* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6: (52 commits)
split invalidate_inodes()
fs: skip I_FREEING inodes in writeback_sb_inodes
fs: fold invalidate_list into invalidate_inodes
fs: do not drop inode_lock in dispose_list
fs: inode split IO and LRU lists
fs: switch bdev inode bdi's correctly
fs: fix buffer invalidation in invalidate_list
fsnotify: use dget_parent
smbfs: use dget_parent
exportfs: use dget_parent
fs: use RCU read side protection in d_validate
fs: clean up dentry lru modification
fs: split __shrink_dcache_sb
fs: improve DCACHE_REFERENCED usage
fs: use percpu counter for nr_dentry and nr_dentry_unused
fs: simplify __d_free
fs: take dcache_lock inside __d_path
fs: do not assign default i_ino in new_inode
fs: introduce a per-cpu last_ino allocator
new helper: ihold()
...
* and kswapd activity, but those code paths have their own
* higher-level throttling.
*/
- if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
+ if (wbc->sync_mode != WB_SYNC_NONE) {
lock_buffer(bh);
} else if (!trylock_buffer(bh)) {
redirty_page_for_writepage(wbc, page);
}
EXPORT_SYMBOL(page_zero_new_buffers);
- int block_prepare_write(struct page *page, unsigned from, unsigned to,
+ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
get_block_t *get_block)
{
+ unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned to = from + len;
struct inode *inode = page->mapping->host;
unsigned block_start, block_end;
sector_t block;
}
return err;
}
- EXPORT_SYMBOL(block_prepare_write);
+ EXPORT_SYMBOL(__block_write_begin);
static int __block_commit_write(struct inode *inode, struct page *page,
unsigned from, unsigned to)
return 0;
}
- int __block_write_begin(struct page *page, loff_t pos, unsigned len,
- get_block_t *get_block)
- {
- unsigned start = pos & (PAGE_CACHE_SIZE - 1);
-
- return block_prepare_write(page, start, start + len, get_block);
- }
- EXPORT_SYMBOL(__block_write_begin);
-
/*
* block_write_begin takes care of the basic task of block allocation and
* bringing partial write blocks uptodate first.
else
end = PAGE_CACHE_SIZE;
- ret = block_prepare_write(page, 0, end, get_block);
+ ret = __block_write_begin(page, 0, end, get_block);
if (!ret)
ret = block_commit_write(page, 0, end);
*fsdata = NULL;
if (page_has_buffers(page)) {
- unlock_page(page);
- page_cache_release(page);
- *pagep = NULL;
- return block_write_begin(mapping, pos, len, flags, pagep,
- get_block);
+ ret = __block_write_begin(page, pos, len, get_block);
+ if (unlikely(ret))
+ goto out_release;
+ return ret;
}
if (PageMappedToDisk(page))
return sb->s_bdi;
}
+ static inline struct inode *wb_inode(struct list_head *head)
+ {
+ return list_entry(head, struct inode, i_wb_list);
+ }
+
static void bdi_queue_work(struct backing_dev_info *bdi,
struct wb_writeback_work *work)
{
if (!list_empty(&wb->b_dirty)) {
struct inode *tail;
- tail = list_entry(wb->b_dirty.next, struct inode, i_list);
+ tail = wb_inode(wb->b_dirty.next);
if (time_before(inode->dirtied_when, tail->dirtied_when))
inode->dirtied_when = jiffies;
}
- list_move(&inode->i_list, &wb->b_dirty);
+ list_move(&inode->i_wb_list, &wb->b_dirty);
}
/*
{
struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
- list_move(&inode->i_list, &wb->b_more_io);
+ list_move(&inode->i_wb_list, &wb->b_more_io);
}
static void inode_sync_complete(struct inode *inode)
int do_sb_sort = 0;
while (!list_empty(delaying_queue)) {
- inode = list_entry(delaying_queue->prev, struct inode, i_list);
+ inode = wb_inode(delaying_queue->prev);
if (older_than_this &&
inode_dirtied_after(inode, *older_than_this))
break;
if (sb && sb != inode->i_sb)
do_sb_sort = 1;
sb = inode->i_sb;
- list_move(&inode->i_list, &tmp);
+ list_move(&inode->i_wb_list, &tmp);
}
/* just one sb in list, splice to dispatch_queue and we're done */
/* Move inodes from one superblock together */
while (!list_empty(&tmp)) {
- inode = list_entry(tmp.prev, struct inode, i_list);
- sb = inode->i_sb;
+ sb = wb_inode(tmp.prev)->i_sb;
list_for_each_prev_safe(pos, node, &tmp) {
- inode = list_entry(pos, struct inode, i_list);
+ inode = wb_inode(pos);
if (inode->i_sb == sb)
- list_move(&inode->i_list, dispatch_queue);
+ list_move(&inode->i_wb_list, dispatch_queue);
}
}
}
* completion.
*/
redirty_tail(inode);
- } else if (atomic_read(&inode->i_count)) {
- /*
- * The inode is clean, inuse
- */
- list_move(&inode->i_list, &inode_in_use);
} else {
/*
- * The inode is clean, unused
+ * The inode is clean. At this point we either have
+ * a reference to the inode or it's on it's way out.
+ * No need to add it back to the LRU.
*/
- list_move(&inode->i_list, &inode_unused);
+ list_del_init(&inode->i_wb_list);
}
}
inode_sync_complete(inode);
{
while (!list_empty(&wb->b_io)) {
long pages_skipped;
- struct inode *inode = list_entry(wb->b_io.prev,
- struct inode, i_list);
+ struct inode *inode = wb_inode(wb->b_io.prev);
if (inode->i_sb != sb) {
if (only_this_sb) {
return 0;
}
- if (inode->i_state & (I_NEW | I_WILL_FREE)) {
+ /*
+ * Don't bother with new inodes or inodes beeing freed, first
+ * kind does not need peridic writeout yet, and for the latter
+ * kind writeout is handled by the freer.
+ */
+ if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
requeue_io(inode);
continue;
}
+
/*
* Was this inode dirtied after sync_sb_inodes was called?
* This keeps sync from extra jobs and livelock.
if (inode_dirtied_after(inode, wbc->wb_start))
return 1;
- BUG_ON(inode->i_state & I_FREEING);
__iget(inode);
pages_skipped = wbc->pages_skipped;
writeback_single_inode(inode, wbc);
queue_io(wb, wbc->older_than_this);
while (!list_empty(&wb->b_io)) {
- struct inode *inode = list_entry(wb->b_io.prev,
- struct inode, i_list);
+ struct inode *inode = wb_inode(wb->b_io.prev);
struct super_block *sb = inode->i_sb;
if (!pin_sb_for_writeback(sb)) {
global_dirty_limits(&background_thresh, &dirty_thresh);
return (global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
+ global_page_state(NR_UNSTABLE_NFS) > background_thresh);
}
/*
*/
spin_lock(&inode_lock);
if (!list_empty(&wb->b_more_io)) {
- inode = list_entry(wb->b_more_io.prev,
- struct inode, i_list);
+ inode = wb_inode(wb->b_more_io.prev);
trace_wbc_writeback_wait(&wbc, wb->bdi);
inode_wait_for_writeback(inode);
}
return 0;
wb->last_old_flush = jiffies;
+ /*
+ * Add in the number of potentially dirty inodes, because each inode
+ * write can dirty pagecache in the underlying blockdev.
+ */
nr_pages = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS) +
- (inodes_stat.nr_inodes - inodes_stat.nr_unused);
+ get_nr_dirty_inodes();
if (nr_pages) {
struct wb_writeback_work work = {
struct backing_dev_info *bdi = wb->bdi;
long pages_written;
- current->flags |= PF_FLUSHER | PF_SWAPWRITE;
+ current->flags |= PF_SWAPWRITE;
set_freezable();
wb->last_active = jiffies;
* dirty list. Add blockdev inodes as well.
*/
if (!S_ISBLK(inode->i_mode)) {
- if (hlist_unhashed(&inode->i_hash))
+ if (inode_unhashed(inode))
goto out;
}
if (inode->i_state & I_FREEING)
}
inode->dirtied_when = jiffies;
- list_move(&inode->i_list, &bdi->wb.b_dirty);
+ list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
}
}
out:
WARN_ON(!rwsem_is_locked(&sb->s_umount));
- work.nr_pages = nr_dirty + nr_unstable +
- (inodes_stat.nr_inodes - inodes_stat.nr_unused);
+ work.nr_pages = nr_dirty + nr_unstable + get_nr_dirty_inodes();
bdi_queue_work(sb->s_bdi, &work);
wait_for_completion(&done);
return ret;
}
EXPORT_SYMBOL(sync_inode);
+
+ /**
+ * sync_inode - write an inode to disk
+ * @inode: the inode to sync
+ * @wait: wait for I/O to complete.
+ *
+ * Write an inode to disk and adjust it's dirty state after completion.
+ *
+ * Note: only writes the actual inode, no associated data or other metadata.
+ */
+ int sync_inode_metadata(struct inode *inode, int wait)
+ {
+ struct writeback_control wbc = {
+ .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
+ .nr_to_write = 0, /* metadata-only */
+ };
+
+ return sync_inode(inode, &wbc);
+ }
+ EXPORT_SYMBOL(sync_inode_metadata);
#include <linux/statfs.h>
#include <linux/security.h>
#include <linux/magic.h>
+#include <linux/migrate.h>
#include <asm/uaccess.h>
inode = new_inode(sb);
if (inode) {
struct hugetlbfs_inode_info *info;
+ inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_uid = uid;
inode->i_gid = gid;
return 0;
}
+static int hugetlbfs_migrate_page(struct address_space *mapping,
+ struct page *newpage, struct page *page)
+{
+ int rc;
+
+ rc = migrate_huge_page_move_mapping(mapping, newpage, page);
+ if (rc)
+ return rc;
+ migrate_page_copy(newpage, page);
+
+ return 0;
+}
+
static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
.write_begin = hugetlbfs_write_begin,
.write_end = hugetlbfs_write_end,
.set_page_dirty = hugetlbfs_set_page_dirty,
+ .migratepage = hugetlbfs_migrate_page,
};
#include <linux/mount.h>
#include <linux/async.h>
#include <linux/posix_acl.h>
+#include <linux/ima.h>
/*
* This is needed for the following functions:
* - inode_has_buffers
- * - invalidate_inode_buffers
* - invalidate_bdev
*
* FIXME: remove all knowledge of the buffer layer from this file
* allowing for low-overhead inode sync() operations.
*/
- LIST_HEAD(inode_in_use);
- LIST_HEAD(inode_unused);
+ static LIST_HEAD(inode_lru);
static struct hlist_head *inode_hashtable __read_mostly;
/*
*/
struct inodes_stat_t inodes_stat;
+ static struct percpu_counter nr_inodes __cacheline_aligned_in_smp;
+ static struct percpu_counter nr_inodes_unused __cacheline_aligned_in_smp;
+
static struct kmem_cache *inode_cachep __read_mostly;
+ static inline int get_nr_inodes(void)
+ {
+ return percpu_counter_sum_positive(&nr_inodes);
+ }
+
+ static inline int get_nr_inodes_unused(void)
+ {
+ return percpu_counter_sum_positive(&nr_inodes_unused);
+ }
+
+ int get_nr_dirty_inodes(void)
+ {
+ int nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
+ return nr_dirty > 0 ? nr_dirty : 0;
+
+ }
+
+ /*
+ * Handle nr_inode sysctl
+ */
+ #ifdef CONFIG_SYSCTL
+ int proc_nr_inodes(ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+ {
+ inodes_stat.nr_inodes = get_nr_inodes();
+ inodes_stat.nr_unused = get_nr_inodes_unused();
+ return proc_dointvec(table, write, buffer, lenp, ppos);
+ }
+ #endif
+
static void wake_up_inode(struct inode *inode)
{
/*
inode->i_fsnotify_mask = 0;
#endif
+ percpu_counter_inc(&nr_inodes);
+
return 0;
out:
return -ENOMEM;
if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
posix_acl_release(inode->i_default_acl);
#endif
+ percpu_counter_dec(&nr_inodes);
}
EXPORT_SYMBOL(__destroy_inode);
- void destroy_inode(struct inode *inode)
+ static void destroy_inode(struct inode *inode)
{
+ BUG_ON(!list_empty(&inode->i_lru));
__destroy_inode(inode);
if (inode->i_sb->s_op->destroy_inode)
inode->i_sb->s_op->destroy_inode(inode);
INIT_HLIST_NODE(&inode->i_hash);
INIT_LIST_HEAD(&inode->i_dentry);
INIT_LIST_HEAD(&inode->i_devices);
+ INIT_LIST_HEAD(&inode->i_wb_list);
+ INIT_LIST_HEAD(&inode->i_lru);
INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
spin_lock_init(&inode->i_data.tree_lock);
spin_lock_init(&inode->i_data.i_mmap_lock);
*/
void __iget(struct inode *inode)
{
- if (atomic_inc_return(&inode->i_count) != 1)
- return;
+ atomic_inc(&inode->i_count);
+ }
+
+ /*
+ * get additional reference to inode; caller must already hold one.
+ */
+ void ihold(struct inode *inode)
+ {
+ WARN_ON(atomic_inc_return(&inode->i_count) < 2);
+ }
+ EXPORT_SYMBOL(ihold);
+
+ static void inode_lru_list_add(struct inode *inode)
+ {
+ if (list_empty(&inode->i_lru)) {
+ list_add(&inode->i_lru, &inode_lru);
+ percpu_counter_inc(&nr_inodes_unused);
+ }
+ }
- if (!(inode->i_state & (I_DIRTY|I_SYNC)))
- list_move(&inode->i_list, &inode_in_use);
- inodes_stat.nr_unused--;
+ static void inode_lru_list_del(struct inode *inode)
+ {
+ if (!list_empty(&inode->i_lru)) {
+ list_del_init(&inode->i_lru);
+ percpu_counter_dec(&nr_inodes_unused);
+ }
+ }
+
+ static inline void __inode_sb_list_add(struct inode *inode)
+ {
+ list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
}
+ /**
+ * inode_sb_list_add - add inode to the superblock list of inodes
+ * @inode: inode to add
+ */
+ void inode_sb_list_add(struct inode *inode)
+ {
+ spin_lock(&inode_lock);
+ __inode_sb_list_add(inode);
+ spin_unlock(&inode_lock);
+ }
+ EXPORT_SYMBOL_GPL(inode_sb_list_add);
+
+ static inline void __inode_sb_list_del(struct inode *inode)
+ {
+ list_del_init(&inode->i_sb_list);
+ }
+
+ static unsigned long hash(struct super_block *sb, unsigned long hashval)
+ {
+ unsigned long tmp;
+
+ tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
+ L1_CACHE_BYTES;
+ tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
+ return tmp & I_HASHMASK;
+ }
+
+ /**
+ * __insert_inode_hash - hash an inode
+ * @inode: unhashed inode
+ * @hashval: unsigned long value used to locate this object in the
+ * inode_hashtable.
+ *
+ * Add an inode to the inode hash for this superblock.
+ */
+ void __insert_inode_hash(struct inode *inode, unsigned long hashval)
+ {
+ struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
+
+ spin_lock(&inode_lock);
+ hlist_add_head(&inode->i_hash, b);
+ spin_unlock(&inode_lock);
+ }
+ EXPORT_SYMBOL(__insert_inode_hash);
+
+ /**
+ * __remove_inode_hash - remove an inode from the hash
+ * @inode: inode to unhash
+ *
+ * Remove an inode from the superblock.
+ */
+ static void __remove_inode_hash(struct inode *inode)
+ {
+ hlist_del_init(&inode->i_hash);
+ }
+
+ /**
+ * remove_inode_hash - remove an inode from the hash
+ * @inode: inode to unhash
+ *
+ * Remove an inode from the superblock.
+ */
+ void remove_inode_hash(struct inode *inode)
+ {
+ spin_lock(&inode_lock);
+ hlist_del_init(&inode->i_hash);
+ spin_unlock(&inode_lock);
+ }
+ EXPORT_SYMBOL(remove_inode_hash);
+
void end_writeback(struct inode *inode)
{
might_sleep();
*/
static void dispose_list(struct list_head *head)
{
- int nr_disposed = 0;
-
while (!list_empty(head)) {
struct inode *inode;
- inode = list_first_entry(head, struct inode, i_list);
- list_del(&inode->i_list);
+ inode = list_first_entry(head, struct inode, i_lru);
+ list_del_init(&inode->i_lru);
evict(inode);
spin_lock(&inode_lock);
- hlist_del_init(&inode->i_hash);
- list_del_init(&inode->i_sb_list);
+ __remove_inode_hash(inode);
+ __inode_sb_list_del(inode);
spin_unlock(&inode_lock);
wake_up_inode(inode);
destroy_inode(inode);
- nr_disposed++;
}
- spin_lock(&inode_lock);
- inodes_stat.nr_inodes -= nr_disposed;
- spin_unlock(&inode_lock);
}
- /*
- * Invalidate all inodes for a device.
+ /**
+ * evict_inodes - evict all evictable inodes for a superblock
+ * @sb: superblock to operate on
+ *
+ * Make sure that no inodes with zero refcount are retained. This is
+ * called by superblock shutdown after having MS_ACTIVE flag removed,
+ * so any inode reaching zero refcount during or after that call will
+ * be immediately evicted.
*/
- static int invalidate_list(struct list_head *head, struct list_head *dispose)
+ void evict_inodes(struct super_block *sb)
{
- struct list_head *next;
- int busy = 0, count = 0;
-
- next = head->next;
- for (;;) {
- struct list_head *tmp = next;
- struct inode *inode;
+ struct inode *inode, *next;
+ LIST_HEAD(dispose);
- /*
- * We can reschedule here without worrying about the list's
- * consistency because the per-sb list of inodes must not
- * change during umount anymore, and because iprune_sem keeps
- * shrink_icache_memory() away.
- */
- cond_resched_lock(&inode_lock);
+ down_write(&iprune_sem);
- next = next->next;
- if (tmp == head)
- break;
- inode = list_entry(tmp, struct inode, i_sb_list);
- if (inode->i_state & I_NEW)
+ spin_lock(&inode_lock);
+ list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
+ if (atomic_read(&inode->i_count))
continue;
- invalidate_inode_buffers(inode);
- if (!atomic_read(&inode->i_count)) {
- list_move(&inode->i_list, dispose);
- WARN_ON(inode->i_state & I_NEW);
- inode->i_state |= I_FREEING;
- count++;
+
+ if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+ WARN_ON(1);
continue;
}
- busy = 1;
+
+ inode->i_state |= I_FREEING;
+
+ /*
+ * Move the inode off the IO lists and LRU once I_FREEING is
+ * set so that it won't get moved back on there if it is dirty.
+ */
+ list_move(&inode->i_lru, &dispose);
+ list_del_init(&inode->i_wb_list);
+ if (!(inode->i_state & (I_DIRTY | I_SYNC)))
+ percpu_counter_dec(&nr_inodes_unused);
}
- /* only unused inodes may be cached with i_count zero */
- inodes_stat.nr_unused -= count;
- return busy;
+ spin_unlock(&inode_lock);
+
+ dispose_list(&dispose);
+ up_write(&iprune_sem);
}
/**
- * invalidate_inodes - discard the inodes on a device
- * @sb: superblock
+ * invalidate_inodes - attempt to free all inodes on a superblock
+ * @sb: superblock to operate on
*
- * Discard all of the inodes for a given superblock. If the discard
- * fails because there are busy inodes then a non zero value is returned.
- * If the discard is successful all the inodes have been discarded.
+ * Attempts to free all inodes for a given superblock. If there were any
+ * busy inodes return a non-zero value, else zero.
*/
int invalidate_inodes(struct super_block *sb)
{
- int busy;
- LIST_HEAD(throw_away);
+ int busy = 0;
+ struct inode *inode, *next;
+ LIST_HEAD(dispose);
down_write(&iprune_sem);
+
spin_lock(&inode_lock);
- fsnotify_unmount_inodes(&sb->s_inodes);
- busy = invalidate_list(&sb->s_inodes, &throw_away);
+ list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
+ if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
+ continue;
+ if (atomic_read(&inode->i_count)) {
+ busy = 1;
+ continue;
+ }
+
+ inode->i_state |= I_FREEING;
+
+ /*
+ * Move the inode off the IO lists and LRU once I_FREEING is
+ * set so that it won't get moved back on there if it is dirty.
+ */
+ list_move(&inode->i_lru, &dispose);
+ list_del_init(&inode->i_wb_list);
+ if (!(inode->i_state & (I_DIRTY | I_SYNC)))
+ percpu_counter_dec(&nr_inodes_unused);
+ }
spin_unlock(&inode_lock);
- dispose_list(&throw_away);
+ dispose_list(&dispose);
up_write(&iprune_sem);
return busy;
}
- EXPORT_SYMBOL(invalidate_inodes);
static int can_unuse(struct inode *inode)
{
- if (inode->i_state)
+ if (inode->i_state & ~I_REFERENCED)
return 0;
if (inode_has_buffers(inode))
return 0;
}
/*
- * Scan `goal' inodes on the unused list for freeable ones. They are moved to
- * a temporary list and then are freed outside inode_lock by dispose_list().
+ * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
+ * temporary list and then are freed outside inode_lock by dispose_list().
*
* Any inodes which are pinned purely because of attached pagecache have their
- * pagecache removed. We expect the final iput() on that inode to add it to
- * the front of the inode_unused list. So look for it there and if the
- * inode is still freeable, proceed. The right inode is found 99.9% of the
- * time in testing on a 4-way.
+ * pagecache removed. If the inode has metadata buffers attached to
+ * mapping->private_list then try to remove them.
*
- * If the inode has metadata buffers attached to mapping->private_list then
- * try to remove them.
+ * If the inode has the I_REFERENCED flag set, then it means that it has been
+ * used recently - the flag is set in iput_final(). When we encounter such an
+ * inode, clear the flag and move it to the back of the LRU so it gets another
+ * pass through the LRU before it gets reclaimed. This is necessary because of
+ * the fact we are doing lazy LRU updates to minimise lock contention so the
+ * LRU does not have strict ordering. Hence we don't want to reclaim inodes
+ * with this flag set because they are the inodes that are out of order.
*/
static void prune_icache(int nr_to_scan)
{
LIST_HEAD(freeable);
- int nr_pruned = 0;
int nr_scanned;
unsigned long reap = 0;
for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
struct inode *inode;
- if (list_empty(&inode_unused))
+ if (list_empty(&inode_lru))
break;
- inode = list_entry(inode_unused.prev, struct inode, i_list);
+ inode = list_entry(inode_lru.prev, struct inode, i_lru);
- if (inode->i_state || atomic_read(&inode->i_count)) {
- list_move(&inode->i_list, &inode_unused);
+ /*
+ * Referenced or dirty inodes are still in use. Give them
+ * another pass through the LRU as we canot reclaim them now.
+ */
+ if (atomic_read(&inode->i_count) ||
+ (inode->i_state & ~I_REFERENCED)) {
+ list_del_init(&inode->i_lru);
+ percpu_counter_dec(&nr_inodes_unused);
+ continue;
+ }
+
+ /* recently referenced inodes get one more pass */
+ if (inode->i_state & I_REFERENCED) {
+ list_move(&inode->i_lru, &inode_lru);
+ inode->i_state &= ~I_REFERENCED;
continue;
}
if (inode_has_buffers(inode) || inode->i_data.nrpages) {
iput(inode);
spin_lock(&inode_lock);
- if (inode != list_entry(inode_unused.next,
- struct inode, i_list))
+ if (inode != list_entry(inode_lru.next,
+ struct inode, i_lru))
continue; /* wrong inode or list_empty */
if (!can_unuse(inode))
continue;
}
- list_move(&inode->i_list, &freeable);
WARN_ON(inode->i_state & I_NEW);
inode->i_state |= I_FREEING;
- nr_pruned++;
+
+ /*
+ * Move the inode off the IO lists and LRU once I_FREEING is
+ * set so that it won't get moved back on there if it is dirty.
+ */
+ list_move(&inode->i_lru, &freeable);
+ list_del_init(&inode->i_wb_list);
+ percpu_counter_dec(&nr_inodes_unused);
}
- inodes_stat.nr_unused -= nr_pruned;
if (current_is_kswapd())
__count_vm_events(KSWAPD_INODESTEAL, reap);
else
return -1;
prune_icache(nr);
}
- return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
+ return (get_nr_inodes_unused() / 100) * sysctl_vfs_cache_pressure;
}
static struct shrinker icache_shrinker = {
static void __wait_on_freeing_inode(struct inode *inode);
/*
* Called with the inode lock held.
- * NOTE: we are not increasing the inode-refcount, you must call __iget()
- * by hand after calling find_inode now! This simplifies iunique and won't
- * add any additional branch in the common code.
*/
static struct inode *find_inode(struct super_block *sb,
struct hlist_head *head,
__wait_on_freeing_inode(inode);
goto repeat;
}
- break;
+ __iget(inode);
+ return inode;
}
- return node ? inode : NULL;
+ return NULL;
}
/*
__wait_on_freeing_inode(inode);
goto repeat;
}
- break;
+ __iget(inode);
+ return inode;
}
- return node ? inode : NULL;
- }
-
- static unsigned long hash(struct super_block *sb, unsigned long hashval)
- {
- unsigned long tmp;
-
- tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
- L1_CACHE_BYTES;
- tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
- return tmp & I_HASHMASK;
- }
-
- static inline void
- __inode_add_to_lists(struct super_block *sb, struct hlist_head *head,
- struct inode *inode)
- {
- inodes_stat.nr_inodes++;
- list_add(&inode->i_list, &inode_in_use);
- list_add(&inode->i_sb_list, &sb->s_inodes);
- if (head)
- hlist_add_head(&inode->i_hash, head);
+ return NULL;
}
- /**
- * inode_add_to_lists - add a new inode to relevant lists
- * @sb: superblock inode belongs to
- * @inode: inode to mark in use
+ /*
+ * Each cpu owns a range of LAST_INO_BATCH numbers.
+ * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations,
+ * to renew the exhausted range.
*
- * When an inode is allocated it needs to be accounted for, added to the in use
- * list, the owning superblock and the inode hash. This needs to be done under
- * the inode_lock, so export a function to do this rather than the inode lock
- * itself. We calculate the hash list to add to here so it is all internal
- * which requires the caller to have already set up the inode number in the
- * inode to add.
+ * This does not significantly increase overflow rate because every CPU can
+ * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is
+ * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the
+ * 2^32 range, and is a worst-case. Even a 50% wastage would only increase
+ * overflow rate by 2x, which does not seem too significant.
+ *
+ * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
+ * error if st_ino won't fit in target struct field. Use 32bit counter
+ * here to attempt to avoid that.
*/
- void inode_add_to_lists(struct super_block *sb, struct inode *inode)
+ #define LAST_INO_BATCH 1024
+ static DEFINE_PER_CPU(unsigned int, last_ino);
+
+ unsigned int get_next_ino(void)
{
- struct hlist_head *head = inode_hashtable + hash(sb, inode->i_ino);
+ unsigned int *p = &get_cpu_var(last_ino);
+ unsigned int res = *p;
- spin_lock(&inode_lock);
- __inode_add_to_lists(sb, head, inode);
- spin_unlock(&inode_lock);
+ #ifdef CONFIG_SMP
+ if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) {
+ static atomic_t shared_last_ino;
+ int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino);
+
+ res = next - LAST_INO_BATCH;
+ }
+ #endif
+
+ *p = ++res;
+ put_cpu_var(last_ino);
+ return res;
}
- EXPORT_SYMBOL_GPL(inode_add_to_lists);
+ EXPORT_SYMBOL(get_next_ino);
/**
* new_inode - obtain an inode
*/
struct inode *new_inode(struct super_block *sb)
{
- /*
- * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
- * error if st_ino won't fit in target struct field. Use 32bit counter
- * here to attempt to avoid that.
- */
- static unsigned int last_ino;
struct inode *inode;
spin_lock_prefetch(&inode_lock);
inode = alloc_inode(sb);
if (inode) {
spin_lock(&inode_lock);
- __inode_add_to_lists(sb, NULL, inode);
- inode->i_ino = ++last_ino;
+ __inode_sb_list_add(inode);
inode->i_state = 0;
spin_unlock(&inode_lock);
}
void unlock_new_inode(struct inode *inode)
{
#ifdef CONFIG_DEBUG_LOCK_ALLOC
- if (inode->i_mode & S_IFDIR) {
+ if (S_ISDIR(inode->i_mode)) {
struct file_system_type *type = inode->i_sb->s_type;
/* Set new key only if filesystem hasn't already changed it */
if (set(inode, data))
goto set_failed;
- __inode_add_to_lists(sb, head, inode);
+ hlist_add_head(&inode->i_hash, head);
+ __inode_sb_list_add(inode);
inode->i_state = I_NEW;
spin_unlock(&inode_lock);
* us. Use the old inode instead of the one we just
* allocated.
*/
- __iget(old);
spin_unlock(&inode_lock);
destroy_inode(inode);
inode = old;
old = find_inode_fast(sb, head, ino);
if (!old) {
inode->i_ino = ino;
- __inode_add_to_lists(sb, head, inode);
+ hlist_add_head(&inode->i_hash, head);
+ __inode_sb_list_add(inode);
inode->i_state = I_NEW;
spin_unlock(&inode_lock);
* us. Use the old inode instead of the one we just
* allocated.
*/
- __iget(old);
spin_unlock(&inode_lock);
destroy_inode(inode);
inode = old;
return inode;
}
+ /*
+ * search the inode cache for a matching inode number.
+ * If we find one, then the inode number we are trying to
+ * allocate is not unique and so we should not use it.
+ *
+ * Returns 1 if the inode number is unique, 0 if it is not.
+ */
+ static int test_inode_iunique(struct super_block *sb, unsigned long ino)
+ {
+ struct hlist_head *b = inode_hashtable + hash(sb, ino);
+ struct hlist_node *node;
+ struct inode *inode;
+
+ hlist_for_each_entry(inode, node, b, i_hash) {
+ if (inode->i_ino == ino && inode->i_sb == sb)
+ return 0;
+ }
+
+ return 1;
+ }
+
/**
* iunique - get a unique inode number
* @sb: superblock
* error if st_ino won't fit in target struct field. Use 32bit counter
* here to attempt to avoid that.
*/
+ static DEFINE_SPINLOCK(iunique_lock);
static unsigned int counter;
- struct inode *inode;
- struct hlist_head *head;
ino_t res;
spin_lock(&inode_lock);
+ spin_lock(&iunique_lock);
do {
if (counter <= max_reserved)
counter = max_reserved + 1;
res = counter++;
- head = inode_hashtable + hash(sb, res);
- inode = find_inode_fast(sb, head, res);
- } while (inode != NULL);
+ } while (!test_inode_iunique(sb, res));
+ spin_unlock(&iunique_lock);
spin_unlock(&inode_lock);
return res;
spin_lock(&inode_lock);
inode = find_inode(sb, head, test, data);
if (inode) {
- __iget(inode);
spin_unlock(&inode_lock);
if (likely(wait))
wait_on_inode(inode);
spin_lock(&inode_lock);
inode = find_inode_fast(sb, head, ino);
if (inode) {
- __iget(inode);
spin_unlock(&inode_lock);
wait_on_inode(inode);
return inode;
__iget(old);
spin_unlock(&inode_lock);
wait_on_inode(old);
- if (unlikely(!hlist_unhashed(&old->i_hash))) {
+ if (unlikely(!inode_unhashed(old))) {
iput(old);
return -EBUSY;
}
__iget(old);
spin_unlock(&inode_lock);
wait_on_inode(old);
- if (unlikely(!hlist_unhashed(&old->i_hash))) {
+ if (unlikely(!inode_unhashed(old))) {
iput(old);
return -EBUSY;
}
}
EXPORT_SYMBOL(insert_inode_locked4);
- /**
- * __insert_inode_hash - hash an inode
- * @inode: unhashed inode
- * @hashval: unsigned long value used to locate this object in the
- * inode_hashtable.
- *
- * Add an inode to the inode hash for this superblock.
- */
- void __insert_inode_hash(struct inode *inode, unsigned long hashval)
- {
- struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
- spin_lock(&inode_lock);
- hlist_add_head(&inode->i_hash, head);
- spin_unlock(&inode_lock);
- }
- EXPORT_SYMBOL(__insert_inode_hash);
-
- /**
- * remove_inode_hash - remove an inode from the hash
- * @inode: inode to unhash
- *
- * Remove an inode from the superblock.
- */
- void remove_inode_hash(struct inode *inode)
- {
- spin_lock(&inode_lock);
- hlist_del_init(&inode->i_hash);
- spin_unlock(&inode_lock);
- }
- EXPORT_SYMBOL(remove_inode_hash);
int generic_delete_inode(struct inode *inode)
{
*/
int generic_drop_inode(struct inode *inode)
{
- return !inode->i_nlink || hlist_unhashed(&inode->i_hash);
+ return !inode->i_nlink || inode_unhashed(inode);
}
EXPORT_SYMBOL_GPL(generic_drop_inode);
drop = generic_drop_inode(inode);
if (!drop) {
- if (!(inode->i_state & (I_DIRTY|I_SYNC)))
- list_move(&inode->i_list, &inode_unused);
- inodes_stat.nr_unused++;
if (sb->s_flags & MS_ACTIVE) {
+ inode->i_state |= I_REFERENCED;
+ if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
+ inode_lru_list_add(inode);
+ }
spin_unlock(&inode_lock);
return;
}
spin_lock(&inode_lock);
WARN_ON(inode->i_state & I_NEW);
inode->i_state &= ~I_WILL_FREE;
- inodes_stat.nr_unused--;
- hlist_del_init(&inode->i_hash);
+ __remove_inode_hash(inode);
}
- list_del_init(&inode->i_list);
- list_del_init(&inode->i_sb_list);
+
WARN_ON(inode->i_state & I_NEW);
inode->i_state |= I_FREEING;
- inodes_stat.nr_inodes--;
+
+ /*
+ * Move the inode off the IO lists and LRU once I_FREEING is
+ * set so that it won't get moved back on there if it is dirty.
+ */
+ inode_lru_list_del(inode);
+ list_del_init(&inode->i_wb_list);
+
+ __inode_sb_list_del(inode);
spin_unlock(&inode_lock);
evict(inode);
- spin_lock(&inode_lock);
- hlist_del_init(&inode->i_hash);
- spin_unlock(&inode_lock);
+ remove_inode_hash(inode);
wake_up_inode(inode);
BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
destroy_inode(inode);
SLAB_MEM_SPREAD),
init_once);
register_shrinker(&icache_shrinker);
+ percpu_counter_init(&nr_inodes, 0);
+ percpu_counter_init(&nr_inodes_unused, 0);
/* Hash may have been set up in inode_init_early */
if (!hashdist)
#include <linux/namei.h>
#include <linux/mount.h>
#include <linux/sched.h>
+#include <linux/vmalloc.h>
-#include "nfs4_fs.h"
#include "delegation.h"
#include "iostat.h"
#include "internal.h"
+#include "fscache.h"
/* #define NFS_DEBUG_VERBOSE 1 */
struct inode *, struct dentry *);
static int nfs_fsync_dir(struct file *, int);
static loff_t nfs_llseek_dir(struct file *, loff_t, int);
+static int nfs_readdir_clear_array(struct page*, gfp_t);
const struct file_operations nfs_dir_operations = {
.llseek = nfs_llseek_dir,
.setattr = nfs_setattr,
};
+const struct address_space_operations nfs_dir_addr_space_ops = {
+ .releasepage = nfs_readdir_clear_array,
+};
+
#ifdef CONFIG_NFS_V3
const struct inode_operations nfs3_dir_inode_operations = {
.create = nfs_create,
#ifdef CONFIG_NFS_V4
static struct dentry *nfs_atomic_lookup(struct inode *, struct dentry *, struct nameidata *);
+static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd);
const struct inode_operations nfs4_dir_inode_operations = {
- .create = nfs_create,
+ .create = nfs_open_create,
.lookup = nfs_atomic_lookup,
.link = nfs_link,
.unlink = nfs_unlink,
return res;
}
-typedef __be32 * (*decode_dirent_t)(__be32 *, struct nfs_entry *, int);
+struct nfs_cache_array_entry {
+ u64 cookie;
+ u64 ino;
+ struct qstr string;
+};
+
+struct nfs_cache_array {
+ unsigned int size;
+ int eof_index;
+ u64 last_cookie;
+ struct nfs_cache_array_entry array[0];
+};
+
+#define MAX_READDIR_ARRAY ((PAGE_SIZE - sizeof(struct nfs_cache_array)) / sizeof(struct nfs_cache_array_entry))
+
+typedef __be32 * (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
typedef struct {
struct file *file;
struct page *page;
unsigned long page_index;
- __be32 *ptr;
u64 *dir_cookie;
loff_t current_index;
- struct nfs_entry *entry;
decode_dirent_t decode;
- int plus;
+
unsigned long timestamp;
unsigned long gencount;
- int timestamp_valid;
+ unsigned int cache_entry_index;
+ unsigned int plus:1;
+ unsigned int eof:1;
} nfs_readdir_descriptor_t;
-/* Now we cache directories properly, by stuffing the dirent
- * data directly in the page cache.
- *
- * Inode invalidation due to refresh etc. takes care of
- * _everything_, no sloppy entry flushing logic, no extraneous
- * copying, network direct to page cache, the way it was meant
- * to be.
- *
- * NOTE: Dirent information verification is done always by the
- * page-in of the RPC reply, nowhere else, this simplies
- * things substantially.
+/*
+ * The caller is responsible for calling nfs_readdir_release_array(page)
*/
static
-int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
+struct nfs_cache_array *nfs_readdir_get_array(struct page *page)
+{
+ if (page == NULL)
+ return ERR_PTR(-EIO);
+ return (struct nfs_cache_array *)kmap(page);
+}
+
+static
+void nfs_readdir_release_array(struct page *page)
+{
+ kunmap(page);
+}
+
+/*
+ * we are freeing strings created by nfs_add_to_readdir_array()
+ */
+static
+int nfs_readdir_clear_array(struct page *page, gfp_t mask)
+{
+ struct nfs_cache_array *array = nfs_readdir_get_array(page);
+ int i;
+ for (i = 0; i < array->size; i++)
+ kfree(array->array[i].string.name);
+ nfs_readdir_release_array(page);
+ return 0;
+}
+
+/*
+ * the caller is responsible for freeing qstr.name
+ * when called by nfs_readdir_add_to_array, the strings will be freed in
+ * nfs_clear_readdir_array()
+ */
+static
+int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int len)
+{
+ string->len = len;
+ string->name = kmemdup(name, len, GFP_KERNEL);
+ if (string->name == NULL)
+ return -ENOMEM;
+ string->hash = full_name_hash(name, len);
+ return 0;
+}
+
+static
+int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
+{
+ struct nfs_cache_array *array = nfs_readdir_get_array(page);
+ struct nfs_cache_array_entry *cache_entry;
+ int ret;
+
+ if (IS_ERR(array))
+ return PTR_ERR(array);
+ ret = -EIO;
+ if (array->size >= MAX_READDIR_ARRAY)
+ goto out;
+
+ cache_entry = &array->array[array->size];
+ cache_entry->cookie = entry->prev_cookie;
+ cache_entry->ino = entry->ino;
+ ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len);
+ if (ret)
+ goto out;
+ array->last_cookie = entry->cookie;
+ if (entry->eof == 1)
+ array->eof_index = array->size;
+ array->size++;
+out:
+ nfs_readdir_release_array(page);
+ return ret;
+}
+
+static
+int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
+{
+ loff_t diff = desc->file->f_pos - desc->current_index;
+ unsigned int index;
+
+ if (diff < 0)
+ goto out_eof;
+ if (diff >= array->size) {
+ if (array->eof_index > 0)
+ goto out_eof;
+ desc->current_index += array->size;
+ return -EAGAIN;
+ }
+
+ index = (unsigned int)diff;
+ *desc->dir_cookie = array->array[index].cookie;
+ desc->cache_entry_index = index;
+ if (index == array->eof_index)
+ desc->eof = 1;
+ return 0;
+out_eof:
+ desc->eof = 1;
+ return -EBADCOOKIE;
+}
+
+static
+int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
+{
+ int i;
+ int status = -EAGAIN;
+
+ for (i = 0; i < array->size; i++) {
+ if (i == array->eof_index) {
+ desc->eof = 1;
+ status = -EBADCOOKIE;
+ }
+ if (array->array[i].cookie == *desc->dir_cookie) {
+ desc->cache_entry_index = i;
+ status = 0;
+ break;
+ }
+ }
+
+ return status;
+}
+
+static
+int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
+{
+ struct nfs_cache_array *array;
+ int status = -EBADCOOKIE;
+
+ if (desc->dir_cookie == NULL)
+ goto out;
+
+ array = nfs_readdir_get_array(desc->page);
+ if (IS_ERR(array)) {
+ status = PTR_ERR(array);
+ goto out;
+ }
+
+ if (*desc->dir_cookie == 0)
+ status = nfs_readdir_search_for_pos(array, desc);
+ else
+ status = nfs_readdir_search_for_cookie(array, desc);
+
+ nfs_readdir_release_array(desc->page);
+out:
+ return status;
+}
+
+/* Fill a page with xdr information before transferring to the cache page */
+static
+int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,
+ struct nfs_entry *entry, struct file *file, struct inode *inode)
{
- struct file *file = desc->file;
- struct inode *inode = file->f_path.dentry->d_inode;
struct rpc_cred *cred = nfs_file_cred(file);
unsigned long timestamp, gencount;
int error;
- dfprintk(DIRCACHE, "NFS: %s: reading cookie %Lu into page %lu\n",
- __func__, (long long)desc->entry->cookie,
- page->index);
-
again:
timestamp = jiffies;
gencount = nfs_inc_attr_generation_counter();
- error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, desc->entry->cookie, page,
+ error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, entry->cookie, pages,
NFS_SERVER(inode)->dtsize, desc->plus);
if (error < 0) {
/* We requested READDIRPLUS, but the server doesn't grok it */
}
desc->timestamp = timestamp;
desc->gencount = gencount;
- desc->timestamp_valid = 1;
- SetPageUptodate(page);
- /* Ensure consistent page alignment of the data.
- * Note: assumes we have exclusive access to this mapping either
- * through inode->i_mutex or some other mechanism.
- */
- if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
- /* Should never happen */
- nfs_zap_mapping(inode, inode->i_mapping);
- }
- unlock_page(page);
- return 0;
- error:
- unlock_page(page);
- return -EIO;
+error:
+ return error;
}
-static inline
-int dir_decode(nfs_readdir_descriptor_t *desc)
+/* Fill in an entry based on the xdr code stored in desc->page */
+static
+int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct xdr_stream *stream)
{
- __be32 *p = desc->ptr;
- p = desc->decode(p, desc->entry, desc->plus);
+ __be32 *p = desc->decode(stream, entry, NFS_SERVER(desc->file->f_path.dentry->d_inode), desc->plus);
if (IS_ERR(p))
return PTR_ERR(p);
- desc->ptr = p;
- if (desc->timestamp_valid) {
- desc->entry->fattr->time_start = desc->timestamp;
- desc->entry->fattr->gencount = desc->gencount;
- } else
- desc->entry->fattr->valid &= ~NFS_ATTR_FATTR;
+
+ entry->fattr->time_start = desc->timestamp;
+ entry->fattr->gencount = desc->gencount;
return 0;
}
-static inline
-void dir_page_release(nfs_readdir_descriptor_t *desc)
+static
+int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
{
- kunmap(desc->page);
- page_cache_release(desc->page);
- desc->page = NULL;
- desc->ptr = NULL;
+ struct nfs_inode *node;
+ if (dentry->d_inode == NULL)
+ goto different;
+ node = NFS_I(dentry->d_inode);
+ if (node->fh.size != entry->fh->size)
+ goto different;
+ if (strncmp(node->fh.data, entry->fh->data, node->fh.size) != 0)
+ goto different;
+ return 1;
+different:
+ return 0;
}
-/*
- * Given a pointer to a buffer that has already been filled by a call
- * to readdir, find the next entry with cookie '*desc->dir_cookie'.
- *
- * If the end of the buffer has been reached, return -EAGAIN, if not,
- * return the offset within the buffer of the next entry to be
- * read.
- */
-static inline
-int find_dirent(nfs_readdir_descriptor_t *desc)
+static
+void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
{
- struct nfs_entry *entry = desc->entry;
- int loop_count = 0,
- status;
-
- while((status = dir_decode(desc)) == 0) {
- dfprintk(DIRCACHE, "NFS: %s: examining cookie %Lu\n",
- __func__, (unsigned long long)entry->cookie);
- if (entry->prev_cookie == *desc->dir_cookie)
- break;
- if (loop_count++ > 200) {
- loop_count = 0;
- schedule();
+ struct qstr filename = {
+ .len = entry->len,
+ .name = entry->name,
+ };
+ struct dentry *dentry;
+ struct dentry *alias;
+ struct inode *dir = parent->d_inode;
+ struct inode *inode;
+
+ if (filename.name[0] == '.') {
+ if (filename.len == 1)
+ return;
+ if (filename.len == 2 && filename.name[1] == '.')
+ return;
+ }
+ filename.hash = full_name_hash(filename.name, filename.len);
+
+ dentry = d_lookup(parent, &filename);
+ if (dentry != NULL) {
+ if (nfs_same_file(dentry, entry)) {
+ nfs_refresh_inode(dentry->d_inode, entry->fattr);
+ goto out;
+ } else {
+ d_drop(dentry);
+ dput(dentry);
}
}
- return status;
+
+ dentry = d_alloc(parent, &filename);
+ if (dentry == NULL)
+ return;
+
+ dentry->d_op = NFS_PROTO(dir)->dentry_ops;
+ inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
+ if (IS_ERR(inode))
+ goto out;
+
+ alias = d_materialise_unique(dentry, inode);
+ if (IS_ERR(alias))
+ goto out;
+ else if (alias) {
+ nfs_set_verifier(alias, nfs_save_change_attribute(dir));
+ dput(alias);
+ } else
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+
+out:
+ dput(dentry);
+}
+
+/* Perform conversion from xdr to cache array */
+static
+void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
+ void *xdr_page, struct page *page, unsigned int buflen)
+{
+ struct xdr_stream stream;
+ struct xdr_buf buf;
+ __be32 *ptr = xdr_page;
+ int status;
+ struct nfs_cache_array *array;
+
+ buf.head->iov_base = xdr_page;
+ buf.head->iov_len = buflen;
+ buf.tail->iov_len = 0;
+ buf.page_base = 0;
+ buf.page_len = 0;
+ buf.buflen = buf.head->iov_len;
+ buf.len = buf.head->iov_len;
+
+ xdr_init_decode(&stream, &buf, ptr);
+
+
+ do {
+ status = xdr_decode(desc, entry, &stream);
+ if (status != 0)
+ break;
+
+ if (nfs_readdir_add_to_array(entry, page) == -1)
+ break;
+ if (desc->plus == 1)
+ nfs_prime_dcache(desc->file->f_path.dentry, entry);
+ } while (!entry->eof);
+
+ if (status == -EBADCOOKIE && entry->eof) {
+ array = nfs_readdir_get_array(page);
+ array->eof_index = array->size - 1;
+ status = 0;
+ nfs_readdir_release_array(page);
+ }
+}
+
+static
+void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages)
+{
+ unsigned int i;
+ for (i = 0; i < npages; i++)
+ put_page(pages[i]);
+}
+
+static
+void nfs_readdir_free_large_page(void *ptr, struct page **pages,
+ unsigned int npages)
+{
+ vm_unmap_ram(ptr, npages);
+ nfs_readdir_free_pagearray(pages, npages);
}
/*
- * Given a pointer to a buffer that has already been filled by a call
- * to readdir, find the entry at offset 'desc->file->f_pos'.
- *
- * If the end of the buffer has been reached, return -EAGAIN, if not,
- * return the offset within the buffer of the next entry to be
- * read.
+ * nfs_readdir_large_page will allocate pages that must be freed with a call
+ * to nfs_readdir_free_large_page
*/
-static inline
-int find_dirent_index(nfs_readdir_descriptor_t *desc)
+static
+void *nfs_readdir_large_page(struct page **pages, unsigned int npages)
{
- struct nfs_entry *entry = desc->entry;
- int loop_count = 0,
- status;
+ void *ptr;
+ unsigned int i;
+
+ for (i = 0; i < npages; i++) {
+ struct page *page = alloc_page(GFP_KERNEL);
+ if (page == NULL)
+ goto out_freepages;
+ pages[i] = page;
+ }
- for(;;) {
- status = dir_decode(desc);
- if (status)
- break;
+ ptr = vm_map_ram(pages, npages, 0, PAGE_KERNEL);
+ if (!IS_ERR_OR_NULL(ptr))
+ return ptr;
+out_freepages:
+ nfs_readdir_free_pagearray(pages, i);
+ return NULL;
+}
- dfprintk(DIRCACHE, "NFS: found cookie %Lu at index %Ld\n",
- (unsigned long long)entry->cookie, desc->current_index);
+static
+int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
+{
+ struct page *pages[NFS_MAX_READDIR_PAGES];
+ void *pages_ptr = NULL;
+ struct nfs_entry entry;
+ struct file *file = desc->file;
+ struct nfs_cache_array *array;
+ int status = 0;
+ unsigned int array_size = ARRAY_SIZE(pages);
+
+ entry.prev_cookie = 0;
+ entry.cookie = *desc->dir_cookie;
+ entry.eof = 0;
+ entry.fh = nfs_alloc_fhandle();
+ entry.fattr = nfs_alloc_fattr();
+ if (entry.fh == NULL || entry.fattr == NULL)
+ goto out;
+
+ array = nfs_readdir_get_array(page);
+ memset(array, 0, sizeof(struct nfs_cache_array));
+ array->eof_index = -1;
- if (desc->file->f_pos == desc->current_index) {
- *desc->dir_cookie = entry->cookie;
+ pages_ptr = nfs_readdir_large_page(pages, array_size);
+ if (!pages_ptr)
+ goto out_release_array;
+ do {
+ status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode);
+
+ if (status < 0)
break;
- }
- desc->current_index++;
- if (loop_count++ > 200) {
- loop_count = 0;
- schedule();
- }
- }
+ nfs_readdir_page_filler(desc, &entry, pages_ptr, page, array_size * PAGE_SIZE);
+ } while (array->eof_index < 0 && array->size < MAX_READDIR_ARRAY);
+
+ nfs_readdir_free_large_page(pages_ptr, pages, array_size);
+out_release_array:
+ nfs_readdir_release_array(page);
+out:
+ nfs_free_fattr(entry.fattr);
+ nfs_free_fhandle(entry.fh);
return status;
}
/*
- * Find the given page, and call find_dirent() or find_dirent_index in
- * order to try to return the next entry.
+ * Now we cache directories properly, by converting xdr information
+ * to an array that can be used for lookups later. This results in
+ * fewer cache pages, since we can store more information on each page.
+ * We only need to convert from xdr once so future lookups are much simpler
*/
-static inline
-int find_dirent_page(nfs_readdir_descriptor_t *desc)
+static
+int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
{
struct inode *inode = desc->file->f_path.dentry->d_inode;
- struct page *page;
- int status;
- dfprintk(DIRCACHE, "NFS: %s: searching page %ld for target %Lu\n",
- __func__, desc->page_index,
- (long long) *desc->dir_cookie);
+ if (nfs_readdir_xdr_to_array(desc, page, inode) < 0)
+ goto error;
+ SetPageUptodate(page);
- /* If we find the page in the page_cache, we cannot be sure
- * how fresh the data is, so we will ignore readdir_plus attributes.
- */
- desc->timestamp_valid = 0;
- page = read_cache_page(inode->i_mapping, desc->page_index,
- (filler_t *)nfs_readdir_filler, desc);
- if (IS_ERR(page)) {
- status = PTR_ERR(page);
- goto out;
+ if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
+ /* Should never happen */
+ nfs_zap_mapping(inode, inode->i_mapping);
}
+ unlock_page(page);
+ return 0;
+ error:
+ unlock_page(page);
+ return -EIO;
+}
- /* NOTE: Someone else may have changed the READDIRPLUS flag */
- desc->page = page;
- desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */
- if (*desc->dir_cookie != 0)
- status = find_dirent(desc);
- else
- status = find_dirent_index(desc);
- if (status < 0)
- dir_page_release(desc);
- out:
- dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status);
- return status;
+static
+void cache_page_release(nfs_readdir_descriptor_t *desc)
+{
+ page_cache_release(desc->page);
+ desc->page = NULL;
+}
+
+static
+struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
+{
+ struct page *page;
+ page = read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping,
+ desc->page_index, (filler_t *)nfs_readdir_filler, desc);
+ if (IS_ERR(page))
+ desc->eof = 1;
+ return page;
}
/*
- * Recurse through the page cache pages, and return a
- * filled nfs_entry structure of the next directory entry if possible.
- *
- * The target for the search is '*desc->dir_cookie' if non-0,
- * 'desc->file->f_pos' otherwise
+ * Returns 0 if desc->dir_cookie was found on page desc->page_index
*/
+static
+int find_cache_page(nfs_readdir_descriptor_t *desc)
+{
+ int res;
+
+ desc->page = get_cache_page(desc);
+ if (IS_ERR(desc->page))
+ return PTR_ERR(desc->page);
+
+ res = nfs_readdir_search_array(desc);
+ if (res == 0)
+ return 0;
+ cache_page_release(desc);
+ return res;
+}
+
+/* Search for desc->dir_cookie from the beginning of the page cache */
static inline
int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
{
- int loop_count = 0;
- int res;
-
- /* Always search-by-index from the beginning of the cache */
- if (*desc->dir_cookie == 0) {
- dfprintk(DIRCACHE, "NFS: readdir_search_pagecache() searching for offset %Ld\n",
- (long long)desc->file->f_pos);
- desc->page_index = 0;
- desc->entry->cookie = desc->entry->prev_cookie = 0;
- desc->entry->eof = 0;
- desc->current_index = 0;
- } else
- dfprintk(DIRCACHE, "NFS: readdir_search_pagecache() searching for cookie %Lu\n",
- (unsigned long long)*desc->dir_cookie);
+ int res = -EAGAIN;
- for (;;) {
- res = find_dirent_page(desc);
+ while (1) {
+ res = find_cache_page(desc);
if (res != -EAGAIN)
break;
- /* Align to beginning of next page */
- desc->page_index ++;
- if (loop_count++ > 200) {
- loop_count = 0;
- schedule();
- }
+ desc->page_index++;
}
-
- dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, res);
return res;
}
return (inode->i_mode >> 12) & 15;
}
-static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc);
-
/*
* Once we've found the start of the dirent within a page: fill 'er up...
*/
filldir_t filldir)
{
struct file *file = desc->file;
- struct nfs_entry *entry = desc->entry;
- struct dentry *dentry = NULL;
- u64 fileid;
- int loop_count = 0,
- res;
-
- dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling starting @ cookie %Lu\n",
- (unsigned long long)entry->cookie);
-
- for(;;) {
- unsigned d_type = DT_UNKNOWN;
- /* Note: entry->prev_cookie contains the cookie for
- * retrieving the current dirent on the server */
- fileid = entry->ino;
-
- /* Get a dentry if we have one */
- if (dentry != NULL)
- dput(dentry);
- dentry = nfs_readdir_lookup(desc);
+ int i = 0;
+ int res = 0;
+ struct nfs_cache_array *array = NULL;
+ unsigned int d_type = DT_UNKNOWN;
+ struct dentry *dentry = NULL;
- /* Use readdirplus info */
- if (dentry != NULL && dentry->d_inode != NULL) {
- d_type = dt_type(dentry->d_inode);
- fileid = NFS_FILEID(dentry->d_inode);
- }
+ array = nfs_readdir_get_array(desc->page);
+
+ for (i = desc->cache_entry_index; i < array->size; i++) {
+ d_type = DT_UNKNOWN;
- res = filldir(dirent, entry->name, entry->len,
- file->f_pos, nfs_compat_user_ino64(fileid),
- d_type);
+ res = filldir(dirent, array->array[i].string.name,
+ array->array[i].string.len, file->f_pos,
+ nfs_compat_user_ino64(array->array[i].ino), d_type);
if (res < 0)
break;
file->f_pos++;
- *desc->dir_cookie = entry->cookie;
- if (dir_decode(desc) != 0) {
- desc->page_index ++;
+ desc->cache_entry_index = i;
+ if (i < (array->size-1))
+ *desc->dir_cookie = array->array[i+1].cookie;
+ else
+ *desc->dir_cookie = array->last_cookie;
+ if (i == array->eof_index) {
+ desc->eof = 1;
break;
}
- if (loop_count++ > 200) {
- loop_count = 0;
- schedule();
- }
}
- dir_page_release(desc);
+
+ nfs_readdir_release_array(desc->page);
+ cache_page_release(desc);
if (dentry != NULL)
dput(dentry);
dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",
int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
filldir_t filldir)
{
- struct file *file = desc->file;
- struct inode *inode = file->f_path.dentry->d_inode;
- struct rpc_cred *cred = nfs_file_cred(file);
struct page *page = NULL;
int status;
- unsigned long timestamp, gencount;
+ struct inode *inode = desc->file->f_path.dentry->d_inode;
dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
(unsigned long long)*desc->dir_cookie);
status = -ENOMEM;
goto out;
}
- timestamp = jiffies;
- gencount = nfs_inc_attr_generation_counter();
- status = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred,
- *desc->dir_cookie, page,
- NFS_SERVER(inode)->dtsize,
- desc->plus);
- desc->page = page;
- desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */
- if (status >= 0) {
- desc->timestamp = timestamp;
- desc->gencount = gencount;
- desc->timestamp_valid = 1;
- if ((status = dir_decode(desc)) == 0)
- desc->entry->prev_cookie = *desc->dir_cookie;
- } else
+
+ if (nfs_readdir_xdr_to_array(desc, page, inode) == -1) {
status = -EIO;
- if (status < 0)
goto out_release;
+ }
+ desc->page_index = 0;
+ desc->page = page;
status = nfs_do_filldir(desc, dirent, filldir);
- /* Reset read descriptor so it searches the page cache from
- * the start upon the next call to readdir_search_pagecache() */
- desc->page_index = 0;
- desc->entry->cookie = desc->entry->prev_cookie = 0;
- desc->entry->eof = 0;
out:
dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
__func__, status);
return status;
out_release:
- dir_page_release(desc);
+ cache_page_release(desc);
goto out;
}
struct inode *inode = dentry->d_inode;
nfs_readdir_descriptor_t my_desc,
*desc = &my_desc;
- struct nfs_entry my_entry;
int res = -ENOMEM;
dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
desc->decode = NFS_PROTO(inode)->decode_dirent;
desc->plus = NFS_USE_READDIRPLUS(inode);
- my_entry.cookie = my_entry.prev_cookie = 0;
- my_entry.eof = 0;
- my_entry.fh = nfs_alloc_fhandle();
- my_entry.fattr = nfs_alloc_fattr();
- if (my_entry.fh == NULL || my_entry.fattr == NULL)
- goto out_alloc_failed;
-
- desc->entry = &my_entry;
-
nfs_block_sillyrename(dentry);
res = nfs_revalidate_mapping(inode, filp->f_mapping);
if (res < 0)
goto out;
- while(!desc->entry->eof) {
+ while (desc->eof != 1) {
res = readdir_search_pagecache(desc);
if (res == -EBADCOOKIE) {
/* This means either end of directory */
- if (*desc->dir_cookie && desc->entry->cookie != *desc->dir_cookie) {
+ if (*desc->dir_cookie && desc->eof == 0) {
/* Or that the server has 'lost' a cookie */
res = uncached_readdir(desc, dirent, filldir);
if (res >= 0)
if (res == -ETOOSMALL && desc->plus) {
clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
nfs_zap_caches(inode);
+ desc->page_index = 0;
desc->plus = 0;
- desc->entry->eof = 0;
+ desc->eof = 0;
continue;
}
if (res < 0)
nfs_unblock_sillyrename(dentry);
if (res > 0)
res = 0;
-out_alloc_failed:
- nfs_free_fattr(my_entry.fattr);
- nfs_free_fhandle(my_entry.fh);
dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
res);
return 1;
}
+static struct nfs_open_context *nameidata_to_nfs_open_context(struct dentry *dentry, struct nameidata *nd)
+{
+ struct path path = {
+ .mnt = nd->path.mnt,
+ .dentry = dentry,
+ };
+ struct nfs_open_context *ctx;
+ struct rpc_cred *cred;
+ fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
+
+ cred = rpc_lookup_cred();
+ if (IS_ERR(cred))
+ return ERR_CAST(cred);
+ ctx = alloc_nfs_open_context(&path, cred, fmode);
+ put_rpccred(cred);
+ if (ctx == NULL)
+ return ERR_PTR(-ENOMEM);
+ return ctx;
+}
+
+static int do_open(struct inode *inode, struct file *filp)
+{
+ nfs_fscache_set_inode_cookie(inode, filp);
+ return 0;
+}
+
+static int nfs_intent_set_file(struct nameidata *nd, struct nfs_open_context *ctx)
+{
+ struct file *filp;
+ int ret = 0;
+
+ /* If the open_intent is for execute, we have an extra check to make */
+ if (ctx->mode & FMODE_EXEC) {
+ ret = nfs_may_open(ctx->path.dentry->d_inode,
+ ctx->cred,
+ nd->intent.open.flags);
+ if (ret < 0)
+ goto out;
+ }
+ filp = lookup_instantiate_filp(nd, ctx->path.dentry, do_open);
+ if (IS_ERR(filp))
+ ret = PTR_ERR(filp);
+ else
+ nfs_file_set_open_context(filp, ctx);
+out:
+ put_nfs_open_context(ctx);
+ return ret;
+}
+
static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
{
+ struct nfs_open_context *ctx;
+ struct iattr attr;
struct dentry *res = NULL;
- int error;
+ struct inode *inode;
+ int open_flags;
+ int err;
dfprintk(VFS, "NFS: atomic_lookup(%s/%ld), %s\n",
dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
goto out;
}
+ ctx = nameidata_to_nfs_open_context(dentry, nd);
+ res = ERR_CAST(ctx);
+ if (IS_ERR(ctx))
+ goto out;
+
+ open_flags = nd->intent.open.flags;
+ if (nd->flags & LOOKUP_CREATE) {
+ attr.ia_mode = nd->intent.open.create_mode;
+ attr.ia_valid = ATTR_MODE;
+ if (!IS_POSIXACL(dir))
+ attr.ia_mode &= ~current_umask();
+ } else {
+ open_flags &= ~(O_EXCL | O_CREAT);
+ attr.ia_valid = 0;
+ }
+
/* Open the file on the server */
- res = nfs4_atomic_open(dir, dentry, nd);
- if (IS_ERR(res)) {
- error = PTR_ERR(res);
- switch (error) {
+ nfs_block_sillyrename(dentry->d_parent);
+ inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr);
+ if (IS_ERR(inode)) {
+ nfs_unblock_sillyrename(dentry->d_parent);
+ put_nfs_open_context(ctx);
+ switch (PTR_ERR(inode)) {
/* Make a negative dentry */
case -ENOENT:
+ d_add(dentry, NULL);
res = NULL;
goto out;
/* This turned out not to be a regular file */
goto no_open;
/* case -EINVAL: */
default:
+ res = ERR_CAST(inode);
goto out;
}
- } else if (res != NULL)
+ }
+ res = d_add_unique(dentry, inode);
+ nfs_unblock_sillyrename(dentry->d_parent);
+ if (res != NULL) {
+ dput(ctx->path.dentry);
+ ctx->path.dentry = dget(res);
dentry = res;
+ }
+ err = nfs_intent_set_file(nd, ctx);
+ if (err < 0) {
+ if (res != NULL)
+ dput(res);
+ return ERR_PTR(err);
+ }
out:
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
return res;
no_open:
return nfs_lookup(dir, dentry, nd);
struct dentry *parent = NULL;
struct inode *inode = dentry->d_inode;
struct inode *dir;
+ struct nfs_open_context *ctx;
int openflags, ret = 0;
if (!is_atomic_open(nd) || d_mountpoint(dentry))
goto no_open;
+
parent = dget_parent(dentry);
dir = parent->d_inode;
+
/* We can't create new files in nfs_open_revalidate(), so we
* optimize away revalidation of negative dentries.
*/
/* We can't create new files, or truncate existing ones here */
openflags &= ~(O_CREAT|O_EXCL|O_TRUNC);
+ ctx = nameidata_to_nfs_open_context(dentry, nd);
+ ret = PTR_ERR(ctx);
+ if (IS_ERR(ctx))
+ goto out;
/*
* Note: we're not holding inode->i_mutex and so may be racing with
* operations that change the directory. We therefore save the
* change attribute *before* we do the RPC call.
*/
- ret = nfs4_open_revalidate(dir, dentry, openflags, nd);
+ inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, NULL);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ switch (ret) {
+ case -EPERM:
+ case -EACCES:
+ case -EDQUOT:
+ case -ENOSPC:
+ case -EROFS:
+ goto out_put_ctx;
+ default:
+ goto out_drop;
+ }
+ }
+ iput(inode);
+ if (inode != dentry->d_inode)
+ goto out_drop;
+
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ ret = nfs_intent_set_file(nd, ctx);
+ if (ret >= 0)
+ ret = 1;
out:
dput(parent);
- if (!ret)
- d_drop(dentry);
return ret;
+out_drop:
+ d_drop(dentry);
+ ret = 0;
+out_put_ctx:
+ put_nfs_open_context(ctx);
+ goto out;
+
no_open_dput:
dput(parent);
no_open:
return nfs_lookup_revalidate(dentry, nd);
}
-#endif /* CONFIG_NFSV4 */
-static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc)
+static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode,
+ struct nameidata *nd)
{
- struct dentry *parent = desc->file->f_path.dentry;
- struct inode *dir = parent->d_inode;
- struct nfs_entry *entry = desc->entry;
- struct dentry *dentry, *alias;
- struct qstr name = {
- .name = entry->name,
- .len = entry->len,
- };
- struct inode *inode;
- unsigned long verf = nfs_save_change_attribute(dir);
+ struct nfs_open_context *ctx = NULL;
+ struct iattr attr;
+ int error;
+ int open_flags = 0;
- switch (name.len) {
- case 2:
- if (name.name[0] == '.' && name.name[1] == '.')
- return dget_parent(parent);
- break;
- case 1:
- if (name.name[0] == '.')
- return dget(parent);
- }
+ dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
+ dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
- spin_lock(&dir->i_lock);
- if (NFS_I(dir)->cache_validity & NFS_INO_INVALID_DATA) {
- spin_unlock(&dir->i_lock);
- return NULL;
- }
- spin_unlock(&dir->i_lock);
+ attr.ia_mode = mode;
+ attr.ia_valid = ATTR_MODE;
- name.hash = full_name_hash(name.name, name.len);
- dentry = d_lookup(parent, &name);
- if (dentry != NULL) {
- /* Is this a positive dentry that matches the readdir info? */
- if (dentry->d_inode != NULL &&
- (NFS_FILEID(dentry->d_inode) == entry->ino ||
- d_mountpoint(dentry))) {
- if (!desc->plus || entry->fh->size == 0)
- return dentry;
- if (nfs_compare_fh(NFS_FH(dentry->d_inode),
- entry->fh) == 0)
- goto out_renew;
- }
- /* No, so d_drop to allow one to be created */
- d_drop(dentry);
- dput(dentry);
- }
- if (!desc->plus || !(entry->fattr->valid & NFS_ATTR_FATTR))
- return NULL;
- if (name.len > NFS_SERVER(dir)->namelen)
- return NULL;
- /* Note: caller is already holding the dir->i_mutex! */
- dentry = d_alloc(parent, &name);
- if (dentry == NULL)
- return NULL;
- dentry->d_op = NFS_PROTO(dir)->dentry_ops;
- inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
- if (IS_ERR(inode)) {
- dput(dentry);
- return NULL;
- }
+ if ((nd->flags & LOOKUP_CREATE) != 0) {
+ open_flags = nd->intent.open.flags;
- alias = d_materialise_unique(dentry, inode);
- if (alias != NULL) {
- dput(dentry);
- if (IS_ERR(alias))
- return NULL;
- dentry = alias;
+ ctx = nameidata_to_nfs_open_context(dentry, nd);
+ error = PTR_ERR(ctx);
+ if (IS_ERR(ctx))
+ goto out_err_drop;
}
-out_renew:
- nfs_set_verifier(dentry, verf);
- return dentry;
+ error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, ctx);
+ if (error != 0)
+ goto out_put_ctx;
+ if (ctx != NULL) {
+ error = nfs_intent_set_file(nd, ctx);
+ if (error < 0)
+ goto out_err;
+ }
+ return 0;
+out_put_ctx:
+ if (ctx != NULL)
+ put_nfs_open_context(ctx);
+out_err_drop:
+ d_drop(dentry);
+out_err:
+ return error;
}
+#endif /* CONFIG_NFSV4 */
+
/*
* Code common to create, mkdir, and mknod.
*/
{
struct iattr attr;
int error;
- int open_flags = 0;
dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
attr.ia_mode = mode;
attr.ia_valid = ATTR_MODE;
- if ((nd->flags & LOOKUP_CREATE) != 0)
- open_flags = nd->intent.open.flags;
-
- error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd);
+ error = NFS_PROTO(dir)->create(dir, dentry, &attr, 0, NULL);
if (error != 0)
goto out_err;
return 0;
return error;
}
-static int nfs_sillyrename(struct inode *dir, struct dentry *dentry)
-{
- static unsigned int sillycounter;
- const int fileidsize = sizeof(NFS_FILEID(dentry->d_inode))*2;
- const int countersize = sizeof(sillycounter)*2;
- const int slen = sizeof(".nfs")+fileidsize+countersize-1;
- char silly[slen+1];
- struct qstr qsilly;
- struct dentry *sdentry;
- int error = -EIO;
-
- dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
- dentry->d_parent->d_name.name, dentry->d_name.name,
- atomic_read(&dentry->d_count));
- nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
-
- /*
- * We don't allow a dentry to be silly-renamed twice.
- */
- error = -EBUSY;
- if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
- goto out;
-
- sprintf(silly, ".nfs%*.*Lx",
- fileidsize, fileidsize,
- (unsigned long long)NFS_FILEID(dentry->d_inode));
-
- /* Return delegation in anticipation of the rename */
- nfs_inode_return_delegation(dentry->d_inode);
-
- sdentry = NULL;
- do {
- char *suffix = silly + slen - countersize;
-
- dput(sdentry);
- sillycounter++;
- sprintf(suffix, "%*.*x", countersize, countersize, sillycounter);
-
- dfprintk(VFS, "NFS: trying to rename %s to %s\n",
- dentry->d_name.name, silly);
-
- sdentry = lookup_one_len(silly, dentry->d_parent, slen);
- /*
- * N.B. Better to return EBUSY here ... it could be
- * dangerous to delete the file while it's in use.
- */
- if (IS_ERR(sdentry))
- goto out;
- } while(sdentry->d_inode != NULL); /* need negative lookup */
-
- qsilly.name = silly;
- qsilly.len = strlen(silly);
- if (dentry->d_inode) {
- error = NFS_PROTO(dir)->rename(dir, &dentry->d_name,
- dir, &qsilly);
- nfs_mark_for_revalidate(dentry->d_inode);
- } else
- error = NFS_PROTO(dir)->rename(dir, &dentry->d_name,
- dir, &qsilly);
- if (!error) {
- nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
- d_move(dentry, sdentry);
- error = nfs_async_unlink(dir, dentry);
- /* If we return 0 we don't unlink */
- }
- dput(sdentry);
-out:
- return error;
-}
-
/*
* Remove a file after making sure there are no pending writes,
* and after checking that the file has only one user.
d_drop(dentry);
error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
if (error == 0) {
- atomic_inc(&inode->i_count);
+ ihold(inode);
d_add(dentry, inode);
}
return error;
int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
{
LIST_HEAD(head);
- struct nfs_inode *nfsi;
+ struct nfs_inode *nfsi, *next;
struct nfs_access_entry *cache;
if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
return (nr_to_scan == 0) ? 0 : -1;
spin_lock(&nfs_access_lru_lock);
- list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) {
+ list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) {
struct inode *inode;
if (nr_to_scan-- == 0)
static int mem_open(struct inode* inode, struct file* file)
{
file->private_data = (void*)((long)current->self_exec_id);
+ /* OK to pass negative loff_t, we can catch out-of-range */
+ file->f_mode |= FMODE_UNSIGNED_OFFSET;
return 0;
}
memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
- if (copy_from_user(buffer, buf, count))
- return -EFAULT;
+ if (copy_from_user(buffer, buf, count)) {
+ err = -EFAULT;
+ goto out;
+ }
err = strict_strtol(strstrip(buffer), 0, &oom_adjust);
if (err)
- return -EINVAL;
+ goto out;
if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
- oom_adjust != OOM_DISABLE)
- return -EINVAL;
+ oom_adjust != OOM_DISABLE) {
+ err = -EINVAL;
+ goto out;
+ }
task = get_proc_task(file->f_path.dentry->d_inode);
- if (!task)
- return -ESRCH;
+ if (!task) {
+ err = -ESRCH;
+ goto out;
+ }
+
+ task_lock(task);
+ if (!task->mm) {
+ err = -EINVAL;
+ goto err_task_lock;
+ }
+
if (!lock_task_sighand(task, &flags)) {
- put_task_struct(task);
- return -ESRCH;
+ err = -ESRCH;
+ goto err_task_lock;
}
if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
- unlock_task_sighand(task, &flags);
- put_task_struct(task);
- return -EACCES;
+ err = -EACCES;
+ goto err_sighand;
+ }
+
+ if (oom_adjust != task->signal->oom_adj) {
+ if (oom_adjust == OOM_DISABLE)
+ atomic_inc(&task->mm->oom_disable_count);
+ if (task->signal->oom_adj == OOM_DISABLE)
+ atomic_dec(&task->mm->oom_disable_count);
}
/*
else
task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
-OOM_DISABLE;
+err_sighand:
unlock_task_sighand(task, &flags);
+err_task_lock:
+ task_unlock(task);
put_task_struct(task);
-
- return count;
+out:
+ return err < 0 ? err : count;
}
static const struct file_operations proc_oom_adjust_operations = {
memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
- if (copy_from_user(buffer, buf, count))
- return -EFAULT;
+ if (copy_from_user(buffer, buf, count)) {
+ err = -EFAULT;
+ goto out;
+ }
err = strict_strtol(strstrip(buffer), 0, &oom_score_adj);
if (err)
- return -EINVAL;
+ goto out;
if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
- oom_score_adj > OOM_SCORE_ADJ_MAX)
- return -EINVAL;
+ oom_score_adj > OOM_SCORE_ADJ_MAX) {
+ err = -EINVAL;
+ goto out;
+ }
task = get_proc_task(file->f_path.dentry->d_inode);
- if (!task)
- return -ESRCH;
+ if (!task) {
+ err = -ESRCH;
+ goto out;
+ }
+
+ task_lock(task);
+ if (!task->mm) {
+ err = -EINVAL;
+ goto err_task_lock;
+ }
+
if (!lock_task_sighand(task, &flags)) {
- put_task_struct(task);
- return -ESRCH;
+ err = -ESRCH;
+ goto err_task_lock;
}
+
if (oom_score_adj < task->signal->oom_score_adj &&
!capable(CAP_SYS_RESOURCE)) {
- unlock_task_sighand(task, &flags);
- put_task_struct(task);
- return -EACCES;
+ err = -EACCES;
+ goto err_sighand;
}
+ if (oom_score_adj != task->signal->oom_score_adj) {
+ if (oom_score_adj == OOM_SCORE_ADJ_MIN)
+ atomic_inc(&task->mm->oom_disable_count);
+ if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+ atomic_dec(&task->mm->oom_disable_count);
+ }
task->signal->oom_score_adj = oom_score_adj;
/*
* Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
else
task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
OOM_SCORE_ADJ_MAX;
+err_sighand:
unlock_task_sighand(task, &flags);
+err_task_lock:
+ task_unlock(task);
put_task_struct(task);
- return count;
+out:
+ return err < 0 ? err : count;
}
static const struct file_operations proc_oom_score_adj_operations = {
/* Common stuff */
ei = PROC_I(inode);
+ inode->i_ino = get_next_ino();
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
inode->i_op = &proc_def_inode_operations;
/* Initialize the inode */
ei = PROC_I(inode);
+ inode->i_ino = get_next_ino();
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
/*
int reiserfs_commit_write(struct file *f, struct page *page,
unsigned from, unsigned to);
- int reiserfs_prepare_write(struct file *f, struct page *page,
- unsigned from, unsigned to);
void reiserfs_evict_inode(struct inode *inode)
{
** but tail is still sitting in a direct item, and we can't write to
** it. So, look through this page, and check all the mapped buffers
** to make sure they have valid block numbers. Any that don't need
- ** to be unmapped, so that block_prepare_write will correctly call
+ ** to be unmapped, so that __block_write_begin will correctly call
** reiserfs_get_block to convert the tail into an unformatted node
*/
static inline void fix_tail_page_for_writing(struct page *page)
}
/* special version of get_block that is only used by grab_tail_page right
- ** now. It is sent to block_prepare_write, and when you try to get a
+ ** now. It is sent to __block_write_begin, and when you try to get a
** block past the end of the file (or a block from a hole) it returns
- ** -ENOENT instead of a valid buffer. block_prepare_write expects to
+ ** -ENOENT instead of a valid buffer. __block_write_begin expects to
** be able to do i/o on the buffers returned, unless an error value
** is also returned.
**
- ** So, this allows block_prepare_write to be used for reading a single block
+ ** So, this allows __block_write_begin to be used for reading a single block
** in a page. Where it does not produce a valid page for holes, or past the
** end of the file. This turns out to be exactly what we need for reading
** tails for conversion.
**
** We must fix the tail page for writing because it might have buffers
** that are mapped, but have a block number of 0. This indicates tail
- ** data that has been read directly into the page, and block_prepare_write
- ** won't trigger a get_block in this case.
+ ** data that has been read directly into the page, and
+ ** __block_write_begin won't trigger a get_block in this case.
*/
fix_tail_page_for_writing(tail_page);
- retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end);
+ retval = __reiserfs_write_begin(tail_page, tail_start,
+ tail_end - tail_start);
if (retval)
goto unlock;
/* start within the page of the last block in the file */
start = (offset / blocksize) * blocksize;
- error = block_prepare_write(page, start, offset,
+ error = __block_write_begin(page, start, offset - start,
reiserfs_get_block_create_0);
if (error)
goto unlock;
/* from this point on, we know the buffer is mapped to a
* real block and not a direct item
*/
- if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
+ if (wbc->sync_mode != WB_SYNC_NONE) {
lock_buffer(bh);
} else {
if (!trylock_buffer(bh)) {
return ret;
}
- int reiserfs_prepare_write(struct file *f, struct page *page,
- unsigned from, unsigned to)
+ int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len)
{
struct inode *inode = page->mapping->host;
int ret;
th->t_refcount++;
}
- ret = block_prepare_write(page, from, to, reiserfs_get_block);
+ ret = __block_write_begin(page, from, len, reiserfs_get_block);
if (ret && reiserfs_transaction_running(inode->i_sb)) {
struct reiserfs_transaction_handle *th = current->journal_info;
/* this gets a little ugly. If reiserfs_get_block returned an
/* Expect random access pattern */
#define FMODE_RANDOM ((__force fmode_t)0x1000)
+ /* File is huge (eg. /dev/kmem): treat loff_t as unsigned */
+ #define FMODE_UNSIGNED_OFFSET ((__force fmode_t)0x2000)
+
/* File was opened by fanotify and shouldn't generate fanotify events */
#define FMODE_NONOTIFY ((__force fmode_t)0x1000000)
#define S_NOCMTIME 128 /* Do not update file c/mtime */
#define S_SWAPFILE 256 /* Do not truncate: swapon got its bmaps */
#define S_PRIVATE 512 /* Inode is fs-internal */
+#define S_IMA 1024 /* Inode has an associated IMA struct */
/*
* Note that nosuid etc flags are inode-specific: setting some file-system
#define IS_NOCMTIME(inode) ((inode)->i_flags & S_NOCMTIME)
#define IS_SWAPFILE(inode) ((inode)->i_flags & S_SWAPFILE)
#define IS_PRIVATE(inode) ((inode)->i_flags & S_PRIVATE)
+#define IS_IMA(inode) ((inode)->i_flags & S_IMA)
/* the read-only stuff doesn't really belong here, but any other place is
probably as bad and I don't want to create yet another include file. */
struct inode {
struct hlist_node i_hash;
- struct list_head i_list; /* backing dev IO list */
+ struct list_head i_wb_list; /* backing dev IO list */
+ struct list_head i_lru; /* inode LRU list */
struct list_head i_sb_list;
struct list_head i_dentry;
unsigned long i_ino;
unsigned int i_flags;
+#ifdef CONFIG_IMA
+ /* protected by i_lock */
+ unsigned int i_readcount; /* struct files open RO */
+#endif
atomic_t i_writecount;
#ifdef CONFIG_SECURITY
void *i_security;
void *i_private; /* fs or device private pointer */
};
+ static inline int inode_unhashed(struct inode *inode)
+ {
+ return hlist_unhashed(&inode->i_hash);
+ }
+
/*
* inode->i_mutex nesting subclasses for the lock validator:
*
*
* Q: What is the difference between I_WILL_FREE and I_FREEING?
*/
- #define I_DIRTY_SYNC 1
- #define I_DIRTY_DATASYNC 2
- #define I_DIRTY_PAGES 4
+ #define I_DIRTY_SYNC (1 << 0)
+ #define I_DIRTY_DATASYNC (1 << 1)
+ #define I_DIRTY_PAGES (1 << 2)
#define __I_NEW 3
#define I_NEW (1 << __I_NEW)
- #define I_WILL_FREE 16
- #define I_FREEING 32
- #define I_CLEAR 64
+ #define I_WILL_FREE (1 << 4)
+ #define I_FREEING (1 << 5)
+ #define I_CLEAR (1 << 6)
#define __I_SYNC 7
#define I_SYNC (1 << __I_SYNC)
+ #define I_REFERENCED (1 << 8)
#define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
}
int sync_inode(struct inode *inode, struct writeback_control *wbc);
+ int sync_inode_metadata(struct inode *inode, int wait);
struct file_system_type {
const char *name;
extern int __invalidate_device(struct block_device *);
extern int invalidate_partition(struct gendisk *, int);
#endif
- extern int invalidate_inodes(struct super_block *);
unsigned long invalidate_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t end);
extern int inode_init_always(struct super_block *, struct inode *);
extern void inode_init_once(struct inode *);
- extern void inode_add_to_lists(struct super_block *, struct inode *);
+ extern void ihold(struct inode * inode);
extern void iput(struct inode *);
extern struct inode * igrab(struct inode *);
extern ino_t iunique(struct super_block *, ino_t);
extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *);
extern int insert_inode_locked(struct inode *);
extern void unlock_new_inode(struct inode *);
+ extern unsigned int get_next_ino(void);
extern void __iget(struct inode * inode);
extern void iget_failed(struct inode *);
extern void end_writeback(struct inode *);
- extern void destroy_inode(struct inode *);
extern void __destroy_inode(struct inode *);
extern struct inode *new_inode(struct super_block *);
extern int should_remove_suid(struct dentry *);
extern void __insert_inode_hash(struct inode *, unsigned long hashval);
extern void remove_inode_hash(struct inode *);
- static inline void insert_inode_hash(struct inode *inode) {
+ static inline void insert_inode_hash(struct inode *inode)
+ {
__insert_inode_hash(inode, inode->i_ino);
}
+ extern void inode_sb_list_add(struct inode *inode);
#ifdef CONFIG_BLOCK
extern void submit_bio(int, struct bio *);
struct ctl_table;
int proc_nr_files(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
-
+ int proc_nr_dentry(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
+ int proc_nr_inodes(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
int __init get_filesystem_list(char *buf);
#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
struct backing_dev_info;
extern spinlock_t inode_lock;
- extern struct list_head inode_in_use;
- extern struct list_head inode_unused;
/*
* fs/fs-writeback.c
int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
void set_page_dirty_balance(struct page *page, int page_mkwrite);
void writeback_set_ratelimit(void);
+void tag_pages_for_writeback(struct address_space *mapping,
+ pgoff_t start, pgoff_t end);
/* pdflush.c */
extern int nr_pdflush_threads; /* Global so it can be exported to sysctl
extern int unaligned_dump_stack;
#endif
-extern struct ratelimit_state printk_ratelimit_state;
-
#ifdef CONFIG_PROC_SYSCTL
static int proc_do_cad_pid(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
.data = &inodes_stat,
.maxlen = 2*sizeof(int),
.mode = 0444,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_nr_inodes,
},
{
.procname = "inode-state",
.data = &inodes_stat,
.maxlen = 7*sizeof(int),
.mode = 0444,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_nr_inodes,
},
{
.procname = "file-nr",
.data = &dentry_stat,
.maxlen = 6*sizeof(int),
.mode = 0444,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_nr_dentry,
},
{
.procname = "overflowuid",
nr_wb = nr_dirty = nr_io = nr_more_io = 0;
spin_lock(&inode_lock);
- list_for_each_entry(inode, &wb->b_dirty, i_list)
+ list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
nr_dirty++;
- list_for_each_entry(inode, &wb->b_io, i_list)
+ list_for_each_entry(inode, &wb->b_io, i_wb_list)
nr_io++;
- list_for_each_entry(inode, &wb->b_more_io, i_list)
+ list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
nr_more_io++;
spin_unlock(&inode_lock);
{
struct bdi_writeback *me = ptr;
- current->flags |= PF_FLUSHER | PF_SWAPWRITE;
+ current->flags |= PF_SWAPWRITE;
set_freezable();
/*
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
};
+static atomic_t nr_bdi_congested[2];
void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
{
wait_queue_head_t *wqh = &congestion_wqh[sync];
bit = sync ? BDI_sync_congested : BDI_async_congested;
- clear_bit(bit, &bdi->state);
+ if (test_and_clear_bit(bit, &bdi->state))
+ atomic_dec(&nr_bdi_congested[sync]);
smp_mb__after_clear_bit();
if (waitqueue_active(wqh))
wake_up(wqh);
enum bdi_state bit;
bit = sync ? BDI_sync_congested : BDI_async_congested;
- set_bit(bit, &bdi->state);
+ if (!test_and_set_bit(bit, &bdi->state))
+ atomic_inc(&nr_bdi_congested[sync]);
}
EXPORT_SYMBOL(set_bdi_congested);
long congestion_wait(int sync, long timeout)
{
long ret;
+ unsigned long start = jiffies;
DEFINE_WAIT(wait);
wait_queue_head_t *wqh = &congestion_wqh[sync];
prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
ret = io_schedule_timeout(timeout);
finish_wait(wqh, &wait);
+
+ trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
+ jiffies_to_usecs(jiffies - start));
+
return ret;
}
EXPORT_SYMBOL(congestion_wait);
+/**
+ * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
+ * @zone: A zone to check if it is heavily congested
+ * @sync: SYNC or ASYNC IO
+ * @timeout: timeout in jiffies
+ *
+ * In the event of a congested backing_dev (any backing_dev) and the given
+ * @zone has experienced recent congestion, this waits for up to @timeout
+ * jiffies for either a BDI to exit congestion of the given @sync queue
+ * or a write to complete.
+ *
+ * In the absense of zone congestion, cond_resched() is called to yield
+ * the processor if necessary but otherwise does not sleep.
+ *
+ * The return value is 0 if the sleep is for the full timeout. Otherwise,
+ * it is the number of jiffies that were still remaining when the function
+ * returned. return_value == timeout implies the function did not sleep.
+ */
+long wait_iff_congested(struct zone *zone, int sync, long timeout)
+{
+ long ret;
+ unsigned long start = jiffies;
+ DEFINE_WAIT(wait);
+ wait_queue_head_t *wqh = &congestion_wqh[sync];
+
+ /*
+ * If there is no congestion, or heavy congestion is not being
+ * encountered in the current zone, yield if necessary instead
+ * of sleeping on the congestion queue
+ */
+ if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
+ !zone_is_reclaim_congested(zone)) {
+ cond_resched();
+
+ /* In case we scheduled, work out time remaining */
+ ret = timeout - (jiffies - start);
+ if (ret < 0)
+ ret = 0;
+
+ goto out;
+ }
+
+ /* Sleep until uncongested or a write happens */
+ prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+ ret = io_schedule_timeout(timeout);
+ finish_wait(wqh, &wait);
+
+out:
+ trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
+ jiffies_to_usecs(jiffies - start));
+
+ return ret;
+}
+EXPORT_SYMBOL(wait_iff_congested);
&socket_file_ops);
if (unlikely(!file)) {
/* drop dentry, keep inode */
- atomic_inc(&path.dentry->d_inode->i_count);
+ ihold(path.dentry->d_inode);
path_put(&path);
put_unused_fd(fd);
return -ENFILE;
sock = SOCKET_I(inode);
kmemcheck_annotate_bitfield(sock, type);
+ inode->i_ino = get_next_ino();
inode->i_mode = S_IFSOCK | S_IRWXUGO;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
}
EXPORT_SYMBOL(sock_wake_async);
-static int __sock_create(struct net *net, int family, int type, int protocol,
+int __sock_create(struct net *net, int family, int type, int protocol,
struct socket **res, int kern)
{
int err;
rcu_read_unlock();
goto out_sock_release;
}
+EXPORT_SYMBOL(__sock_create);
int sock_create(int family, int type, int protocol, struct socket **res)
{