Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6

author Linus Torvalds <torvalds@linux-foundation.org>

Wed, 27 Oct 2010 00:58:44 +0000 (17:58 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 27 Oct 2010 00:58:44 +0000 (17:58 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Wed, 27 Oct 2010 00:58:44 +0000 (17:58 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 27 Oct 2010 00:58:44 +0000 (17:58 -0700)
diff --combined fs/buffer.c

index 8d595ab2aed1a75bb79a3450ff6205d88dd359ba,d895d9fd5b715ca1405c35cc47d42ec489c09aa5..5930e382959bc504c58bbb428588a372742d4aa4
--- 1/fs/buffer.c
--- 2/fs/buffer.c
+++ b/fs/buffer.c
@@@ -1705,7 -1705,7 +1705,7 @@@ static int __block_write_full_page(stru
                  * and kswapd activity, but those code paths have their own
                  * higher-level throttling.
                  */
- -              if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
+ +              if (wbc->sync_mode != WB_SYNC_NONE) {
                         lock_buffer(bh);
                 } else if (!trylock_buffer(bh)) {
                         redirty_page_for_writepage(wbc, page);
@@@ -1833,9 -1833,11 +1833,11 @@@ void page_zero_new_buffers(struct page 
   }
   EXPORT_SYMBOL(page_zero_new_buffers);
   
- int block_prepare_write(struct page *page, unsigned from, unsigned to,
+ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
                 get_block_t *get_block)
   {
+       unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+       unsigned to = from + len;
         struct inode *inode = page->mapping->host;
         unsigned block_start, block_end;
         sector_t block;
@@@ -1915,7 -1917,7 +1917,7 @@@
         }
         return err;
   }
- EXPORT_SYMBOL(block_prepare_write);
+ EXPORT_SYMBOL(__block_write_begin);
   
   static int __block_commit_write(struct inode *inode, struct page *page,
                 unsigned from, unsigned to)
@@@ -1952,15 -1954,6 +1954,6 @@@
         return 0;
   }
   
- int __block_write_begin(struct page *page, loff_t pos, unsigned len,
-               get_block_t *get_block)
- {
-       unsigned start = pos & (PAGE_CACHE_SIZE - 1);
- 
-       return block_prepare_write(page, start, start + len, get_block);
- }
- EXPORT_SYMBOL(__block_write_begin);
- 
   /*
    * block_write_begin takes care of the basic task of block allocation and
    * bringing partial write blocks uptodate first.
@@@ -2378,7 -2371,7 +2371,7 @@@ block_page_mkwrite(struct vm_area_struc
         else
                 end = PAGE_CACHE_SIZE;
   
-       ret = block_prepare_write(page, 0, end, get_block);
+       ret = __block_write_begin(page, 0, end, get_block);
         if (!ret)
                 ret = block_commit_write(page, 0, end);
   
@@@ -2465,11 -2458,10 +2458,10 @@@ int nobh_write_begin(struct address_spa
         *fsdata = NULL;
   
         if (page_has_buffers(page)) {
-               unlock_page(page);
-               page_cache_release(page);
-               *pagep = NULL;
-               return block_write_begin(mapping, pos, len, flags, pagep,
-                                        get_block);
+               ret = __block_write_begin(page, pos, len, get_block);
+               if (unlikely(ret))
+                       goto out_release;
+               return ret;
         }
   
         if (PageMappedToDisk(page))
diff --combined fs/fs-writeback.c

index 9e46aec10d1aaaec3a4f0fe968ebc99412f09713,f6af81add4591420a9e9a011fac63584fa62d938..aed881a76b229602ece291fb8c55fbd5ec45ee90
--- 1/fs/fs-writeback.c
--- 2/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@@ -79,6 -79,11 +79,11 @@@ static inline struct backing_dev_info *
         return sb->s_bdi;
   }
   
+ static inline struct inode *wb_inode(struct list_head *head)
+ {
+       return list_entry(head, struct inode, i_wb_list);
+ }
+ 
   static void bdi_queue_work(struct backing_dev_info *bdi,
                 struct wb_writeback_work *work)
   {
@@@ -172,11 -177,11 +177,11 @@@ static void redirty_tail(struct inode *
         if (!list_empty(&wb->b_dirty)) {
                 struct inode *tail;
   
-               tail = list_entry(wb->b_dirty.next, struct inode, i_list);
+               tail = wb_inode(wb->b_dirty.next);
                 if (time_before(inode->dirtied_when, tail->dirtied_when))
                         inode->dirtied_when = jiffies;
         }
-       list_move(&inode->i_list, &wb->b_dirty);
+       list_move(&inode->i_wb_list, &wb->b_dirty);
   }
   
   /*
@@@ -186,7 -191,7 +191,7 @@@ static void requeue_io(struct inode *in
   {
         struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
   
-       list_move(&inode->i_list, &wb->b_more_io);
+       list_move(&inode->i_wb_list, &wb->b_more_io);
   }
   
   static void inode_sync_complete(struct inode *inode)
@@@ -227,14 -232,14 +232,14 @@@ static void move_expired_inodes(struct 
         int do_sb_sort = 0;
   
         while (!list_empty(delaying_queue)) {
-               inode = list_entry(delaying_queue->prev, struct inode, i_list);
+               inode = wb_inode(delaying_queue->prev);
                 if (older_than_this &&
                     inode_dirtied_after(inode, *older_than_this))
                         break;
                 if (sb && sb != inode->i_sb)
                         do_sb_sort = 1;
                 sb = inode->i_sb;
-               list_move(&inode->i_list, &tmp);
+               list_move(&inode->i_wb_list, &tmp);
         }
   
         /* just one sb in list, splice to dispatch_queue and we're done */
@@@ -245,12 -250,11 +250,11 @@@
   
         /* Move inodes from one superblock together */
         while (!list_empty(&tmp)) {
-               inode = list_entry(tmp.prev, struct inode, i_list);
-               sb = inode->i_sb;
+               sb = wb_inode(tmp.prev)->i_sb;
                 list_for_each_prev_safe(pos, node, &tmp) {
-                       inode = list_entry(pos, struct inode, i_list);
+                       inode = wb_inode(pos);
                         if (inode->i_sb == sb)
-                               list_move(&inode->i_list, dispatch_queue);
+                               list_move(&inode->i_wb_list, dispatch_queue);
                 }
         }
   }
@@@ -408,16 -412,13 +412,13 @@@ writeback_single_inode(struct inode *in
                          * completion.
                          */
                         redirty_tail(inode);
-               } else if (atomic_read(&inode->i_count)) {
-                       /*
-                        * The inode is clean, inuse
-                        */
-                       list_move(&inode->i_list, &inode_in_use);
                 } else {
                         /*
-                        * The inode is clean, unused
+                        * The inode is clean.  At this point we either have
+                        * a reference to the inode or it's on it's way out.
+                        * No need to add it back to the LRU.
                          */
-                       list_move(&inode->i_list, &inode_unused);
+                       list_del_init(&inode->i_wb_list);
                 }
         }
         inode_sync_complete(inode);
@@@ -465,8 -466,7 +466,7 @@@ static int writeback_sb_inodes(struct s
   {
         while (!list_empty(&wb->b_io)) {
                 long pages_skipped;
-               struct inode *inode = list_entry(wb->b_io.prev,
-                                                struct inode, i_list);
+               struct inode *inode = wb_inode(wb->b_io.prev);
   
                 if (inode->i_sb != sb) {
                         if (only_this_sb) {
@@@ -487,10 -487,16 +487,16 @@@
                         return 0;
                 }
   
-               if (inode->i_state & (I_NEW | I_WILL_FREE)) {
+               /*
+                * Don't bother with new inodes or inodes beeing freed, first
+                * kind does not need peridic writeout yet, and for the latter
+                * kind writeout is handled by the freer.
+                */
+               if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
                         requeue_io(inode);
                         continue;
                 }
+ 
                 /*
                  * Was this inode dirtied after sync_sb_inodes was called?
                  * This keeps sync from extra jobs and livelock.
@@@ -498,7 -504,6 +504,6 @@@
                 if (inode_dirtied_after(inode, wbc->wb_start))
                         return 1;
   
-               BUG_ON(inode->i_state & I_FREEING);
                 __iget(inode);
                 pages_skipped = wbc->pages_skipped;
                 writeback_single_inode(inode, wbc);
@@@ -536,8 -541,7 +541,7 @@@ void writeback_inodes_wb(struct bdi_wri
                 queue_io(wb, wbc->older_than_this);
   
         while (!list_empty(&wb->b_io)) {
-               struct inode *inode = list_entry(wb->b_io.prev,
-                                                struct inode, i_list);
+               struct inode *inode = wb_inode(wb->b_io.prev);
                 struct super_block *sb = inode->i_sb;
   
                 if (!pin_sb_for_writeback(sb)) {
@@@ -582,7 -586,7 +586,7 @@@ static inline bool over_bground_thresh(
         global_dirty_limits(&background_thresh, &dirty_thresh);
   
         return (global_page_state(NR_FILE_DIRTY) +
- -              global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
+ +              global_page_state(NR_UNSTABLE_NFS) > background_thresh);
   }
   
   /*
@@@ -675,8 -679,7 +679,7 @@@ static long wb_writeback(struct bdi_wri
                  */
                 spin_lock(&inode_lock);
                 if (!list_empty(&wb->b_more_io))  {
-                       inode = list_entry(wb->b_more_io.prev,
-                                               struct inode, i_list);
+                       inode = wb_inode(wb->b_more_io.prev);
                         trace_wbc_writeback_wait(&wbc, wb->bdi);
                         inode_wait_for_writeback(inode);
                 }
@@@ -721,13 -724,9 +724,13 @@@ static long wb_check_old_data_flush(str
                 return 0;
   
         wb->last_old_flush = jiffies;
+ +      /*
+ +       * Add in the number of potentially dirty inodes, because each inode
+ +       * write can dirty pagecache in the underlying blockdev.
+ +       */
         nr_pages = global_page_state(NR_FILE_DIRTY) +
                         global_page_state(NR_UNSTABLE_NFS) +
-                       (inodes_stat.nr_inodes - inodes_stat.nr_unused);
+                       get_nr_dirty_inodes();
   
         if (nr_pages) {
                 struct wb_writeback_work work = {
@@@ -794,7 -793,7 +797,7 @@@ int bdi_writeback_thread(void *data
         struct backing_dev_info *bdi = wb->bdi;
         long pages_written;
   
- -      current->flags |= PF_FLUSHER | PF_SWAPWRITE;
+ +      current->flags |= PF_SWAPWRITE;
         set_freezable();
         wb->last_active = jiffies;
   
@@@ -966,7 -965,7 +969,7 @@@ void __mark_inode_dirty(struct inode *i
                  * dirty list.  Add blockdev inodes as well.
                  */
                 if (!S_ISBLK(inode->i_mode)) {
-                       if (hlist_unhashed(&inode->i_hash))
+                       if (inode_unhashed(inode))
                                 goto out;
                 }
                 if (inode->i_state & I_FREEING)
@@@ -994,7 -993,7 +997,7 @@@
                         }
   
                         inode->dirtied_when = jiffies;
-                       list_move(&inode->i_list, &bdi->wb.b_dirty);
+                       list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
                 }
         }
   out:
@@@ -1094,8 -1093,7 +1097,7 @@@ void writeback_inodes_sb(struct super_b
   
         WARN_ON(!rwsem_is_locked(&sb->s_umount));
   
-       work.nr_pages = nr_dirty + nr_unstable +
-                       (inodes_stat.nr_inodes - inodes_stat.nr_unused);
+       work.nr_pages = nr_dirty + nr_unstable + get_nr_dirty_inodes();
   
         bdi_queue_work(sb->s_bdi, &work);
         wait_for_completion(&done);
@@@ -1202,3 -1200,23 +1204,23 @@@ int sync_inode(struct inode *inode, str
         return ret;
   }
   EXPORT_SYMBOL(sync_inode);
+ 
+ /**
+  * sync_inode - write an inode to disk
+  * @inode: the inode to sync
+  * @wait: wait for I/O to complete.
+  *
+  * Write an inode to disk and adjust it's dirty state after completion.
+  *
+  * Note: only writes the actual inode, no associated data or other metadata.
+  */
+ int sync_inode_metadata(struct inode *inode, int wait)
+ {
+       struct writeback_control wbc = {
+               .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
+               .nr_to_write = 0, /* metadata-only */
+       };
+ 
+       return sync_inode(inode, &wbc);
+ }
+ EXPORT_SYMBOL(sync_inode_metadata);
diff --combined fs/hugetlbfs/inode.c

index a14328d270e855a4d16d3772b9ad464bab296bdc,8d0607b3726618f9f6ca58506c17670e8e5ca879..b14be3f781c714129d4fafbd17a7ff9874193b3f
--- 1/fs/hugetlbfs/inode.c
--- 2/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@@ -31,7 -31,6 +31,7 @@@
   #include <linux/statfs.h>
   #include <linux/security.h>
   #include <linux/magic.h>
+ +#include <linux/migrate.h>
   
   #include <asm/uaccess.h>
   
@@@ -456,6 -455,7 +456,7 @@@ static struct inode *hugetlbfs_get_inod
         inode = new_inode(sb);
         if (inode) {
                 struct hugetlbfs_inode_info *info;
+               inode->i_ino = get_next_ino();
                 inode->i_mode = mode;
                 inode->i_uid = uid;
                 inode->i_gid = gid;
@@@ -574,19 -574,6 +575,19 @@@ static int hugetlbfs_set_page_dirty(str
         return 0;
   }
   
+ +static int hugetlbfs_migrate_page(struct address_space *mapping,
+ +                              struct page *newpage, struct page *page)
+ +{
+ +      int rc;
+ +
+ +      rc = migrate_huge_page_move_mapping(mapping, newpage, page);
+ +      if (rc)
+ +              return rc;
+ +      migrate_page_copy(newpage, page);
+ +
+ +      return 0;
+ +}
+ +
   static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
   {
         struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
@@@ -673,7 -660,6 +674,7 @@@ static const struct address_space_opera
         .write_begin    = hugetlbfs_write_begin,
         .write_end      = hugetlbfs_write_end,
         .set_page_dirty = hugetlbfs_set_page_dirty,
+ +      .migratepage    = hugetlbfs_migrate_page,
   };
   
   
diff --combined fs/inode.c

index 56d909d69bc88cda72bb70a528fbf0c70e877d0d,a6d60682f0fd861757df6991ff8038c8ee38b7f4..ae2727ab0c3ab7695b14f256da0bccdfd3668d81
--- 1/fs/inode.c
--- 2/fs/inode.c
+++ b/fs/inode.c
@@@ -24,12 -24,10 +24,11 @@@
   #include <linux/mount.h>
   #include <linux/async.h>
   #include <linux/posix_acl.h>
+ +#include <linux/ima.h>
   
   /*
    * This is needed for the following functions:
    *  - inode_has_buffers
-  *  - invalidate_inode_buffers
    *  - invalidate_bdev
    *
    * FIXME: remove all knowledge of the buffer layer from this file
@@@ -73,8 -71,7 +72,7 @@@ static unsigned int i_hash_shift __read
    * allowing for low-overhead inode sync() operations.
    */
   
- LIST_HEAD(inode_in_use);
- LIST_HEAD(inode_unused);
+ static LIST_HEAD(inode_lru);
   static struct hlist_head *inode_hashtable __read_mostly;
   
   /*
@@@ -104,8 -101,41 +102,41 @@@ static DECLARE_RWSEM(iprune_sem)
    */
   struct inodes_stat_t inodes_stat;
   
+ static struct percpu_counter nr_inodes __cacheline_aligned_in_smp;
+ static struct percpu_counter nr_inodes_unused __cacheline_aligned_in_smp;
+ 
   static struct kmem_cache *inode_cachep __read_mostly;
   
+ static inline int get_nr_inodes(void)
+ {
+       return percpu_counter_sum_positive(&nr_inodes);
+ }
+ 
+ static inline int get_nr_inodes_unused(void)
+ {
+       return percpu_counter_sum_positive(&nr_inodes_unused);
+ }
+ 
+ int get_nr_dirty_inodes(void)
+ {
+       int nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
+       return nr_dirty > 0 ? nr_dirty : 0;
+ 
+ }
+ 
+ /*
+  * Handle nr_inode sysctl
+  */
+ #ifdef CONFIG_SYSCTL
+ int proc_nr_inodes(ctl_table *table, int write,
+                  void __user *buffer, size_t *lenp, loff_t *ppos)
+ {
+       inodes_stat.nr_inodes = get_nr_inodes();
+       inodes_stat.nr_unused = get_nr_inodes_unused();
+       return proc_dointvec(table, write, buffer, lenp, ppos);
+ }
+ #endif
+ 
   static void wake_up_inode(struct inode *inode)
   {
         /*
@@@ -193,6 -223,8 +224,8 @@@ int inode_init_always(struct super_bloc
         inode->i_fsnotify_mask = 0;
   #endif
   
+       percpu_counter_inc(&nr_inodes);
+ 
         return 0;
   out:
         return -ENOMEM;
@@@ -233,11 -265,13 +266,13 @@@ void __destroy_inode(struct inode *inod
         if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
                 posix_acl_release(inode->i_default_acl);
   #endif
+       percpu_counter_dec(&nr_inodes);
   }
   EXPORT_SYMBOL(__destroy_inode);
   
- void destroy_inode(struct inode *inode)
+ static void destroy_inode(struct inode *inode)
   {
+       BUG_ON(!list_empty(&inode->i_lru));
         __destroy_inode(inode);
         if (inode->i_sb->s_op->destroy_inode)
                 inode->i_sb->s_op->destroy_inode(inode);
@@@ -256,6 -290,8 +291,8 @@@ void inode_init_once(struct inode *inod
         INIT_HLIST_NODE(&inode->i_hash);
         INIT_LIST_HEAD(&inode->i_dentry);
         INIT_LIST_HEAD(&inode->i_devices);
+       INIT_LIST_HEAD(&inode->i_wb_list);
+       INIT_LIST_HEAD(&inode->i_lru);
         INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
         spin_lock_init(&inode->i_data.tree_lock);
         spin_lock_init(&inode->i_data.i_mmap_lock);
@@@ -282,14 -318,109 +319,109 @@@ static void init_once(void *foo
    */
   void __iget(struct inode *inode)
   {
-       if (atomic_inc_return(&inode->i_count) != 1)
-               return;
+       atomic_inc(&inode->i_count);
+ }
+ 
+ /*
+  * get additional reference to inode; caller must already hold one.
+  */
+ void ihold(struct inode *inode)
+ {
+       WARN_ON(atomic_inc_return(&inode->i_count) < 2);
+ }
+ EXPORT_SYMBOL(ihold);
+ 
+ static void inode_lru_list_add(struct inode *inode)
+ {
+       if (list_empty(&inode->i_lru)) {
+               list_add(&inode->i_lru, &inode_lru);
+               percpu_counter_inc(&nr_inodes_unused);
+       }
+ }
   
-       if (!(inode->i_state & (I_DIRTY|I_SYNC)))
-               list_move(&inode->i_list, &inode_in_use);
-       inodes_stat.nr_unused--;
+ static void inode_lru_list_del(struct inode *inode)
+ {
+       if (!list_empty(&inode->i_lru)) {
+               list_del_init(&inode->i_lru);
+               percpu_counter_dec(&nr_inodes_unused);
+       }
+ }
+ 
+ static inline void __inode_sb_list_add(struct inode *inode)
+ {
+       list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
   }
   
+ /**
+  * inode_sb_list_add - add inode to the superblock list of inodes
+  * @inode: inode to add
+  */
+ void inode_sb_list_add(struct inode *inode)
+ {
+       spin_lock(&inode_lock);
+       __inode_sb_list_add(inode);
+       spin_unlock(&inode_lock);
+ }
+ EXPORT_SYMBOL_GPL(inode_sb_list_add);
+ 
+ static inline void __inode_sb_list_del(struct inode *inode)
+ {
+       list_del_init(&inode->i_sb_list);
+ }
+ 
+ static unsigned long hash(struct super_block *sb, unsigned long hashval)
+ {
+       unsigned long tmp;
+ 
+       tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
+                       L1_CACHE_BYTES;
+       tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
+       return tmp & I_HASHMASK;
+ }
+ 
+ /**
+  *    __insert_inode_hash - hash an inode
+  *    @inode: unhashed inode
+  *    @hashval: unsigned long value used to locate this object in the
+  *            inode_hashtable.
+  *
+  *    Add an inode to the inode hash for this superblock.
+  */
+ void __insert_inode_hash(struct inode *inode, unsigned long hashval)
+ {
+       struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
+ 
+       spin_lock(&inode_lock);
+       hlist_add_head(&inode->i_hash, b);
+       spin_unlock(&inode_lock);
+ }
+ EXPORT_SYMBOL(__insert_inode_hash);
+ 
+ /**
+  *    __remove_inode_hash - remove an inode from the hash
+  *    @inode: inode to unhash
+  *
+  *    Remove an inode from the superblock.
+  */
+ static void __remove_inode_hash(struct inode *inode)
+ {
+       hlist_del_init(&inode->i_hash);
+ }
+ 
+ /**
+  *    remove_inode_hash - remove an inode from the hash
+  *    @inode: inode to unhash
+  *
+  *    Remove an inode from the superblock.
+  */
+ void remove_inode_hash(struct inode *inode)
+ {
+       spin_lock(&inode_lock);
+       hlist_del_init(&inode->i_hash);
+       spin_unlock(&inode_lock);
+ }
+ EXPORT_SYMBOL(remove_inode_hash);
+ 
   void end_writeback(struct inode *inode)
   {
         might_sleep();
@@@ -328,101 -459,113 +460,113 @@@ static void evict(struct inode *inode
    */
   static void dispose_list(struct list_head *head)
   {
-       int nr_disposed = 0;
- 
         while (!list_empty(head)) {
                 struct inode *inode;
   
-               inode = list_first_entry(head, struct inode, i_list);
-               list_del(&inode->i_list);
+               inode = list_first_entry(head, struct inode, i_lru);
+               list_del_init(&inode->i_lru);
   
                 evict(inode);
   
                 spin_lock(&inode_lock);
-               hlist_del_init(&inode->i_hash);
-               list_del_init(&inode->i_sb_list);
+               __remove_inode_hash(inode);
+               __inode_sb_list_del(inode);
                 spin_unlock(&inode_lock);
   
                 wake_up_inode(inode);
                 destroy_inode(inode);
-               nr_disposed++;
         }
-       spin_lock(&inode_lock);
-       inodes_stat.nr_inodes -= nr_disposed;
-       spin_unlock(&inode_lock);
   }
   
- /*
-  * Invalidate all inodes for a device.
+ /**
+  * evict_inodes       - evict all evictable inodes for a superblock
+  * @sb:               superblock to operate on
+  *
+  * Make sure that no inodes with zero refcount are retained.  This is
+  * called by superblock shutdown after having MS_ACTIVE flag removed,
+  * so any inode reaching zero refcount during or after that call will
+  * be immediately evicted.
    */
- static int invalidate_list(struct list_head *head, struct list_head *dispose)
+ void evict_inodes(struct super_block *sb)
   {
-       struct list_head *next;
-       int busy = 0, count = 0;
- 
-       next = head->next;
-       for (;;) {
-               struct list_head *tmp = next;
-               struct inode *inode;
+       struct inode *inode, *next;
+       LIST_HEAD(dispose);
   
-               /*
-                * We can reschedule here without worrying about the list's
-                * consistency because the per-sb list of inodes must not
-                * change during umount anymore, and because iprune_sem keeps
-                * shrink_icache_memory() away.
-                */
-               cond_resched_lock(&inode_lock);
+       down_write(&iprune_sem);
   
-               next = next->next;
-               if (tmp == head)
-                       break;
-               inode = list_entry(tmp, struct inode, i_sb_list);
-               if (inode->i_state & I_NEW)
+       spin_lock(&inode_lock);
+       list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
+               if (atomic_read(&inode->i_count))
                         continue;
-               invalidate_inode_buffers(inode);
-               if (!atomic_read(&inode->i_count)) {
-                       list_move(&inode->i_list, dispose);
-                       WARN_ON(inode->i_state & I_NEW);
-                       inode->i_state |= I_FREEING;
-                       count++;
+ 
+               if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+                       WARN_ON(1);
                         continue;
                 }
-               busy = 1;
+ 
+               inode->i_state |= I_FREEING;
+ 
+               /*
+                * Move the inode off the IO lists and LRU once I_FREEING is
+                * set so that it won't get moved back on there if it is dirty.
+                */
+               list_move(&inode->i_lru, &dispose);
+               list_del_init(&inode->i_wb_list);
+               if (!(inode->i_state & (I_DIRTY | I_SYNC)))
+                       percpu_counter_dec(&nr_inodes_unused);
         }
-       /* only unused inodes may be cached with i_count zero */
-       inodes_stat.nr_unused -= count;
-       return busy;
+       spin_unlock(&inode_lock);
+ 
+       dispose_list(&dispose);
+       up_write(&iprune_sem);
   }
   
   /**
-  *    invalidate_inodes       - discard the inodes on a device
-  *    @sb: superblock
+  * invalidate_inodes  - attempt to free all inodes on a superblock
+  * @sb:               superblock to operate on
    *
-  *    Discard all of the inodes for a given superblock. If the discard
-  *    fails because there are busy inodes then a non zero value is returned.
-  *    If the discard is successful all the inodes have been discarded.
+  * Attempts to free all inodes for a given superblock.  If there were any
+  * busy inodes return a non-zero value, else zero.
    */
   int invalidate_inodes(struct super_block *sb)
   {
-       int busy;
-       LIST_HEAD(throw_away);
+       int busy = 0;
+       struct inode *inode, *next;
+       LIST_HEAD(dispose);
   
         down_write(&iprune_sem);
+ 
         spin_lock(&inode_lock);
-       fsnotify_unmount_inodes(&sb->s_inodes);
-       busy = invalidate_list(&sb->s_inodes, &throw_away);
+       list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
+               if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
+                       continue;
+               if (atomic_read(&inode->i_count)) {
+                       busy = 1;
+                       continue;
+               }
+ 
+               inode->i_state |= I_FREEING;
+ 
+               /*
+                * Move the inode off the IO lists and LRU once I_FREEING is
+                * set so that it won't get moved back on there if it is dirty.
+                */
+               list_move(&inode->i_lru, &dispose);
+               list_del_init(&inode->i_wb_list);
+               if (!(inode->i_state & (I_DIRTY | I_SYNC)))
+                       percpu_counter_dec(&nr_inodes_unused);
+       }
         spin_unlock(&inode_lock);
   
-       dispose_list(&throw_away);
+       dispose_list(&dispose);
         up_write(&iprune_sem);
   
         return busy;
   }
- EXPORT_SYMBOL(invalidate_inodes);
   
   static int can_unuse(struct inode *inode)
   {
-       if (inode->i_state)
+       if (inode->i_state & ~I_REFERENCED)
                 return 0;
         if (inode_has_buffers(inode))
                 return 0;
@@@ -434,22 -577,24 +578,24 @@@
   }
   
   /*
-  * Scan `goal' inodes on the unused list for freeable ones. They are moved to
-  * a temporary list and then are freed outside inode_lock by dispose_list().
+  * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
+  * temporary list and then are freed outside inode_lock by dispose_list().
    *
    * Any inodes which are pinned purely because of attached pagecache have their
-  * pagecache removed.  We expect the final iput() on that inode to add it to
-  * the front of the inode_unused list.  So look for it there and if the
-  * inode is still freeable, proceed.  The right inode is found 99.9% of the
-  * time in testing on a 4-way.
+  * pagecache removed.  If the inode has metadata buffers attached to
+  * mapping->private_list then try to remove them.
    *
-  * If the inode has metadata buffers attached to mapping->private_list then
-  * try to remove them.
+  * If the inode has the I_REFERENCED flag set, then it means that it has been
+  * used recently - the flag is set in iput_final(). When we encounter such an
+  * inode, clear the flag and move it to the back of the LRU so it gets another
+  * pass through the LRU before it gets reclaimed. This is necessary because of
+  * the fact we are doing lazy LRU updates to minimise lock contention so the
+  * LRU does not have strict ordering. Hence we don't want to reclaim inodes
+  * with this flag set because they are the inodes that are out of order.
    */
   static void prune_icache(int nr_to_scan)
   {
         LIST_HEAD(freeable);
-       int nr_pruned = 0;
         int nr_scanned;
         unsigned long reap = 0;
   
@@@ -458,13 -603,26 +604,26 @@@
         for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
                 struct inode *inode;
   
-               if (list_empty(&inode_unused))
+               if (list_empty(&inode_lru))
                         break;
   
-               inode = list_entry(inode_unused.prev, struct inode, i_list);
+               inode = list_entry(inode_lru.prev, struct inode, i_lru);
   
-               if (inode->i_state || atomic_read(&inode->i_count)) {
-                       list_move(&inode->i_list, &inode_unused);
+               /*
+                * Referenced or dirty inodes are still in use. Give them
+                * another pass through the LRU as we canot reclaim them now.
+                */
+               if (atomic_read(&inode->i_count) ||
+                   (inode->i_state & ~I_REFERENCED)) {
+                       list_del_init(&inode->i_lru);
+                       percpu_counter_dec(&nr_inodes_unused);
+                       continue;
+               }
+ 
+               /* recently referenced inodes get one more pass */
+               if (inode->i_state & I_REFERENCED) {
+                       list_move(&inode->i_lru, &inode_lru);
+                       inode->i_state &= ~I_REFERENCED;
                         continue;
                 }
                 if (inode_has_buffers(inode) || inode->i_data.nrpages) {
@@@ -476,18 -634,23 +635,23 @@@
                         iput(inode);
                         spin_lock(&inode_lock);
   
-                       if (inode != list_entry(inode_unused.next,
-                                               struct inode, i_list))
+                       if (inode != list_entry(inode_lru.next,
+                                               struct inode, i_lru))
                                 continue;       /* wrong inode or list_empty */
                         if (!can_unuse(inode))
                                 continue;
                 }
-               list_move(&inode->i_list, &freeable);
                 WARN_ON(inode->i_state & I_NEW);
                 inode->i_state |= I_FREEING;
-               nr_pruned++;
+ 
+               /*
+                * Move the inode off the IO lists and LRU once I_FREEING is
+                * set so that it won't get moved back on there if it is dirty.
+                */
+               list_move(&inode->i_lru, &freeable);
+               list_del_init(&inode->i_wb_list);
+               percpu_counter_dec(&nr_inodes_unused);
         }
-       inodes_stat.nr_unused -= nr_pruned;
         if (current_is_kswapd())
                 __count_vm_events(KSWAPD_INODESTEAL, reap);
         else
@@@ -519,7 -682,7 +683,7 @@@ static int shrink_icache_memory(struct 
                         return -1;
                 prune_icache(nr);
         }
-       return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
+       return (get_nr_inodes_unused() / 100) * sysctl_vfs_cache_pressure;
   }
   
   static struct shrinker icache_shrinker = {
@@@ -530,9 -693,6 +694,6 @@@
   static void __wait_on_freeing_inode(struct inode *inode);
   /*
    * Called with the inode lock held.
-  * NOTE: we are not increasing the inode-refcount, you must call __iget()
-  * by hand after calling find_inode now! This simplifies iunique and won't
-  * add any additional branch in the common code.
    */
   static struct inode *find_inode(struct super_block *sb,
                                 struct hlist_head *head,
@@@ -552,9 -712,10 +713,10 @@@ repeat
                         __wait_on_freeing_inode(inode);
                         goto repeat;
                 }
-               break;
+               __iget(inode);
+               return inode;
         }
-       return node ? inode : NULL;
+       return NULL;
   }
   
   /*
@@@ -577,53 -738,49 +739,49 @@@ repeat
                         __wait_on_freeing_inode(inode);
                         goto repeat;
                 }
-               break;
+               __iget(inode);
+               return inode;
         }
-       return node ? inode : NULL;
- }
- 
- static unsigned long hash(struct super_block *sb, unsigned long hashval)
- {
-       unsigned long tmp;
- 
-       tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
-                       L1_CACHE_BYTES;
-       tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
-       return tmp & I_HASHMASK;
- }
- 
- static inline void
- __inode_add_to_lists(struct super_block *sb, struct hlist_head *head,
-                       struct inode *inode)
- {
-       inodes_stat.nr_inodes++;
-       list_add(&inode->i_list, &inode_in_use);
-       list_add(&inode->i_sb_list, &sb->s_inodes);
-       if (head)
-               hlist_add_head(&inode->i_hash, head);
+       return NULL;
   }
   
- /**
-  * inode_add_to_lists - add a new inode to relevant lists
-  * @sb: superblock inode belongs to
-  * @inode: inode to mark in use
+ /*
+  * Each cpu owns a range of LAST_INO_BATCH numbers.
+  * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations,
+  * to renew the exhausted range.
    *
-  * When an inode is allocated it needs to be accounted for, added to the in use
-  * list, the owning superblock and the inode hash. This needs to be done under
-  * the inode_lock, so export a function to do this rather than the inode lock
-  * itself. We calculate the hash list to add to here so it is all internal
-  * which requires the caller to have already set up the inode number in the
-  * inode to add.
+  * This does not significantly increase overflow rate because every CPU can
+  * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is
+  * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the
+  * 2^32 range, and is a worst-case. Even a 50% wastage would only increase
+  * overflow rate by 2x, which does not seem too significant.
+  *
+  * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
+  * error if st_ino won't fit in target struct field. Use 32bit counter
+  * here to attempt to avoid that.
    */
- void inode_add_to_lists(struct super_block *sb, struct inode *inode)
+ #define LAST_INO_BATCH 1024
+ static DEFINE_PER_CPU(unsigned int, last_ino);
+ 
+ unsigned int get_next_ino(void)
   {
-       struct hlist_head *head = inode_hashtable + hash(sb, inode->i_ino);
+       unsigned int *p = &get_cpu_var(last_ino);
+       unsigned int res = *p;
   
-       spin_lock(&inode_lock);
-       __inode_add_to_lists(sb, head, inode);
-       spin_unlock(&inode_lock);
+ #ifdef CONFIG_SMP
+       if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) {
+               static atomic_t shared_last_ino;
+               int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino);
+ 
+               res = next - LAST_INO_BATCH;
+       }
+ #endif
+ 
+       *p = ++res;
+       put_cpu_var(last_ino);
+       return res;
   }
- EXPORT_SYMBOL_GPL(inode_add_to_lists);
+ EXPORT_SYMBOL(get_next_ino);
   
   /**
    *    new_inode       - obtain an inode
@@@ -639,12 -796,6 +797,6 @@@
    */
   struct inode *new_inode(struct super_block *sb)
   {
-       /*
-        * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
-        * error if st_ino won't fit in target struct field. Use 32bit counter
-        * here to attempt to avoid that.
-        */
-       static unsigned int last_ino;
         struct inode *inode;
   
         spin_lock_prefetch(&inode_lock);
@@@ -652,8 -803,7 +804,7 @@@
         inode = alloc_inode(sb);
         if (inode) {
                 spin_lock(&inode_lock);
-               __inode_add_to_lists(sb, NULL, inode);
-               inode->i_ino = ++last_ino;
+               __inode_sb_list_add(inode);
                 inode->i_state = 0;
                 spin_unlock(&inode_lock);
         }
@@@ -664,7 -814,7 +815,7 @@@ EXPORT_SYMBOL(new_inode)
   void unlock_new_inode(struct inode *inode)
   {
   #ifdef CONFIG_DEBUG_LOCK_ALLOC
-       if (inode->i_mode & S_IFDIR) {
+       if (S_ISDIR(inode->i_mode)) {
                 struct file_system_type *type = inode->i_sb->s_type;
   
                 /* Set new key only if filesystem hasn't already changed it */
@@@ -721,7 -871,8 +872,8 @@@ static struct inode *get_new_inode(stru
                         if (set(inode, data))
                                 goto set_failed;
   
-                       __inode_add_to_lists(sb, head, inode);
+                       hlist_add_head(&inode->i_hash, head);
+                       __inode_sb_list_add(inode);
                         inode->i_state = I_NEW;
                         spin_unlock(&inode_lock);
   
@@@ -736,7 -887,6 +888,6 @@@
                  * us. Use the old inode instead of the one we just
                  * allocated.
                  */
-               __iget(old);
                 spin_unlock(&inode_lock);
                 destroy_inode(inode);
                 inode = old;
@@@ -768,7 -918,8 +919,8 @@@ static struct inode *get_new_inode_fast
                 old = find_inode_fast(sb, head, ino);
                 if (!old) {
                         inode->i_ino = ino;
-                       __inode_add_to_lists(sb, head, inode);
+                       hlist_add_head(&inode->i_hash, head);
+                       __inode_sb_list_add(inode);
                         inode->i_state = I_NEW;
                         spin_unlock(&inode_lock);
   
@@@ -783,7 -934,6 +935,6 @@@
                  * us. Use the old inode instead of the one we just
                  * allocated.
                  */
-               __iget(old);
                 spin_unlock(&inode_lock);
                 destroy_inode(inode);
                 inode = old;
@@@ -792,6 -942,27 +943,27 @@@
         return inode;
   }
   
+ /*
+  * search the inode cache for a matching inode number.
+  * If we find one, then the inode number we are trying to
+  * allocate is not unique and so we should not use it.
+  *
+  * Returns 1 if the inode number is unique, 0 if it is not.
+  */
+ static int test_inode_iunique(struct super_block *sb, unsigned long ino)
+ {
+       struct hlist_head *b = inode_hashtable + hash(sb, ino);
+       struct hlist_node *node;
+       struct inode *inode;
+ 
+       hlist_for_each_entry(inode, node, b, i_hash) {
+               if (inode->i_ino == ino && inode->i_sb == sb)
+                       return 0;
+       }
+ 
+       return 1;
+ }
+ 
   /**
    *    iunique - get a unique inode number
    *    @sb: superblock
@@@ -813,19 -984,18 +985,18 @@@ ino_t iunique(struct super_block *sb, i
          * error if st_ino won't fit in target struct field. Use 32bit counter
          * here to attempt to avoid that.
          */
+       static DEFINE_SPINLOCK(iunique_lock);
         static unsigned int counter;
-       struct inode *inode;
-       struct hlist_head *head;
         ino_t res;
   
         spin_lock(&inode_lock);
+       spin_lock(&iunique_lock);
         do {
                 if (counter <= max_reserved)
                         counter = max_reserved + 1;
                 res = counter++;
-               head = inode_hashtable + hash(sb, res);
-               inode = find_inode_fast(sb, head, res);
-       } while (inode != NULL);
+       } while (!test_inode_iunique(sb, res));
+       spin_unlock(&iunique_lock);
         spin_unlock(&inode_lock);
   
         return res;
@@@ -877,7 -1047,6 +1048,6 @@@ static struct inode *ifind(struct super
         spin_lock(&inode_lock);
         inode = find_inode(sb, head, test, data);
         if (inode) {
-               __iget(inode);
                 spin_unlock(&inode_lock);
                 if (likely(wait))
                         wait_on_inode(inode);
@@@ -910,7 -1079,6 +1080,6 @@@ static struct inode *ifind_fast(struct 
         spin_lock(&inode_lock);
         inode = find_inode_fast(sb, head, ino);
         if (inode) {
-               __iget(inode);
                 spin_unlock(&inode_lock);
                 wait_on_inode(inode);
                 return inode;
@@@ -1096,7 -1264,7 +1265,7 @@@ int insert_inode_locked(struct inode *i
                 __iget(old);
                 spin_unlock(&inode_lock);
                 wait_on_inode(old);
-               if (unlikely(!hlist_unhashed(&old->i_hash))) {
+               if (unlikely(!inode_unhashed(old))) {
                         iput(old);
                         return -EBUSY;
                 }
@@@ -1135,7 -1303,7 +1304,7 @@@ int insert_inode_locked4(struct inode *
                 __iget(old);
                 spin_unlock(&inode_lock);
                 wait_on_inode(old);
-               if (unlikely(!hlist_unhashed(&old->i_hash))) {
+               if (unlikely(!inode_unhashed(old))) {
                         iput(old);
                         return -EBUSY;
                 }
@@@ -1144,36 -1312,6 +1313,6 @@@
   }
   EXPORT_SYMBOL(insert_inode_locked4);
   
- /**
-  *    __insert_inode_hash - hash an inode
-  *    @inode: unhashed inode
-  *    @hashval: unsigned long value used to locate this object in the
-  *            inode_hashtable.
-  *
-  *    Add an inode to the inode hash for this superblock.
-  */
- void __insert_inode_hash(struct inode *inode, unsigned long hashval)
- {
-       struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
-       spin_lock(&inode_lock);
-       hlist_add_head(&inode->i_hash, head);
-       spin_unlock(&inode_lock);
- }
- EXPORT_SYMBOL(__insert_inode_hash);
- 
- /**
-  *    remove_inode_hash - remove an inode from the hash
-  *    @inode: inode to unhash
-  *
-  *    Remove an inode from the superblock.
-  */
- void remove_inode_hash(struct inode *inode)
- {
-       spin_lock(&inode_lock);
-       hlist_del_init(&inode->i_hash);
-       spin_unlock(&inode_lock);
- }
- EXPORT_SYMBOL(remove_inode_hash);
   
   int generic_delete_inode(struct inode *inode)
   {
@@@ -1188,7 -1326,7 +1327,7 @@@ EXPORT_SYMBOL(generic_delete_inode)
    */
   int generic_drop_inode(struct inode *inode)
   {
-       return !inode->i_nlink || hlist_unhashed(&inode->i_hash);
+       return !inode->i_nlink || inode_unhashed(inode);
   }
   EXPORT_SYMBOL_GPL(generic_drop_inode);
   
@@@ -1214,10 -1352,11 +1353,11 @@@ static void iput_final(struct inode *in
                 drop = generic_drop_inode(inode);
   
         if (!drop) {
-               if (!(inode->i_state & (I_DIRTY|I_SYNC)))
-                       list_move(&inode->i_list, &inode_unused);
-               inodes_stat.nr_unused++;
                 if (sb->s_flags & MS_ACTIVE) {
+                       inode->i_state |= I_REFERENCED;
+                       if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
+                               inode_lru_list_add(inode);
+                       }
                         spin_unlock(&inode_lock);
                         return;
                 }
@@@ -1228,19 -1367,23 +1368,23 @@@
                 spin_lock(&inode_lock);
                 WARN_ON(inode->i_state & I_NEW);
                 inode->i_state &= ~I_WILL_FREE;
-               inodes_stat.nr_unused--;
-               hlist_del_init(&inode->i_hash);
+               __remove_inode_hash(inode);
         }
-       list_del_init(&inode->i_list);
-       list_del_init(&inode->i_sb_list);
+ 
         WARN_ON(inode->i_state & I_NEW);
         inode->i_state |= I_FREEING;
-       inodes_stat.nr_inodes--;
+ 
+       /*
+        * Move the inode off the IO lists and LRU once I_FREEING is
+        * set so that it won't get moved back on there if it is dirty.
+        */
+       inode_lru_list_del(inode);
+       list_del_init(&inode->i_wb_list);
+ 
+       __inode_sb_list_del(inode);
         spin_unlock(&inode_lock);
         evict(inode);
-       spin_lock(&inode_lock);
-       hlist_del_init(&inode->i_hash);
-       spin_unlock(&inode_lock);
+       remove_inode_hash(inode);
         wake_up_inode(inode);
         BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
         destroy_inode(inode);
@@@ -1504,6 -1647,8 +1648,8 @@@ void __init inode_init(void
                                          SLAB_MEM_SPREAD),
                                          init_once);
         register_shrinker(&icache_shrinker);
+       percpu_counter_init(&nr_inodes, 0);
+       percpu_counter_init(&nr_inodes_unused, 0);
   
         /* Hash may have been set up in inode_init_early */
         if (!hashdist)
diff --combined fs/nfs/dir.c

index 257e4052492e9eb174d2db47b4cdc94bd8f81ef6,0fac7fea18efe669a40656303fb4ab4cc46bb610..07ac3847e562b54c26efd30c9a9eabab468c309d
--- 1/fs/nfs/dir.c
--- 2/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@@ -33,12 -33,11 +33,12 @@@
   #include <linux/namei.h>
   #include <linux/mount.h>
   #include <linux/sched.h>
+ +#include <linux/vmalloc.h>
   
- -#include "nfs4_fs.h"
   #include "delegation.h"
   #include "iostat.h"
   #include "internal.h"
+ +#include "fscache.h"
   
   /* #define NFS_DEBUG_VERBOSE 1 */
   
@@@ -56,7 -55,6 +56,7 @@@ static int nfs_rename(struct inode *, s
                       struct inode *, struct dentry *);
   static int nfs_fsync_dir(struct file *, int);
   static loff_t nfs_llseek_dir(struct file *, loff_t, int);
+ +static int nfs_readdir_clear_array(struct page*, gfp_t);
   
   const struct file_operations nfs_dir_operations = {
         .llseek         = nfs_llseek_dir,
@@@ -82,10 -80,6 +82,10 @@@ const struct inode_operations nfs_dir_i
         .setattr        = nfs_setattr,
   };
   
+ +const struct address_space_operations nfs_dir_addr_space_ops = {
+ +      .releasepage = nfs_readdir_clear_array,
+ +};
+ +
   #ifdef CONFIG_NFS_V3
   const struct inode_operations nfs3_dir_inode_operations = {
         .create         = nfs_create,
@@@ -110,9 -104,8 +110,9 @@@
   #ifdef CONFIG_NFS_V4
   
   static struct dentry *nfs_atomic_lookup(struct inode *, struct dentry *, struct nameidata *);
+ +static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd);
   const struct inode_operations nfs4_dir_inode_operations = {
- -      .create         = nfs_create,
+ +      .create         = nfs_open_create,
         .lookup         = nfs_atomic_lookup,
         .link           = nfs_link,
         .unlink         = nfs_unlink,
@@@ -157,197 -150,51 +157,197 @@@ nfs_opendir(struct inode *inode, struc
         return res;
   }
   
- -typedef __be32 * (*decode_dirent_t)(__be32 *, struct nfs_entry *, int);
+ +struct nfs_cache_array_entry {
+ +      u64 cookie;
+ +      u64 ino;
+ +      struct qstr string;
+ +};
+ +
+ +struct nfs_cache_array {
+ +      unsigned int size;
+ +      int eof_index;
+ +      u64 last_cookie;
+ +      struct nfs_cache_array_entry array[0];
+ +};
+ +
+ +#define MAX_READDIR_ARRAY ((PAGE_SIZE - sizeof(struct nfs_cache_array)) / sizeof(struct nfs_cache_array_entry))
+ +
+ +typedef __be32 * (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
   typedef struct {
         struct file     *file;
         struct page     *page;
         unsigned long   page_index;
- -      __be32          *ptr;
         u64             *dir_cookie;
         loff_t          current_index;
- -      struct nfs_entry *entry;
         decode_dirent_t decode;
- -      int             plus;
+ +
         unsigned long   timestamp;
         unsigned long   gencount;
- -      int             timestamp_valid;
+ +      unsigned int    cache_entry_index;
+ +      unsigned int    plus:1;
+ +      unsigned int    eof:1;
   } nfs_readdir_descriptor_t;
   
- -/* Now we cache directories properly, by stuffing the dirent
- - * data directly in the page cache.
- - *
- - * Inode invalidation due to refresh etc. takes care of
- - * _everything_, no sloppy entry flushing logic, no extraneous
- - * copying, network direct to page cache, the way it was meant
- - * to be.
- - *
- - * NOTE: Dirent information verification is done always by the
- - *     page-in of the RPC reply, nowhere else, this simplies
- - *     things substantially.
+ +/*
+ + * The caller is responsible for calling nfs_readdir_release_array(page)
    */
   static
- -int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
+ +struct nfs_cache_array *nfs_readdir_get_array(struct page *page)
+ +{
+ +      if (page == NULL)
+ +              return ERR_PTR(-EIO);
+ +      return (struct nfs_cache_array *)kmap(page);
+ +}
+ +
+ +static
+ +void nfs_readdir_release_array(struct page *page)
+ +{
+ +      kunmap(page);
+ +}
+ +
+ +/*
+ + * we are freeing strings created by nfs_add_to_readdir_array()
+ + */
+ +static
+ +int nfs_readdir_clear_array(struct page *page, gfp_t mask)
+ +{
+ +      struct nfs_cache_array *array = nfs_readdir_get_array(page);
+ +      int i;
+ +      for (i = 0; i < array->size; i++)
+ +              kfree(array->array[i].string.name);
+ +      nfs_readdir_release_array(page);
+ +      return 0;
+ +}
+ +
+ +/*
+ + * the caller is responsible for freeing qstr.name
+ + * when called by nfs_readdir_add_to_array, the strings will be freed in
+ + * nfs_clear_readdir_array()
+ + */
+ +static
+ +int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int len)
+ +{
+ +      string->len = len;
+ +      string->name = kmemdup(name, len, GFP_KERNEL);
+ +      if (string->name == NULL)
+ +              return -ENOMEM;
+ +      string->hash = full_name_hash(name, len);
+ +      return 0;
+ +}
+ +
+ +static
+ +int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
+ +{
+ +      struct nfs_cache_array *array = nfs_readdir_get_array(page);
+ +      struct nfs_cache_array_entry *cache_entry;
+ +      int ret;
+ +
+ +      if (IS_ERR(array))
+ +              return PTR_ERR(array);
+ +      ret = -EIO;
+ +      if (array->size >= MAX_READDIR_ARRAY)
+ +              goto out;
+ +
+ +      cache_entry = &array->array[array->size];
+ +      cache_entry->cookie = entry->prev_cookie;
+ +      cache_entry->ino = entry->ino;
+ +      ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len);
+ +      if (ret)
+ +              goto out;
+ +      array->last_cookie = entry->cookie;
+ +      if (entry->eof == 1)
+ +              array->eof_index = array->size;
+ +      array->size++;
+ +out:
+ +      nfs_readdir_release_array(page);
+ +      return ret;
+ +}
+ +
+ +static
+ +int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
+ +{
+ +      loff_t diff = desc->file->f_pos - desc->current_index;
+ +      unsigned int index;
+ +
+ +      if (diff < 0)
+ +              goto out_eof;
+ +      if (diff >= array->size) {
+ +              if (array->eof_index > 0)
+ +                      goto out_eof;
+ +              desc->current_index += array->size;
+ +              return -EAGAIN;
+ +      }
+ +
+ +      index = (unsigned int)diff;
+ +      *desc->dir_cookie = array->array[index].cookie;
+ +      desc->cache_entry_index = index;
+ +      if (index == array->eof_index)
+ +              desc->eof = 1;
+ +      return 0;
+ +out_eof:
+ +      desc->eof = 1;
+ +      return -EBADCOOKIE;
+ +}
+ +
+ +static
+ +int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
+ +{
+ +      int i;
+ +      int status = -EAGAIN;
+ +
+ +      for (i = 0; i < array->size; i++) {
+ +              if (i == array->eof_index) {
+ +                      desc->eof = 1;
+ +                      status = -EBADCOOKIE;
+ +              }
+ +              if (array->array[i].cookie == *desc->dir_cookie) {
+ +                      desc->cache_entry_index = i;
+ +                      status = 0;
+ +                      break;
+ +              }
+ +      }
+ +
+ +      return status;
+ +}
+ +
+ +static
+ +int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
+ +{
+ +      struct nfs_cache_array *array;
+ +      int status = -EBADCOOKIE;
+ +
+ +      if (desc->dir_cookie == NULL)
+ +              goto out;
+ +
+ +      array = nfs_readdir_get_array(desc->page);
+ +      if (IS_ERR(array)) {
+ +              status = PTR_ERR(array);
+ +              goto out;
+ +      }
+ +
+ +      if (*desc->dir_cookie == 0)
+ +              status = nfs_readdir_search_for_pos(array, desc);
+ +      else
+ +              status = nfs_readdir_search_for_cookie(array, desc);
+ +
+ +      nfs_readdir_release_array(desc->page);
+ +out:
+ +      return status;
+ +}
+ +
+ +/* Fill a page with xdr information before transferring to the cache page */
+ +static
+ +int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,
+ +                      struct nfs_entry *entry, struct file *file, struct inode *inode)
   {
- -      struct file     *file = desc->file;
- -      struct inode    *inode = file->f_path.dentry->d_inode;
         struct rpc_cred *cred = nfs_file_cred(file);
         unsigned long   timestamp, gencount;
         int             error;
   
- -      dfprintk(DIRCACHE, "NFS: %s: reading cookie %Lu into page %lu\n",
- -                      __func__, (long long)desc->entry->cookie,
- -                      page->index);
- -
    again:
         timestamp = jiffies;
         gencount = nfs_inc_attr_generation_counter();
- -      error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, desc->entry->cookie, page,
+ +      error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, entry->cookie, pages,
                                           NFS_SERVER(inode)->dtsize, desc->plus);
         if (error < 0) {
                 /* We requested READDIRPLUS, but the server doesn't grok it */
@@@ -361,292 -208,190 +361,292 @@@
         }
         desc->timestamp = timestamp;
         desc->gencount = gencount;
- -      desc->timestamp_valid = 1;
- -      SetPageUptodate(page);
- -      /* Ensure consistent page alignment of the data.
- -       * Note: assumes we have exclusive access to this mapping either
- -       *       through inode->i_mutex or some other mechanism.
- -       */
- -      if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
- -              /* Should never happen */
- -              nfs_zap_mapping(inode, inode->i_mapping);
- -      }
- -      unlock_page(page);
- -      return 0;
- - error:
- -      unlock_page(page);
- -      return -EIO;
+ +error:
+ +      return error;
   }
   
- -static inline
- -int dir_decode(nfs_readdir_descriptor_t *desc)
+ +/* Fill in an entry based on the xdr code stored in desc->page */
+ +static
+ +int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct xdr_stream *stream)
   {
- -      __be32  *p = desc->ptr;
- -      p = desc->decode(p, desc->entry, desc->plus);
+ +      __be32 *p = desc->decode(stream, entry, NFS_SERVER(desc->file->f_path.dentry->d_inode), desc->plus);
         if (IS_ERR(p))
                 return PTR_ERR(p);
- -      desc->ptr = p;
- -      if (desc->timestamp_valid) {
- -              desc->entry->fattr->time_start = desc->timestamp;
- -              desc->entry->fattr->gencount = desc->gencount;
- -      } else
- -              desc->entry->fattr->valid &= ~NFS_ATTR_FATTR;
+ +
+ +      entry->fattr->time_start = desc->timestamp;
+ +      entry->fattr->gencount = desc->gencount;
         return 0;
   }
   
- -static inline
- -void dir_page_release(nfs_readdir_descriptor_t *desc)
+ +static
+ +int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
   {
- -      kunmap(desc->page);
- -      page_cache_release(desc->page);
- -      desc->page = NULL;
- -      desc->ptr = NULL;
+ +      struct nfs_inode *node;
+ +      if (dentry->d_inode == NULL)
+ +              goto different;
+ +      node = NFS_I(dentry->d_inode);
+ +      if (node->fh.size != entry->fh->size)
+ +              goto different;
+ +      if (strncmp(node->fh.data, entry->fh->data, node->fh.size) != 0)
+ +              goto different;
+ +      return 1;
+ +different:
+ +      return 0;
   }
   
- -/*
- - * Given a pointer to a buffer that has already been filled by a call
- - * to readdir, find the next entry with cookie '*desc->dir_cookie'.
- - *
- - * If the end of the buffer has been reached, return -EAGAIN, if not,
- - * return the offset within the buffer of the next entry to be
- - * read.
- - */
- -static inline
- -int find_dirent(nfs_readdir_descriptor_t *desc)
+ +static
+ +void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
   {
- -      struct nfs_entry *entry = desc->entry;
- -      int             loop_count = 0,
- -                      status;
- -
- -      while((status = dir_decode(desc)) == 0) {
- -              dfprintk(DIRCACHE, "NFS: %s: examining cookie %Lu\n",
- -                              __func__, (unsigned long long)entry->cookie);
- -              if (entry->prev_cookie == *desc->dir_cookie)
- -                      break;
- -              if (loop_count++ > 200) {
- -                      loop_count = 0;
- -                      schedule();
+ +      struct qstr filename = {
+ +              .len = entry->len,
+ +              .name = entry->name,
+ +      };
+ +      struct dentry *dentry;
+ +      struct dentry *alias;
+ +      struct inode *dir = parent->d_inode;
+ +      struct inode *inode;
+ +
+ +      if (filename.name[0] == '.') {
+ +              if (filename.len == 1)
+ +                      return;
+ +              if (filename.len == 2 && filename.name[1] == '.')
+ +                      return;
+ +      }
+ +      filename.hash = full_name_hash(filename.name, filename.len);
+ +
+ +      dentry = d_lookup(parent, &filename);
+ +      if (dentry != NULL) {
+ +              if (nfs_same_file(dentry, entry)) {
+ +                      nfs_refresh_inode(dentry->d_inode, entry->fattr);
+ +                      goto out;
+ +              } else {
+ +                      d_drop(dentry);
+ +                      dput(dentry);
                 }
         }
- -      return status;
+ +
+ +      dentry = d_alloc(parent, &filename);
+ +      if (dentry == NULL)
+ +              return;
+ +
+ +      dentry->d_op = NFS_PROTO(dir)->dentry_ops;
+ +      inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
+ +      if (IS_ERR(inode))
+ +              goto out;
+ +
+ +      alias = d_materialise_unique(dentry, inode);
+ +      if (IS_ERR(alias))
+ +              goto out;
+ +      else if (alias) {
+ +              nfs_set_verifier(alias, nfs_save_change_attribute(dir));
+ +              dput(alias);
+ +      } else
+ +              nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ +
+ +out:
+ +      dput(dentry);
+ +}
+ +
+ +/* Perform conversion from xdr to cache array */
+ +static
+ +void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
+ +                              void *xdr_page, struct page *page, unsigned int buflen)
+ +{
+ +      struct xdr_stream stream;
+ +      struct xdr_buf buf;
+ +      __be32 *ptr = xdr_page;
+ +      int status;
+ +      struct nfs_cache_array *array;
+ +
+ +      buf.head->iov_base = xdr_page;
+ +      buf.head->iov_len = buflen;
+ +      buf.tail->iov_len = 0;
+ +      buf.page_base = 0;
+ +      buf.page_len = 0;
+ +      buf.buflen = buf.head->iov_len;
+ +      buf.len = buf.head->iov_len;
+ +
+ +      xdr_init_decode(&stream, &buf, ptr);
+ +
+ +
+ +      do {
+ +              status = xdr_decode(desc, entry, &stream);
+ +              if (status != 0)
+ +                      break;
+ +
+ +              if (nfs_readdir_add_to_array(entry, page) == -1)
+ +                      break;
+ +              if (desc->plus == 1)
+ +                      nfs_prime_dcache(desc->file->f_path.dentry, entry);
+ +      } while (!entry->eof);
+ +
+ +      if (status == -EBADCOOKIE && entry->eof) {
+ +              array = nfs_readdir_get_array(page);
+ +              array->eof_index = array->size - 1;
+ +              status = 0;
+ +              nfs_readdir_release_array(page);
+ +      }
+ +}
+ +
+ +static
+ +void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages)
+ +{
+ +      unsigned int i;
+ +      for (i = 0; i < npages; i++)
+ +              put_page(pages[i]);
+ +}
+ +
+ +static
+ +void nfs_readdir_free_large_page(void *ptr, struct page **pages,
+ +              unsigned int npages)
+ +{
+ +      vm_unmap_ram(ptr, npages);
+ +      nfs_readdir_free_pagearray(pages, npages);
   }
   
   /*
- - * Given a pointer to a buffer that has already been filled by a call
- - * to readdir, find the entry at offset 'desc->file->f_pos'.
- - *
- - * If the end of the buffer has been reached, return -EAGAIN, if not,
- - * return the offset within the buffer of the next entry to be
- - * read.
+ + * nfs_readdir_large_page will allocate pages that must be freed with a call
+ + * to nfs_readdir_free_large_page
    */
- -static inline
- -int find_dirent_index(nfs_readdir_descriptor_t *desc)
+ +static
+ +void *nfs_readdir_large_page(struct page **pages, unsigned int npages)
   {
- -      struct nfs_entry *entry = desc->entry;
- -      int             loop_count = 0,
- -                      status;
+ +      void *ptr;
+ +      unsigned int i;
+ +
+ +      for (i = 0; i < npages; i++) {
+ +              struct page *page = alloc_page(GFP_KERNEL);
+ +              if (page == NULL)
+ +                      goto out_freepages;
+ +              pages[i] = page;
+ +      }
   
- -      for(;;) {
- -              status = dir_decode(desc);
- -              if (status)
- -                      break;
+ +      ptr = vm_map_ram(pages, npages, 0, PAGE_KERNEL);
+ +      if (!IS_ERR_OR_NULL(ptr))
+ +              return ptr;
+ +out_freepages:
+ +      nfs_readdir_free_pagearray(pages, i);
+ +      return NULL;
+ +}
   
- -              dfprintk(DIRCACHE, "NFS: found cookie %Lu at index %Ld\n",
- -                              (unsigned long long)entry->cookie, desc->current_index);
+ +static
+ +int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
+ +{
+ +      struct page *pages[NFS_MAX_READDIR_PAGES];
+ +      void *pages_ptr = NULL;
+ +      struct nfs_entry entry;
+ +      struct file     *file = desc->file;
+ +      struct nfs_cache_array *array;
+ +      int status = 0;
+ +      unsigned int array_size = ARRAY_SIZE(pages);
+ +
+ +      entry.prev_cookie = 0;
+ +      entry.cookie = *desc->dir_cookie;
+ +      entry.eof = 0;
+ +      entry.fh = nfs_alloc_fhandle();
+ +      entry.fattr = nfs_alloc_fattr();
+ +      if (entry.fh == NULL || entry.fattr == NULL)
+ +              goto out;
+ +
+ +      array = nfs_readdir_get_array(page);
+ +      memset(array, 0, sizeof(struct nfs_cache_array));
+ +      array->eof_index = -1;
   
- -              if (desc->file->f_pos == desc->current_index) {
- -                      *desc->dir_cookie = entry->cookie;
+ +      pages_ptr = nfs_readdir_large_page(pages, array_size);
+ +      if (!pages_ptr)
+ +              goto out_release_array;
+ +      do {
+ +              status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode);
+ +
+ +              if (status < 0)
                         break;
- -              }
- -              desc->current_index++;
- -              if (loop_count++ > 200) {
- -                      loop_count = 0;
- -                      schedule();
- -              }
- -      }
+ +              nfs_readdir_page_filler(desc, &entry, pages_ptr, page, array_size * PAGE_SIZE);
+ +      } while (array->eof_index < 0 && array->size < MAX_READDIR_ARRAY);
+ +
+ +      nfs_readdir_free_large_page(pages_ptr, pages, array_size);
+ +out_release_array:
+ +      nfs_readdir_release_array(page);
+ +out:
+ +      nfs_free_fattr(entry.fattr);
+ +      nfs_free_fhandle(entry.fh);
         return status;
   }
   
   /*
- - * Find the given page, and call find_dirent() or find_dirent_index in
- - * order to try to return the next entry.
+ + * Now we cache directories properly, by converting xdr information
+ + * to an array that can be used for lookups later.  This results in
+ + * fewer cache pages, since we can store more information on each page.
+ + * We only need to convert from xdr once so future lookups are much simpler
    */
- -static inline
- -int find_dirent_page(nfs_readdir_descriptor_t *desc)
+ +static
+ +int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
   {
         struct inode    *inode = desc->file->f_path.dentry->d_inode;
- -      struct page     *page;
- -      int             status;
   
- -      dfprintk(DIRCACHE, "NFS: %s: searching page %ld for target %Lu\n",
- -                      __func__, desc->page_index,
- -                      (long long) *desc->dir_cookie);
+ +      if (nfs_readdir_xdr_to_array(desc, page, inode) < 0)
+ +              goto error;
+ +      SetPageUptodate(page);
   
- -      /* If we find the page in the page_cache, we cannot be sure
- -       * how fresh the data is, so we will ignore readdir_plus attributes.
- -       */
- -      desc->timestamp_valid = 0;
- -      page = read_cache_page(inode->i_mapping, desc->page_index,
- -                             (filler_t *)nfs_readdir_filler, desc);
- -      if (IS_ERR(page)) {
- -              status = PTR_ERR(page);
- -              goto out;
+ +      if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
+ +              /* Should never happen */
+ +              nfs_zap_mapping(inode, inode->i_mapping);
         }
+ +      unlock_page(page);
+ +      return 0;
+ + error:
+ +      unlock_page(page);
+ +      return -EIO;
+ +}
   
- -      /* NOTE: Someone else may have changed the READDIRPLUS flag */
- -      desc->page = page;
- -      desc->ptr = kmap(page);         /* matching kunmap in nfs_do_filldir */
- -      if (*desc->dir_cookie != 0)
- -              status = find_dirent(desc);
- -      else
- -              status = find_dirent_index(desc);
- -      if (status < 0)
- -              dir_page_release(desc);
- - out:
- -      dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status);
- -      return status;
+ +static
+ +void cache_page_release(nfs_readdir_descriptor_t *desc)
+ +{
+ +      page_cache_release(desc->page);
+ +      desc->page = NULL;
+ +}
+ +
+ +static
+ +struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
+ +{
+ +      struct page *page;
+ +      page = read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping,
+ +                      desc->page_index, (filler_t *)nfs_readdir_filler, desc);
+ +      if (IS_ERR(page))
+ +              desc->eof = 1;
+ +      return page;
   }
   
   /*
- - * Recurse through the page cache pages, and return a
- - * filled nfs_entry structure of the next directory entry if possible.
- - *
- - * The target for the search is '*desc->dir_cookie' if non-0,
- - * 'desc->file->f_pos' otherwise
+ + * Returns 0 if desc->dir_cookie was found on page desc->page_index
    */
+ +static
+ +int find_cache_page(nfs_readdir_descriptor_t *desc)
+ +{
+ +      int res;
+ +
+ +      desc->page = get_cache_page(desc);
+ +      if (IS_ERR(desc->page))
+ +              return PTR_ERR(desc->page);
+ +
+ +      res = nfs_readdir_search_array(desc);
+ +      if (res == 0)
+ +              return 0;
+ +      cache_page_release(desc);
+ +      return res;
+ +}
+ +
+ +/* Search for desc->dir_cookie from the beginning of the page cache */
   static inline
   int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
   {
- -      int             loop_count = 0;
- -      int             res;
- -
- -      /* Always search-by-index from the beginning of the cache */
- -      if (*desc->dir_cookie == 0) {
- -              dfprintk(DIRCACHE, "NFS: readdir_search_pagecache() searching for offset %Ld\n",
- -                              (long long)desc->file->f_pos);
- -              desc->page_index = 0;
- -              desc->entry->cookie = desc->entry->prev_cookie = 0;
- -              desc->entry->eof = 0;
- -              desc->current_index = 0;
- -      } else
- -              dfprintk(DIRCACHE, "NFS: readdir_search_pagecache() searching for cookie %Lu\n",
- -                              (unsigned long long)*desc->dir_cookie);
+ +      int res = -EAGAIN;
   
- -      for (;;) {
- -              res = find_dirent_page(desc);
+ +      while (1) {
+ +              res = find_cache_page(desc);
                 if (res != -EAGAIN)
                         break;
- -              /* Align to beginning of next page */
- -              desc->page_index ++;
- -              if (loop_count++ > 200) {
- -                      loop_count = 0;
- -                      schedule();
- -              }
+ +              desc->page_index++;
         }
- -
- -      dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, res);
         return res;
   }
   
@@@ -655,6 -400,8 +655,6 @@@ static inline unsigned int dt_type(stru
         return (inode->i_mode >> 12) & 15;
   }
   
- -static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc);
- -
   /*
    * Once we've found the start of the dirent within a page: fill 'er up...
    */
@@@ -663,36 -410,49 +663,36 @@@ int nfs_do_filldir(nfs_readdir_descript
                    filldir_t filldir)
   {
         struct file     *file = desc->file;
- -      struct nfs_entry *entry = desc->entry;
- -      struct dentry   *dentry = NULL;
- -      u64             fileid;
- -      int             loop_count = 0,
- -                      res;
- -
- -      dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling starting @ cookie %Lu\n",
- -                      (unsigned long long)entry->cookie);
- -
- -      for(;;) {
- -              unsigned d_type = DT_UNKNOWN;
- -              /* Note: entry->prev_cookie contains the cookie for
- -               *       retrieving the current dirent on the server */
- -              fileid = entry->ino;
- -
- -              /* Get a dentry if we have one */
- -              if (dentry != NULL)
- -                      dput(dentry);
- -              dentry = nfs_readdir_lookup(desc);
+ +      int i = 0;
+ +      int res = 0;
+ +      struct nfs_cache_array *array = NULL;
+ +      unsigned int d_type = DT_UNKNOWN;
+ +      struct dentry *dentry = NULL;
   
- -              /* Use readdirplus info */
- -              if (dentry != NULL && dentry->d_inode != NULL) {
- -                      d_type = dt_type(dentry->d_inode);
- -                      fileid = NFS_FILEID(dentry->d_inode);
- -              }
+ +      array = nfs_readdir_get_array(desc->page);
+ +
+ +      for (i = desc->cache_entry_index; i < array->size; i++) {
+ +              d_type = DT_UNKNOWN;
   
- -              res = filldir(dirent, entry->name, entry->len, 
- -                            file->f_pos, nfs_compat_user_ino64(fileid),
- -                            d_type);
+ +              res = filldir(dirent, array->array[i].string.name,
+ +                      array->array[i].string.len, file->f_pos,
+ +                      nfs_compat_user_ino64(array->array[i].ino), d_type);
                 if (res < 0)
                         break;
                 file->f_pos++;
- -              *desc->dir_cookie = entry->cookie;
- -              if (dir_decode(desc) != 0) {
- -                      desc->page_index ++;
+ +              desc->cache_entry_index = i;
+ +              if (i < (array->size-1))
+ +                      *desc->dir_cookie = array->array[i+1].cookie;
+ +              else
+ +                      *desc->dir_cookie = array->last_cookie;
+ +              if (i == array->eof_index) {
+ +                      desc->eof = 1;
                         break;
                 }
- -              if (loop_count++ > 200) {
- -                      loop_count = 0;
- -                      schedule();
- -              }
         }
- -      dir_page_release(desc);
+ +
+ +      nfs_readdir_release_array(desc->page);
+ +      cache_page_release(desc);
         if (dentry != NULL)
                 dput(dentry);
         dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",
@@@ -716,9 -476,12 +716,9 @@@ static inlin
   int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
                      filldir_t filldir)
   {
- -      struct file     *file = desc->file;
- -      struct inode    *inode = file->f_path.dentry->d_inode;
- -      struct rpc_cred *cred = nfs_file_cred(file);
         struct page     *page = NULL;
         int             status;
- -      unsigned long   timestamp, gencount;
+ +      struct inode *inode = desc->file->f_path.dentry->d_inode;
   
         dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
                         (unsigned long long)*desc->dir_cookie);
@@@ -728,22 -491,38 +728,22 @@@
                 status = -ENOMEM;
                 goto out;
         }
- -      timestamp = jiffies;
- -      gencount = nfs_inc_attr_generation_counter();
- -      status = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred,
- -                                              *desc->dir_cookie, page,
- -                                              NFS_SERVER(inode)->dtsize,
- -                                              desc->plus);
- -      desc->page = page;
- -      desc->ptr = kmap(page);         /* matching kunmap in nfs_do_filldir */
- -      if (status >= 0) {
- -              desc->timestamp = timestamp;
- -              desc->gencount = gencount;
- -              desc->timestamp_valid = 1;
- -              if ((status = dir_decode(desc)) == 0)
- -                      desc->entry->prev_cookie = *desc->dir_cookie;
- -      } else
+ +
+ +      if (nfs_readdir_xdr_to_array(desc, page, inode) == -1) {
                 status = -EIO;
- -      if (status < 0)
                 goto out_release;
+ +      }
   
+ +      desc->page_index = 0;
+ +      desc->page = page;
         status = nfs_do_filldir(desc, dirent, filldir);
   
- -      /* Reset read descriptor so it searches the page cache from
- -       * the start upon the next call to readdir_search_pagecache() */
- -      desc->page_index = 0;
- -      desc->entry->cookie = desc->entry->prev_cookie = 0;
- -      desc->entry->eof = 0;
    out:
         dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
                         __func__, status);
         return status;
    out_release:
- -      dir_page_release(desc);
+ +      cache_page_release(desc);
         goto out;
   }
   
@@@ -757,6 -536,7 +757,6 @@@ static int nfs_readdir(struct file *fil
         struct inode    *inode = dentry->d_inode;
         nfs_readdir_descriptor_t my_desc,
                         *desc = &my_desc;
- -      struct nfs_entry my_entry;
         int res = -ENOMEM;
   
         dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
@@@ -777,17 -557,26 +777,17 @@@
         desc->decode = NFS_PROTO(inode)->decode_dirent;
         desc->plus = NFS_USE_READDIRPLUS(inode);
   
- -      my_entry.cookie = my_entry.prev_cookie = 0;
- -      my_entry.eof = 0;
- -      my_entry.fh = nfs_alloc_fhandle();
- -      my_entry.fattr = nfs_alloc_fattr();
- -      if (my_entry.fh == NULL || my_entry.fattr == NULL)
- -              goto out_alloc_failed;
- -
- -      desc->entry = &my_entry;
- -
         nfs_block_sillyrename(dentry);
         res = nfs_revalidate_mapping(inode, filp->f_mapping);
         if (res < 0)
                 goto out;
   
- -      while(!desc->entry->eof) {
+ +      while (desc->eof != 1) {
                 res = readdir_search_pagecache(desc);
   
                 if (res == -EBADCOOKIE) {
                         /* This means either end of directory */
- -                      if (*desc->dir_cookie && desc->entry->cookie != *desc->dir_cookie) {
+ +                      if (*desc->dir_cookie && desc->eof == 0) {
                                 /* Or that the server has 'lost' a cookie */
                                 res = uncached_readdir(desc, dirent, filldir);
                                 if (res >= 0)
@@@ -799,9 -588,8 +799,9 @@@
                 if (res == -ETOOSMALL && desc->plus) {
                         clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
                         nfs_zap_caches(inode);
+ +                      desc->page_index = 0;
                         desc->plus = 0;
- -                      desc->entry->eof = 0;
+ +                      desc->eof = 0;
                         continue;
                 }
                 if (res < 0)
@@@ -817,6 -605,9 +817,6 @@@ out
         nfs_unblock_sillyrename(dentry);
         if (res > 0)
                 res = 0;
- -out_alloc_failed:
- -      nfs_free_fattr(my_entry.fattr);
- -      nfs_free_fhandle(my_entry.fh);
         dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n",
                         dentry->d_parent->d_name.name, dentry->d_name.name,
                         res);
@@@ -1238,63 -1029,10 +1238,63 @@@ static int is_atomic_open(struct nameid
         return 1;
   }
   
+ +static struct nfs_open_context *nameidata_to_nfs_open_context(struct dentry *dentry, struct nameidata *nd)
+ +{
+ +      struct path path = {
+ +              .mnt = nd->path.mnt,
+ +              .dentry = dentry,
+ +      };
+ +      struct nfs_open_context *ctx;
+ +      struct rpc_cred *cred;
+ +      fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
+ +
+ +      cred = rpc_lookup_cred();
+ +      if (IS_ERR(cred))
+ +              return ERR_CAST(cred);
+ +      ctx = alloc_nfs_open_context(&path, cred, fmode);
+ +      put_rpccred(cred);
+ +      if (ctx == NULL)
+ +              return ERR_PTR(-ENOMEM);
+ +      return ctx;
+ +}
+ +
+ +static int do_open(struct inode *inode, struct file *filp)
+ +{
+ +      nfs_fscache_set_inode_cookie(inode, filp);
+ +      return 0;
+ +}
+ +
+ +static int nfs_intent_set_file(struct nameidata *nd, struct nfs_open_context *ctx)
+ +{
+ +      struct file *filp;
+ +      int ret = 0;
+ +
+ +      /* If the open_intent is for execute, we have an extra check to make */
+ +      if (ctx->mode & FMODE_EXEC) {
+ +              ret = nfs_may_open(ctx->path.dentry->d_inode,
+ +                              ctx->cred,
+ +                              nd->intent.open.flags);
+ +              if (ret < 0)
+ +                      goto out;
+ +      }
+ +      filp = lookup_instantiate_filp(nd, ctx->path.dentry, do_open);
+ +      if (IS_ERR(filp))
+ +              ret = PTR_ERR(filp);
+ +      else
+ +              nfs_file_set_open_context(filp, ctx);
+ +out:
+ +      put_nfs_open_context(ctx);
+ +      return ret;
+ +}
+ +
   static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
   {
+ +      struct nfs_open_context *ctx;
+ +      struct iattr attr;
         struct dentry *res = NULL;
- -      int error;
+ +      struct inode *inode;
+ +      int open_flags;
+ +      int err;
   
         dfprintk(VFS, "NFS: atomic_lookup(%s/%ld), %s\n",
                         dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
@@@ -1316,32 -1054,13 +1316,32 @@@
                 goto out;
         }
   
+ +      ctx = nameidata_to_nfs_open_context(dentry, nd);
+ +      res = ERR_CAST(ctx);
+ +      if (IS_ERR(ctx))
+ +              goto out;
+ +
+ +      open_flags = nd->intent.open.flags;
+ +      if (nd->flags & LOOKUP_CREATE) {
+ +              attr.ia_mode = nd->intent.open.create_mode;
+ +              attr.ia_valid = ATTR_MODE;
+ +              if (!IS_POSIXACL(dir))
+ +                      attr.ia_mode &= ~current_umask();
+ +      } else {
+ +              open_flags &= ~(O_EXCL | O_CREAT);
+ +              attr.ia_valid = 0;
+ +      }
+ +
         /* Open the file on the server */
- -      res = nfs4_atomic_open(dir, dentry, nd);
- -      if (IS_ERR(res)) {
- -              error = PTR_ERR(res);
- -              switch (error) {
+ +      nfs_block_sillyrename(dentry->d_parent);
+ +      inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr);
+ +      if (IS_ERR(inode)) {
+ +              nfs_unblock_sillyrename(dentry->d_parent);
+ +              put_nfs_open_context(ctx);
+ +              switch (PTR_ERR(inode)) {
                         /* Make a negative dentry */
                         case -ENOENT:
+ +                              d_add(dentry, NULL);
                                 res = NULL;
                                 goto out;
                         /* This turned out not to be a regular file */
@@@ -1353,25 -1072,11 +1353,25 @@@
                                         goto no_open;
                         /* case -EINVAL: */
                         default:
+ +                              res = ERR_CAST(inode);
                                 goto out;
                 }
- -      } else if (res != NULL)
+ +      }
+ +      res = d_add_unique(dentry, inode);
+ +      nfs_unblock_sillyrename(dentry->d_parent);
+ +      if (res != NULL) {
+ +              dput(ctx->path.dentry);
+ +              ctx->path.dentry = dget(res);
                 dentry = res;
+ +      }
+ +      err = nfs_intent_set_file(nd, ctx);
+ +      if (err < 0) {
+ +              if (res != NULL)
+ +                      dput(res);
+ +              return ERR_PTR(err);
+ +      }
   out:
+ +      nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
         return res;
   no_open:
         return nfs_lookup(dir, dentry, nd);
@@@ -1382,15 -1087,12 +1382,15 @@@ static int nfs_open_revalidate(struct d
         struct dentry *parent = NULL;
         struct inode *inode = dentry->d_inode;
         struct inode *dir;
+ +      struct nfs_open_context *ctx;
         int openflags, ret = 0;
   
         if (!is_atomic_open(nd) || d_mountpoint(dentry))
                 goto no_open;
+ +
         parent = dget_parent(dentry);
         dir = parent->d_inode;
+ +
         /* We can't create new files in nfs_open_revalidate(), so we
          * optimize away revalidation of negative dentries.
          */
@@@ -1410,96 -1112,99 +1410,96 @@@
         /* We can't create new files, or truncate existing ones here */
         openflags &= ~(O_CREAT|O_EXCL|O_TRUNC);
   
+ +      ctx = nameidata_to_nfs_open_context(dentry, nd);
+ +      ret = PTR_ERR(ctx);
+ +      if (IS_ERR(ctx))
+ +              goto out;
         /*
          * Note: we're not holding inode->i_mutex and so may be racing with
          * operations that change the directory. We therefore save the
          * change attribute *before* we do the RPC call.
          */
- -      ret = nfs4_open_revalidate(dir, dentry, openflags, nd);
+ +      inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, NULL);
+ +      if (IS_ERR(inode)) {
+ +              ret = PTR_ERR(inode);
+ +              switch (ret) {
+ +              case -EPERM:
+ +              case -EACCES:
+ +              case -EDQUOT:
+ +              case -ENOSPC:
+ +              case -EROFS:
+ +                      goto out_put_ctx;
+ +              default:
+ +                      goto out_drop;
+ +              }
+ +      }
+ +      iput(inode);
+ +      if (inode != dentry->d_inode)
+ +              goto out_drop;
+ +
+ +      nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ +      ret = nfs_intent_set_file(nd, ctx);
+ +      if (ret >= 0)
+ +              ret = 1;
   out:
         dput(parent);
- -      if (!ret)
- -              d_drop(dentry);
         return ret;
+ +out_drop:
+ +      d_drop(dentry);
+ +      ret = 0;
+ +out_put_ctx:
+ +      put_nfs_open_context(ctx);
+ +      goto out;
+ +
   no_open_dput:
         dput(parent);
   no_open:
         return nfs_lookup_revalidate(dentry, nd);
   }
- -#endif /* CONFIG_NFSV4 */
   
- -static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc)
+ +static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode,
+ +              struct nameidata *nd)
   {
- -      struct dentry *parent = desc->file->f_path.dentry;
- -      struct inode *dir = parent->d_inode;
- -      struct nfs_entry *entry = desc->entry;
- -      struct dentry *dentry, *alias;
- -      struct qstr name = {
- -              .name = entry->name,
- -              .len = entry->len,
- -      };
- -      struct inode *inode;
- -      unsigned long verf = nfs_save_change_attribute(dir);
+ +      struct nfs_open_context *ctx = NULL;
+ +      struct iattr attr;
+ +      int error;
+ +      int open_flags = 0;
   
- -      switch (name.len) {
- -              case 2:
- -                      if (name.name[0] == '.' && name.name[1] == '.')
- -                              return dget_parent(parent);
- -                      break;
- -              case 1:
- -                      if (name.name[0] == '.')
- -                              return dget(parent);
- -      }
+ +      dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
+ +                      dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
   
- -      spin_lock(&dir->i_lock);
- -      if (NFS_I(dir)->cache_validity & NFS_INO_INVALID_DATA) {
- -              spin_unlock(&dir->i_lock);
- -              return NULL;
- -      }
- -      spin_unlock(&dir->i_lock);
+ +      attr.ia_mode = mode;
+ +      attr.ia_valid = ATTR_MODE;
   
- -      name.hash = full_name_hash(name.name, name.len);
- -      dentry = d_lookup(parent, &name);
- -      if (dentry != NULL) {
- -              /* Is this a positive dentry that matches the readdir info? */
- -              if (dentry->d_inode != NULL &&
- -                              (NFS_FILEID(dentry->d_inode) == entry->ino ||
- -                              d_mountpoint(dentry))) {
- -                      if (!desc->plus || entry->fh->size == 0)
- -                              return dentry;
- -                      if (nfs_compare_fh(NFS_FH(dentry->d_inode),
- -                                              entry->fh) == 0)
- -                              goto out_renew;
- -              }
- -              /* No, so d_drop to allow one to be created */
- -              d_drop(dentry);
- -              dput(dentry);
- -      }
- -      if (!desc->plus || !(entry->fattr->valid & NFS_ATTR_FATTR))
- -              return NULL;
- -      if (name.len > NFS_SERVER(dir)->namelen)
- -              return NULL;
- -      /* Note: caller is already holding the dir->i_mutex! */
- -      dentry = d_alloc(parent, &name);
- -      if (dentry == NULL)
- -              return NULL;
- -      dentry->d_op = NFS_PROTO(dir)->dentry_ops;
- -      inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
- -      if (IS_ERR(inode)) {
- -              dput(dentry);
- -              return NULL;
- -      }
+ +      if ((nd->flags & LOOKUP_CREATE) != 0) {
+ +              open_flags = nd->intent.open.flags;
   
- -      alias = d_materialise_unique(dentry, inode);
- -      if (alias != NULL) {
- -              dput(dentry);
- -              if (IS_ERR(alias))
- -                      return NULL;
- -              dentry = alias;
+ +              ctx = nameidata_to_nfs_open_context(dentry, nd);
+ +              error = PTR_ERR(ctx);
+ +              if (IS_ERR(ctx))
+ +                      goto out_err_drop;
         }
   
- -out_renew:
- -      nfs_set_verifier(dentry, verf);
- -      return dentry;
+ +      error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, ctx);
+ +      if (error != 0)
+ +              goto out_put_ctx;
+ +      if (ctx != NULL) {
+ +              error = nfs_intent_set_file(nd, ctx);
+ +              if (error < 0)
+ +                      goto out_err;
+ +      }
+ +      return 0;
+ +out_put_ctx:
+ +      if (ctx != NULL)
+ +              put_nfs_open_context(ctx);
+ +out_err_drop:
+ +      d_drop(dentry);
+ +out_err:
+ +      return error;
   }
   
+ +#endif /* CONFIG_NFSV4 */
+ +
   /*
    * Code common to create, mkdir, and mknod.
    */
@@@ -1553,6 -1258,7 +1553,6 @@@ static int nfs_create(struct inode *dir
   {
         struct iattr attr;
         int error;
- -      int open_flags = 0;
   
         dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
                         dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
@@@ -1560,7 -1266,10 +1560,7 @@@
         attr.ia_mode = mode;
         attr.ia_valid = ATTR_MODE;
   
- -      if ((nd->flags & LOOKUP_CREATE) != 0)
- -              open_flags = nd->intent.open.flags;
- -
- -      error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd);
+ +      error = NFS_PROTO(dir)->create(dir, dentry, &attr, 0, NULL);
         if (error != 0)
                 goto out_err;
         return 0;
@@@ -1642,6 -1351,76 +1642,6 @@@ static int nfs_rmdir(struct inode *dir
         return error;
   }
   
- -static int nfs_sillyrename(struct inode *dir, struct dentry *dentry)
- -{
- -      static unsigned int sillycounter;
- -      const int      fileidsize  = sizeof(NFS_FILEID(dentry->d_inode))*2;
- -      const int      countersize = sizeof(sillycounter)*2;
- -      const int      slen        = sizeof(".nfs")+fileidsize+countersize-1;
- -      char           silly[slen+1];
- -      struct qstr    qsilly;
- -      struct dentry *sdentry;
- -      int            error = -EIO;
- -
- -      dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
- -              dentry->d_parent->d_name.name, dentry->d_name.name, 
- -              atomic_read(&dentry->d_count));
- -      nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
- -
- -      /*
- -       * We don't allow a dentry to be silly-renamed twice.
- -       */
- -      error = -EBUSY;
- -      if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
- -              goto out;
- -
- -      sprintf(silly, ".nfs%*.*Lx",
- -              fileidsize, fileidsize,
- -              (unsigned long long)NFS_FILEID(dentry->d_inode));
- -
- -      /* Return delegation in anticipation of the rename */
- -      nfs_inode_return_delegation(dentry->d_inode);
- -
- -      sdentry = NULL;
- -      do {
- -              char *suffix = silly + slen - countersize;
- -
- -              dput(sdentry);
- -              sillycounter++;
- -              sprintf(suffix, "%*.*x", countersize, countersize, sillycounter);
- -
- -              dfprintk(VFS, "NFS: trying to rename %s to %s\n",
- -                              dentry->d_name.name, silly);
- -              
- -              sdentry = lookup_one_len(silly, dentry->d_parent, slen);
- -              /*
- -               * N.B. Better to return EBUSY here ... it could be
- -               * dangerous to delete the file while it's in use.
- -               */
- -              if (IS_ERR(sdentry))
- -                      goto out;
- -      } while(sdentry->d_inode != NULL); /* need negative lookup */
- -
- -      qsilly.name = silly;
- -      qsilly.len  = strlen(silly);
- -      if (dentry->d_inode) {
- -              error = NFS_PROTO(dir)->rename(dir, &dentry->d_name,
- -                              dir, &qsilly);
- -              nfs_mark_for_revalidate(dentry->d_inode);
- -      } else
- -              error = NFS_PROTO(dir)->rename(dir, &dentry->d_name,
- -                              dir, &qsilly);
- -      if (!error) {
- -              nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
- -              d_move(dentry, sdentry);
- -              error = nfs_async_unlink(dir, dentry);
- -              /* If we return 0 we don't unlink */
- -      }
- -      dput(sdentry);
- -out:
- -      return error;
- -}
- -
   /*
    * Remove a file after making sure there are no pending writes,
    * and after checking that the file has only one user. 
@@@ -1801,7 -1580,7 +1801,7 @@@ nfs_link(struct dentry *old_dentry, str
         d_drop(dentry);
         error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
         if (error == 0) {
-               atomic_inc(&inode->i_count);
+               ihold(inode);
                 d_add(dentry, inode);
         }
         return error;
@@@ -1932,14 -1711,14 +1932,14 @@@ static void nfs_access_free_list(struc
   int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
   {
         LIST_HEAD(head);
- -      struct nfs_inode *nfsi;
+ +      struct nfs_inode *nfsi, *next;
         struct nfs_access_entry *cache;
   
         if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
                 return (nr_to_scan == 0) ? 0 : -1;
   
         spin_lock(&nfs_access_lru_lock);
- -      list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) {
+ +      list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) {
                 struct inode *inode;
   
                 if (nr_to_scan-- == 0)
diff --combined fs/proc/base.c

index 53dc8ad40ae682b7ad2781782538b05c45da53b1,9883f1e18332da35da06ba1f15d3828703b89712..9b094c1c846542160ff6c265171467d5ef0050ff
--- 1/fs/proc/base.c
--- 2/fs/proc/base.c
+++ b/fs/proc/base.c
@@@ -771,6 -771,8 +771,8 @@@ static const struct file_operations pro
   static int mem_open(struct inode* inode, struct file* file)
   {
         file->private_data = (void*)((long)current->self_exec_id);
+       /* OK to pass negative loff_t, we can catch out-of-range */
+       file->f_mode |= FMODE_UNSIGNED_OFFSET;
         return 0;
   }
   
@@@ -1023,47 -1025,28 +1025,47 @@@ static ssize_t oom_adjust_write(struct 
         memset(buffer, 0, sizeof(buffer));
         if (count > sizeof(buffer) - 1)
                 count = sizeof(buffer) - 1;
- -      if (copy_from_user(buffer, buf, count))
- -              return -EFAULT;
+ +      if (copy_from_user(buffer, buf, count)) {
+ +              err = -EFAULT;
+ +              goto out;
+ +      }
   
         err = strict_strtol(strstrip(buffer), 0, &oom_adjust);
         if (err)
- -              return -EINVAL;
+ +              goto out;
         if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
- -           oom_adjust != OOM_DISABLE)
- -              return -EINVAL;
+ +           oom_adjust != OOM_DISABLE) {
+ +              err = -EINVAL;
+ +              goto out;
+ +      }
   
         task = get_proc_task(file->f_path.dentry->d_inode);
- -      if (!task)
- -              return -ESRCH;
+ +      if (!task) {
+ +              err = -ESRCH;
+ +              goto out;
+ +      }
+ +
+ +      task_lock(task);
+ +      if (!task->mm) {
+ +              err = -EINVAL;
+ +              goto err_task_lock;
+ +      }
+ +
         if (!lock_task_sighand(task, &flags)) {
- -              put_task_struct(task);
- -              return -ESRCH;
+ +              err = -ESRCH;
+ +              goto err_task_lock;
         }
   
         if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
- -              unlock_task_sighand(task, &flags);
- -              put_task_struct(task);
- -              return -EACCES;
+ +              err = -EACCES;
+ +              goto err_sighand;
+ +      }
+ +
+ +      if (oom_adjust != task->signal->oom_adj) {
+ +              if (oom_adjust == OOM_DISABLE)
+ +                      atomic_inc(&task->mm->oom_disable_count);
+ +              if (task->signal->oom_adj == OOM_DISABLE)
+ +                      atomic_dec(&task->mm->oom_disable_count);
         }
   
         /*
@@@ -1084,13 -1067,10 +1086,13 @@@
         else
                 task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
                                                                 -OOM_DISABLE;
+ +err_sighand:
         unlock_task_sighand(task, &flags);
+ +err_task_lock:
+ +      task_unlock(task);
         put_task_struct(task);
- -
- -      return count;
+ +out:
+ +      return err < 0 ? err : count;
   }
   
   static const struct file_operations proc_oom_adjust_operations = {
@@@ -1131,49 -1111,30 +1133,49 @@@ static ssize_t oom_score_adj_write(stru
         memset(buffer, 0, sizeof(buffer));
         if (count > sizeof(buffer) - 1)
                 count = sizeof(buffer) - 1;
- -      if (copy_from_user(buffer, buf, count))
- -              return -EFAULT;
+ +      if (copy_from_user(buffer, buf, count)) {
+ +              err = -EFAULT;
+ +              goto out;
+ +      }
   
         err = strict_strtol(strstrip(buffer), 0, &oom_score_adj);
         if (err)
- -              return -EINVAL;
+ +              goto out;
         if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
- -                      oom_score_adj > OOM_SCORE_ADJ_MAX)
- -              return -EINVAL;
+ +                      oom_score_adj > OOM_SCORE_ADJ_MAX) {
+ +              err = -EINVAL;
+ +              goto out;
+ +      }
   
         task = get_proc_task(file->f_path.dentry->d_inode);
- -      if (!task)
- -              return -ESRCH;
+ +      if (!task) {
+ +              err = -ESRCH;
+ +              goto out;
+ +      }
+ +
+ +      task_lock(task);
+ +      if (!task->mm) {
+ +              err = -EINVAL;
+ +              goto err_task_lock;
+ +      }
+ +
         if (!lock_task_sighand(task, &flags)) {
- -              put_task_struct(task);
- -              return -ESRCH;
+ +              err = -ESRCH;
+ +              goto err_task_lock;
         }
+ +
         if (oom_score_adj < task->signal->oom_score_adj &&
                         !capable(CAP_SYS_RESOURCE)) {
- -              unlock_task_sighand(task, &flags);
- -              put_task_struct(task);
- -              return -EACCES;
+ +              err = -EACCES;
+ +              goto err_sighand;
         }
   
+ +      if (oom_score_adj != task->signal->oom_score_adj) {
+ +              if (oom_score_adj == OOM_SCORE_ADJ_MIN)
+ +                      atomic_inc(&task->mm->oom_disable_count);
+ +              if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+ +                      atomic_dec(&task->mm->oom_disable_count);
+ +      }
         task->signal->oom_score_adj = oom_score_adj;
         /*
          * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
@@@ -1184,13 -1145,9 +1186,13 @@@
         else
                 task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
                                                         OOM_SCORE_ADJ_MAX;
+ +err_sighand:
         unlock_task_sighand(task, &flags);
+ +err_task_lock:
+ +      task_unlock(task);
         put_task_struct(task);
- -      return count;
+ +out:
+ +      return err < 0 ? err : count;
   }
   
   static const struct file_operations proc_oom_score_adj_operations = {
@@@ -1646,6 -1603,7 +1648,7 @@@ static struct inode *proc_pid_make_inod
   
         /* Common stuff */
         ei = PROC_I(inode);
+       inode->i_ino = get_next_ino();
         inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
         inode->i_op = &proc_def_inode_operations;
   
@@@ -2592,6 -2550,7 +2595,7 @@@ static struct dentry *proc_base_instant
   
         /* Initialize the inode */
         ei = PROC_I(inode);
+       inode->i_ino = get_next_ino();
         inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
   
         /*
diff --combined fs/reiserfs/inode.c

index c1f93896cb538082ce0b4158bbdfb565edb523cd,4dcb88046030f2c756c1e9c31496ce91784e389d..41656d40dc5c87fc8bcfd9ec5aea4bc635ebc092
--- 1/fs/reiserfs/inode.c
--- 2/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@@ -22,8 -22,6 +22,6 @@@
   
   int reiserfs_commit_write(struct file *f, struct page *page,
                           unsigned from, unsigned to);
- int reiserfs_prepare_write(struct file *f, struct page *page,
-                          unsigned from, unsigned to);
   
   void reiserfs_evict_inode(struct inode *inode)
   {
@@@ -165,7 -163,7 +163,7 @@@ inline void make_le_item_head(struct it
   ** but tail is still sitting in a direct item, and we can't write to
   ** it.  So, look through this page, and check all the mapped buffers
   ** to make sure they have valid block numbers.  Any that don't need
- ** to be unmapped, so that block_prepare_write will correctly call
+ ** to be unmapped, so that __block_write_begin will correctly call
   ** reiserfs_get_block to convert the tail into an unformatted node
   */
   static inline void fix_tail_page_for_writing(struct page *page)
@@@ -439,13 -437,13 +437,13 @@@ static int reiserfs_bmap(struct inode *
   }
   
   /* special version of get_block that is only used by grab_tail_page right
- ** now.  It is sent to block_prepare_write, and when you try to get a
+ ** now.  It is sent to __block_write_begin, and when you try to get a
   ** block past the end of the file (or a block from a hole) it returns
- ** -ENOENT instead of a valid buffer.  block_prepare_write expects to
+ ** -ENOENT instead of a valid buffer.  __block_write_begin expects to
   ** be able to do i/o on the buffers returned, unless an error value
   ** is also returned.
   **
- ** So, this allows block_prepare_write to be used for reading a single block
+ ** So, this allows __block_write_begin to be used for reading a single block
   ** in a page.  Where it does not produce a valid page for holes, or past the
   ** end of the file.  This turns out to be exactly what we need for reading
   ** tails for conversion.
@@@ -558,11 -556,12 +556,12 @@@ static int convert_tail_for_hole(struc
          **
          ** We must fix the tail page for writing because it might have buffers
          ** that are mapped, but have a block number of 0.  This indicates tail
-        ** data that has been read directly into the page, and block_prepare_write
-        ** won't trigger a get_block in this case.
+        ** data that has been read directly into the page, and
+        ** __block_write_begin won't trigger a get_block in this case.
          */
         fix_tail_page_for_writing(tail_page);
-       retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end);
+       retval = __reiserfs_write_begin(tail_page, tail_start,
+                                     tail_end - tail_start);
         if (retval)
                 goto unlock;
   
@@@ -2033,7 -2032,7 +2032,7 @@@ static int grab_tail_page(struct inode 
         /* start within the page of the last block in the file */
         start = (offset / blocksize) * blocksize;
   
-       error = block_prepare_write(page, start, offset,
+       error = __block_write_begin(page, start, offset - start,
                                     reiserfs_get_block_create_0);
         if (error)
                 goto unlock;
@@@ -2438,7 -2437,7 +2437,7 @@@ static int reiserfs_write_full_page(str
                 /* from this point on, we know the buffer is mapped to a
                  * real block and not a direct item
                  */
- -              if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
+ +              if (wbc->sync_mode != WB_SYNC_NONE) {
                         lock_buffer(bh);
                 } else {
                         if (!trylock_buffer(bh)) {
@@@ -2628,8 -2627,7 +2627,7 @@@ static int reiserfs_write_begin(struct 
         return ret;
   }
   
- int reiserfs_prepare_write(struct file *f, struct page *page,
-                          unsigned from, unsigned to)
+ int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len)
   {
         struct inode *inode = page->mapping->host;
         int ret;
@@@ -2650,7 -2648,7 +2648,7 @@@
                 th->t_refcount++;
         }
   
-       ret = block_prepare_write(page, from, to, reiserfs_get_block);
+       ret = __block_write_begin(page, from, len, reiserfs_get_block);
         if (ret && reiserfs_transaction_running(inode->i_sb)) {
                 struct reiserfs_transaction_handle *th = current->journal_info;
                 /* this gets a little ugly.  If reiserfs_get_block returned an
diff --combined include/linux/fs.h

index 4658777b41cc24d7a6cca8465cdbf72399e1fed5,f300a650881814da8f60402d9d933624445c90c5..240eb1d4f87645672217429ed0699cf21ce84d72
--- 1/include/linux/fs.h
--- 2/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -92,6 -92,9 +92,9 @@@ struct inodes_stat_t 
   /* Expect random access pattern */
   #define FMODE_RANDOM          ((__force fmode_t)0x1000)
   
+ /* File is huge (eg. /dev/kmem): treat loff_t as unsigned */
+ #define FMODE_UNSIGNED_OFFSET ((__force fmode_t)0x2000)
+ 
   /* File was opened by fanotify and shouldn't generate fanotify events */
   #define FMODE_NONOTIFY                ((__force fmode_t)0x1000000)
   
@@@ -231,7 -234,6 +234,7 @@@
   #define S_NOCMTIME    128     /* Do not update file c/mtime */
   #define S_SWAPFILE    256     /* Do not truncate: swapon got its bmaps */
   #define S_PRIVATE     512     /* Inode is fs-internal */
+ +#define S_IMA         1024    /* Inode has an associated IMA struct */
   
   /*
    * Note that nosuid etc flags are inode-specific: setting some file-system
@@@ -266,7 -268,6 +269,7 @@@
   #define IS_NOCMTIME(inode)    ((inode)->i_flags & S_NOCMTIME)
   #define IS_SWAPFILE(inode)    ((inode)->i_flags & S_SWAPFILE)
   #define IS_PRIVATE(inode)     ((inode)->i_flags & S_PRIVATE)
+ +#define IS_IMA(inode)         ((inode)->i_flags & S_IMA)
   
   /* the read-only stuff doesn't really belong here, but any other place is
      probably as bad and I don't want to create yet another include file. */
@@@ -722,7 -723,8 +725,8 @@@ struct posix_acl
   
   struct inode {
         struct hlist_node       i_hash;
-       struct list_head        i_list;         /* backing dev IO list */
+       struct list_head        i_wb_list;      /* backing dev IO list */
+       struct list_head        i_lru;          /* inode LRU list */
         struct list_head        i_sb_list;
         struct list_head        i_dentry;
         unsigned long           i_ino;
@@@ -774,10 -776,6 +778,10 @@@
   
         unsigned int            i_flags;
   
+ +#ifdef CONFIG_IMA
+ +      /* protected by i_lock */
+ +      unsigned int            i_readcount; /* struct files open RO */
+ +#endif
         atomic_t                i_writecount;
   #ifdef CONFIG_SECURITY
         void                    *i_security;
@@@ -789,6 -787,11 +793,11 @@@
         void                    *i_private; /* fs or device private pointer */
   };
   
+ static inline int inode_unhashed(struct inode *inode)
+ {
+       return hlist_unhashed(&inode->i_hash);
+ }
+ 
   /*
    * inode->i_mutex nesting subclasses for the lock validator:
    *
@@@ -1639,16 -1642,17 +1648,17 @@@ struct super_operations 
    *
    * Q: What is the difference between I_WILL_FREE and I_FREEING?
    */
- #define I_DIRTY_SYNC          1
- #define I_DIRTY_DATASYNC      2
- #define I_DIRTY_PAGES         4
+ #define I_DIRTY_SYNC          (1 << 0)
+ #define I_DIRTY_DATASYNC      (1 << 1)
+ #define I_DIRTY_PAGES         (1 << 2)
   #define __I_NEW                       3
   #define I_NEW                 (1 << __I_NEW)
- #define I_WILL_FREE           16
- #define I_FREEING             32
- #define I_CLEAR                       64
+ #define I_WILL_FREE           (1 << 4)
+ #define I_FREEING             (1 << 5)
+ #define I_CLEAR                       (1 << 6)
   #define __I_SYNC              7
   #define I_SYNC                        (1 << __I_SYNC)
+ #define I_REFERENCED          (1 << 8)
   
   #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
   
@@@ -1740,6 -1744,7 +1750,7 @@@ static inline void file_accessed(struc
   }
   
   int sync_inode(struct inode *inode, struct writeback_control *wbc);
+ int sync_inode_metadata(struct inode *inode, int wait);
   
   struct file_system_type {
         const char *name;
@@@ -2084,7 -2089,6 +2095,6 @@@ extern int check_disk_change(struct blo
   extern int __invalidate_device(struct block_device *);
   extern int invalidate_partition(struct gendisk *, int);
   #endif
- extern int invalidate_inodes(struct super_block *);
   unsigned long invalidate_mapping_pages(struct address_space *mapping,
                                         pgoff_t start, pgoff_t end);
   
@@@ -2168,7 -2172,7 +2178,7 @@@ extern loff_t vfs_llseek(struct file *f
   
   extern int inode_init_always(struct super_block *, struct inode *);
   extern void inode_init_once(struct inode *);
- extern void inode_add_to_lists(struct super_block *, struct inode *);
+ extern void ihold(struct inode * inode);
   extern void iput(struct inode *);
   extern struct inode * igrab(struct inode *);
   extern ino_t iunique(struct super_block *, ino_t);
@@@ -2188,11 -2192,11 +2198,11 @@@ extern struct inode * iget_locked(struc
   extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *);
   extern int insert_inode_locked(struct inode *);
   extern void unlock_new_inode(struct inode *);
+ extern unsigned int get_next_ino(void);
   
   extern void __iget(struct inode * inode);
   extern void iget_failed(struct inode *);
   extern void end_writeback(struct inode *);
- extern void destroy_inode(struct inode *);
   extern void __destroy_inode(struct inode *);
   extern struct inode *new_inode(struct super_block *);
   extern int should_remove_suid(struct dentry *);
@@@ -2200,9 -2204,11 +2210,11 @@@ extern int file_remove_suid(struct fil
   
   extern void __insert_inode_hash(struct inode *, unsigned long hashval);
   extern void remove_inode_hash(struct inode *);
- static inline void insert_inode_hash(struct inode *inode) {
+ static inline void insert_inode_hash(struct inode *inode)
+ {
         __insert_inode_hash(inode, inode->i_ino);
   }
+ extern void inode_sb_list_add(struct inode *inode);
   
   #ifdef CONFIG_BLOCK
   extern void submit_bio(int, struct bio *);
@@@ -2485,7 -2491,10 +2497,10 @@@ ssize_t simple_attr_write(struct file *
   struct ctl_table;
   int proc_nr_files(struct ctl_table *table, int write,
                   void __user *buffer, size_t *lenp, loff_t *ppos);
- 
+ int proc_nr_dentry(struct ctl_table *table, int write,
+                 void __user *buffer, size_t *lenp, loff_t *ppos);
+ int proc_nr_inodes(struct ctl_table *table, int write,
+                  void __user *buffer, size_t *lenp, loff_t *ppos);
   int __init get_filesystem_list(char *buf);
   
   #define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
diff --combined include/linux/writeback.h

index c7299d2ace6b373d802a01aee3216acfa8a64f41,242b6f812ba6103ab3e2d6b4f7dd309172335b17..d5c7aaadda59a926032794a4d3a27a46f01bab3c
--- 1/include/linux/writeback.h
--- 2/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@@ -10,8 -10,6 +10,6 @@@
   struct backing_dev_info;
   
   extern spinlock_t inode_lock;
- extern struct list_head inode_in_use;
- extern struct list_head inode_unused;
   
   /*
    * fs/fs-writeback.c
@@@ -149,8 -147,6 +147,8 @@@ int write_cache_pages(struct address_sp
   int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
   void set_page_dirty_balance(struct page *page, int page_mkwrite);
   void writeback_set_ratelimit(void);
+ +void tag_pages_for_writeback(struct address_space *mapping,
+ +                           pgoff_t start, pgoff_t end);
   
   /* pdflush.c */
   extern int nr_pdflush_threads;        /* Global so it can be exported to sysctl
diff --combined kernel/sysctl.c

index 48d9d689498fbaa394e4d6ec8f4403c90767d7f1,8b77ff5c502c60cd0b73dd6849ece649420daeed..c33a1edb799fda6db2e16fdf9d1d0401a9f78278
--- 1/kernel/sysctl.c
--- 2/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@@ -161,6 -161,8 +161,6 @@@ extern int no_unaligned_warning
   extern int unaligned_dump_stack;
   #endif
   
- -extern struct ratelimit_state printk_ratelimit_state;
- -
   #ifdef CONFIG_PROC_SYSCTL
   static int proc_do_cad_pid(struct ctl_table *table, int write,
                   void __user *buffer, size_t *lenp, loff_t *ppos);
@@@ -1338,14 -1340,14 +1338,14 @@@ static struct ctl_table fs_table[] = 
                 .data           = &inodes_stat,
                 .maxlen         = 2*sizeof(int),
                 .mode           = 0444,
-               .proc_handler   = proc_dointvec,
+               .proc_handler   = proc_nr_inodes,
         },
         {
                 .procname       = "inode-state",
                 .data           = &inodes_stat,
                 .maxlen         = 7*sizeof(int),
                 .mode           = 0444,
-               .proc_handler   = proc_dointvec,
+               .proc_handler   = proc_nr_inodes,
         },
         {
                 .procname       = "file-nr",
@@@ -1375,7 -1377,7 +1375,7 @@@
                 .data           = &dentry_stat,
                 .maxlen         = 6*sizeof(int),
                 .mode           = 0444,
-               .proc_handler   = proc_dointvec,
+               .proc_handler   = proc_nr_dentry,
         },
         {
                 .procname       = "overflowuid",
diff --combined mm/backing-dev.c

index f2eb27884ffa88c99d4662fa2cdb3035f216289c,15d5097de821bbaa3c12ba12e59d32392d6e6a8f..027100d30227fead0a4010d1feb0e2791a98fcaa
--- 1/mm/backing-dev.c
--- 2/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@@ -74,11 -74,11 +74,11 @@@ static int bdi_debug_stats_show(struct 
   
         nr_wb = nr_dirty = nr_io = nr_more_io = 0;
         spin_lock(&inode_lock);
-       list_for_each_entry(inode, &wb->b_dirty, i_list)
+       list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
                 nr_dirty++;
-       list_for_each_entry(inode, &wb->b_io, i_list)
+       list_for_each_entry(inode, &wb->b_io, i_wb_list)
                 nr_io++;
-       list_for_each_entry(inode, &wb->b_more_io, i_list)
+       list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
                 nr_more_io++;
         spin_unlock(&inode_lock);
   
@@@ -362,7 -362,7 +362,7 @@@ static int bdi_forker_thread(void *ptr
   {
         struct bdi_writeback *me = ptr;
   
- -      current->flags |= PF_FLUSHER | PF_SWAPWRITE;
+ +      current->flags |= PF_SWAPWRITE;
         set_freezable();
   
         /*
@@@ -729,7 -729,6 +729,7 @@@ static wait_queue_head_t congestion_wqh
                 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
                 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
         };
+ +static atomic_t nr_bdi_congested[2];
   
   void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
   {
@@@ -737,8 -736,7 +737,8 @@@
         wait_queue_head_t *wqh = &congestion_wqh[sync];
   
         bit = sync ? BDI_sync_congested : BDI_async_congested;
- -      clear_bit(bit, &bdi->state);
+ +      if (test_and_clear_bit(bit, &bdi->state))
+ +              atomic_dec(&nr_bdi_congested[sync]);
         smp_mb__after_clear_bit();
         if (waitqueue_active(wqh))
                 wake_up(wqh);
@@@ -750,8 -748,7 +750,8 @@@ void set_bdi_congested(struct backing_d
         enum bdi_state bit;
   
         bit = sync ? BDI_sync_congested : BDI_async_congested;
- -      set_bit(bit, &bdi->state);
+ +      if (!test_and_set_bit(bit, &bdi->state))
+ +              atomic_inc(&nr_bdi_congested[sync]);
   }
   EXPORT_SYMBOL(set_bdi_congested);
   
@@@ -767,72 -764,13 +767,72 @@@
   long congestion_wait(int sync, long timeout)
   {
         long ret;
+ +      unsigned long start = jiffies;
         DEFINE_WAIT(wait);
         wait_queue_head_t *wqh = &congestion_wqh[sync];
   
         prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
         ret = io_schedule_timeout(timeout);
         finish_wait(wqh, &wait);
+ +
+ +      trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
+ +                                      jiffies_to_usecs(jiffies - start));
+ +
         return ret;
   }
   EXPORT_SYMBOL(congestion_wait);
   
+ +/**
+ + * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
+ + * @zone: A zone to check if it is heavily congested
+ + * @sync: SYNC or ASYNC IO
+ + * @timeout: timeout in jiffies
+ + *
+ + * In the event of a congested backing_dev (any backing_dev) and the given
+ + * @zone has experienced recent congestion, this waits for up to @timeout
+ + * jiffies for either a BDI to exit congestion of the given @sync queue
+ + * or a write to complete.
+ + *
+ + * In the absense of zone congestion, cond_resched() is called to yield
+ + * the processor if necessary but otherwise does not sleep.
+ + *
+ + * The return value is 0 if the sleep is for the full timeout. Otherwise,
+ + * it is the number of jiffies that were still remaining when the function
+ + * returned. return_value == timeout implies the function did not sleep.
+ + */
+ +long wait_iff_congested(struct zone *zone, int sync, long timeout)
+ +{
+ +      long ret;
+ +      unsigned long start = jiffies;
+ +      DEFINE_WAIT(wait);
+ +      wait_queue_head_t *wqh = &congestion_wqh[sync];
+ +
+ +      /*
+ +       * If there is no congestion, or heavy congestion is not being
+ +       * encountered in the current zone, yield if necessary instead
+ +       * of sleeping on the congestion queue
+ +       */
+ +      if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
+ +                      !zone_is_reclaim_congested(zone)) {
+ +              cond_resched();
+ +
+ +              /* In case we scheduled, work out time remaining */
+ +              ret = timeout - (jiffies - start);
+ +              if (ret < 0)
+ +                      ret = 0;
+ +
+ +              goto out;
+ +      }
+ +
+ +      /* Sleep until uncongested or a write happens */
+ +      prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+ +      ret = io_schedule_timeout(timeout);
+ +      finish_wait(wqh, &wait);
+ +
+ +out:
+ +      trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
+ +                                      jiffies_to_usecs(jiffies - start));
+ +
+ +      return ret;
+ +}
+ +EXPORT_SYMBOL(wait_iff_congested);
diff --combined net/socket.c

index 7f67c072d4969670a0355c7aaad033dbdc651a5f,5cac1c707755e4c2d9768f510504b2351745b5ef..ee3cd280c76e9cfb58024fa22f3017352f0467bc
--- 1/net/socket.c
--- 2/net/socket.c
+++ b/net/socket.c
@@@ -377,7 -377,7 +377,7 @@@ static int sock_alloc_file(struct socke
                   &socket_file_ops);
         if (unlikely(!file)) {
                 /* drop dentry, keep inode */
-               atomic_inc(&path.dentry->d_inode->i_count);
+               ihold(path.dentry->d_inode);
                 path_put(&path);
                 put_unused_fd(fd);
                 return -ENFILE;
@@@ -480,6 -480,7 +480,7 @@@ static struct socket *sock_alloc(void
         sock = SOCKET_I(inode);
   
         kmemcheck_annotate_bitfield(sock, type);
+       inode->i_ino = get_next_ino();
         inode->i_mode = S_IFSOCK | S_IRWXUGO;
         inode->i_uid = current_fsuid();
         inode->i_gid = current_fsgid();
@@@ -1145,7 -1146,7 +1146,7 @@@ call_kill
   }
   EXPORT_SYMBOL(sock_wake_async);
   
- -static int __sock_create(struct net *net, int family, int type, int protocol,
+ +int __sock_create(struct net *net, int family, int type, int protocol,
                          struct socket **res, int kern)
   {
         int err;
@@@ -1257,7 -1258,6 +1258,7 @@@ out_release
         rcu_read_unlock();
         goto out_sock_release;
   }
+ +EXPORT_SYMBOL(__sock_create);
   
   int sock_create(int family, int type, int protocol, struct socket **res)
   {
author	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 27 Oct 2010 00:58:44 +0000 (17:58 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 27 Oct 2010 00:58:44 +0000 (17:58 -0700)
		1	2
fs/buffer.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/fs-writeback.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/hugetlbfs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nfs/dir.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/proc/base.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/reiserfs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/fs.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/writeback.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sysctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/backing-dev.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/socket.c	patch \|	diff1 \|	diff2 \|	blob \| history