]> git.karo-electronics.de Git - karo-tx-linux.git/blobdiff - mm/filemap.c
Merge branch 'akpm-current/current'
[karo-tx-linux.git] / mm / filemap.c
index bc943867d68c68dab4109fe715c8d37d6c72757a..6414803972f7a1cc04a9eba7d9b8b995379fc39d 100644 (file)
  *    ->tree_lock              (page_remove_rmap->set_page_dirty)
  *    bdi.wb->list_lock                (page_remove_rmap->set_page_dirty)
  *    ->inode->i_lock          (page_remove_rmap->set_page_dirty)
- *    ->memcg->move_lock       (page_remove_rmap->mem_cgroup_begin_page_stat)
+ *    ->memcg->move_lock       (page_remove_rmap->lock_page_memcg)
  *    bdi.wb->list_lock                (zap_pte_range->set_page_dirty)
  *    ->inode->i_lock          (zap_pte_range->set_page_dirty)
  *    ->private_lock           (zap_pte_range->__set_page_dirty_buffers)
@@ -176,11 +176,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
 /*
  * Delete a page from the page cache and free it. Caller has to make
  * sure the page is locked and that nobody else uses it - or that usage
- * is safe.  The caller must hold the mapping's tree_lock and
- * mem_cgroup_begin_page_stat().
+ * is safe.  The caller must hold the mapping's tree_lock.
  */
-void __delete_from_page_cache(struct page *page, void *shadow,
-                             struct mem_cgroup *memcg)
+void __delete_from_page_cache(struct page *page, void *shadow)
 {
        struct address_space *mapping = page->mapping;
 
@@ -216,8 +214,7 @@ void __delete_from_page_cache(struct page *page, void *shadow,
         * anyway will be cleared before returning page into buddy allocator.
         */
        if (WARN_ON_ONCE(PageDirty(page)))
-               account_page_cleaned(page, mapping, memcg,
-                                    inode_to_wb(mapping->host));
+               account_page_cleaned(page, mapping, inode_to_wb(mapping->host));
 }
 
 /**
@@ -231,7 +228,6 @@ void __delete_from_page_cache(struct page *page, void *shadow,
 void delete_from_page_cache(struct page *page)
 {
        struct address_space *mapping = page->mapping;
-       struct mem_cgroup *memcg;
        unsigned long flags;
 
        void (*freepage)(struct page *);
@@ -240,11 +236,9 @@ void delete_from_page_cache(struct page *page)
 
        freepage = mapping->a_ops->freepage;
 
-       memcg = mem_cgroup_begin_page_stat(page);
        spin_lock_irqsave(&mapping->tree_lock, flags);
-       __delete_from_page_cache(page, NULL, memcg);
+       __delete_from_page_cache(page, NULL);
        spin_unlock_irqrestore(&mapping->tree_lock, flags);
-       mem_cgroup_end_page_stat(memcg);
 
        if (freepage)
                freepage(page);
@@ -532,7 +526,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
        if (!error) {
                struct address_space *mapping = old->mapping;
                void (*freepage)(struct page *);
-               struct mem_cgroup *memcg;
                unsigned long flags;
 
                pgoff_t offset = old->index;
@@ -542,9 +535,8 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
                new->mapping = mapping;
                new->index = offset;
 
-               memcg = mem_cgroup_begin_page_stat(old);
                spin_lock_irqsave(&mapping->tree_lock, flags);
-               __delete_from_page_cache(old, NULL, memcg);
+               __delete_from_page_cache(old, NULL);
                error = radix_tree_insert(&mapping->page_tree, offset, new);
                BUG_ON(error);
                mapping->nrpages++;
@@ -557,8 +549,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
                if (PageSwapBacked(new))
                        __inc_zone_page_state(new, NR_SHMEM);
                spin_unlock_irqrestore(&mapping->tree_lock, flags);
-               mem_cgroup_end_page_stat(memcg);
-               mem_cgroup_replace_page(old, new);
+               mem_cgroup_migrate(old, new);
                radix_tree_preload_end();
                if (freepage)
                        freepage(old);
@@ -576,7 +567,7 @@ static int page_cache_tree_insert(struct address_space *mapping,
        void **slot;
        int error;
 
-       error = __radix_tree_create(&mapping->page_tree, page->index,
+       error = __radix_tree_create(&mapping->page_tree, page->index, 0,
                                    &node, &slot);
        if (error)
                return error;
@@ -1245,7 +1236,6 @@ unsigned find_get_entries(struct address_space *mapping,
                return 0;
 
        rcu_read_lock();
-restart:
        radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
                struct page *page;
 repeat:
@@ -1253,8 +1243,10 @@ repeat:
                if (unlikely(!page))
                        continue;
                if (radix_tree_exception(page)) {
-                       if (radix_tree_deref_retry(page))
-                               goto restart;
+                       if (radix_tree_deref_retry(page)) {
+                               slot = radix_tree_iter_retry(&iter);
+                               continue;
+                       }
                        /*
                         * A shadow entry of a recently evicted page, a swap
                         * entry from shmem/tmpfs or a DAX entry.  Return it
@@ -1307,7 +1299,6 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
                return 0;
 
        rcu_read_lock();
-restart:
        radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
                struct page *page;
 repeat:
@@ -1317,13 +1308,8 @@ repeat:
 
                if (radix_tree_exception(page)) {
                        if (radix_tree_deref_retry(page)) {
-                               /*
-                                * Transient condition which can only trigger
-                                * when entry at index 0 moves out of or back
-                                * to root: none yet gotten, safe to restart.
-                                */
-                               WARN_ON(iter.index);
-                               goto restart;
+                               slot = radix_tree_iter_retry(&iter);
+                               continue;
                        }
                        /*
                         * A shadow entry of a recently evicted page,
@@ -1374,7 +1360,6 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
                return 0;
 
        rcu_read_lock();
-restart:
        radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) {
                struct page *page;
 repeat:
@@ -1385,12 +1370,8 @@ repeat:
 
                if (radix_tree_exception(page)) {
                        if (radix_tree_deref_retry(page)) {
-                               /*
-                                * Transient condition which can only trigger
-                                * when entry at index 0 moves out of or back
-                                * to root: none yet gotten, safe to restart.
-                                */
-                               goto restart;
+                               slot = radix_tree_iter_retry(&iter);
+                               continue;
                        }
                        /*
                         * A shadow entry of a recently evicted page,
@@ -1450,7 +1431,6 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
                return 0;
 
        rcu_read_lock();
-restart:
        radix_tree_for_each_tagged(slot, &mapping->page_tree,
                                   &iter, *index, tag) {
                struct page *page;
@@ -1461,12 +1441,8 @@ repeat:
 
                if (radix_tree_exception(page)) {
                        if (radix_tree_deref_retry(page)) {
-                               /*
-                                * Transient condition which can only trigger
-                                * when entry at index 0 moves out of or back
-                                * to root: none yet gotten, safe to restart.
-                                */
-                               goto restart;
+                               slot = radix_tree_iter_retry(&iter);
+                               continue;
                        }
                        /*
                         * A shadow entry of a recently evicted page.
@@ -1529,7 +1505,6 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
                return 0;
 
        rcu_read_lock();
-restart:
        radix_tree_for_each_tagged(slot, &mapping->page_tree,
                                   &iter, start, tag) {
                struct page *page;
@@ -1539,12 +1514,8 @@ repeat:
                        continue;
                if (radix_tree_exception(page)) {
                        if (radix_tree_deref_retry(page)) {
-                               /*
-                                * Transient condition which can only trigger
-                                * when entry at index 0 moves out of or back
-                                * to root: none yet gotten, safe to restart.
-                                */
-                               goto restart;
+                               slot = radix_tree_iter_retry(&iter);
+                               continue;
                        }
 
                        /*
@@ -1649,6 +1620,15 @@ find_page:
                                        index, last_index - index);
                }
                if (!PageUptodate(page)) {
+                       /*
+                        * See comment in do_read_cache_page on why
+                        * wait_on_page_locked is used to avoid unnecessarily
+                        * serialisations and why it's safe.
+                        */
+                       wait_on_page_locked_killable(page);
+                       if (PageUptodate(page))
+                               goto page_ok;
+
                        if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
                                        !mapping->a_ops->is_partially_uptodate)
                                goto page_not_up_to_date;
@@ -1890,6 +1870,7 @@ EXPORT_SYMBOL(generic_file_read_iter);
  * page_cache_read - adds requested page to the page cache if not already there
  * @file:      file to read
  * @offset:    page index
+ * @gfp_mask:  memory allocation flags
  *
  * This adds the requested page to the page cache if it isn't already there,
  * and schedules an I/O to read in its contents from disk.
@@ -2151,10 +2132,11 @@ repeat:
                if (unlikely(!page))
                        goto next;
                if (radix_tree_exception(page)) {
-                       if (radix_tree_deref_retry(page))
-                               break;
-                       else
-                               goto next;
+                       if (radix_tree_deref_retry(page)) {
+                               slot = radix_tree_iter_retry(&iter);
+                               continue;
+                       }
+                       goto next;
                }
 
                if (!page_cache_get_speculative(page))
@@ -2283,7 +2265,7 @@ static struct page *wait_on_page_read(struct page *page)
        return page;
 }
 
-static struct page *__read_cache_page(struct address_space *mapping,
+static struct page *do_read_cache_page(struct address_space *mapping,
                                pgoff_t index,
                                int (*filler)(void *, struct page *),
                                void *data,
@@ -2305,53 +2287,74 @@ repeat:
                        /* Presumably ENOMEM for radix tree node */
                        return ERR_PTR(err);
                }
+
+filler:
                err = filler(data, page);
                if (err < 0) {
                        page_cache_release(page);
-                       page = ERR_PTR(err);
-               } else {
-                       page = wait_on_page_read(page);
+                       return ERR_PTR(err);
                }
-       }
-       return page;
-}
-
-static struct page *do_read_cache_page(struct address_space *mapping,
-                               pgoff_t index,
-                               int (*filler)(void *, struct page *),
-                               void *data,
-                               gfp_t gfp)
 
-{
-       struct page *page;
-       int err;
+               page = wait_on_page_read(page);
+               if (IS_ERR(page))
+                       return page;
+               goto out;
+       }
+       if (PageUptodate(page))
+               goto out;
 
-retry:
-       page = __read_cache_page(mapping, index, filler, data, gfp);
-       if (IS_ERR(page))
-               return page;
+       /*
+        * Page is not up to date and may be locked due one of the following
+        * case a: Page is being filled and the page lock is held
+        * case b: Read/write error clearing the page uptodate status
+        * case c: Truncation in progress (page locked)
+        * case d: Reclaim in progress
+        *
+        * Case a, the page will be up to date when the page is unlocked.
+        *    There is no need to serialise on the page lock here as the page
+        *    is pinned so the lock gives no additional protection. Even if the
+        *    the page is truncated, the data is still valid if PageUptodate as
+        *    it's a race vs truncate race.
+        * Case b, the page will not be up to date
+        * Case c, the page may be truncated but in itself, the data may still
+        *    be valid after IO completes as it's a read vs truncate race. The
+        *    operation must restart if the page is not uptodate on unlock but
+        *    otherwise serialising on page lock to stabilise the mapping gives
+        *    no additional guarantees to the caller as the page lock is
+        *    released before return.
+        * Case d, similar to truncation. If reclaim holds the page lock, it
+        *    will be a race with remove_mapping that determines if the mapping
+        *    is valid on unlock but otherwise the data is valid and there is
+        *    no need to serialise with page lock.
+        *
+        * As the page lock gives no additional guarantee, we optimistically
+        * wait on the page to be unlocked and check if it's up to date and
+        * use the page if it is. Otherwise, the page lock is required to
+        * distinguish between the different cases. The motivation is that we
+        * avoid spurious serialisations and wakeups when multiple processes
+        * wait on the same page for IO to complete.
+        */
+       wait_on_page_locked(page);
        if (PageUptodate(page))
                goto out;
 
+       /* Distinguish between all the cases under the safety of the lock */
        lock_page(page);
+
+       /* Case c or d, restart the operation */
        if (!page->mapping) {
                unlock_page(page);
                page_cache_release(page);
-               goto retry;
+               goto repeat;
        }
+
+       /* Someone else locked and filled the page in a very small window */
        if (PageUptodate(page)) {
                unlock_page(page);
                goto out;
        }
-       err = filler(data, page);
-       if (err < 0) {
-               page_cache_release(page);
-               return ERR_PTR(err);
-       } else {
-               page = wait_on_page_read(page);
-               if (IS_ERR(page))
-                       return page;
-       }
+       goto filler;
+
 out:
        mark_page_accessed(page);
        return page;
@@ -2424,7 +2427,7 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
 
        if (limit != RLIM_INFINITY) {
                if (iocb->ki_pos >= limit) {
-                       send_sig(SIGXFSZ, current, 0);
+                       io_send_sig(SIGXFSZ);
                        return -EFBIG;
                }
                iov_iter_truncate(from, limit - (unsigned long)pos);
@@ -2435,8 +2438,10 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
         */
        if (unlikely(pos + iov_iter_count(from) > MAX_NON_LFS &&
                                !(file->f_flags & O_LARGEFILE))) {
-               if (pos >= MAX_NON_LFS)
+               if (pos >= MAX_NON_LFS) {
+                       io_send_sig(SIGXFSZ);
                        return -EFBIG;
+               }
                iov_iter_truncate(from, MAX_NON_LFS - (unsigned long)pos);
        }