]> git.karo-electronics.de Git - karo-tx-linux.git/blobdiff - mm/swapfile.c
Merge tag 'fbdev-v4.11' of git://github.com/bzolnier/linux
[karo-tx-linux.git] / mm / swapfile.c
index 1c6e0321205dd2d34abc7f39a0753c128eb7ae53..2cac12cc9abe2dbbd7e5e385de91cf93a4ab61d1 100644 (file)
@@ -34,6 +34,7 @@
 #include <linux/frontswap.h>
 #include <linux/swapfile.h>
 #include <linux/export.h>
+#include <linux/swap_slots.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -257,6 +258,47 @@ static inline void cluster_set_null(struct swap_cluster_info *info)
        info->data = 0;
 }
 
+static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
+                                                    unsigned long offset)
+{
+       struct swap_cluster_info *ci;
+
+       ci = si->cluster_info;
+       if (ci) {
+               ci += offset / SWAPFILE_CLUSTER;
+               spin_lock(&ci->lock);
+       }
+       return ci;
+}
+
+static inline void unlock_cluster(struct swap_cluster_info *ci)
+{
+       if (ci)
+               spin_unlock(&ci->lock);
+}
+
+static inline struct swap_cluster_info *lock_cluster_or_swap_info(
+       struct swap_info_struct *si,
+       unsigned long offset)
+{
+       struct swap_cluster_info *ci;
+
+       ci = lock_cluster(si, offset);
+       if (!ci)
+               spin_lock(&si->lock);
+
+       return ci;
+}
+
+static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
+                                              struct swap_cluster_info *ci)
+{
+       if (ci)
+               unlock_cluster(ci);
+       else
+               spin_unlock(&si->lock);
+}
+
 static inline bool cluster_list_empty(struct swap_cluster_list *list)
 {
        return cluster_is_null(&list->head);
@@ -281,9 +323,17 @@ static void cluster_list_add_tail(struct swap_cluster_list *list,
                cluster_set_next_flag(&list->head, idx, 0);
                cluster_set_next_flag(&list->tail, idx, 0);
        } else {
+               struct swap_cluster_info *ci_tail;
                unsigned int tail = cluster_next(&list->tail);
 
-               cluster_set_next(&ci[tail], idx);
+               /*
+                * Nested cluster lock, but both cluster locks are
+                * only acquired when we held swap_info_struct->lock
+                */
+               ci_tail = ci + tail;
+               spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
+               cluster_set_next(ci_tail, idx);
+               unlock_cluster(ci_tail);
                cluster_set_next_flag(&list->tail, idx, 0);
        }
 }
@@ -328,7 +378,7 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
 */
 static void swap_do_scheduled_discard(struct swap_info_struct *si)
 {
-       struct swap_cluster_info *info;
+       struct swap_cluster_info *info, *ci;
        unsigned int idx;
 
        info = si->cluster_info;
@@ -341,10 +391,14 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si)
                                SWAPFILE_CLUSTER);
 
                spin_lock(&si->lock);
-               cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE);
+               ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
+               cluster_set_flag(ci, CLUSTER_FLAG_FREE);
+               unlock_cluster(ci);
                cluster_list_add_tail(&si->free_clusters, info, idx);
+               ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
                memset(si->swap_map + idx * SWAPFILE_CLUSTER,
                                0, SWAPFILE_CLUSTER);
+               unlock_cluster(ci);
        }
 }
 
@@ -443,12 +497,13 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
  * Try to get a swap entry from current cpu's swap entry pool (a cluster). This
  * might involve allocating a new cluster for current CPU too.
  */
-static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
+static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
        unsigned long *offset, unsigned long *scan_base)
 {
        struct percpu_cluster *cluster;
+       struct swap_cluster_info *ci;
        bool found_free;
-       unsigned long tmp;
+       unsigned long tmp, max;
 
 new_cluster:
        cluster = this_cpu_ptr(si->percpu_cluster);
@@ -466,7 +521,7 @@ new_cluster:
                        *scan_base = *offset = si->cluster_next;
                        goto new_cluster;
                } else
-                       return;
+                       return false;
        }
 
        found_free = false;
@@ -476,14 +531,21 @@ new_cluster:
         * check if there is still free entry in the cluster
         */
        tmp = cluster->next;
-       while (tmp < si->max && tmp < (cluster_next(&cluster->index) + 1) *
-              SWAPFILE_CLUSTER) {
+       max = min_t(unsigned long, si->max,
+                   (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
+       if (tmp >= max) {
+               cluster_set_null(&cluster->index);
+               goto new_cluster;
+       }
+       ci = lock_cluster(si, tmp);
+       while (tmp < max) {
                if (!si->swap_map[tmp]) {
                        found_free = true;
                        break;
                }
                tmp++;
        }
+       unlock_cluster(ci);
        if (!found_free) {
                cluster_set_null(&cluster->index);
                goto new_cluster;
@@ -491,15 +553,22 @@ new_cluster:
        cluster->next = tmp + 1;
        *offset = tmp;
        *scan_base = tmp;
+       return found_free;
 }
 
-static unsigned long scan_swap_map(struct swap_info_struct *si,
-                                  unsigned char usage)
+static int scan_swap_map_slots(struct swap_info_struct *si,
+                              unsigned char usage, int nr,
+                              swp_entry_t slots[])
 {
+       struct swap_cluster_info *ci;
        unsigned long offset;
        unsigned long scan_base;
        unsigned long last_in_cluster = 0;
        int latency_ration = LATENCY_LIMIT;
+       int n_ret = 0;
+
+       if (nr > SWAP_BATCH)
+               nr = SWAP_BATCH;
 
        /*
         * We try to cluster swap pages by allocating them sequentially
@@ -517,8 +586,10 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
 
        /* SSD algorithm */
        if (si->cluster_info) {
-               scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
-               goto checks;
+               if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
+                       goto checks;
+               else
+                       goto scan;
        }
 
        if (unlikely(!si->cluster_nr--)) {
@@ -562,8 +633,14 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
 
 checks:
        if (si->cluster_info) {
-               while (scan_swap_map_ssd_cluster_conflict(si, offset))
-                       scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
+               while (scan_swap_map_ssd_cluster_conflict(si, offset)) {
+               /* take a break if we already got some slots */
+                       if (n_ret)
+                               goto done;
+                       if (!scan_swap_map_try_ssd_cluster(si, &offset,
+                                                       &scan_base))
+                               goto scan;
+               }
        }
        if (!(si->flags & SWP_WRITEOK))
                goto no_page;
@@ -572,9 +649,11 @@ checks:
        if (offset > si->highest_bit)
                scan_base = offset = si->lowest_bit;
 
+       ci = lock_cluster(si, offset);
        /* reuse swap entry of cache-only swap if not busy. */
        if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
                int swap_was_freed;
+               unlock_cluster(ci);
                spin_unlock(&si->lock);
                swap_was_freed = __try_to_reclaim_swap(si, offset);
                spin_lock(&si->lock);
@@ -584,8 +663,13 @@ checks:
                goto scan; /* check next one */
        }
 
-       if (si->swap_map[offset])
-               goto scan;
+       if (si->swap_map[offset]) {
+               unlock_cluster(ci);
+               if (!n_ret)
+                       goto scan;
+               else
+                       goto done;
+       }
 
        if (offset == si->lowest_bit)
                si->lowest_bit++;
@@ -601,10 +685,45 @@ checks:
        }
        si->swap_map[offset] = usage;
        inc_cluster_info_page(si, si->cluster_info, offset);
+       unlock_cluster(ci);
        si->cluster_next = offset + 1;
-       si->flags -= SWP_SCANNING;
+       slots[n_ret++] = swp_entry(si->type, offset);
+
+       /* got enough slots or reach max slots? */
+       if ((n_ret == nr) || (offset >= si->highest_bit))
+               goto done;
+
+       /* search for next available slot */
 
-       return offset;
+       /* time to take a break? */
+       if (unlikely(--latency_ration < 0)) {
+               if (n_ret)
+                       goto done;
+               spin_unlock(&si->lock);
+               cond_resched();
+               spin_lock(&si->lock);
+               latency_ration = LATENCY_LIMIT;
+       }
+
+       /* try to get more slots in cluster */
+       if (si->cluster_info) {
+               if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
+                       goto checks;
+               else
+                       goto done;
+       }
+       /* non-ssd case */
+       ++offset;
+
+       /* non-ssd case, still more slots in cluster? */
+       if (si->cluster_nr && !si->swap_map[offset]) {
+               --si->cluster_nr;
+               goto checks;
+       }
+
+done:
+       si->flags -= SWP_SCANNING;
+       return n_ret;
 
 scan:
        spin_unlock(&si->lock);
@@ -642,17 +761,41 @@ scan:
 
 no_page:
        si->flags -= SWP_SCANNING;
-       return 0;
+       return n_ret;
 }
 
-swp_entry_t get_swap_page(void)
+static unsigned long scan_swap_map(struct swap_info_struct *si,
+                                  unsigned char usage)
+{
+       swp_entry_t entry;
+       int n_ret;
+
+       n_ret = scan_swap_map_slots(si, usage, 1, &entry);
+
+       if (n_ret)
+               return swp_offset(entry);
+       else
+               return 0;
+
+}
+
+int get_swap_pages(int n_goal, swp_entry_t swp_entries[])
 {
        struct swap_info_struct *si, *next;
-       pgoff_t offset;
+       long avail_pgs;
+       int n_ret = 0;
 
-       if (atomic_long_read(&nr_swap_pages) <= 0)
+       avail_pgs = atomic_long_read(&nr_swap_pages);
+       if (avail_pgs <= 0)
                goto noswap;
-       atomic_long_dec(&nr_swap_pages);
+
+       if (n_goal > SWAP_BATCH)
+               n_goal = SWAP_BATCH;
+
+       if (n_goal > avail_pgs)
+               n_goal = avail_pgs;
+
+       atomic_long_sub(n_goal, &nr_swap_pages);
 
        spin_lock(&swap_avail_lock);
 
@@ -678,14 +821,14 @@ start_over:
                        spin_unlock(&si->lock);
                        goto nextsi;
                }
-
-               /* This is called for allocating swap entry for cache */
-               offset = scan_swap_map(si, SWAP_HAS_CACHE);
+               n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
+                                           n_goal, swp_entries);
                spin_unlock(&si->lock);
-               if (offset)
-                       return swp_entry(si->type, offset);
+               if (n_ret)
+                       goto check_out;
                pr_debug("scan_swap_map of si %d failed to find offset\n",
-                      si->type);
+                       si->type);
+
                spin_lock(&swap_avail_lock);
 nextsi:
                /*
@@ -696,7 +839,8 @@ nextsi:
                 * up between us dropping swap_avail_lock and taking si->lock.
                 * Since we dropped the swap_avail_lock, the swap_avail_head
                 * list may have been modified; so if next is still in the
-                * swap_avail_head list then try it, otherwise start over.
+                * swap_avail_head list then try it, otherwise start over
+                * if we have not gotten any slots.
                 */
                if (plist_node_empty(&next->avail_list))
                        goto start_over;
@@ -704,9 +848,11 @@ nextsi:
 
        spin_unlock(&swap_avail_lock);
 
-       atomic_long_inc(&nr_swap_pages);
+check_out:
+       if (n_ret < n_goal)
+               atomic_long_add((long) (n_goal-n_ret), &nr_swap_pages);
 noswap:
-       return (swp_entry_t) {0};
+       return n_ret;
 }
 
 /* The only caller of this function is now suspend routine */
@@ -731,7 +877,7 @@ swp_entry_t get_swap_page_of_type(int type)
        return (swp_entry_t) {0};
 }
 
-static struct swap_info_struct *swap_info_get(swp_entry_t entry)
+static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
 {
        struct swap_info_struct *p;
        unsigned long offset, type;
@@ -747,34 +893,76 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry)
        offset = swp_offset(entry);
        if (offset >= p->max)
                goto bad_offset;
-       if (!p->swap_map[offset])
-               goto bad_free;
-       spin_lock(&p->lock);
        return p;
 
-bad_free:
-       pr_err("swap_free: %s%08lx\n", Unused_offset, entry.val);
-       goto out;
 bad_offset:
-       pr_err("swap_free: %s%08lx\n", Bad_offset, entry.val);
+       pr_err("swap_info_get: %s%08lx\n", Bad_offset, entry.val);
        goto out;
 bad_device:
-       pr_err("swap_free: %s%08lx\n", Unused_file, entry.val);
+       pr_err("swap_info_get: %s%08lx\n", Unused_file, entry.val);
        goto out;
 bad_nofile:
-       pr_err("swap_free: %s%08lx\n", Bad_file, entry.val);
+       pr_err("swap_info_get: %s%08lx\n", Bad_file, entry.val);
+out:
+       return NULL;
+}
+
+static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
+{
+       struct swap_info_struct *p;
+
+       p = __swap_info_get(entry);
+       if (!p)
+               goto out;
+       if (!p->swap_map[swp_offset(entry)])
+               goto bad_free;
+       return p;
+
+bad_free:
+       pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val);
+       goto out;
 out:
        return NULL;
 }
 
-static unsigned char swap_entry_free(struct swap_info_struct *p,
-                                    swp_entry_t entry, unsigned char usage)
+static struct swap_info_struct *swap_info_get(swp_entry_t entry)
+{
+       struct swap_info_struct *p;
+
+       p = _swap_info_get(entry);
+       if (p)
+               spin_lock(&p->lock);
+       return p;
+}
+
+static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
+                                       struct swap_info_struct *q)
 {
+       struct swap_info_struct *p;
+
+       p = _swap_info_get(entry);
+
+       if (p != q) {
+               if (q != NULL)
+                       spin_unlock(&q->lock);
+               if (p != NULL)
+                       spin_lock(&p->lock);
+       }
+       return p;
+}
+
+static unsigned char __swap_entry_free(struct swap_info_struct *p,
+                                      swp_entry_t entry, unsigned char usage)
+{
+       struct swap_cluster_info *ci;
        unsigned long offset = swp_offset(entry);
        unsigned char count;
        unsigned char has_cache;
 
+       ci = lock_cluster_or_swap_info(p, offset);
+
        count = p->swap_map[offset];
+
        has_cache = count & SWAP_HAS_CACHE;
        count &= ~SWAP_HAS_CACHE;
 
@@ -798,38 +986,52 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
        }
 
        usage = count | has_cache;
-       p->swap_map[offset] = usage;
-
-       /* free if no reference */
-       if (!usage) {
-               mem_cgroup_uncharge_swap(entry);
-               dec_cluster_info_page(p, p->cluster_info, offset);
-               if (offset < p->lowest_bit)
-                       p->lowest_bit = offset;
-               if (offset > p->highest_bit) {
-                       bool was_full = !p->highest_bit;
-                       p->highest_bit = offset;
-                       if (was_full && (p->flags & SWP_WRITEOK)) {
-                               spin_lock(&swap_avail_lock);
-                               WARN_ON(!plist_node_empty(&p->avail_list));
-                               if (plist_node_empty(&p->avail_list))
-                                       plist_add(&p->avail_list,
-                                                 &swap_avail_head);
-                               spin_unlock(&swap_avail_lock);
-                       }
-               }
-               atomic_long_inc(&nr_swap_pages);
-               p->inuse_pages--;
-               frontswap_invalidate_page(p->type, offset);
-               if (p->flags & SWP_BLKDEV) {
-                       struct gendisk *disk = p->bdev->bd_disk;
-                       if (disk->fops->swap_slot_free_notify)
-                               disk->fops->swap_slot_free_notify(p->bdev,
-                                                                 offset);
+       p->swap_map[offset] = usage ? : SWAP_HAS_CACHE;
+
+       unlock_cluster_or_swap_info(p, ci);
+
+       return usage;
+}
+
+static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
+{
+       struct swap_cluster_info *ci;
+       unsigned long offset = swp_offset(entry);
+       unsigned char count;
+
+       ci = lock_cluster(p, offset);
+       count = p->swap_map[offset];
+       VM_BUG_ON(count != SWAP_HAS_CACHE);
+       p->swap_map[offset] = 0;
+       dec_cluster_info_page(p, p->cluster_info, offset);
+       unlock_cluster(ci);
+
+       mem_cgroup_uncharge_swap(entry);
+       if (offset < p->lowest_bit)
+               p->lowest_bit = offset;
+       if (offset > p->highest_bit) {
+               bool was_full = !p->highest_bit;
+
+               p->highest_bit = offset;
+               if (was_full && (p->flags & SWP_WRITEOK)) {
+                       spin_lock(&swap_avail_lock);
+                       WARN_ON(!plist_node_empty(&p->avail_list));
+                       if (plist_node_empty(&p->avail_list))
+                               plist_add(&p->avail_list,
+                                         &swap_avail_head);
+                       spin_unlock(&swap_avail_lock);
                }
        }
+       atomic_long_inc(&nr_swap_pages);
+       p->inuse_pages--;
+       frontswap_invalidate_page(p->type, offset);
+       if (p->flags & SWP_BLKDEV) {
+               struct gendisk *disk = p->bdev->bd_disk;
 
-       return usage;
+               if (disk->fops->swap_slot_free_notify)
+                       disk->fops->swap_slot_free_notify(p->bdev,
+                                                         offset);
+       }
 }
 
 /*
@@ -840,10 +1042,10 @@ void swap_free(swp_entry_t entry)
 {
        struct swap_info_struct *p;
 
-       p = swap_info_get(entry);
+       p = _swap_info_get(entry);
        if (p) {
-               swap_entry_free(p, entry, 1);
-               spin_unlock(&p->lock);
+               if (!__swap_entry_free(p, entry, 1))
+                       free_swap_slot(entry);
        }
 }
 
@@ -854,11 +1056,33 @@ void swapcache_free(swp_entry_t entry)
 {
        struct swap_info_struct *p;
 
-       p = swap_info_get(entry);
+       p = _swap_info_get(entry);
        if (p) {
-               swap_entry_free(p, entry, SWAP_HAS_CACHE);
-               spin_unlock(&p->lock);
+               if (!__swap_entry_free(p, entry, SWAP_HAS_CACHE))
+                       free_swap_slot(entry);
+       }
+}
+
+void swapcache_free_entries(swp_entry_t *entries, int n)
+{
+       struct swap_info_struct *p, *prev;
+       int i;
+
+       if (n <= 0)
+               return;
+
+       prev = NULL;
+       p = NULL;
+       for (i = 0; i < n; ++i) {
+               p = swap_info_get_cont(entries[i], prev);
+               if (p)
+                       swap_entry_free(p, entries[i]);
+               else
+                       break;
+               prev = p;
        }
+       if (p)
+               spin_unlock(&p->lock);
 }
 
 /*
@@ -870,13 +1094,39 @@ int page_swapcount(struct page *page)
 {
        int count = 0;
        struct swap_info_struct *p;
+       struct swap_cluster_info *ci;
        swp_entry_t entry;
+       unsigned long offset;
 
        entry.val = page_private(page);
-       p = swap_info_get(entry);
+       p = _swap_info_get(entry);
        if (p) {
-               count = swap_count(p->swap_map[swp_offset(entry)]);
-               spin_unlock(&p->lock);
+               offset = swp_offset(entry);
+               ci = lock_cluster_or_swap_info(p, offset);
+               count = swap_count(p->swap_map[offset]);
+               unlock_cluster_or_swap_info(p, ci);
+       }
+       return count;
+}
+
+/*
+ * How many references to @entry are currently swapped out?
+ * This does not give an exact answer when swap count is continued,
+ * but does include the high COUNT_CONTINUED flag to allow for that.
+ */
+int __swp_swapcount(swp_entry_t entry)
+{
+       int count = 0;
+       pgoff_t offset;
+       struct swap_info_struct *si;
+       struct swap_cluster_info *ci;
+
+       si = __swap_info_get(entry);
+       if (si) {
+               offset = swp_offset(entry);
+               ci = lock_cluster_or_swap_info(si, offset);
+               count = swap_count(si->swap_map[offset]);
+               unlock_cluster_or_swap_info(si, ci);
        }
        return count;
 }
@@ -889,22 +1139,26 @@ int swp_swapcount(swp_entry_t entry)
 {
        int count, tmp_count, n;
        struct swap_info_struct *p;
+       struct swap_cluster_info *ci;
        struct page *page;
        pgoff_t offset;
        unsigned char *map;
 
-       p = swap_info_get(entry);
+       p = _swap_info_get(entry);
        if (!p)
                return 0;
 
-       count = swap_count(p->swap_map[swp_offset(entry)]);
+       offset = swp_offset(entry);
+
+       ci = lock_cluster_or_swap_info(p, offset);
+
+       count = swap_count(p->swap_map[offset]);
        if (!(count & COUNT_CONTINUED))
                goto out;
 
        count &= ~COUNT_CONTINUED;
        n = SWAP_MAP_MAX + 1;
 
-       offset = swp_offset(entry);
        page = vmalloc_to_page(p->swap_map + offset);
        offset &= ~PAGE_MASK;
        VM_BUG_ON(page_private(page) != SWP_CONTINUED);
@@ -919,7 +1173,7 @@ int swp_swapcount(swp_entry_t entry)
                n *= (SWAP_CONT_MAX + 1);
        } while (tmp_count & COUNT_CONTINUED);
 out:
-       spin_unlock(&p->lock);
+       unlock_cluster_or_swap_info(p, ci);
        return count;
 }
 
@@ -943,11 +1197,25 @@ bool reuse_swap_page(struct page *page, int *total_mapcount)
        count = page_trans_huge_mapcount(page, total_mapcount);
        if (count <= 1 && PageSwapCache(page)) {
                count += page_swapcount(page);
-               if (count == 1 && !PageWriteback(page)) {
+               if (count != 1)
+                       goto out;
+               if (!PageWriteback(page)) {
                        delete_from_swap_cache(page);
                        SetPageDirty(page);
+               } else {
+                       swp_entry_t entry;
+                       struct swap_info_struct *p;
+
+                       entry.val = page_private(page);
+                       p = swap_info_get(entry);
+                       if (p->flags & SWP_STABLE_WRITES) {
+                               spin_unlock(&p->lock);
+                               return false;
+                       }
+                       spin_unlock(&p->lock);
                }
        }
+out:
        return count <= 1;
 }
 
@@ -997,21 +1265,23 @@ int free_swap_and_cache(swp_entry_t entry)
 {
        struct swap_info_struct *p;
        struct page *page = NULL;
+       unsigned char count;
 
        if (non_swap_entry(entry))
                return 1;
 
-       p = swap_info_get(entry);
+       p = _swap_info_get(entry);
        if (p) {
-               if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
+               count = __swap_entry_free(p, entry, 1);
+               if (count == SWAP_HAS_CACHE) {
                        page = find_get_page(swap_address_space(entry),
                                             swp_offset(entry));
                        if (page && !trylock_page(page)) {
                                put_page(page);
                                page = NULL;
                        }
-               }
-               spin_unlock(&p->lock);
+               } else if (!count)
+                       free_swap_slot(entry);
        }
        if (page) {
                /*
@@ -1839,6 +2109,17 @@ static void reinsert_swap_info(struct swap_info_struct *p)
        spin_unlock(&swap_lock);
 }
 
+bool has_usable_swap(void)
+{
+       bool ret = true;
+
+       spin_lock(&swap_lock);
+       if (plist_head_empty(&swap_active_head))
+               ret = false;
+       spin_unlock(&swap_lock);
+       return ret;
+}
+
 SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 {
        struct swap_info_struct *p = NULL;
@@ -1909,6 +2190,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        spin_unlock(&p->lock);
        spin_unlock(&swap_lock);
 
+       disable_swap_slots_cache_lock();
+
        set_current_oom_origin();
        err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
        clear_current_oom_origin();
@@ -1916,9 +2199,12 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        if (err) {
                /* re-insert swap space back into swap_list */
                reinsert_swap_info(p);
+               reenable_swap_slots_cache_unlock();
                goto out_dput;
        }
 
+       reenable_swap_slots_cache_unlock();
+
        flush_work(&p->discard_work);
 
        destroy_swap_extents(p);
@@ -1961,6 +2247,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        vfree(frontswap_map);
        /* Destroy swap account information */
        swap_cgroup_swapoff(p->type);
+       exit_swap_address_space(p->type);
 
        inode = mapping->host;
        if (S_ISBLK(inode->i_mode)) {
@@ -2284,6 +2571,13 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
        return maxpages;
 }
 
+#define SWAP_CLUSTER_INFO_COLS                                         \
+       DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
+#define SWAP_CLUSTER_SPACE_COLS                                                \
+       DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
+#define SWAP_CLUSTER_COLS                                              \
+       max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
+
 static int setup_swap_map_and_extents(struct swap_info_struct *p,
                                        union swap_header *swap_header,
                                        unsigned char *swap_map,
@@ -2291,11 +2585,12 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
                                        unsigned long maxpages,
                                        sector_t *span)
 {
-       int i;
+       unsigned int j, k;
        unsigned int nr_good_pages;
        int nr_extents;
        unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
-       unsigned long idx = p->cluster_next / SWAPFILE_CLUSTER;
+       unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
+       unsigned long i, idx;
 
        nr_good_pages = maxpages - 1;   /* omit header page */
 
@@ -2343,15 +2638,23 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
        if (!cluster_info)
                return nr_extents;
 
-       for (i = 0; i < nr_clusters; i++) {
-               if (!cluster_count(&cluster_info[idx])) {
+
+       /*
+        * Reduce false cache line sharing between cluster_info and
+        * sharing same address space.
+        */
+       for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
+               j = (k + col) % SWAP_CLUSTER_COLS;
+               for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
+                       idx = i * SWAP_CLUSTER_COLS + j;
+                       if (idx >= nr_clusters)
+                               continue;
+                       if (cluster_count(&cluster_info[idx]))
+                               continue;
                        cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
                        cluster_list_add_tail(&p->free_clusters, cluster_info,
                                              idx);
                }
-               idx++;
-               if (idx == nr_clusters)
-                       idx = 0;
        }
        return nr_extents;
 }
@@ -2448,8 +2751,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
                error = -ENOMEM;
                goto bad_swap;
        }
+
+       if (bdi_cap_stable_pages_required(inode_to_bdi(inode)))
+               p->flags |= SWP_STABLE_WRITES;
+
        if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
                int cpu;
+               unsigned long ci, nr_cluster;
 
                p->flags |= SWP_SOLIDSTATE;
                /*
@@ -2457,13 +2765,17 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
                 * SSD
                 */
                p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
+               nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
 
-               cluster_info = vzalloc(DIV_ROUND_UP(maxpages,
-                       SWAPFILE_CLUSTER) * sizeof(*cluster_info));
+               cluster_info = vzalloc(nr_cluster * sizeof(*cluster_info));
                if (!cluster_info) {
                        error = -ENOMEM;
                        goto bad_swap;
                }
+
+               for (ci = 0; ci < nr_cluster; ci++)
+                       spin_lock_init(&((cluster_info + ci)->lock));
+
                p->percpu_cluster = alloc_percpu(struct percpu_cluster);
                if (!p->percpu_cluster) {
                        error = -ENOMEM;
@@ -2520,6 +2832,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
                }
        }
 
+       error = init_swap_address_space(p->type, maxpages);
+       if (error)
+               goto bad_swap;
+
        mutex_lock(&swapon_mutex);
        prio = -1;
        if (swap_flags & SWAP_FLAG_PREFER)
@@ -2575,6 +2891,8 @@ out:
                putname(name);
        if (inode && S_ISREG(inode->i_mode))
                inode_unlock(inode);
+       if (!error)
+               enable_swap_slots_cache();
        return error;
 }
 
@@ -2609,6 +2927,7 @@ void si_swapinfo(struct sysinfo *val)
 static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
 {
        struct swap_info_struct *p;
+       struct swap_cluster_info *ci;
        unsigned long offset, type;
        unsigned char count;
        unsigned char has_cache;
@@ -2622,10 +2941,10 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
                goto bad_file;
        p = swap_info[type];
        offset = swp_offset(entry);
-
-       spin_lock(&p->lock);
        if (unlikely(offset >= p->max))
-               goto unlock_out;
+               goto out;
+
+       ci = lock_cluster_or_swap_info(p, offset);
 
        count = p->swap_map[offset];
 
@@ -2668,7 +2987,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
        p->swap_map[offset] = count | has_cache;
 
 unlock_out:
-       spin_unlock(&p->lock);
+       unlock_cluster_or_swap_info(p, ci);
 out:
        return err;
 
@@ -2757,6 +3076,7 @@ EXPORT_SYMBOL_GPL(__page_file_index);
 int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
 {
        struct swap_info_struct *si;
+       struct swap_cluster_info *ci;
        struct page *head;
        struct page *page;
        struct page *list_page;
@@ -2780,6 +3100,9 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
        }
 
        offset = swp_offset(entry);
+
+       ci = lock_cluster(si, offset);
+
        count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
 
        if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
@@ -2792,6 +3115,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
        }
 
        if (!page) {
+               unlock_cluster(ci);
                spin_unlock(&si->lock);
                return -ENOMEM;
        }
@@ -2840,6 +3164,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
        list_add_tail(&page->lru, &head->lru);
        page = NULL;                    /* now it's attached, don't free it */
 out:
+       unlock_cluster(ci);
        spin_unlock(&si->lock);
 outer:
        if (page)
@@ -2853,7 +3178,8 @@ outer:
  * into, carry if so, or else fail until a new continuation page is allocated;
  * when the original swap_map count is decremented from 0 with continuation,
  * borrow from the continuation and report whether it still holds more.
- * Called while __swap_duplicate() or swap_entry_free() holds swap_lock.
+ * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster
+ * lock.
  */
 static bool swap_count_continued(struct swap_info_struct *si,
                                 pgoff_t offset, unsigned char count)