]> git.karo-electronics.de Git - karo-tx-linux.git/blobdiff - mm/swapfile.c
mm/mlock.c: prevent walking off the end of a pagetable in no-pmd configuration
[karo-tx-linux.git] / mm / swapfile.c
index d1fbeb486de52ed31a02bdd957c985fcde552c73..3963fc24fcc1b6f8c4d365de4d99bda8993311b7 100644 (file)
@@ -175,12 +175,6 @@ static void discard_swap_cluster(struct swap_info_struct *si,
        }
 }
 
-static int wait_for_discard(void *word)
-{
-       schedule();
-       return 0;
-}
-
 #define SWAPFILE_CLUSTER       256
 #define LATENCY_LIMIT          256
 
@@ -242,6 +236,90 @@ static inline void cluster_set_null(struct swap_cluster_info *info)
        info->data = 0;
 }
 
+/* Add a cluster to discard list and schedule it to do discard */
+static void swap_cluster_schedule_discard(struct swap_info_struct *si,
+               unsigned int idx)
+{
+       /*
+        * If scan_swap_map() can't find a free cluster, it will check
+        * si->swap_map directly. To make sure the discarding cluster isn't
+        * taken by scan_swap_map(), mark the swap entries bad (occupied). It
+        * will be cleared after discard
+        */
+       memset(si->swap_map + idx * SWAPFILE_CLUSTER,
+                       SWAP_MAP_BAD, SWAPFILE_CLUSTER);
+
+       if (cluster_is_null(&si->discard_cluster_head)) {
+               cluster_set_next_flag(&si->discard_cluster_head,
+                                               idx, 0);
+               cluster_set_next_flag(&si->discard_cluster_tail,
+                                               idx, 0);
+       } else {
+               unsigned int tail = cluster_next(&si->discard_cluster_tail);
+               cluster_set_next(&si->cluster_info[tail], idx);
+               cluster_set_next_flag(&si->discard_cluster_tail,
+                                               idx, 0);
+       }
+
+       schedule_work(&si->discard_work);
+}
+
+/*
+ * Doing discard actually. After a cluster discard is finished, the cluster
+ * will be added to free cluster list. caller should hold si->lock.
+*/
+static void swap_do_scheduled_discard(struct swap_info_struct *si)
+{
+       struct swap_cluster_info *info;
+       unsigned int idx;
+
+       info = si->cluster_info;
+
+       while (!cluster_is_null(&si->discard_cluster_head)) {
+               idx = cluster_next(&si->discard_cluster_head);
+
+               cluster_set_next_flag(&si->discard_cluster_head,
+                                               cluster_next(&info[idx]), 0);
+               if (cluster_next(&si->discard_cluster_tail) == idx) {
+                       cluster_set_null(&si->discard_cluster_head);
+                       cluster_set_null(&si->discard_cluster_tail);
+               }
+               spin_unlock(&si->lock);
+
+               discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
+                               SWAPFILE_CLUSTER);
+
+               spin_lock(&si->lock);
+               cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE);
+               if (cluster_is_null(&si->free_cluster_head)) {
+                       cluster_set_next_flag(&si->free_cluster_head,
+                                               idx, 0);
+                       cluster_set_next_flag(&si->free_cluster_tail,
+                                               idx, 0);
+               } else {
+                       unsigned int tail;
+
+                       tail = cluster_next(&si->free_cluster_tail);
+                       cluster_set_next(&info[tail], idx);
+                       cluster_set_next_flag(&si->free_cluster_tail,
+                                               idx, 0);
+               }
+               memset(si->swap_map + idx * SWAPFILE_CLUSTER,
+                               0, SWAPFILE_CLUSTER);
+       }
+}
+
+static void swap_discard_work(struct work_struct *work)
+{
+       struct swap_info_struct *si;
+
+       si = container_of(work, struct swap_info_struct, discard_work);
+
+       spin_lock(&si->lock);
+       swap_do_scheduled_discard(si);
+       spin_unlock(&si->lock);
+}
+
 /*
  * The cluster corresponding to page_nr will be used. The cluster will be
  * removed from free cluster list and its usage counter will be increased.
@@ -287,6 +365,17 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
                cluster_count(&cluster_info[idx]) - 1);
 
        if (cluster_count(&cluster_info[idx]) == 0) {
+               /*
+                * If the swap is discardable, prepare discard the cluster
+                * instead of free it immediately. The cluster will be freed
+                * after discard.
+                */
+               if ((p->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
+                                (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
+                       swap_cluster_schedule_discard(p, idx);
+                       return;
+               }
+
                cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
                if (cluster_is_null(&p->free_cluster_head)) {
                        cluster_set_next_flag(&p->free_cluster_head, idx, 0);
@@ -303,13 +392,78 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
  * It's possible scan_swap_map() uses a free cluster in the middle of free
  * cluster list. Avoiding such abuse to avoid list corruption.
  */
-static inline bool scan_swap_map_recheck_cluster(struct swap_info_struct *si,
+static bool
+scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
        unsigned long offset)
 {
+       struct percpu_cluster *percpu_cluster;
+       bool conflict;
+
        offset /= SWAPFILE_CLUSTER;
-       return !cluster_is_null(&si->free_cluster_head) &&
+       conflict = !cluster_is_null(&si->free_cluster_head) &&
                offset != cluster_next(&si->free_cluster_head) &&
                cluster_is_free(&si->cluster_info[offset]);
+
+       if (!conflict)
+               return false;
+
+       percpu_cluster = this_cpu_ptr(si->percpu_cluster);
+       cluster_set_null(&percpu_cluster->index);
+       return true;
+}
+
+/*
+ * Try to get a swap entry from current cpu's swap entry pool (a cluster). This
+ * might involve allocating a new cluster for current CPU too.
+ */
+static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
+       unsigned long *offset, unsigned long *scan_base)
+{
+       struct percpu_cluster *cluster;
+       bool found_free;
+       unsigned long tmp;
+
+new_cluster:
+       cluster = this_cpu_ptr(si->percpu_cluster);
+       if (cluster_is_null(&cluster->index)) {
+               if (!cluster_is_null(&si->free_cluster_head)) {
+                       cluster->index = si->free_cluster_head;
+                       cluster->next = cluster_next(&cluster->index) *
+                                       SWAPFILE_CLUSTER;
+               } else if (!cluster_is_null(&si->discard_cluster_head)) {
+                       /*
+                        * we don't have free cluster but have some clusters in
+                        * discarding, do discard now and reclaim them
+                        */
+                       swap_do_scheduled_discard(si);
+                       *scan_base = *offset = si->cluster_next;
+                       goto new_cluster;
+               } else
+                       return;
+       }
+
+       found_free = false;
+
+       /*
+        * Other CPUs can use our cluster if they can't find a free cluster,
+        * check if there is still free entry in the cluster
+        */
+       tmp = cluster->next;
+       while (tmp < si->max && tmp < (cluster_next(&cluster->index) + 1) *
+              SWAPFILE_CLUSTER) {
+               if (!si->swap_map[tmp]) {
+                       found_free = true;
+                       break;
+               }
+               tmp++;
+       }
+       if (!found_free) {
+               cluster_set_null(&cluster->index);
+               goto new_cluster;
+       }
+       cluster->next = tmp + 1;
+       *offset = tmp;
+       *scan_base = tmp;
 }
 
 static unsigned long scan_swap_map(struct swap_info_struct *si,
@@ -319,7 +473,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
        unsigned long scan_base;
        unsigned long last_in_cluster = 0;
        int latency_ration = LATENCY_LIMIT;
-       int found_free_cluster = 0;
 
        /*
         * We try to cluster swap pages by allocating them sequentially
@@ -335,42 +488,17 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
        si->flags += SWP_SCANNING;
        scan_base = offset = si->cluster_next;
 
+       /* SSD algorithm */
+       if (si->cluster_info) {
+               scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
+               goto checks;
+       }
+
        if (unlikely(!si->cluster_nr--)) {
                if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
                        si->cluster_nr = SWAPFILE_CLUSTER - 1;
                        goto checks;
                }
-               if (si->flags & SWP_PAGE_DISCARD) {
-                       /*
-                        * Start range check on racing allocations, in case
-                        * they overlap the cluster we eventually decide on
-                        * (we scan without swap_lock to allow preemption).
-                        * It's hardly conceivable that cluster_nr could be
-                        * wrapped during our scan, but don't depend on it.
-                        */
-                       if (si->lowest_alloc)
-                               goto checks;
-                       si->lowest_alloc = si->max;
-                       si->highest_alloc = 0;
-               }
-check_cluster:
-               if (!cluster_is_null(&si->free_cluster_head)) {
-                       offset = cluster_next(&si->free_cluster_head) *
-                                               SWAPFILE_CLUSTER;
-                       last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
-                       si->cluster_next = offset;
-                       si->cluster_nr = SWAPFILE_CLUSTER - 1;
-                       found_free_cluster = 1;
-                       goto checks;
-               } else if (si->cluster_info) {
-                       /*
-                        * Checking free cluster is fast enough, we can do the
-                        * check every time
-                        */
-                       si->cluster_nr = 0;
-                       si->lowest_alloc = 0;
-                       goto checks;
-               }
 
                spin_unlock(&si->lock);
 
@@ -395,7 +523,6 @@ check_cluster:
                                offset -= SWAPFILE_CLUSTER - 1;
                                si->cluster_next = offset;
                                si->cluster_nr = SWAPFILE_CLUSTER - 1;
-                               found_free_cluster = 1;
                                goto checks;
                        }
                        if (unlikely(--latency_ration < 0)) {
@@ -416,7 +543,6 @@ check_cluster:
                                offset -= SWAPFILE_CLUSTER - 1;
                                si->cluster_next = offset;
                                si->cluster_nr = SWAPFILE_CLUSTER - 1;
-                               found_free_cluster = 1;
                                goto checks;
                        }
                        if (unlikely(--latency_ration < 0)) {
@@ -428,12 +554,13 @@ check_cluster:
                offset = scan_base;
                spin_lock(&si->lock);
                si->cluster_nr = SWAPFILE_CLUSTER - 1;
-               si->lowest_alloc = 0;
        }
 
 checks:
-       if (scan_swap_map_recheck_cluster(si, offset))
-               goto check_cluster;
+       if (si->cluster_info) {
+               while (scan_swap_map_ssd_cluster_conflict(si, offset))
+                       scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
+       }
        if (!(si->flags & SWP_WRITEOK))
                goto no_page;
        if (!si->highest_bit)
@@ -470,59 +597,6 @@ checks:
        si->cluster_next = offset + 1;
        si->flags -= SWP_SCANNING;
 
-       if (si->lowest_alloc) {
-               /*
-                * Only set when SWP_PAGE_DISCARD, and there's a scan
-                * for a free cluster in progress or just completed.
-                */
-               if (found_free_cluster) {
-                       /*
-                        * To optimize wear-levelling, discard the
-                        * old data of the cluster, taking care not to
-                        * discard any of its pages that have already
-                        * been allocated by racing tasks (offset has
-                        * already stepped over any at the beginning).
-                        */
-                       if (offset < si->highest_alloc &&
-                           si->lowest_alloc <= last_in_cluster)
-                               last_in_cluster = si->lowest_alloc - 1;
-                       si->flags |= SWP_DISCARDING;
-                       spin_unlock(&si->lock);
-
-                       if (offset < last_in_cluster)
-                               discard_swap_cluster(si, offset,
-                                       last_in_cluster - offset + 1);
-
-                       spin_lock(&si->lock);
-                       si->lowest_alloc = 0;
-                       si->flags &= ~SWP_DISCARDING;
-
-                       smp_mb();       /* wake_up_bit advises this */
-                       wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
-
-               } else if (si->flags & SWP_DISCARDING) {
-                       /*
-                        * Delay using pages allocated by racing tasks
-                        * until the whole discard has been issued. We
-                        * could defer that delay until swap_writepage,
-                        * but it's easier to keep this self-contained.
-                        */
-                       spin_unlock(&si->lock);
-                       wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
-                               wait_for_discard, TASK_UNINTERRUPTIBLE);
-                       spin_lock(&si->lock);
-               } else {
-                       /*
-                        * Note pages allocated by racing tasks while
-                        * scan for a free cluster is in progress, so
-                        * that its final discard can exclude them.
-                        */
-                       if (offset < si->lowest_alloc)
-                               si->lowest_alloc = offset;
-                       if (offset > si->highest_alloc)
-                               si->highest_alloc = offset;
-               }
-       }
        return offset;
 
 scan:
@@ -1258,7 +1332,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
                        else
                                continue;
                }
-               count = si->swap_map[i];
+               count = ACCESS_ONCE(si->swap_map[i]);
                if (count && swap_count(count) != SWAP_MAP_BAD)
                        break;
        }
@@ -1278,7 +1352,11 @@ int try_to_unuse(unsigned int type, bool frontswap,
 {
        struct swap_info_struct *si = swap_info[type];
        struct mm_struct *start_mm;
-       unsigned char *swap_map;
+       volatile unsigned char *swap_map; /* swap_map is accessed without
+                                          * locking. Mark it as volatile
+                                          * to prevent compiler doing
+                                          * something odd.
+                                          */
        unsigned char swcount;
        struct page *page;
        swp_entry_t entry;
@@ -1329,7 +1407,15 @@ int try_to_unuse(unsigned int type, bool frontswap,
                         * reused since sys_swapoff() already disabled
                         * allocation from here, or alloc_page() failed.
                         */
-                       if (!*swap_map)
+                       swcount = *swap_map;
+                       /*
+                        * We don't hold lock here, so the swap entry could be
+                        * SWAP_MAP_BAD (when the cluster is discarding).
+                        * Instead of fail out, We can just skip the swap
+                        * entry because swapoff will wait for discarding
+                        * finish anyway.
+                        */
+                       if (!swcount || swcount == SWAP_MAP_BAD)
                                continue;
                        retval = -ENOMEM;
                        break;
@@ -1806,6 +1892,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
                goto out_dput;
        }
 
+       flush_work(&p->discard_work);
+
        destroy_swap_extents(p);
        if (p->flags & SWP_CONTINUED)
                free_swap_count_continuations(p);
@@ -1839,6 +1927,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        spin_unlock(&swap_lock);
        frontswap_invalidate_area(type);
        mutex_unlock(&swapon_mutex);
+       free_percpu(p->percpu_cluster);
+       p->percpu_cluster = NULL;
        vfree(swap_map);
        vfree(cluster_info);
        vfree(frontswap_map);
@@ -2172,6 +2262,8 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
 
        cluster_set_null(&p->free_cluster_head);
        cluster_set_null(&p->free_cluster_tail);
+       cluster_set_null(&p->discard_cluster_head);
+       cluster_set_null(&p->discard_cluster_tail);
 
        for (i = 0; i < swap_header->info.nr_badpages; i++) {
                unsigned int page_nr = swap_header->info.badpages[i];
@@ -2281,6 +2373,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        if (IS_ERR(p))
                return PTR_ERR(p);
 
+       INIT_WORK(&p->discard_work, swap_discard_work);
+
        name = getname(specialfile);
        if (IS_ERR(name)) {
                error = PTR_ERR(name);
@@ -2354,6 +2448,16 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
                        error = -ENOMEM;
                        goto bad_swap;
                }
+               p->percpu_cluster = alloc_percpu(struct percpu_cluster);
+               if (!p->percpu_cluster) {
+                       error = -ENOMEM;
+                       goto bad_swap;
+               }
+               for_each_possible_cpu(i) {
+                       struct percpu_cluster *cluster;
+                       cluster = per_cpu_ptr(p->percpu_cluster, i);
+                       cluster_set_null(&cluster->index);
+               }
        }
 
        error = swap_cgroup_swapon(p->type, maxpages);
@@ -2426,6 +2530,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        error = 0;
        goto out;
 bad_swap:
+       free_percpu(p->percpu_cluster);
+       p->percpu_cluster = NULL;
        if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
                set_blocksize(p->bdev, p->old_block_size);
                blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
@@ -2507,6 +2613,16 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
                goto unlock_out;
 
        count = p->swap_map[offset];
+
+       /*
+        * swapin_readahead() doesn't check if a swap entry is valid, so the
+        * swap entry could be SWAP_MAP_BAD. Check here with lock held.
+        */
+       if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
+               err = -ENOENT;
+               goto unlock_out;
+       }
+
        has_cache = count & SWAP_HAS_CACHE;
        count &= ~SWAP_HAS_CACHE;
        err = 0;