]> git.karo-electronics.de Git - karo-tx-linux.git/blob - mm/madvise.c
mm: support madvise(MADV_FREE)
[karo-tx-linux.git] / mm / madvise.c
1 /*
2  *      linux/mm/madvise.c
3  *
4  * Copyright (C) 1999  Linus Torvalds
5  * Copyright (C) 2002  Christoph Hellwig
6  */
7
8 #include <linux/mman.h>
9 #include <linux/pagemap.h>
10 #include <linux/syscalls.h>
11 #include <linux/mempolicy.h>
12 #include <linux/page-isolation.h>
13 #include <linux/hugetlb.h>
14 #include <linux/falloc.h>
15 #include <linux/sched.h>
16 #include <linux/ksm.h>
17 #include <linux/fs.h>
18 #include <linux/file.h>
19 #include <linux/blkdev.h>
20 #include <linux/backing-dev.h>
21 #include <linux/swap.h>
22 #include <linux/swapops.h>
23 #include <linux/mmu_notifier.h>
24
25 #include <asm/tlb.h>
26
27 struct madvise_free_private {
28         struct vm_area_struct *vma;
29         struct mmu_gather *tlb;
30 };
31
32 /*
33  * Any behaviour which results in changes to the vma->vm_flags needs to
34  * take mmap_sem for writing. Others, which simply traverse vmas, need
35  * to only take it for reading.
36  */
37 static int madvise_need_mmap_write(int behavior)
38 {
39         switch (behavior) {
40         case MADV_REMOVE:
41         case MADV_WILLNEED:
42         case MADV_DONTNEED:
43         case MADV_FREE:
44                 return 0;
45         default:
46                 /* be safe, default to 1. list exceptions explicitly */
47                 return 1;
48         }
49 }
50
51 /*
52  * We can potentially split a vm area into separate
53  * areas, each area with its own behavior.
54  */
55 static long madvise_behavior(struct vm_area_struct *vma,
56                      struct vm_area_struct **prev,
57                      unsigned long start, unsigned long end, int behavior)
58 {
59         struct mm_struct *mm = vma->vm_mm;
60         int error = 0;
61         pgoff_t pgoff;
62         unsigned long new_flags = vma->vm_flags;
63
64         switch (behavior) {
65         case MADV_NORMAL:
66                 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
67                 break;
68         case MADV_SEQUENTIAL:
69                 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
70                 break;
71         case MADV_RANDOM:
72                 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
73                 break;
74         case MADV_DONTFORK:
75                 new_flags |= VM_DONTCOPY;
76                 break;
77         case MADV_DOFORK:
78                 if (vma->vm_flags & VM_IO) {
79                         error = -EINVAL;
80                         goto out;
81                 }
82                 new_flags &= ~VM_DONTCOPY;
83                 break;
84         case MADV_DONTDUMP:
85                 new_flags |= VM_DONTDUMP;
86                 break;
87         case MADV_DODUMP:
88                 if (new_flags & VM_SPECIAL) {
89                         error = -EINVAL;
90                         goto out;
91                 }
92                 new_flags &= ~VM_DONTDUMP;
93                 break;
94         case MADV_MERGEABLE:
95         case MADV_UNMERGEABLE:
96                 error = ksm_madvise(vma, start, end, behavior, &new_flags);
97                 if (error)
98                         goto out;
99                 break;
100         case MADV_HUGEPAGE:
101         case MADV_NOHUGEPAGE:
102                 error = hugepage_madvise(vma, &new_flags, behavior);
103                 if (error)
104                         goto out;
105                 break;
106         }
107
108         if (new_flags == vma->vm_flags) {
109                 *prev = vma;
110                 goto out;
111         }
112
113         pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
114         *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
115                           vma->vm_file, pgoff, vma_policy(vma),
116                           vma->vm_userfaultfd_ctx);
117         if (*prev) {
118                 vma = *prev;
119                 goto success;
120         }
121
122         *prev = vma;
123
124         if (start != vma->vm_start) {
125                 error = split_vma(mm, vma, start, 1);
126                 if (error)
127                         goto out;
128         }
129
130         if (end != vma->vm_end) {
131                 error = split_vma(mm, vma, end, 0);
132                 if (error)
133                         goto out;
134         }
135
136 success:
137         /*
138          * vm_flags is protected by the mmap_sem held in write mode.
139          */
140         vma->vm_flags = new_flags;
141
142 out:
143         if (error == -ENOMEM)
144                 error = -EAGAIN;
145         return error;
146 }
147
148 #ifdef CONFIG_SWAP
149 static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
150         unsigned long end, struct mm_walk *walk)
151 {
152         pte_t *orig_pte;
153         struct vm_area_struct *vma = walk->private;
154         unsigned long index;
155
156         if (pmd_none_or_trans_huge_or_clear_bad(pmd))
157                 return 0;
158
159         for (index = start; index != end; index += PAGE_SIZE) {
160                 pte_t pte;
161                 swp_entry_t entry;
162                 struct page *page;
163                 spinlock_t *ptl;
164
165                 orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
166                 pte = *(orig_pte + ((index - start) / PAGE_SIZE));
167                 pte_unmap_unlock(orig_pte, ptl);
168
169                 if (pte_present(pte) || pte_none(pte))
170                         continue;
171                 entry = pte_to_swp_entry(pte);
172                 if (unlikely(non_swap_entry(entry)))
173                         continue;
174
175                 page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
176                                                                 vma, index);
177                 if (page)
178                         page_cache_release(page);
179         }
180
181         return 0;
182 }
183
184 static void force_swapin_readahead(struct vm_area_struct *vma,
185                 unsigned long start, unsigned long end)
186 {
187         struct mm_walk walk = {
188                 .mm = vma->vm_mm,
189                 .pmd_entry = swapin_walk_pmd_entry,
190                 .private = vma,
191         };
192
193         walk_page_range(start, end, &walk);
194
195         lru_add_drain();        /* Push any new pages onto the LRU now */
196 }
197
198 static void force_shm_swapin_readahead(struct vm_area_struct *vma,
199                 unsigned long start, unsigned long end,
200                 struct address_space *mapping)
201 {
202         pgoff_t index;
203         struct page *page;
204         swp_entry_t swap;
205
206         for (; start < end; start += PAGE_SIZE) {
207                 index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
208
209                 page = find_get_entry(mapping, index);
210                 if (!radix_tree_exceptional_entry(page)) {
211                         if (page)
212                                 page_cache_release(page);
213                         continue;
214                 }
215                 swap = radix_to_swp_entry(page);
216                 page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
217                                                                 NULL, 0);
218                 if (page)
219                         page_cache_release(page);
220         }
221
222         lru_add_drain();        /* Push any new pages onto the LRU now */
223 }
224 #endif          /* CONFIG_SWAP */
225
226 /*
227  * Schedule all required I/O operations.  Do not wait for completion.
228  */
229 static long madvise_willneed(struct vm_area_struct *vma,
230                              struct vm_area_struct **prev,
231                              unsigned long start, unsigned long end)
232 {
233         struct file *file = vma->vm_file;
234
235 #ifdef CONFIG_SWAP
236         if (!file) {
237                 *prev = vma;
238                 force_swapin_readahead(vma, start, end);
239                 return 0;
240         }
241
242         if (shmem_mapping(file->f_mapping)) {
243                 *prev = vma;
244                 force_shm_swapin_readahead(vma, start, end,
245                                         file->f_mapping);
246                 return 0;
247         }
248 #else
249         if (!file)
250                 return -EBADF;
251 #endif
252
253         if (IS_DAX(file_inode(file))) {
254                 /* no bad return value, but ignore advice */
255                 return 0;
256         }
257
258         *prev = vma;
259         start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
260         if (end > vma->vm_end)
261                 end = vma->vm_end;
262         end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
263
264         force_page_cache_readahead(file->f_mapping, file, start, end - start);
265         return 0;
266 }
267
268 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
269                                 unsigned long end, struct mm_walk *walk)
270
271 {
272         struct madvise_free_private *fp = walk->private;
273         struct mmu_gather *tlb = fp->tlb;
274         struct mm_struct *mm = tlb->mm;
275         struct vm_area_struct *vma = fp->vma;
276         spinlock_t *ptl;
277         pte_t *pte, ptent;
278         struct page *page;
279
280         split_huge_page_pmd(vma, addr, pmd);
281         if (pmd_trans_unstable(pmd))
282                 return 0;
283
284         pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
285         arch_enter_lazy_mmu_mode();
286         for (; addr != end; pte++, addr += PAGE_SIZE) {
287                 ptent = *pte;
288
289                 if (!pte_present(ptent))
290                         continue;
291
292                 page = vm_normal_page(vma, addr, ptent);
293                 if (!page)
294                         continue;
295
296                 if (PageSwapCache(page)) {
297                         if (!trylock_page(page))
298                                 continue;
299
300                         if (!try_to_free_swap(page)) {
301                                 unlock_page(page);
302                                 continue;
303                         }
304
305                         ClearPageDirty(page);
306                         unlock_page(page);
307                 }
308
309                 /*
310                  * Some of architecture(ex, PPC) don't update TLB
311                  * with set_pte_at and tlb_remove_tlb_entry so for
312                  * the portability, remap the pte with old|clean
313                  * after pte clearing.
314                  */
315                 ptent = ptep_get_and_clear_full(mm, addr, pte,
316                                                 tlb->fullmm);
317                 ptent = pte_mkold(ptent);
318                 ptent = pte_mkclean(ptent);
319                 set_pte_at(mm, addr, pte, ptent);
320                 tlb_remove_tlb_entry(tlb, pte, addr);
321         }
322         arch_leave_lazy_mmu_mode();
323         pte_unmap_unlock(pte - 1, ptl);
324         cond_resched();
325         return 0;
326 }
327
328 static void madvise_free_page_range(struct mmu_gather *tlb,
329                              struct vm_area_struct *vma,
330                              unsigned long addr, unsigned long end)
331 {
332         struct madvise_free_private fp = {
333                 .vma = vma,
334                 .tlb = tlb,
335         };
336
337         struct mm_walk free_walk = {
338                 .pmd_entry = madvise_free_pte_range,
339                 .mm = vma->vm_mm,
340                 .private = &fp,
341         };
342
343         BUG_ON(addr >= end);
344         tlb_start_vma(tlb, vma);
345         walk_page_range(addr, end, &free_walk);
346         tlb_end_vma(tlb, vma);
347 }
348
349 static int madvise_free_single_vma(struct vm_area_struct *vma,
350                         unsigned long start_addr, unsigned long end_addr)
351 {
352         unsigned long start, end;
353         struct mm_struct *mm = vma->vm_mm;
354         struct mmu_gather tlb;
355
356         if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
357                 return -EINVAL;
358
359         /* MADV_FREE works for only anon vma at the moment */
360         if (vma->vm_file)
361                 return -EINVAL;
362
363         start = max(vma->vm_start, start_addr);
364         if (start >= vma->vm_end)
365                 return -EINVAL;
366         end = min(vma->vm_end, end_addr);
367         if (end <= vma->vm_start)
368                 return -EINVAL;
369
370         lru_add_drain();
371         tlb_gather_mmu(&tlb, mm, start, end);
372         update_hiwater_rss(mm);
373
374         mmu_notifier_invalidate_range_start(mm, start, end);
375         madvise_free_page_range(&tlb, vma, start, end);
376         mmu_notifier_invalidate_range_end(mm, start, end);
377         tlb_finish_mmu(&tlb, start, end);
378
379         return 0;
380 }
381
382 static long madvise_free(struct vm_area_struct *vma,
383                              struct vm_area_struct **prev,
384                              unsigned long start, unsigned long end)
385 {
386         *prev = vma;
387         return madvise_free_single_vma(vma, start, end);
388 }
389
390 /*
391  * Application no longer needs these pages.  If the pages are dirty,
392  * it's OK to just throw them away.  The app will be more careful about
393  * data it wants to keep.  Be sure to free swap resources too.  The
394  * zap_page_range call sets things up for shrink_active_list to actually free
395  * these pages later if no one else has touched them in the meantime,
396  * although we could add these pages to a global reuse list for
397  * shrink_active_list to pick up before reclaiming other pages.
398  *
399  * NB: This interface discards data rather than pushes it out to swap,
400  * as some implementations do.  This has performance implications for
401  * applications like large transactional databases which want to discard
402  * pages in anonymous maps after committing to backing store the data
403  * that was kept in them.  There is no reason to write this data out to
404  * the swap area if the application is discarding it.
405  *
406  * An interface that causes the system to free clean pages and flush
407  * dirty pages is already available as msync(MS_INVALIDATE).
408  */
409 static long madvise_dontneed(struct vm_area_struct *vma,
410                              struct vm_area_struct **prev,
411                              unsigned long start, unsigned long end)
412 {
413         *prev = vma;
414         if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
415                 return -EINVAL;
416
417         zap_page_range(vma, start, end - start, NULL);
418         return 0;
419 }
420
421 /*
422  * Application wants to free up the pages and associated backing store.
423  * This is effectively punching a hole into the middle of a file.
424  */
425 static long madvise_remove(struct vm_area_struct *vma,
426                                 struct vm_area_struct **prev,
427                                 unsigned long start, unsigned long end)
428 {
429         loff_t offset;
430         int error;
431         struct file *f;
432
433         *prev = NULL;   /* tell sys_madvise we drop mmap_sem */
434
435         if (vma->vm_flags & VM_LOCKED)
436                 return -EINVAL;
437
438         f = vma->vm_file;
439
440         if (!f || !f->f_mapping || !f->f_mapping->host) {
441                         return -EINVAL;
442         }
443
444         if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
445                 return -EACCES;
446
447         offset = (loff_t)(start - vma->vm_start)
448                         + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
449
450         /*
451          * Filesystem's fallocate may need to take i_mutex.  We need to
452          * explicitly grab a reference because the vma (and hence the
453          * vma's reference to the file) can go away as soon as we drop
454          * mmap_sem.
455          */
456         get_file(f);
457         up_read(&current->mm->mmap_sem);
458         error = vfs_fallocate(f,
459                                 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
460                                 offset, end - start);
461         fput(f);
462         down_read(&current->mm->mmap_sem);
463         return error;
464 }
465
466 #ifdef CONFIG_MEMORY_FAILURE
467 /*
468  * Error injection support for memory error handling.
469  */
470 static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
471 {
472         struct page *p;
473         if (!capable(CAP_SYS_ADMIN))
474                 return -EPERM;
475         for (; start < end; start += PAGE_SIZE <<
476                                 compound_order(compound_head(p))) {
477                 int ret;
478
479                 ret = get_user_pages_fast(start, 1, 0, &p);
480                 if (ret != 1)
481                         return ret;
482
483                 if (PageHWPoison(p)) {
484                         put_page(p);
485                         continue;
486                 }
487                 if (bhv == MADV_SOFT_OFFLINE) {
488                         pr_info("Soft offlining page %#lx at %#lx\n",
489                                 page_to_pfn(p), start);
490                         ret = soft_offline_page(p, MF_COUNT_INCREASED);
491                         if (ret)
492                                 return ret;
493                         continue;
494                 }
495                 pr_info("Injecting memory failure for page %#lx at %#lx\n",
496                        page_to_pfn(p), start);
497                 /* Ignore return value for now */
498                 memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
499         }
500         return 0;
501 }
502 #endif
503
504 static long
505 madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
506                 unsigned long start, unsigned long end, int behavior)
507 {
508         switch (behavior) {
509         case MADV_REMOVE:
510                 return madvise_remove(vma, prev, start, end);
511         case MADV_WILLNEED:
512                 return madvise_willneed(vma, prev, start, end);
513         case MADV_FREE:
514                 /*
515                  * XXX: In this implementation, MADV_FREE works like
516                  * MADV_DONTNEED on swapless system or full swap.
517                  */
518                 if (get_nr_swap_pages() > 0)
519                         return madvise_free(vma, prev, start, end);
520                 /* passthrough */
521         case MADV_DONTNEED:
522                 return madvise_dontneed(vma, prev, start, end);
523         default:
524                 return madvise_behavior(vma, prev, start, end, behavior);
525         }
526 }
527
528 static bool
529 madvise_behavior_valid(int behavior)
530 {
531         switch (behavior) {
532         case MADV_DOFORK:
533         case MADV_DONTFORK:
534         case MADV_NORMAL:
535         case MADV_SEQUENTIAL:
536         case MADV_RANDOM:
537         case MADV_REMOVE:
538         case MADV_WILLNEED:
539         case MADV_DONTNEED:
540         case MADV_FREE:
541 #ifdef CONFIG_KSM
542         case MADV_MERGEABLE:
543         case MADV_UNMERGEABLE:
544 #endif
545 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
546         case MADV_HUGEPAGE:
547         case MADV_NOHUGEPAGE:
548 #endif
549         case MADV_DONTDUMP:
550         case MADV_DODUMP:
551                 return true;
552
553         default:
554                 return false;
555         }
556 }
557
558 /*
559  * The madvise(2) system call.
560  *
561  * Applications can use madvise() to advise the kernel how it should
562  * handle paging I/O in this VM area.  The idea is to help the kernel
563  * use appropriate read-ahead and caching techniques.  The information
564  * provided is advisory only, and can be safely disregarded by the
565  * kernel without affecting the correct operation of the application.
566  *
567  * behavior values:
568  *  MADV_NORMAL - the default behavior is to read clusters.  This
569  *              results in some read-ahead and read-behind.
570  *  MADV_RANDOM - the system should read the minimum amount of data
571  *              on any access, since it is unlikely that the appli-
572  *              cation will need more than what it asks for.
573  *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
574  *              once, so they can be aggressively read ahead, and
575  *              can be freed soon after they are accessed.
576  *  MADV_WILLNEED - the application is notifying the system to read
577  *              some pages ahead.
578  *  MADV_DONTNEED - the application is finished with the given range,
579  *              so the kernel can free resources associated with it.
580  *  MADV_REMOVE - the application wants to free up the given range of
581  *              pages and associated backing store.
582  *  MADV_DONTFORK - omit this area from child's address space when forking:
583  *              typically, to avoid COWing pages pinned by get_user_pages().
584  *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
585  *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
586  *              this area with pages of identical content from other such areas.
587  *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
588  *
589  * return values:
590  *  zero    - success
591  *  -EINVAL - start + len < 0, start is not page-aligned,
592  *              "behavior" is not a valid value, or application
593  *              is attempting to release locked or shared pages.
594  *  -ENOMEM - addresses in the specified range are not currently
595  *              mapped, or are outside the AS of the process.
596  *  -EIO    - an I/O error occurred while paging in data.
597  *  -EBADF  - map exists, but area maps something that isn't a file.
598  *  -EAGAIN - a kernel resource was temporarily unavailable.
599  */
600 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
601 {
602         unsigned long end, tmp;
603         struct vm_area_struct *vma, *prev;
604         int unmapped_error = 0;
605         int error = -EINVAL;
606         int write;
607         size_t len;
608         struct blk_plug plug;
609
610 #ifdef CONFIG_MEMORY_FAILURE
611         if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
612                 return madvise_hwpoison(behavior, start, start+len_in);
613 #endif
614         if (!madvise_behavior_valid(behavior))
615                 return error;
616
617         if (start & ~PAGE_MASK)
618                 return error;
619         len = (len_in + ~PAGE_MASK) & PAGE_MASK;
620
621         /* Check to see whether len was rounded up from small -ve to zero */
622         if (len_in && !len)
623                 return error;
624
625         end = start + len;
626         if (end < start)
627                 return error;
628
629         error = 0;
630         if (end == start)
631                 return error;
632
633         write = madvise_need_mmap_write(behavior);
634         if (write)
635                 down_write(&current->mm->mmap_sem);
636         else
637                 down_read(&current->mm->mmap_sem);
638
639         /*
640          * If the interval [start,end) covers some unmapped address
641          * ranges, just ignore them, but return -ENOMEM at the end.
642          * - different from the way of handling in mlock etc.
643          */
644         vma = find_vma_prev(current->mm, start, &prev);
645         if (vma && start > vma->vm_start)
646                 prev = vma;
647
648         blk_start_plug(&plug);
649         for (;;) {
650                 /* Still start < end. */
651                 error = -ENOMEM;
652                 if (!vma)
653                         goto out;
654
655                 /* Here start < (end|vma->vm_end). */
656                 if (start < vma->vm_start) {
657                         unmapped_error = -ENOMEM;
658                         start = vma->vm_start;
659                         if (start >= end)
660                                 goto out;
661                 }
662
663                 /* Here vma->vm_start <= start < (end|vma->vm_end) */
664                 tmp = vma->vm_end;
665                 if (end < tmp)
666                         tmp = end;
667
668                 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
669                 error = madvise_vma(vma, &prev, start, tmp, behavior);
670                 if (error)
671                         goto out;
672                 start = tmp;
673                 if (prev && start < prev->vm_end)
674                         start = prev->vm_end;
675                 error = unmapped_error;
676                 if (start >= end)
677                         goto out;
678                 if (prev)
679                         vma = prev->vm_next;
680                 else    /* madvise_remove dropped mmap_sem */
681                         vma = find_vma(current->mm, start);
682         }
683 out:
684         blk_finish_plug(&plug);
685         if (write)
686                 up_write(&current->mm->mmap_sem);
687         else
688                 up_read(&current->mm->mmap_sem);
689
690         return error;
691 }