From: Michel Lespinasse Date: Wed, 20 Feb 2013 02:14:06 +0000 (+1100) Subject: mm: introduce mm_populate() for populating new vmas X-Git-Tag: next-20130220~1^2~568 X-Git-Url: https://git.karo-electronics.de/?a=commitdiff_plain;h=51486b8ee3d286628695b213c2528486593e16ef;p=karo-tx-linux.git mm: introduce mm_populate() for populating new vmas When creating new mappings using the MAP_POPULATE / MAP_LOCKED flags (or with MCL_FUTURE in effect), we want to populate the pages within the newly created vmas. This may take a while as we may have to read pages from disk, so ideally we want to do this outside of the write-locked mmap_sem region. This change introduces mm_populate(), which is used to defer populating such mappings until after the mmap_sem write lock has been released. This is implemented as a generalization of the former do_mlock_pages(), which accomplished the same task but was using during mlock() / mlockall(). Signed-off-by: Michel Lespinasse Reported-by: Andy Lutomirski Acked-by: Rik van Riel Tested-by: Andy Lutomirski Cc: Greg Ungerer Cc: David Howells Signed-off-by: Andrew Morton --- diff --git a/fs/aio.c b/fs/aio.c index 71f613cf4a85..82eec7c7b4bb 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -103,6 +103,7 @@ static int aio_setup_ring(struct kioctx *ctx) unsigned nr_events = ctx->max_reqs; unsigned long size; int nr_pages; + bool populate; /* Compensate for the ring buffer's head/tail overlap entry */ nr_events += 2; /* 1 is required, 2 for good luck */ @@ -129,7 +130,8 @@ static int aio_setup_ring(struct kioctx *ctx) down_write(&ctx->mm->mmap_sem); info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, PROT_READ|PROT_WRITE, - MAP_ANONYMOUS|MAP_PRIVATE, 0); + MAP_ANONYMOUS|MAP_PRIVATE, 0, + &populate); if (IS_ERR((void *)info->mmap_base)) { up_write(&ctx->mm->mmap_sem); info->mmap_size = 0; @@ -147,6 +149,8 @@ static int aio_setup_ring(struct kioctx *ctx) aio_free_ring(ctx); return -EAGAIN; } + if (populate) + mm_populate(info->mmap_base, info->mmap_size); ctx->user_id = info->mmap_base; diff --git a/include/linux/mm.h b/include/linux/mm.h index 82b3b3e1b9c8..0ace5feea4c4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1476,11 +1476,23 @@ extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned lo extern unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff); -extern unsigned long do_mmap_pgoff(struct file *, unsigned long, - unsigned long, unsigned long, - unsigned long, unsigned long); +extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, unsigned long flags, + unsigned long pgoff, bool *populate); extern int do_munmap(struct mm_struct *, unsigned long, size_t); +#ifdef CONFIG_MMU +extern int __mm_populate(unsigned long addr, unsigned long len, + int ignore_errors); +static inline void mm_populate(unsigned long addr, unsigned long len) +{ + /* Ignore errors */ + (void) __mm_populate(addr, len, 1); +} +#else +static inline void mm_populate(unsigned long addr, unsigned long len) {} +#endif + /* These take the mm semaphore themselves */ extern unsigned long vm_brk(unsigned long, unsigned long); extern int vm_munmap(unsigned long, size_t); diff --git a/ipc/shm.c b/ipc/shm.c index 4fa6d8fee730..9f047ba69e62 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -967,11 +967,11 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, unsigned long flags; unsigned long prot; int acc_mode; - unsigned long user_addr; struct ipc_namespace *ns; struct shm_file_data *sfd; struct path path; fmode_t f_mode; + bool populate = false; err = -EINVAL; if (shmid < 0) @@ -1070,13 +1070,15 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, goto invalid; } - user_addr = do_mmap_pgoff(file, addr, size, prot, flags, 0); - *raddr = user_addr; + addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate); + *raddr = addr; err = 0; - if (IS_ERR_VALUE(user_addr)) - err = (long)user_addr; + if (IS_ERR_VALUE(addr)) + err = (long)addr; invalid: up_write(¤t->mm->mmap_sem); + if (populate) + mm_populate(addr, size); out_fput: fput(file); diff --git a/mm/mlock.c b/mm/mlock.c index c9bd528b01d2..a296a49865df 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -416,7 +416,14 @@ static int do_mlock(unsigned long start, size_t len, int on) return error; } -static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors) +/* + * __mm_populate - populate and/or mlock pages within a range of address space. + * + * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap + * flags. VMAs must be already marked with the desired vm_flags, and + * mmap_sem must not be held. + */ +int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) { struct mm_struct *mm = current->mm; unsigned long end, nstart, nend; @@ -498,7 +505,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) error = do_mlock(start, len, 1); up_write(¤t->mm->mmap_sem); if (!error) - error = do_mlock_pages(start, len, 0); + error = __mm_populate(start, len, 0); return error; } @@ -564,10 +571,8 @@ SYSCALL_DEFINE1(mlockall, int, flags) capable(CAP_IPC_LOCK)) ret = do_mlockall(flags); up_write(¤t->mm->mmap_sem); - if (!ret && (flags & MCL_CURRENT)) { - /* Ignore errors */ - do_mlock_pages(0, TASK_SIZE, 1); - } + if (!ret && (flags & MCL_CURRENT)) + mm_populate(0, TASK_SIZE); out: return ret; } diff --git a/mm/mmap.c b/mm/mmap.c index 09da0b264982..9b12e3047a86 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1154,12 +1154,15 @@ static inline unsigned long round_hint_to_min(unsigned long hint) unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, - unsigned long flags, unsigned long pgoff) + unsigned long flags, unsigned long pgoff, + bool *populate) { struct mm_struct * mm = current->mm; struct inode *inode; vm_flags_t vm_flags; + *populate = false; + /* * Does the application expect PROT_READ to imply PROT_EXEC? * @@ -1280,7 +1283,12 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, } } - return mmap_region(file, addr, len, flags, vm_flags, pgoff); + addr = mmap_region(file, addr, len, flags, vm_flags, pgoff); + if (!IS_ERR_VALUE(addr) && + ((vm_flags & VM_LOCKED) || + (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) + *populate = true; + return addr; } SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, @@ -1531,10 +1539,12 @@ out: vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { - if (!mlock_vma_pages_range(vma, addr, addr + len)) + if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || + vma == get_gate_vma(current->mm))) mm->locked_vm += (len >> PAGE_SHIFT); - } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) - make_pages_present(addr, addr + len); + else + vma->vm_flags &= ~VM_LOCKED; + } if (file) uprobe_mmap(vma); diff --git a/mm/nommu.c b/mm/nommu.c index b20db4e22263..7296a5a280e7 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1250,7 +1250,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long len, unsigned long prot, unsigned long flags, - unsigned long pgoff) + unsigned long pgoff, + bool *populate) { struct vm_area_struct *vma; struct vm_region *region; @@ -1260,6 +1261,8 @@ unsigned long do_mmap_pgoff(struct file *file, kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); + *populate = false; + /* decide whether we should attempt the mapping, and if so what sort of * mapping */ ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, diff --git a/mm/util.c b/mm/util.c index c55e26b17d93..13467e043e9e 100644 --- a/mm/util.c +++ b/mm/util.c @@ -355,12 +355,16 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, { unsigned long ret; struct mm_struct *mm = current->mm; + bool populate; ret = security_mmap_file(file, prot, flag); if (!ret) { down_write(&mm->mmap_sem); - ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff); + ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff, + &populate); up_write(&mm->mmap_sem); + if (!IS_ERR_VALUE(ret) && populate) + mm_populate(ret, len); } return ret; }