mm: do not use mm->nr_pmds on !MMU configurations

[linux-beck.git] / include / linux / mm.h
diff --git a/include/linux/mm.h b/include/linux/mm.h

index dd5ea3016fc4e854ded6b1e7c2e096224d83317f..bd52e2f140276d9aa22caf7753463f1de2960d85 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -138,7 +138,6 @@ extern unsigned int kobjsize(const void *objp);
  #define VM_ACCOUNT     0x00100000      /* Is a VM accounted object */
  #define VM_NORESERVE   0x00200000      /* should the VM suppress accounting */
  #define VM_HUGETLB     0x00400000      /* Huge TLB Page VM */
-#define VM_NONLINEAR   0x00800000      /* Is non-linear (remap_file_pages) */
  #define VM_ARCH_1      0x01000000      /* Architecture-specific flag */
  #define VM_ARCH_2      0x02000000
  #define VM_DONTDUMP    0x04000000      /* Do not include in the core dump */
@@ -206,21 +205,19 @@ extern unsigned int kobjsize(const void *objp);
  extern pgprot_t protection_map[16];
  
  #define FAULT_FLAG_WRITE       0x01    /* Fault was a write access */
-#define FAULT_FLAG_NONLINEAR   0x02    /* Fault was via a nonlinear mapping */
-#define FAULT_FLAG_MKWRITE     0x04    /* Fault was mkwrite of existing pte */
-#define FAULT_FLAG_ALLOW_RETRY 0x08    /* Retry fault if blocking */
-#define FAULT_FLAG_RETRY_NOWAIT        0x10    /* Don't drop mmap_sem and wait when retrying */
-#define FAULT_FLAG_KILLABLE    0x20    /* The fault task is in SIGKILL killable region */
-#define FAULT_FLAG_TRIED       0x40    /* second try */
-#define FAULT_FLAG_USER                0x80    /* The fault originated in userspace */
+#define FAULT_FLAG_MKWRITE     0x02    /* Fault was mkwrite of existing pte */
+#define FAULT_FLAG_ALLOW_RETRY 0x04    /* Retry fault if blocking */
+#define FAULT_FLAG_RETRY_NOWAIT        0x08    /* Don't drop mmap_sem and wait when retrying */
+#define FAULT_FLAG_KILLABLE    0x10    /* The fault task is in SIGKILL killable region */
+#define FAULT_FLAG_TRIED       0x20    /* Second try */
+#define FAULT_FLAG_USER                0x40    /* The fault originated in userspace */
  
  /*
   * vm_fault is filled by the the pagefault handler and passed to the vma's
   * ->fault function. The vma's ->fault is responsible for returning a bitmask
   * of VM_FAULT_xxx flags that give details about how the fault was handled.
   *
- * pgoff should be used in favour of virtual_address, if possible. If pgoff
- * is used, one may implement ->remap_pages to get nonlinear mapping support.
+ * pgoff should be used in favour of virtual_address, if possible.
   */
  struct vm_fault {
         unsigned int flags;             /* FAULT_FLAG_xxx flags */
@@ -287,9 +284,13 @@ struct vm_operations_struct {
         struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
                                         unsigned long addr);
  #endif
-       /* called by sys_remap_file_pages() to populate non-linear mapping */
-       int (*remap_pages)(struct vm_area_struct *vma, unsigned long addr,
-                          unsigned long size, pgoff_t pgoff);
+       /*
+        * Called by vm_normal_page() for special PTEs to find the
+        * page for @addr.  This is useful if the default behavior
+        * (using pte_page()) would not find the correct page.
+        */
+       struct page *(*find_special_page)(struct vm_area_struct *vma,
+                                         unsigned long addr);
  };
  
  struct mmu_gather;
@@ -446,6 +447,12 @@ static inline struct page *compound_head_by_tail(struct page *tail)
         return tail;
  }
  
+/*
+ * Since either compound page could be dismantled asynchronously in THP
+ * or we access asynchronously arbitrary positioned struct page, there
+ * would be tail flag race. To handle this race, we should call
+ * smp_rmb() before checking tail flag. compound_head_by_tail() did it.
+ */
  static inline struct page *compound_head(struct page *page)
  {
         if (unlikely(PageTail(page)))
@@ -453,6 +460,18 @@ static inline struct page *compound_head(struct page *page)
         return page;
  }
  
+/*
+ * If we access compound page synchronously such as access to
+ * allocated page, there is no need to handle tail flag race, so we can
+ * check tail flag directly without any synchronization primitive.
+ */
+static inline struct page *compound_head_fast(struct page *page)
+{
+       if (unlikely(PageTail(page)))
+               return page->first_page;
+       return page;
+}
+
  /*
   * The atomic page->_mapcount, starts from -1: so that transitions
   * both from it and to it can be tracked, using atomic_inc_and_test
@@ -465,7 +484,8 @@ static inline void page_mapcount_reset(struct page *page)
  
  static inline int page_mapcount(struct page *page)
  {
-       return atomic_read(&(page)->_mapcount) + 1;
+       VM_BUG_ON_PAGE(PageSlab(page), page);
+       return atomic_read(&page->_mapcount) + 1;
  }
  
  static inline int page_count(struct page *page)
@@ -531,7 +551,14 @@ static inline void get_page(struct page *page)
  static inline struct page *virt_to_head_page(const void *x)
  {
         struct page *page = virt_to_page(x);
-       return compound_head(page);
+
+       /*
+        * We don't need to worry about synchronization of tail flag
+        * when we call virt_to_head_page() since it is only called for
+        * already allocated page and this page won't be freed until
+        * this virt_to_head_page() is finished. So use _fast variant.
+        */
+       return compound_head_fast(page);
  }
  
  /*
@@ -601,29 +628,28 @@ int split_free_page(struct page *page);
   * prototype for that function and accessor functions.
   * These are _only_ valid on the head of a PG_compound page.
   */
-typedef void compound_page_dtor(struct page *);
  
  static inline void set_compound_page_dtor(struct page *page,
                                                 compound_page_dtor *dtor)
  {
-       page[1].lru.next = (void *)dtor;
+       page[1].compound_dtor = dtor;
  }
  
  static inline compound_page_dtor *get_compound_page_dtor(struct page *page)
  {
-       return (compound_page_dtor *)page[1].lru.next;
+       return page[1].compound_dtor;
  }
  
  static inline int compound_order(struct page *page)
  {
         if (!PageHead(page))
                 return 0;
-       return (unsigned long)page[1].lru.prev;
+       return page[1].compound_order;
  }
  
  static inline void set_compound_order(struct page *page, unsigned long order)
  {
-       page[1].lru.prev = (void *)order;
+       page[1].compound_order = order;
  }
  
  #ifdef CONFIG_MMU
@@ -1121,7 +1147,6 @@ extern void user_shm_unlock(size_t, struct user_struct *);
   * Parameter block passed down to zap_pte_range in exceptional cases.
   */
  struct zap_details {
-       struct vm_area_struct *nonlinear_vma;   /* Check page->index if set */
         struct address_space *check_mapping;    /* Check page->mapping if set */
         pgoff_t first_index;                    /* Lowest page->index to unmap */
         pgoff_t last_index;                     /* Highest page->index to unmap */
@@ -1139,8 +1164,6 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
  
  /**
   * mm_walk - callbacks for walk_page_range
- * @pgd_entry: if set, called for each non-empty PGD (top-level) entry
- * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry
   * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry
   *            this handler is required to be able to handle
   *            pmd_trans_huge() pmds.  They may simply choose to
@@ -1148,16 +1171,18 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
   * @pte_entry: if set, called for each non-empty PTE (4th-level) entry
   * @pte_hole: if set, called for each hole at all levels
   * @hugetlb_entry: if set, called for each hugetlb entry
- *                *Caution*: The caller must hold mmap_sem() if @hugetlb_entry
- *                           is used.
+ * @test_walk: caller specific callback function to determine whether
+ *             we walk over the current vma or not. A positive returned
+ *             value means "do page table walk over the current vma,"
+ *             and a negative one means "abort current page table walk
+ *             right now." 0 means "skip the current vma."
+ * @mm:        mm_struct representing the target process of page table walk
+ * @vma:       vma currently walked (NULL if walking outside vmas)
+ * @private:   private data for callbacks' usage
   *
- * (see walk_page_range for more details)
+ * (see the comment on walk_page_range() for more details)
   */
  struct mm_walk {
-       int (*pgd_entry)(pgd_t *pgd, unsigned long addr,
-                        unsigned long next, struct mm_walk *walk);
-       int (*pud_entry)(pud_t *pud, unsigned long addr,
-                        unsigned long next, struct mm_walk *walk);
         int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
                          unsigned long next, struct mm_walk *walk);
         int (*pte_entry)(pte_t *pte, unsigned long addr,
@@ -1167,12 +1192,16 @@ struct mm_walk {
         int (*hugetlb_entry)(pte_t *pte, unsigned long hmask,
                              unsigned long addr, unsigned long next,
                              struct mm_walk *walk);
+       int (*test_walk)(unsigned long addr, unsigned long next,
+                       struct mm_walk *walk);
         struct mm_struct *mm;
+       struct vm_area_struct *vma;
         void *private;
  };
  
  int walk_page_range(unsigned long addr, unsigned long end,
                 struct mm_walk *walk);
+int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk);
  void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
                 unsigned long end, unsigned long floor, unsigned long ceiling);
  int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
@@ -1236,6 +1265,17 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                     unsigned long start, unsigned long nr_pages,
                     int write, int force, struct page **pages,
                     struct vm_area_struct **vmas);
+long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
+                   unsigned long start, unsigned long nr_pages,
+                   int write, int force, struct page **pages,
+                   int *locked);
+long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+                              unsigned long start, unsigned long nr_pages,
+                              int write, int force, struct page **pages,
+                              unsigned int gup_flags);
+long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+                   unsigned long start, unsigned long nr_pages,
+                   int write, int force, struct page **pages);
  int get_user_pages_fast(unsigned long start, int nr_pages, int write,
                         struct page **pages);
  struct kvec;
@@ -1407,14 +1447,45 @@ static inline int __pud_alloc(struct mm_struct *mm, pgd_t *pgd,
  int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
  #endif
  
-#ifdef __PAGETABLE_PMD_FOLDED
+#if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU)
  static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
                                                 unsigned long address)
  {
         return 0;
  }
+
+static inline void mm_nr_pmds_init(struct mm_struct *mm) {}
+
+static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
+{
+       return 0;
+}
+
+static inline void mm_inc_nr_pmds(struct mm_struct *mm) {}
+static inline void mm_dec_nr_pmds(struct mm_struct *mm) {}
+
  #else
  int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
+
+static inline void mm_nr_pmds_init(struct mm_struct *mm)
+{
+       atomic_long_set(&mm->nr_pmds, 0);
+}
+
+static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
+{
+       return atomic_long_read(&mm->nr_pmds);
+}
+
+static inline void mm_inc_nr_pmds(struct mm_struct *mm)
+{
+       atomic_long_inc(&mm->nr_pmds);
+}
+
+static inline void mm_dec_nr_pmds(struct mm_struct *mm)
+{
+       atomic_long_dec(&mm->nr_pmds);
+}
  #endif
  
  int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -1777,12 +1848,6 @@ struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node,
         for (vma = vma_interval_tree_iter_first(root, start, last);     \
              vma; vma = vma_interval_tree_iter_next(vma, start, last))
  
-static inline void vma_nonlinear_insert(struct vm_area_struct *vma,
-                                       struct list_head *list)
-{
-       list_add_tail(&vma->shared.nonlinear, list);
-}
-
  void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
                                    struct rb_root *root);
  void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
@@ -2110,9 +2175,8 @@ int drop_caches_sysctl_handler(struct ctl_table *, int,
                                         void __user *, size_t *, loff_t *);
  #endif
  
-unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid,
-                               unsigned long nr_scanned,
-                               unsigned long nr_eligible);
+void drop_slab(void);
+void drop_slab_node(int nid);
  
  #ifndef CONFIG_MMU
  #define randomize_va_space 0