Merge misc fixes from Andrew Morton.
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (22 commits)
pidns: fix free_pid() to handle the first fork failure
ipc,msg: prevent race with rmid in msgsnd,msgrcv
ipc/sem.c: update sem_otime for all operations
mm/hwpoison: fix the lack of one reference count against poisoned page
mm/hwpoison: fix false report on 2nd attempt at page recovery
mm/hwpoison: fix test for a transparent huge page
mm/hwpoison: fix traversal of hugetlbfs pages to avoid printk flood
block: change config option name for cmdline partition parsing
mm/mlock.c: prevent walking off the end of a pagetable in no-pmd configuration
mm: avoid reinserting isolated balloon pages into LRU lists
arch/parisc/mm/fault.c: fix uninitialized variable usage
include/asm-generic/vtime.h: avoid zero-length file
nilfs2: fix issue with race condition of competition between segments for dirty blocks
Documentation/kernel-parameters.txt: replace kernelcore with Movable
mm/bounce.c: fix a regression where MS_SNAP_STABLE (stable pages snapshotting) was ignored
kernel/kmod.c: check for NULL in call_usermodehelper_exec()
ipc/sem.c: synchronize the proc interface
ipc/sem.c: optimize sem_lock()
ipc/sem.c: fix race in sem_lock()
mm/compaction.c: periodically schedule when freeing pages
...
- Generic Block Device Capability (/sys/block/<device>/capability)
cfq-iosched.txt
- CFQ IO scheduler tunables
+cmdline-partition.txt
+ - how to specify block device partitions on kernel command line
data-integrity.txt
- Block data integrity
deadline-iosched.txt
-Embedded device command line partition
+Embedded device command line partition parsing
=====================================================================
-Read block device partition table from command line.
-The partition used for fixed block device (eMMC) embedded device.
-It is no MBR, save storage space. Bootloader can be easily accessed
+Support for reading the block device partition table from the command line.
+It is typically used for fixed block (eMMC) embedded devices.
+It has no MBR, so saves storage space. Bootloader can be easily accessed
by absolute address of data on the block device.
Users can easily change the partition.
Format: <io>,<irq>,<mode>
See header of drivers/net/hamradio/baycom_ser_hdx.c.
+ blkdevparts= Manual partition parsing of block device(s) for
+ embedded devices based on command line input.
+ See Documentation/block/cmdline-partition.txt
+
boot_delay= Milliseconds to delay each printk during boot.
Values larger than 10 seconds (10000) are changed to
no delay (0).
pages. In the event, a node is too small to have both
kernelcore and Movable pages, kernelcore pages will
take priority and other nodes will have a larger number
- of kernelcore pages. The Movable zone is used for the
+ of Movable pages. The Movable zone is used for the
allocation of pages that may be reclaimed or moved
by the page migration subsystem. This means that
HugeTLB pages may not be allocated from this zone.
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
+
+ acc_type = parisc_acctyp(code, regs->iir);
+
if (acc_type & VM_WRITE)
flags |= FAULT_FLAG_WRITE;
retry:
good_area:
- acc_type = parisc_acctyp(code,regs->iir);
-
if ((vma->vm_flags & acc_type) != acc_type)
goto bad_area;
See Documentation/cgroups/blkio-controller.txt for more information.
-config CMDLINE_PARSER
+config BLK_CMDLINE_PARSER
bool "Block device command line partition parser"
default n
---help---
- Parsing command line, get the partitions information.
+ Enabling this option allows you to specify the partition layout from
+ the kernel boot args. This is typically of use for embedded devices
+ which don't otherwise have any standardized method for listing the
+ partitions on a block device.
+
+ See Documentation/block/cmdline-partition.txt for more information.
menu "Partition Types"
obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o
-obj-$(CONFIG_CMDLINE_PARSER) += cmdline-parser.o
+obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o
config CMDLINE_PARTITION
bool "Command line partition support" if PARTITION_ADVANCED
- select CMDLINE_PARSER
+ select BLK_CMDLINE_PARSER
help
- Say Y here if you would read the partitions table from bootargs.
+ Say Y here if you want to read the partition table from bootargs.
The format for the command line is just like mtdparts.
* Copyright (C) 2013 HUAWEI
* Author: Cai Zhiyong <caizhiyong@huawei.com>
*
- * Read block device partition table from command line.
- * The partition used for fixed block device (eMMC) embedded device.
- * It is no MBR, save storage space. Bootloader can be easily accessed
+ * Read block device partition table from the command line.
+ * Typically used for fixed block (eMMC) embedded devices.
+ * It has no MBR, so saves storage space. Bootloader can be easily accessed
* by absolute address of data on the block device.
* Users can easily change the partition.
*
* The format for the command line is just like mtdparts.
*
- * Verbose config please reference "Documentation/block/cmdline-partition.txt"
+ * For further information, see "Documentation/block/cmdline-partition.txt"
*
*/
* long file_ofs
* followed by COUNT filenames in ASCII: "FILE1" NUL "FILE2" NUL...
*/
-static void fill_files_note(struct memelfnote *note)
+static int fill_files_note(struct memelfnote *note)
{
struct vm_area_struct *vma;
unsigned count, size, names_ofs, remaining, n;
names_ofs = (2 + 3 * count) * sizeof(data[0]);
alloc:
if (size >= MAX_FILE_NOTE_SIZE) /* paranoia check */
- goto err;
+ return -EINVAL;
size = round_up(size, PAGE_SIZE);
data = vmalloc(size);
if (!data)
- goto err;
+ return -ENOMEM;
start_end_ofs = data + 2;
name_base = name_curpos = ((char *)data) + names_ofs;
size = name_curpos - (char *)data;
fill_note(note, "CORE", NT_FILE, size, data);
- err: ;
+ return 0;
}
#ifdef CORE_DUMP_USE_REGSET
fill_auxv_note(&info->auxv, current->mm);
info->size += notesize(&info->auxv);
- fill_files_note(&info->files);
- info->size += notesize(&info->files);
+ if (fill_files_note(&info->files) == 0)
+ info->size += notesize(&info->files);
return 1;
}
return 0;
if (first && !writenote(&info->auxv, file, foffset))
return 0;
- if (first && !writenote(&info->files, file, foffset))
+ if (first && info->files.data &&
+ !writenote(&info->files, file, foffset))
return 0;
for (i = 1; i < info->thread_notes; ++i)
struct elf_note_info {
struct memelfnote *notes;
+ struct memelfnote *notes_files;
struct elf_prstatus *prstatus; /* NT_PRSTATUS */
struct elf_prpsinfo *psinfo; /* NT_PRPSINFO */
struct list_head thread_list;
fill_siginfo_note(info->notes + 2, &info->csigdata, siginfo);
fill_auxv_note(info->notes + 3, current->mm);
- fill_files_note(info->notes + 4);
+ info->numnote = 4;
- info->numnote = 5;
+ if (fill_files_note(info->notes + info->numnote) == 0) {
+ info->notes_files = info->notes + info->numnote;
+ info->numnote++;
+ }
/* Try to dump the FPU. */
info->prstatus->pr_fpvalid = elf_core_copy_task_fpregs(current, regs,
kfree(list_entry(tmp, struct elf_thread_status, list));
}
- /* Free data allocated by fill_files_note(): */
- vfree(info->notes[4].data);
+ /* Free data possibly allocated by fill_files_note(): */
+ if (info->notes_files)
+ vfree(info->notes_files->data);
kfree(info->prstatus);
kfree(info->psinfo);
struct vm_area_struct *vma, *gate_vma;
struct elfhdr *elf = NULL;
loff_t offset = 0, dataoff, foffset;
- struct elf_note_info info;
+ struct elf_note_info info = { };
struct elf_phdr *phdr4note = NULL;
struct elf_shdr *shdr4extnum = NULL;
Elf_Half e_phnum;
clear_buffer_nilfs_volatile(bh);
clear_buffer_nilfs_checked(bh);
clear_buffer_nilfs_redirected(bh);
+ clear_buffer_async_write(bh);
clear_buffer_dirty(bh);
if (nilfs_page_buffers_clean(page))
__nilfs_clear_page_dirty(page);
"discard block %llu, size %zu",
(u64)bh->b_blocknr, bh->b_size);
}
+ clear_buffer_async_write(bh);
clear_buffer_dirty(bh);
clear_buffer_nilfs_volatile(bh);
clear_buffer_nilfs_checked(bh);
bh = head = page_buffers(page);
do {
- if (!buffer_dirty(bh))
+ if (!buffer_dirty(bh) || buffer_async_write(bh))
continue;
get_bh(bh);
list_add_tail(&bh->b_assoc_buffers, listp);
for (i = 0; i < pagevec_count(&pvec); i++) {
bh = head = page_buffers(pvec.pages[i]);
do {
- if (buffer_dirty(bh)) {
+ if (buffer_dirty(bh) &&
+ !buffer_async_write(bh)) {
get_bh(bh);
list_add_tail(&bh->b_assoc_buffers,
listp);
list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
b_assoc_buffers) {
+ set_buffer_async_write(bh);
if (bh->b_page != bd_page) {
if (bd_page) {
lock_page(bd_page);
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
+ set_buffer_async_write(bh);
if (bh == segbuf->sb_super_root) {
if (bh->b_page != bd_page) {
lock_page(bd_page);
list_for_each_entry(segbuf, logs, sb_list) {
list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
b_assoc_buffers) {
+ clear_buffer_async_write(bh);
if (bh->b_page != bd_page) {
if (bd_page)
end_page_writeback(bd_page);
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
+ clear_buffer_async_write(bh);
if (bh == segbuf->sb_super_root) {
if (bh->b_page != bd_page) {
end_page_writeback(bd_page);
b_assoc_buffers) {
set_buffer_uptodate(bh);
clear_buffer_dirty(bh);
+ clear_buffer_async_write(bh);
if (bh->b_page != bd_page) {
if (bd_page)
end_page_writeback(bd_page);
b_assoc_buffers) {
set_buffer_uptodate(bh);
clear_buffer_dirty(bh);
+ clear_buffer_async_write(bh);
clear_buffer_delay(bh);
clear_buffer_nilfs_volatile(bh);
clear_buffer_nilfs_redirected(bh);
+/* no content, but patch(1) dislikes empty files */
return false;
}
+/*
+ * isolated_balloon_page - identify an isolated balloon page on private
+ * compaction/migration page lists.
+ *
+ * After a compaction thread isolates a balloon page for migration, it raises
+ * the page refcount to prevent concurrent compaction threads from re-isolating
+ * the same page. For that reason putback_movable_pages(), or other routines
+ * that need to identify isolated balloon pages on private pagelists, cannot
+ * rely on balloon_page_movable() to accomplish the task.
+ */
+static inline bool isolated_balloon_page(struct page *page)
+{
+ /* Already isolated balloon pages, by default, have a raised refcount */
+ if (page_flags_cleared(page) && !page_mapped(page) &&
+ page_count(page) >= 2)
+ return __is_movable_balloon_page(page);
+
+ return false;
+}
+
/*
* balloon_page_insert - insert a page into the balloon's page list and make
* the page->mapping assignment accordingly.
return false;
}
+static inline bool isolated_balloon_page(struct page *page)
+{
+ return false;
+}
+
static inline bool balloon_page_isolate(struct page *page)
{
return false;
if (ipcperms(ns, &msq->q_perm, S_IWUGO))
goto out_unlock0;
+ /* raced with RMID? */
+ if (msq->q_perm.deleted) {
+ err = -EIDRM;
+ goto out_unlock0;
+ }
+
err = security_msg_queue_msgsnd(msq, msg, msgflg);
if (err)
goto out_unlock0;
goto out_unlock1;
ipc_lock_object(&msq->q_perm);
+
+ /* raced with RMID? */
+ if (msq->q_perm.deleted) {
+ msg = ERR_PTR(-EIDRM);
+ goto out_unlock0;
+ }
+
msg = find_msg(msq, &msgtyp, mode);
if (!IS_ERR(msg)) {
/*
ipc_rcu_free(head);
}
+/*
+ * Wait until all currently ongoing simple ops have completed.
+ * Caller must own sem_perm.lock.
+ * New simple ops cannot start, because simple ops first check
+ * that sem_perm.lock is free.
+ * that a) sem_perm.lock is free and b) complex_count is 0.
+ */
+static void sem_wait_array(struct sem_array *sma)
+{
+ int i;
+ struct sem *sem;
+
+ if (sma->complex_count) {
+ /* The thread that increased sma->complex_count waited on
+ * all sem->lock locks. Thus we don't need to wait again.
+ */
+ return;
+ }
+
+ for (i = 0; i < sma->sem_nsems; i++) {
+ sem = sma->sem_base + i;
+ spin_unlock_wait(&sem->lock);
+ }
+}
+
/*
* If the request contains only one semaphore operation, and there are
* no complex transactions pending, lock only the semaphore involved.
* Otherwise, lock the entire semaphore array, since we either have
* multiple semaphores in our own semops, or we need to look at
* semaphores from other pending complex operations.
- *
- * Carefully guard against sma->complex_count changing between zero
- * and non-zero while we are spinning for the lock. The value of
- * sma->complex_count cannot change while we are holding the lock,
- * so sem_unlock should be fine.
- *
- * The global lock path checks that all the local locks have been released,
- * checking each local lock once. This means that the local lock paths
- * cannot start their critical sections while the global lock is held.
*/
static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
int nsops)
{
- int locknum;
- again:
- if (nsops == 1 && !sma->complex_count) {
- struct sem *sem = sma->sem_base + sops->sem_num;
+ struct sem *sem;
- /* Lock just the semaphore we are interested in. */
- spin_lock(&sem->lock);
+ if (nsops != 1) {
+ /* Complex operation - acquire a full lock */
+ ipc_lock_object(&sma->sem_perm);
- /*
- * If sma->complex_count was set while we were spinning,
- * we may need to look at things we did not lock here.
+ /* And wait until all simple ops that are processed
+ * right now have dropped their locks.
*/
- if (unlikely(sma->complex_count)) {
- spin_unlock(&sem->lock);
- goto lock_array;
- }
+ sem_wait_array(sma);
+ return -1;
+ }
+ /*
+ * Only one semaphore affected - try to optimize locking.
+ * The rules are:
+ * - optimized locking is possible if no complex operation
+ * is either enqueued or processed right now.
+ * - The test for enqueued complex ops is simple:
+ * sma->complex_count != 0
+ * - Testing for complex ops that are processed right now is
+ * a bit more difficult. Complex ops acquire the full lock
+ * and first wait that the running simple ops have completed.
+ * (see above)
+ * Thus: If we own a simple lock and the global lock is free
+ * and complex_count is now 0, then it will stay 0 and
+ * thus just locking sem->lock is sufficient.
+ */
+ sem = sma->sem_base + sops->sem_num;
+
+ if (sma->complex_count == 0) {
/*
- * Another process is holding the global lock on the
- * sem_array; we cannot enter our critical section,
- * but have to wait for the global lock to be released.
+ * It appears that no complex operation is around.
+ * Acquire the per-semaphore lock.
*/
- if (unlikely(spin_is_locked(&sma->sem_perm.lock))) {
- spin_unlock(&sem->lock);
- spin_unlock_wait(&sma->sem_perm.lock);
- goto again;
+ spin_lock(&sem->lock);
+
+ /* Then check that the global lock is free */
+ if (!spin_is_locked(&sma->sem_perm.lock)) {
+ /* spin_is_locked() is not a memory barrier */
+ smp_mb();
+
+ /* Now repeat the test of complex_count:
+ * It can't change anymore until we drop sem->lock.
+ * Thus: if is now 0, then it will stay 0.
+ */
+ if (sma->complex_count == 0) {
+ /* fast path successful! */
+ return sops->sem_num;
+ }
}
+ spin_unlock(&sem->lock);
+ }
- locknum = sops->sem_num;
+ /* slow path: acquire the full lock */
+ ipc_lock_object(&sma->sem_perm);
+
+ if (sma->complex_count == 0) {
+ /* False alarm:
+ * There is no complex operation, thus we can switch
+ * back to the fast path.
+ */
+ spin_lock(&sem->lock);
+ ipc_unlock_object(&sma->sem_perm);
+ return sops->sem_num;
} else {
- int i;
- /*
- * Lock the semaphore array, and wait for all of the
- * individual semaphore locks to go away. The code
- * above ensures no new single-lock holders will enter
- * their critical section while the array lock is held.
+ /* Not a false alarm, thus complete the sequence for a
+ * full lock.
*/
- lock_array:
- ipc_lock_object(&sma->sem_perm);
- for (i = 0; i < sma->sem_nsems; i++) {
- struct sem *sem = sma->sem_base + i;
- spin_unlock_wait(&sem->lock);
- }
- locknum = -1;
+ sem_wait_array(sma);
+ return -1;
}
- return locknum;
}
static inline void sem_unlock(struct sem_array *sma, int locknum)
return semop_completed;
}
+/**
+ * set_semotime(sma, sops) - set sem_otime
+ * @sma: semaphore array
+ * @sops: operations that modified the array, may be NULL
+ *
+ * sem_otime is replicated to avoid cache line trashing.
+ * This function sets one instance to the current time.
+ */
+static void set_semotime(struct sem_array *sma, struct sembuf *sops)
+{
+ if (sops == NULL) {
+ sma->sem_base[0].sem_otime = get_seconds();
+ } else {
+ sma->sem_base[sops[0].sem_num].sem_otime =
+ get_seconds();
+ }
+}
+
/**
* do_smart_update(sma, sops, nsops, otime, pt) - optimized update_queue
* @sma: semaphore array
}
}
}
- if (otime) {
- if (sops == NULL) {
- sma->sem_base[0].sem_otime = get_seconds();
- } else {
- sma->sem_base[sops[0].sem_num].sem_otime =
- get_seconds();
- }
- }
+ if (otime)
+ set_semotime(sma, sops);
}
-
/* The following counts are associated to each semaphore:
* semncnt number of tasks waiting on semval being nonzero
* semzcnt number of tasks waiting on semval being zero
error = perform_atomic_semop(sma, sops, nsops, un,
task_tgid_vnr(current));
- if (error <= 0) {
- if (alter && error == 0)
+ if (error == 0) {
+ /* If the operation was successful, then do
+ * the required updates.
+ */
+ if (alter)
do_smart_update(sma, sops, nsops, 1, &tasks);
-
- goto out_unlock_free;
+ else
+ set_semotime(sma, sops);
}
+ if (error <= 0)
+ goto out_unlock_free;
/* We need to sleep on this operation, so we put the current
* task into the pending queue and go to sleep.
struct sem_array *sma = it;
time_t sem_otime;
+ /*
+ * The proc interface isn't aware of sem_lock(), it calls
+ * ipc_lock_object() directly (in sysvipc_find_ipc).
+ * In order to stay compatible with sem_lock(), we must wait until
+ * all simple semop() calls have left their critical regions.
+ */
+ sem_wait_array(sma);
+
sem_otime = get_semotime(sma);
return seq_printf(s,
DECLARE_COMPLETION_ONSTACK(done);
int retval = 0;
+ if (!sub_info->path) {
+ call_usermodehelper_freeinfo(sub_info);
+ return -EINVAL;
+ }
helper_lock();
if (!khelper_wq || usermodehelper_disabled) {
retval = -EBUSY;
*/
wake_up_process(ns->child_reaper);
break;
+ case PIDNS_HASH_ADDING:
+ /* Handle a fork failure of the first process */
+ WARN_ON(ns->child_reaper);
+ ns->nr_hashed = 0;
+ /* fall through */
case 0:
schedule_work(&ns->proc_work);
break;
struct bio_vec *to, *from;
unsigned i;
+ if (force)
+ goto bounce;
bio_for_each_segment(from, *bio_orig, i)
if (page_to_pfn(from->bv_page) > queue_bounce_pfn(q))
goto bounce;
pfn -= pageblock_nr_pages) {
unsigned long isolated;
+ /*
+ * This can iterate a massively long zone without finding any
+ * suitable migration targets, so periodically check if we need
+ * to schedule.
+ */
+ cond_resched();
+
if (!pfn_valid(pfn))
continue;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!hwpoison_filter_enable)
- goto inject;
if (!pfn_valid(pfn))
return -ENXIO;
if (!get_page_unless_zero(hpage))
return 0;
+ if (!hwpoison_filter_enable)
+ goto inject;
+
if (!PageLRU(p) && !PageHuge(p))
shake_page(p, 0);
/*
*/
static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
{
+ struct page *p;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- for (; start < end; start += PAGE_SIZE) {
- struct page *p;
+ for (; start < end; start += PAGE_SIZE <<
+ compound_order(compound_head(p))) {
int ret;
ret = get_user_pages_fast(start, 1, 0, &p);
* shake_page could have turned it free.
*/
if (is_free_buddy_page(p)) {
- action_result(pfn, "free buddy, 2nd try",
- DELAYED);
+ if (flags & MF_COUNT_INCREASED)
+ action_result(pfn, "free buddy", DELAYED);
+ else
+ action_result(pfn, "free buddy, 2nd try", DELAYED);
return 0;
}
action_result(pfn, "non LRU", IGNORED);
* worked by memory_failure() and the page lock is not held yet.
* In such case, we yield to memory_failure() and make unpoison fail.
*/
- if (PageTransHuge(page)) {
+ if (!PageHuge(page) && PageTransHuge(page)) {
pr_info("MCE: Memory failure is now running on %#lx\n", pfn);
return 0;
}
list_del(&page->lru);
dec_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
- if (unlikely(balloon_page_movable(page)))
+ if (unlikely(isolated_balloon_page(page)))
balloon_page_putback(page);
else
putback_lru_page(page);
/*
* Initialize pte walk starting at the already pinned page where we
- * are sure that there is a pte.
+ * are sure that there is a pte, as it was pinned under the same
+ * mmap_sem write op.
*/
pte = get_locked_pte(vma->vm_mm, start, &ptl);
- end = min(end, pmd_addr_end(start, end));
+ /* Make sure we do not cross the page table boundary */
+ end = pgd_addr_end(start, end);
+ end = pud_addr_end(start, end);
+ end = pmd_addr_end(start, end);
/* The page next to the pinned page is the first we will try to get */
start += PAGE_SIZE;
list_del(&page->lru);
rmv_page_order(page);
zone->free_area[order].nr_free--;
-#ifdef CONFIG_HIGHMEM
- if (PageHighMem(page))
- totalhigh_pages -= 1 << order;
-#endif
for (i = 0; i < (1 << order); i++)
SetPageReserved((page+i));
pfn += (1 << order);
#include <asm/div64.h>
#include <linux/swapops.h>
+#include <linux/balloon_compaction.h>
#include "internal.h"
LIST_HEAD(clean_pages);
list_for_each_entry_safe(page, next, page_list, lru) {
- if (page_is_file_cache(page) && !PageDirty(page)) {
+ if (page_is_file_cache(page) && !PageDirty(page) &&
+ !isolated_balloon_page(page)) {
ClearPageActive(page);
list_move(&page->lru, &clean_pages);
}