instructions should set this. And it shouldn't hurt to set it
on architectures that don't have such instructions.
- config HAVE_SYSCALL_WRAPPERS
- bool
-
config KRETPROBES
def_bool y
depends on KPROBES && HAVE_KRETPROBES
select ARCH_WANT_COMPAT_IPC_PARSE_VERSION
bool
-config HAVE_VIRT_TO_BUS
- bool
- help
- An architecture should select this if it implements the
- deprecated interface virt_to_bus(). All new architectures
- should probably not select this.
-
config HAVE_ARCH_SECCOMP_FILTER
bool
help
select HAVE_AOUT
select HAVE_IDE
select HAVE_OPROFILE
- select HAVE_SYSCALL_WRAPPERS
select HAVE_PCSPKR_PLATFORM
select HAVE_PERF_EVENTS
select HAVE_DMA_ATTRS
select HAVE_GENERIC_HARDIRQS
- select HAVE_VIRT_TO_BUS
+ select VIRT_TO_BUS
select GENERIC_IRQ_PROBE
select AUTO_IRQ_AFFINITY if SMP
select GENERIC_IRQ_SHOW
select HAVE_KRETPROBES
select HAVE_DEBUG_KMEMLEAK
select ARCH_BINFMT_ELF_RANDOMIZE_PIE
- select HAVE_ARCH_TRANSPARENT_HUGEPAGE
+ select HAVE_ARCH_TRANSPARENT_HUGEPAGE if CPU_SUPPORTS_HUGEPAGES && 64BIT
select RTC_LIB if !MACH_LOONGSON
select GENERIC_ATOMIC64 if !64BIT
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
select GENERIC_CLOCKEVENTS
select GENERIC_CMOS_UPDATE
select HAVE_MOD_ARCH_SPECIFIC
- select HAVE_VIRT_TO_BUS
+ select VIRT_TO_BUS
select MODULES_USE_ELF_REL if MODULES
select MODULES_USE_ELF_RELA if MODULES && 64BIT
select CLONE_BACKWARDS
select IRQ_CPU
select SERIAL_8250
select SERIAL_8250_CONSOLE
+ select USB_EHCI_BIG_ENDIAN_MMIO
+ select USB_EHCI_BIG_ENDIAN_DESC
help
This adds support for the PMC-Sierra family of Multi-Service
Processor System-On-A-Chips. These parts include a number
bool "SNI RM200/300/400"
select FW_ARC if CPU_LITTLE_ENDIAN
select FW_ARC32 if CPU_LITTLE_ENDIAN
- select SNIPROM if CPU_BIG_ENDIAN
+ select FW_SNIPROM if CPU_BIG_ENDIAN
select ARCH_MAY_HAVE_PC_FDC
select BOOT_ELF32
select CEVT_R4K
config FW_ARC32
bool
-config SNIPROM
+config FW_SNIPROM
bool
config BOOT_ELF32
select CPU_SUPPORTS_HUGEPAGES
select LIBFDT
select USE_OF
+ select USB_EHCI_BIG_ENDIAN_MMIO
help
The Cavium Octeon processor is a highly integrated chip containing
many ethernet hardware widgets for networking tasks. The processor
select CPU_SUPPORTS_32BIT_KERNEL
select CPU_SUPPORTS_64BIT_KERNEL
select CPU_SUPPORTS_HIGHMEM
- select CPU_HAS_LLSC
select WEAK_ORDERING
select WEAK_REORDERING_BEYOND_LLSC
select CPU_HAS_PREFETCH
config 64BIT
bool "64-bit kernel"
depends on CPU_SUPPORTS_64BIT_KERNEL && SYS_SUPPORTS_64BIT_KERNEL
- select HAVE_SYSCALL_WRAPPERS
help
Select this option if you want to build a 64-bit kernel.
endmenu
-source "arch/mips/kernel/cpufreq/Kconfig"
+config MIPS_EXTERNAL_TIMER
+ bool
+
+if CPU_SUPPORTS_CPUFREQ && MIPS_EXTERNAL_TIMER
+menu "CPU Power Management"
+source "drivers/cpufreq/Kconfig"
+endmenu
+endif
source "net/Kconfig"
config PPC
bool
default y
+ select BINFMT_ELF
select OF
select OF_EARLY_FLATTREE
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_FUNCTION_GRAPH_TRACER
select SYSCTL_EXCEPTION_TRACE
select ARCH_WANT_OPTIONAL_GPIOLIB
- select HAVE_VIRT_TO_BUS if !PPC64
+ select VIRT_TO_BUS if !PPC64
select HAVE_IDE
select HAVE_IOREMAP_PROT
select HAVE_EFFICIENT_UNALIGNED_ACCESS
select USE_GENERIC_SMP_HELPERS if SMP
select HAVE_OPROFILE
select HAVE_DEBUG_KMEMLEAK
- select HAVE_SYSCALL_WRAPPERS if PPC64
select GENERIC_ATOMIC64 if PPC32
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
select HAVE_PERF_EVENTS
SYSCALL_SPU(capget)
SYSCALL_SPU(capset)
COMPAT_SYS(sigaltstack)
- SYSX_SPU(sys_sendfile,compat_sys_sendfile_wrapper,sys_sendfile)
+ COMPAT_SYS_SPU(sendfile)
SYSCALL(ni_syscall)
SYSCALL(ni_syscall)
PPC_SYS(vfork)
COMPAT_SYS_SPU(sched_getaffinity)
SYSCALL(ni_syscall)
SYSCALL(ni_syscall)
- SYSX(sys_ni_syscall,compat_sys_sendfile64_wrapper,sys_sendfile64)
+ SYS32ONLY(sendfile64)
COMPAT_SYS_SPU(io_setup)
SYSCALL_SPU(io_destroy)
COMPAT_SYS_SPU(io_getevents)
SYSCALL(set_tid_address)
SYSX_SPU(sys_fadvise64,ppc32_fadvise64,sys_fadvise64)
SYSCALL(exit_group)
- SYSX(sys_lookup_dcookie,ppc32_lookup_dcookie,sys_lookup_dcookie)
+ COMPAT_SYS(lookup_dcookie)
SYSCALL_SPU(epoll_create)
SYSCALL_SPU(epoll_ctl)
SYSCALL_SPU(epoll_wait)
COMPAT_SYS(mq_notify)
COMPAT_SYS(mq_getsetattr)
COMPAT_SYS(kexec_load)
- COMPAT_SYS(add_key)
- COMPAT_SYS(request_key)
+ SYSCALL(add_key)
+ SYSCALL(request_key)
COMPAT_SYS(keyctl)
COMPAT_SYS(waitid)
SYSCALL(ioprio_set)
COMPAT_SYS(process_vm_readv)
COMPAT_SYS(process_vm_writev)
SYSCALL(finit_module)
+SYSCALL(ni_syscall) /* sys_kcmp */
#include <uapi/asm/unistd.h>
-#define __NR_syscalls 354
+#define __NR_syscalls 355
#define __NR__exit __NR_exit
#define NR_syscalls __NR_syscalls
#define __ARCH_WANT_SYS_VFORK
#define __ARCH_WANT_SYS_CLONE
- /*
- * "Conditional" syscalls
- */
- #define cond_syscall(x) \
- asmlinkage long x (void) __attribute__((weak,alias("sys_ni_syscall")))
-
#endif /* __ASSEMBLY__ */
#endif /* _ASM_POWERPC_UNISTD_H_ */
select ARCH_INLINE_WRITE_UNLOCK_BH
select ARCH_INLINE_WRITE_UNLOCK_IRQ
select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
+ select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
select ARCH_SAVE_PAGE_KEYS if HIBERNATION
select ARCH_WANT_IPC_PARSE_VERSION
select BUILDTIME_EXTABLE_SORT
select HAVE_PERF_EVENTS
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_SYSCALL_TRACEPOINTS
- select HAVE_SYSCALL_WRAPPERS
select HAVE_UID16 if 32BIT
select HAVE_VIRT_CPU_ACCOUNTING
- select HAVE_VIRT_TO_BUS
+ select VIRT_TO_BUS
select INIT_ALL_POSSIBLE
select KTIME_SCALAR if 32BIT
select MODULES_USE_ELF_RELA
Say Y if you are unsure.
-config SMALL_STACK
- def_bool n
- prompt "Use 8kb for kernel stack instead of 16kb"
- depends on PACK_STACK && 64BIT && !LOCKDEP
- help
- If you say Y here and the compiler supports the -mkernel-backchain
- option the kernel will use a smaller kernel stack size. The reduced
- size is 8kb instead of 16kb. This allows to run more threads on a
- system and reduces the pressure on the memory management for higher
- order page allocations.
-
- Say N if you are unsure.
-
config CHECK_STACK
def_bool y
prompt "Detect kernel stack overflow"
select HAVE_RCU_TABLE_FREE if SMP
select HAVE_MEMBLOCK
select HAVE_MEMBLOCK_NODE_MAP
- select HAVE_SYSCALL_WRAPPERS
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select HAVE_DYNAMIC_FTRACE
select HAVE_FTRACE_MCOUNT_RECORD
default "arch/sparc/configs/sparc32_defconfig" if SPARC32
default "arch/sparc/configs/sparc64_defconfig" if SPARC64
-# CONFIG_BITS can be used at source level to get 32/64 bits
-config BITS
- int
- default 32 if SPARC32
- default 64 if SPARC64
-
config IOMMU_HELPER
bool
default y if SPARC64
config GENERIC_HWEIGHT
bool
- default y if !ULTRA_HAS_POPULATION_COUNT
+ default y
config GENERIC_CALIBRATE_DELAY
bool
if SPARC64
source "drivers/cpufreq/Kconfig"
-
-config US3_FREQ
- tristate "UltraSPARC-III CPU Frequency driver"
- depends on CPU_FREQ
- select CPU_FREQ_TABLE
- help
- This adds the CPUFreq driver for UltraSPARC-III processors.
-
- For details, take a look at <file:Documentation/cpu-freq>.
-
- If in doubt, say N.
-
-config US2E_FREQ
- tristate "UltraSPARC-IIe CPU Frequency driver"
- depends on CPU_FREQ
- select CPU_FREQ_TABLE
- help
- This adds the CPUFreq driver for UltraSPARC-IIe processors.
-
- For details, take a look at <file:Documentation/cpu-freq>.
-
- If in doubt, say N.
-
endif
config US3_MC
config SPARC_LEON
bool "Sparc Leon processor family"
depends on SPARC32
+ select USB_EHCI_BIG_ENDIAN_MMIO
+ select USB_EHCI_BIG_ENDIAN_DESC
---help---
If you say Y here if you are running on a SPARC-LEON processor.
The LEON processor is a synthesizable VHDL model of the
select GENERIC_PENDING_IRQ if SMP
select GENERIC_IRQ_SHOW
select HAVE_DEBUG_BUGVERBOSE
- select HAVE_SYSCALL_WRAPPERS if TILEGX
- select HAVE_VIRT_TO_BUS
+ select VIRT_TO_BUS
select SYS_HYPERVISOR
+ select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select GENERIC_CLOCKEVENTS
select MODULES_USE_ELF_RELA
config SMP
def_bool y
-# Allow checking for compile-time determined overflow errors in
-# copy_from_user(). There are still unprovable places in the
-# generic code as of 2.6.34, so this option is not really compatible
-# with -Werror, which is more useful in general.
-config DEBUG_COPY_FROM_USER
- def_bool n
-
config HVC_TILE
depends on TTY
select HVC_DRIVER
* adapt the usual convention.
*/
-long compat_sys_truncate64(char __user *filename, u32 dummy, u32 low, u32 high)
+COMPAT_SYSCALL_DEFINE4(truncate64, char __user *, filename, u32, dummy,
+ u32, low, u32, high)
{
return sys_truncate(filename, ((loff_t)high << 32) | low);
}
-long compat_sys_ftruncate64(unsigned int fd, u32 dummy, u32 low, u32 high)
+COMPAT_SYSCALL_DEFINE4(ftruncate64, unsigned int, fd, u32, dummy,
+ u32, low, u32, high)
{
return sys_ftruncate(fd, ((loff_t)high << 32) | low);
}
-long compat_sys_pread64(unsigned int fd, char __user *ubuf, size_t count,
- u32 dummy, u32 low, u32 high)
+COMPAT_SYSCALL_DEFINE6(pread64, unsigned int, fd, char __user *, ubuf,
+ size_t, count, u32, dummy, u32, low, u32, high)
{
return sys_pread64(fd, ubuf, count, ((loff_t)high << 32) | low);
}
-long compat_sys_pwrite64(unsigned int fd, char __user *ubuf, size_t count,
- u32 dummy, u32 low, u32 high)
+COMPAT_SYSCALL_DEFINE6(pwrite64, unsigned int, fd, char __user *, ubuf,
+ size_t, count, u32, dummy, u32, low, u32, high)
{
return sys_pwrite64(fd, ubuf, count, ((loff_t)high << 32) | low);
}
- COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, low, u32, high,
- char __user *, buf, size_t, len)
- {
- return sys_lookup_dcookie(((loff_t)high << 32) | low, buf, len);
- }
-
-long compat_sys_sync_file_range2(int fd, unsigned int flags,
- u32 offset_lo, u32 offset_hi,
- u32 nbytes_lo, u32 nbytes_hi)
+COMPAT_SYSCALL_DEFINE6(sync_file_range2, int, fd, unsigned int, flags,
+ u32, offset_lo, u32, offset_hi,
+ u32, nbytes_lo, u32, nbytes_hi)
{
return sys_sync_file_range(fd, ((loff_t)offset_hi << 32) | offset_lo,
((loff_t)nbytes_hi << 32) | nbytes_lo,
flags);
}
-long compat_sys_fallocate(int fd, int mode,
- u32 offset_lo, u32 offset_hi,
- u32 len_lo, u32 len_hi)
+COMPAT_SYSCALL_DEFINE6(fallocate, int, fd, int, mode,
+ u32, offset_lo, u32, offset_hi,
+ u32, len_lo, u32, len_hi)
{
return sys_fallocate(fd, mode, ((loff_t)offset_hi << 32) | offset_lo,
((loff_t)len_hi << 32) | len_lo);
}
+/*
+ * Avoid bug in generic sys_llseek() that specifies offset_high and
+ * offset_low as "unsigned long", thus making it possible to pass
+ * a sign-extended high 32 bits in offset_low.
+ */
+COMPAT_SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned int, offset_high,
+ unsigned int, offset_low, loff_t __user *, result,
+ unsigned int, origin)
+{
+ return sys_llseek(fd, offset_high, offset_low, result, origin);
+}
+
/* Provide the compat syscall number to call mapping. */
#undef __SYSCALL
#define __SYSCALL(nr, call) [nr] = (call),
/* See comments in sys.c */
#define compat_sys_fadvise64_64 sys32_fadvise64_64
#define compat_sys_readahead sys32_readahead
+#define sys_llseek compat_sys_llseek
/* Call the assembly trampolines where necessary. */
#define compat_sys_rt_sigreturn _compat_sys_rt_sigreturn
spin_unlock(&info->ring_lock);
out:
- kunmap_atomic(ring);
dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret,
(unsigned long)ring->head, (unsigned long)ring->tail);
+ kunmap_atomic(ring);
return ret;
}
ret = read_events(ioctx, min_nr, nr, events, timeout);
put_ioctx(ioctx);
}
-
- asmlinkage_protect(5, ret, ctx_id, min_nr, nr, events, timeout);
return ret;
}
#include <linux/signal.h>
#include <linux/poll.h>
#include <linux/mm.h>
- #include <linux/eventpoll.h>
#include <linux/fs_struct.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
}
*ret_pointer = iov;
+ ret = -EFAULT;
+ if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
+ goto out;
+
/*
* Single unix specification:
* We should -EINVAL if an element length is not >= 0 and fitting an
if (!file->f_op)
goto out;
- ret = -EFAULT;
- if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
- goto out;
-
- tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs,
+ ret = compat_rw_copy_check_uvector(type, uvector, nr_segs,
UIO_FASTIOV, iovstack, &iov);
- if (tot_len == 0) {
- ret = 0;
+ if (ret <= 0)
goto out;
- }
+ tot_len = ret;
ret = rw_verify_area(type, file, pos, tot_len);
if (ret < 0)
goto out;
return compat_sys_pwritev64(fd, vec, vlen, pos);
}
- asmlinkage long
- compat_sys_vmsplice(int fd, const struct compat_iovec __user *iov32,
- unsigned int nr_segs, unsigned int flags)
- {
- unsigned i;
- struct iovec __user *iov;
- if (nr_segs > UIO_MAXIOV)
- return -EINVAL;
- iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec));
- for (i = 0; i < nr_segs; i++) {
- struct compat_iovec v;
- if (get_user(v.iov_base, &iov32[i].iov_base) ||
- get_user(v.iov_len, &iov32[i].iov_len) ||
- put_user(compat_ptr(v.iov_base), &iov[i].iov_base) ||
- put_user(v.iov_len, &iov[i].iov_len))
- return -EFAULT;
- }
- return sys_vmsplice(fd, iov, nr_segs, flags);
- }
-
/*
* Exactly like fs/open.c:sys_open(), except that it doesn't set the
* O_LARGEFILE flag.
return ret;
}
- #ifdef CONFIG_EPOLL
-
- asmlinkage long compat_sys_epoll_pwait(int epfd,
- struct compat_epoll_event __user *events,
- int maxevents, int timeout,
- const compat_sigset_t __user *sigmask,
- compat_size_t sigsetsize)
- {
- long err;
- compat_sigset_t csigmask;
- sigset_t ksigmask, sigsaved;
-
- /*
- * If the caller wants a certain signal mask to be set during the wait,
- * we apply it here.
- */
- if (sigmask) {
- if (sigsetsize != sizeof(compat_sigset_t))
- return -EINVAL;
- if (copy_from_user(&csigmask, sigmask, sizeof(csigmask)))
- return -EFAULT;
- sigset_from_compat(&ksigmask, &csigmask);
- sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
- sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
- }
-
- err = sys_epoll_wait(epfd, events, maxevents, timeout);
-
- /*
- * If we changed the signal mask, we need to restore the original one.
- * In case we've got a signal while waiting, we do not restore the
- * signal mask yet, and we allow do_signal() to deliver the signal on
- * the way back to userspace, before the signal mask is restored.
- */
- if (sigmask) {
- if (err == -EINTR) {
- memcpy(¤t->saved_sigmask, &sigsaved,
- sizeof(sigsaved));
- set_restore_sigmask();
- } else
- sigprocmask(SIG_SETMASK, &sigsaved, NULL);
- }
-
- return err;
- }
-
- #endif /* CONFIG_EPOLL */
-
- #ifdef CONFIG_SIGNALFD
-
- asmlinkage long compat_sys_signalfd4(int ufd,
- const compat_sigset_t __user *sigmask,
- compat_size_t sigsetsize, int flags)
- {
- compat_sigset_t ss32;
- sigset_t tmp;
- sigset_t __user *ksigmask;
-
- if (sigsetsize != sizeof(compat_sigset_t))
- return -EINVAL;
- if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
- return -EFAULT;
- sigset_from_compat(&tmp, &ss32);
- ksigmask = compat_alloc_user_space(sizeof(sigset_t));
- if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t)))
- return -EFAULT;
-
- return sys_signalfd4(ufd, ksigmask, sizeof(sigset_t), flags);
- }
-
- asmlinkage long compat_sys_signalfd(int ufd,
- const compat_sigset_t __user *sigmask,
- compat_size_t sigsetsize)
- {
- return compat_sys_signalfd4(ufd, sigmask, sigsetsize, 0);
- }
- #endif /* CONFIG_SIGNALFD */
-
#ifdef CONFIG_FHANDLE
/*
* Exactly like fs/open.c:sys_open_by_handle_at(), except that it
return do_handle_open(mountdirfd, handle, flags);
}
#endif
-
- #ifdef __ARCH_WANT_COMPAT_SYS_SENDFILE
- asmlinkage long compat_sys_sendfile(int out_fd, int in_fd,
- compat_off_t __user *offset, compat_size_t count)
- {
- loff_t pos;
- off_t off;
- ssize_t ret;
-
- if (offset) {
- if (unlikely(get_user(off, offset)))
- return -EFAULT;
- pos = off;
- ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
- if (unlikely(put_user(pos, offset)))
- return -EFAULT;
- return ret;
- }
-
- return do_sendfile(out_fd, in_fd, NULL, count, 0);
- }
- #endif /* __ARCH_WANT_COMPAT_SYS_SENDFILE */
#include <linux/atomic.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+ #include <linux/compat.h>
/*
* LOCKING:
struct epoll_filefd {
struct file *file;
int fd;
-};
+} __packed;
/*
* Structure used to track possible nested calls, for too deep recursions
/*
* Each file descriptor added to the eventpoll interface will
* have an entry of this type linked to the "rbr" RB tree.
+ * Avoid increasing the size of this struct, there can be many thousands
+ * of these on a server and we do not want this to take another cache line.
*/
struct epitem {
/* RB tree node used to link this structure to the eventpoll RB tree */
struct list_head fllink;
/* wakeup_source used when EPOLLWAKEUP is set */
- struct wakeup_source *ws;
+ struct wakeup_source __rcu *ws;
/* The structure that describe the interested events and the source fd */
struct epoll_event event;
}
}
+/* call only when ep->mtx is held */
+static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi)
+{
+ return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));
+}
+
+/* call only when ep->mtx is held */
+static inline void ep_pm_stay_awake(struct epitem *epi)
+{
+ struct wakeup_source *ws = ep_wakeup_source(epi);
+
+ if (ws)
+ __pm_stay_awake(ws);
+}
+
+static inline bool ep_has_wakeup_source(struct epitem *epi)
+{
+ return rcu_access_pointer(epi->ws) ? true : false;
+}
+
+/* call when ep->mtx cannot be held (ep_poll_callback) */
+static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
+{
+ struct wakeup_source *ws;
+
+ rcu_read_lock();
+ ws = rcu_dereference(epi->ws);
+ if (ws)
+ __pm_stay_awake(ws);
+ rcu_read_unlock();
+}
+
/**
* ep_scan_ready_list - Scans the ready list in a way that makes possible for
* the scan code, to call f_op->poll(). Also allows for
*/
if (!ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
- __pm_stay_awake(epi->ws);
+ ep_pm_stay_awake(epi);
}
}
/*
list_del_init(&epi->rdllink);
spin_unlock_irqrestore(&ep->lock, flags);
- wakeup_source_unregister(epi->ws);
+ wakeup_source_unregister(ep_wakeup_source(epi));
/* At this point it is safe to free the eventpoll item */
kmem_cache_free(epi_cache, epi);
* point we are sure no poll callbacks will be lingering around, and also by
* holding "epmutex" we can be sure that no file cleanup code will hit
* us during this operation. So we can avoid the lock on "ep->lock".
+ * We do not need to lock ep->mtx, either, we only do it to prevent
+ * a lockdep warning.
*/
+ mutex_lock(&ep->mtx);
while ((rbp = rb_first(&ep->rbr)) != NULL) {
epi = rb_entry(rbp, struct epitem, rbn);
ep_remove(ep, epi);
}
+ mutex_unlock(&ep->mtx);
mutex_unlock(&epmutex);
mutex_destroy(&ep->mtx);
return 0;
}
+static inline unsigned int ep_item_poll(struct epitem *epi, poll_table *pt)
+{
+ pt->_key = epi->event.events;
+
+ return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->event.events;
+}
+
static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
void *priv)
{
poll_table pt;
init_poll_funcptr(&pt, NULL);
+
list_for_each_entry_safe(epi, tmp, head, rdllink) {
- pt._key = epi->event.events;
- if (epi->ffd.file->f_op->poll(epi->ffd.file, &pt) &
- epi->event.events)
+ if (ep_item_poll(epi, &pt))
return POLLIN | POLLRDNORM;
else {
/*
* callback, but it's not actually ready, as far as
* caller requested events goes. We can remove it here.
*/
- __pm_relax(epi->ws);
+ __pm_relax(ep_wakeup_source(epi));
list_del_init(&epi->rdllink);
}
}
/* If this file is already in the ready list we exit soon */
if (!ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
- __pm_stay_awake(epi->ws);
+ ep_pm_stay_awake_rcu(epi);
}
/*
static int ep_create_wakeup_source(struct epitem *epi)
{
const char *name;
+ struct wakeup_source *ws;
if (!epi->ep->ws) {
epi->ep->ws = wakeup_source_register("eventpoll");
}
name = epi->ffd.file->f_path.dentry->d_name.name;
- epi->ws = wakeup_source_register(name);
- if (!epi->ws)
+ ws = wakeup_source_register(name);
+
+ if (!ws)
return -ENOMEM;
+ rcu_assign_pointer(epi->ws, ws);
return 0;
}
-static void ep_destroy_wakeup_source(struct epitem *epi)
+/* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */
+static noinline void ep_destroy_wakeup_source(struct epitem *epi)
{
- wakeup_source_unregister(epi->ws);
- epi->ws = NULL;
+ struct wakeup_source *ws = ep_wakeup_source(epi);
+
+ RCU_INIT_POINTER(epi->ws, NULL);
+
+ /*
+ * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is
+ * used internally by wakeup_source_remove, too (called by
+ * wakeup_source_unregister), so we cannot use call_rcu
+ */
+ synchronize_rcu();
+ wakeup_source_unregister(ws);
}
/*
if (error)
goto error_create_wakeup_source;
} else {
- epi->ws = NULL;
+ RCU_INIT_POINTER(epi->ws, NULL);
}
/* Initialize the poll table using the queue callback */
epq.epi = epi;
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
- epq.pt._key = event->events;
/*
* Attach the item to the poll hooks and get current event bits.
* this operation completes, the poll callback can start hitting
* the new item.
*/
- revents = tfile->f_op->poll(tfile, &epq.pt);
+ revents = ep_item_poll(epi, &epq.pt);
/*
* We have to check if something went wrong during the poll wait queue
/* If the file is already "ready" we drop it inside the ready list */
if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
- __pm_stay_awake(epi->ws);
+ ep_pm_stay_awake(epi);
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
list_del_init(&epi->rdllink);
spin_unlock_irqrestore(&ep->lock, flags);
- wakeup_source_unregister(epi->ws);
+ wakeup_source_unregister(ep_wakeup_source(epi));
error_create_wakeup_source:
kmem_cache_free(epi_cache, epi);
* f_op->poll() call and the new event set registering.
*/
epi->event.events = event->events; /* need barrier below */
- pt._key = event->events;
epi->event.data = event->data; /* protected by mtx */
if (epi->event.events & EPOLLWAKEUP) {
- if (!epi->ws)
+ if (!ep_has_wakeup_source(epi))
ep_create_wakeup_source(epi);
- } else if (epi->ws) {
+ } else if (ep_has_wakeup_source(epi)) {
ep_destroy_wakeup_source(epi);
}
* Get current event bits. We can safely use the file* here because
* its usage count has been increased by the caller of this function.
*/
- revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt);
+ revents = ep_item_poll(epi, &pt);
/*
* If the item is "hot" and it is not registered inside the ready
spin_lock_irq(&ep->lock);
if (!ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
- __pm_stay_awake(epi->ws);
+ ep_pm_stay_awake(epi);
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
unsigned int revents;
struct epitem *epi;
struct epoll_event __user *uevent;
+ struct wakeup_source *ws;
poll_table pt;
init_poll_funcptr(&pt, NULL);
* instead, but then epi->ws would temporarily be out of sync
* with ep_is_linked().
*/
- if (epi->ws && epi->ws->active)
- __pm_stay_awake(ep->ws);
- __pm_relax(epi->ws);
+ ws = ep_wakeup_source(epi);
+ if (ws) {
+ if (ws->active)
+ __pm_stay_awake(ep->ws);
+ __pm_relax(ws);
+ }
+
list_del_init(&epi->rdllink);
- pt._key = epi->event.events;
- revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt) &
- epi->event.events;
+ revents = ep_item_poll(epi, &pt);
/*
* If the event mask intersect the caller-requested one,
if (__put_user(revents, &uevent->events) ||
__put_user(epi->event.data, &uevent->data)) {
list_add(&epi->rdllink, head);
- __pm_stay_awake(epi->ws);
+ ep_pm_stay_awake(epi);
return eventcnt ? eventcnt : -EFAULT;
}
eventcnt++;
* poll callback will queue them in ep->ovflist.
*/
list_add_tail(&epi->rdllink, &ep->rdllist);
- __pm_stay_awake(epi->ws);
+ ep_pm_stay_awake(epi);
}
}
}
return error;
}
+ #ifdef CONFIG_COMPAT
+ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
+ struct epoll_event __user *, events,
+ int, maxevents, int, timeout,
+ const compat_sigset_t __user *, sigmask,
+ compat_size_t, sigsetsize)
+ {
+ long err;
+ compat_sigset_t csigmask;
+ sigset_t ksigmask, sigsaved;
+
+ /*
+ * If the caller wants a certain signal mask to be set during the wait,
+ * we apply it here.
+ */
+ if (sigmask) {
+ if (sigsetsize != sizeof(compat_sigset_t))
+ return -EINVAL;
+ if (copy_from_user(&csigmask, sigmask, sizeof(csigmask)))
+ return -EFAULT;
+ sigset_from_compat(&ksigmask, &csigmask);
+ sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+ sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+ }
+
+ err = sys_epoll_wait(epfd, events, maxevents, timeout);
+
+ /*
+ * If we changed the signal mask, we need to restore the original one.
+ * In case we've got a signal while waiting, we do not restore the
+ * signal mask yet, and we allow do_signal() to deliver the signal on
+ * the way back to userspace, before the signal mask is restored.
+ */
+ if (sigmask) {
+ if (err == -EINTR) {
+ memcpy(¤t->saved_sigmask, &sigsaved,
+ sizeof(sigsaved));
+ set_restore_sigmask();
+ } else
+ sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+ }
+
+ return err;
+ }
+ #endif
+
static int __init eventpoll_init(void)
{
struct sysinfo si;
/* Initialize the structure used to perform file's f_op->poll() calls */
ep_nested_calls_init(&poll_readywalk_ncalls);
+ /*
+ * We can have many thousands of epitems, so prevent this from
+ * using an extra cache line on 64-bit (and smaller) CPUs
+ */
+ BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);
+
/* Allocates slab cache used to allocate "struct epitem" items */
epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
#include <linux/splice.h>
#include <linux/compat.h>
#include "read_write.h"
+#include "internal.h"
#include <asm/uaccess.h>
#include <asm/unistd.h>
*
* This is a generic implemenation of ->llseek useable for all normal local
* filesystems. It just updates the file offset to the value specified by
- * @offset and @whence under i_mutex.
+ * @offset and @whence.
*/
loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
{
EXPORT_SYMBOL(do_sync_write);
+ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
+{
+ mm_segment_t old_fs;
+ const char __user *p;
+ ssize_t ret;
+
+ if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
+ return -EINVAL;
+
+ old_fs = get_fs();
+ set_fs(get_ds());
+ p = (__force const char __user *)buf;
+ if (count > MAX_RW_COUNT)
+ count = MAX_RW_COUNT;
+ if (file->f_op->write)
+ ret = file->f_op->write(file, p, count, pos);
+ else
+ ret = do_sync_write(file, p, count, pos);
+ set_fs(old_fs);
+ if (ret > 0) {
+ fsnotify_modify(file);
+ add_wchar(current, ret);
+ }
+ inc_syscw(current);
+ return ret;
+}
+
ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret;
return ret;
}
- SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf,
- size_t count, loff_t pos)
+ SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
+ size_t, count, loff_t, pos)
{
struct fd f;
ssize_t ret = -EBADF;
return ret;
}
- #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
- asmlinkage long SyS_pread64(long fd, long buf, long count, loff_t pos)
- {
- return SYSC_pread64((unsigned int) fd, (char __user *) buf,
- (size_t) count, pos);
- }
- SYSCALL_ALIAS(sys_pread64, SyS_pread64);
- #endif
- SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf,
- size_t count, loff_t pos)
+ SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
+ size_t, count, loff_t, pos)
{
struct fd f;
ssize_t ret = -EBADF;
return ret;
}
- #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
- asmlinkage long SyS_pwrite64(long fd, long buf, long count, loff_t pos)
- {
- return SYSC_pwrite64((unsigned int) fd, (const char __user *) buf,
- (size_t) count, pos);
- }
- SYSCALL_ALIAS(sys_pwrite64, SyS_pwrite64);
- #endif
/*
* Reduce an iovec's length in-place. Return the resulting number of segments
return ret;
}
- ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count,
- loff_t max)
+ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
+ size_t count, loff_t max)
{
struct fd in, out;
struct inode *in_inode, *out_inode;
return do_sendfile(out_fd, in_fd, NULL, count, 0);
}
+
+ #ifdef CONFIG_COMPAT
+ COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
+ compat_off_t __user *, offset, compat_size_t, count)
+ {
+ loff_t pos;
+ off_t off;
+ ssize_t ret;
+
+ if (offset) {
+ if (unlikely(get_user(off, offset)))
+ return -EFAULT;
+ pos = off;
+ ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
+ if (unlikely(put_user(pos, offset)))
+ return -EFAULT;
+ return ret;
+ }
+
+ return do_sendfile(out_fd, in_fd, NULL, count, 0);
+ }
+
+ COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
+ compat_loff_t __user *, offset, compat_size_t, count)
+ {
+ loff_t pos;
+ ssize_t ret;
+
+ if (offset) {
+ if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
+ return -EFAULT;
+ ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
+ if (unlikely(put_user(pos, offset)))
+ return -EFAULT;
+ return ret;
+ }
+
+ return do_sendfile(out_fd, in_fd, NULL, count, 0);
+ }
+ #endif
#include <linux/security.h>
#include <linux/gfp.h>
#include <linux/socket.h>
+ #include <linux/compat.h>
+#include "internal.h"
/*
* Attempt to steal a page from a pipe buffer. This should perhaps go into
{
int ret;
void *data;
+ loff_t tmp = sd->pos;
data = buf->ops->map(pipe, buf, 0);
- ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos);
+ ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp);
buf->ops->unmap(pipe, buf, data);
return ret;
return error;
}
+ #ifdef CONFIG_COMPAT
+ COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32,
+ unsigned int, nr_segs, unsigned int, flags)
+ {
+ unsigned i;
+ struct iovec __user *iov;
+ if (nr_segs > UIO_MAXIOV)
+ return -EINVAL;
+ iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec));
+ for (i = 0; i < nr_segs; i++) {
+ struct compat_iovec v;
+ if (get_user(v.iov_base, &iov32[i].iov_base) ||
+ get_user(v.iov_len, &iov32[i].iov_len) ||
+ put_user(compat_ptr(v.iov_base), &iov[i].iov_base) ||
+ put_user(v.iov_len, &iov[i].iov_len))
+ return -EFAULT;
+ }
+ return sys_vmsplice(fd, iov, nr_segs, flags);
+ }
+ #endif
+
SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
int, fd_out, loff_t __user *, off_out,
size_t, len, unsigned int, flags)
#define __SC_DELOUSE(t,v) ((t)(unsigned long)(v))
#endif
- #define __SC_CCAST1(t1, a1) __SC_DELOUSE(t1,a1)
- #define __SC_CCAST2(t2, a2, ...) __SC_DELOUSE(t2,a2), __SC_CCAST1(__VA_ARGS__)
- #define __SC_CCAST3(t3, a3, ...) __SC_DELOUSE(t3,a3), __SC_CCAST2(__VA_ARGS__)
- #define __SC_CCAST4(t4, a4, ...) __SC_DELOUSE(t4,a4), __SC_CCAST3(__VA_ARGS__)
- #define __SC_CCAST5(t5, a5, ...) __SC_DELOUSE(t5,a5), __SC_CCAST4(__VA_ARGS__)
- #define __SC_CCAST6(t6, a6, ...) __SC_DELOUSE(t6,a6), __SC_CCAST5(__VA_ARGS__)
#define COMPAT_SYSCALL_DEFINE1(name, ...) \
COMPAT_SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
#define COMPAT_SYSCALL_DEFINE2(name, ...) \
#define COMPAT_SYSCALL_DEFINE6(name, ...) \
COMPAT_SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)
- #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-
#define COMPAT_SYSCALL_DEFINEx(x, name, ...) \
- asmlinkage long compat_sys##name(__SC_DECL##x(__VA_ARGS__)); \
- static inline long C_SYSC##name(__SC_DECL##x(__VA_ARGS__)); \
- asmlinkage long compat_SyS##name(__SC_LONG##x(__VA_ARGS__)) \
+ asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
+ static inline long C_SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
+ asmlinkage long compat_SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))\
{ \
- return (long) C_SYSC##name(__SC_CCAST##x(__VA_ARGS__)); \
+ return C_SYSC##name(__MAP(x,__SC_DELOUSE,__VA_ARGS__)); \
} \
SYSCALL_ALIAS(compat_sys##name, compat_SyS##name); \
- static inline long C_SYSC##name(__SC_DECL##x(__VA_ARGS__))
-
- #else /* CONFIG_HAVE_SYSCALL_WRAPPERS */
-
- #define COMPAT_SYSCALL_DEFINEx(x, name, ...) \
- asmlinkage long compat_sys##name(__SC_DECL##x(__VA_ARGS__))
-
- #endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */
+ static inline long C_SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))
#ifndef compat_user_stack_pointer
#define compat_user_stack_pointer() current_user_stack_pointer()
} compat_sigset_t;
struct compat_sigaction {
-#ifndef __ARCH_HAS_ODD_SIGACTION
+#ifndef __ARCH_HAS_IRIX_SIGACTION
compat_uptr_t sa_handler;
compat_ulong_t sa_flags;
#else
- compat_ulong_t sa_flags;
+ compat_uint_t sa_flags;
compat_uptr_t sa_handler;
#endif
#ifdef __ARCH_HAS_SA_RESTORER
compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
compat_size_t __user *len_ptr);
- #ifdef CONFIG_ARCH_WANT_OLD_COMPAT_IPC
- long compat_sys_semctl(int first, int second, int third, void __user *uptr);
- long compat_sys_msgsnd(int first, int second, int third, void __user *uptr);
- long compat_sys_msgrcv(int first, int second, int msgtyp, int third,
- int version, void __user *uptr);
- long compat_sys_shmat(int first, int second, compat_uptr_t third, int version,
- void __user *uptr);
- #else
- long compat_sys_semctl(int semid, int semnum, int cmd, int arg);
- long compat_sys_msgsnd(int msqid, struct compat_msgbuf __user *msgp,
+ asmlinkage long compat_sys_ipc(u32, int, int, u32, compat_uptr_t, u32);
+ asmlinkage long compat_sys_shmat(int shmid, compat_uptr_t shmaddr, int shmflg);
+ asmlinkage long compat_sys_semctl(int semid, int semnum, int cmd, int arg);
+ asmlinkage long compat_sys_msgsnd(int msqid, compat_uptr_t msgp,
compat_ssize_t msgsz, int msgflg);
- long compat_sys_msgrcv(int msqid, struct compat_msgbuf __user *msgp,
+ asmlinkage long compat_sys_msgrcv(int msqid, compat_uptr_t msgp,
compat_ssize_t msgsz, long msgtyp, int msgflg);
- long compat_sys_shmat(int shmid, compat_uptr_t shmaddr, int shmflg);
- #endif
long compat_sys_msgctl(int first, int second, void __user *uptr);
long compat_sys_shmctl(int first, int second, void __user *uptr);
long compat_sys_semtimedop(int semid, struct sembuf __user *tsems,
asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
compat_long_t addr, compat_long_t data);
+ asmlinkage long compat_sys_lookup_dcookie(u32, u32, char __user *, size_t);
/*
* epoll (fs/eventpoll.c) compat bits follow ...
*/
- struct epoll_event;
- #define compat_epoll_event epoll_event
+ struct epoll_event; /* fortunately, this one is fixed-layout */
asmlinkage long compat_sys_epoll_pwait(int epfd,
- struct compat_epoll_event __user *events,
+ struct epoll_event __user *events,
int maxevents, int timeout,
const compat_sigset_t __user *sigmask,
compat_size_t sigsetsize);
asmlinkage long compat_sys_sendfile(int out_fd, int in_fd,
compat_off_t __user *offset, compat_size_t count);
+ asmlinkage long compat_sys_sendfile64(int out_fd, int in_fd,
+ compat_loff_t __user *offset, compat_size_t count);
asmlinkage long compat_sys_sigaltstack(const compat_stack_t __user *uss_ptr,
compat_stack_t __user *uoss_ptr);
#include <asm/pgtable.h>
#include <asm/processor.h>
+extern unsigned long sysctl_user_reserve_kbytes;
+extern unsigned long sysctl_admin_reserve_kbytes;
+
#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
/* to align the pointer to the (next) page boundary */
#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */
#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */
-#define VM_POPULATE 0x00001000
#define VM_LOCKED 0x00002000
#define VM_IO 0x00004000 /* Memory mapped I/O or similar */
* Flags passed to show_mem() and show_free_areas() to suppress output in
* various contexts.
*/
-#define SHOW_MEM_FILTER_NODES (0x0001u) /* filter disallowed nodes */
+#define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */
+#define SHOW_MEM_FILTER_PAGE_COUNT (0x0002u) /* page type count */
extern void show_free_areas(unsigned int flags);
extern bool skip_free_areas_node(unsigned int flags, int nid);
unsigned long old_addr, struct vm_area_struct *new_vma,
unsigned long new_addr, unsigned long len,
bool need_rmap_locks);
- extern unsigned long do_mremap(unsigned long addr,
- unsigned long old_len, unsigned long new_len,
- unsigned long flags, unsigned long new_addr);
extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgprot_t newprot,
int dirty_accountable, int prot_numa);
unsigned long zone_start_pfn, unsigned long *zholes_size);
extern void free_initmem(void);
+/*
+ * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK)
+ * into the buddy system. The freed pages will be poisoned with pattern
+ * "poison" if it's non-zero.
+ * Return pages freed into the buddy system.
+ */
+extern unsigned long free_reserved_area(unsigned long start, unsigned long end,
+ int poison, char *s);
+#ifdef CONFIG_HIGHMEM
+/*
+ * Free a highmem page into the buddy system, adjusting totalhigh_pages
+ * and totalram_pages.
+ */
+extern void free_highmem_page(struct page *page);
+#endif
+
+static inline void adjust_managed_page_count(struct page *page, long count)
+{
+ totalram_pages += count;
+}
+
+/* Free the reserved page into the buddy system, so it gets managed. */
+static inline void __free_reserved_page(struct page *page)
+{
+ ClearPageReserved(page);
+ init_page_count(page);
+ __free_page(page);
+}
+
+static inline void free_reserved_page(struct page *page)
+{
+ __free_reserved_page(page);
+ adjust_managed_page_count(page, 1);
+}
+
+static inline void mark_page_reserved(struct page *page)
+{
+ SetPageReserved(page);
+ adjust_managed_page_count(page, -1);
+}
+
+/*
+ * Default method to free all the __init memory into the buddy system.
+ * The freed pages will be poisoned with pattern "poison" if it is
+ * non-zero. Return pages freed into the buddy system.
+ */
+static inline unsigned long free_initmem_default(int poison)
+{
+ extern char __init_begin[], __init_end[];
+
+ return free_reserved_area(PAGE_ALIGN((unsigned long)&__init_begin) ,
+ ((unsigned long)&__init_end) & PAGE_MASK,
+ poison, "unused kernel");
+}
+
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
/*
* With CONFIG_HAVE_MEMBLOCK_NODE_MAP set, an architecture may initialise its
unsigned long pfn);
int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn);
+int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
+
struct page *follow_page_mask(struct vm_area_struct *vma,
unsigned long address, unsigned int foll_flags,
#define in_gate_area(mm, addr) ({(void)mm; in_gate_area_no_mm(addr);})
#endif /* __HAVE_ARCH_GATE_AREA */
+#ifdef CONFIG_SYSCTL
+extern int sysctl_drop_caches;
int drop_caches_sysctl_handler(struct ctl_table *, int,
void __user *, size_t *, loff_t *);
+#endif
+
unsigned long shrink_slab(struct shrink_control *shrink,
unsigned long nr_pages_scanned,
unsigned long lru_pages);
void *vmemmap_alloc_block(unsigned long size, int node);
void *vmemmap_alloc_block_buf(unsigned long size, int node);
void vmemmap_verify(pte_t *, int, unsigned long, unsigned long);
-int vmemmap_populate_basepages(struct page *start_page,
- unsigned long pages, int node);
-int vmemmap_populate(struct page *start_page, unsigned long pages, int node);
+int vmemmap_populate_basepages(unsigned long start, unsigned long end,
+ int node);
+int vmemmap_populate(unsigned long start, unsigned long end, int node);
void vmemmap_populate_print_last(void);
#ifdef CONFIG_MEMORY_HOTPLUG
-void vmemmap_free(struct page *memmap, unsigned long nr_pages);
+void vmemmap_free(unsigned long start, unsigned long end);
#endif
void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
unsigned long size);
static inline bool page_is_guard(struct page *page) { return false; }
#endif /* CONFIG_DEBUG_PAGEALLOC */
+#if MAX_NUMNODES > 1
+void __init setup_nr_node_ids(void);
+#else
+static inline void setup_nr_node_ids(void) {}
+#endif
+
#endif /* __KERNEL__ */
#endif /* _LINUX_MM_H */
return 0;
}
- asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru)
- {
- struct rusage r;
- int ret;
- mm_segment_t old_fs = get_fs();
-
- set_fs(KERNEL_DS);
- ret = sys_getrusage(who, (struct rusage __user *) &r);
- set_fs(old_fs);
-
- if (ret)
- return ret;
-
- if (put_compat_rusage(&r, ru))
- return -EFAULT;
-
- return 0;
- }
-
COMPAT_SYSCALL_DEFINE4(wait4,
compat_pid_t, pid,
compat_uint_t __user *, stat_addr,
}
#endif
-struct compat_sysinfo {
- s32 uptime;
- u32 loads[3];
- u32 totalram;
- u32 freeram;
- u32 sharedram;
- u32 bufferram;
- u32 totalswap;
- u32 freeswap;
- u16 procs;
- u16 pad;
- u32 totalhigh;
- u32 freehigh;
- u32 mem_unit;
- char _f[20-2*sizeof(u32)-sizeof(int)];
-};
-
-asmlinkage long
-compat_sys_sysinfo(struct compat_sysinfo __user *info)
-{
- struct sysinfo s;
-
- do_sysinfo(&s);
-
- /* Check to see if any memory value is too large for 32-bit and scale
- * down if needed
- */
- if ((s.totalram >> 32) || (s.totalswap >> 32)) {
- int bitcount = 0;
-
- while (s.mem_unit < PAGE_SIZE) {
- s.mem_unit <<= 1;
- bitcount++;
- }
-
- s.totalram >>= bitcount;
- s.freeram >>= bitcount;
- s.sharedram >>= bitcount;
- s.bufferram >>= bitcount;
- s.totalswap >>= bitcount;
- s.freeswap >>= bitcount;
- s.totalhigh >>= bitcount;
- s.freehigh >>= bitcount;
- }
-
- if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) ||
- __put_user (s.uptime, &info->uptime) ||
- __put_user (s.loads[0], &info->loads[0]) ||
- __put_user (s.loads[1], &info->loads[1]) ||
- __put_user (s.loads[2], &info->loads[2]) ||
- __put_user (s.totalram, &info->totalram) ||
- __put_user (s.freeram, &info->freeram) ||
- __put_user (s.sharedram, &info->sharedram) ||
- __put_user (s.bufferram, &info->bufferram) ||
- __put_user (s.totalswap, &info->totalswap) ||
- __put_user (s.freeswap, &info->freeswap) ||
- __put_user (s.procs, &info->procs) ||
- __put_user (s.totalhigh, &info->totalhigh) ||
- __put_user (s.freehigh, &info->freehigh) ||
- __put_user (s.mem_unit, &info->mem_unit))
- return -EFAULT;
-
- return 0;
-}
-
COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
compat_pid_t, pid,
struct compat_timespec __user *, interval)
/*
* Make sure we are holding no locks:
*/
- debug_check_no_locks_held();
+ debug_check_no_locks_held(tsk);
/*
* We can do this unlocked here. The futex code uses this flag
* just to verify whether the pi state cleanup has been done
}
put_pid(pid);
-
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(5, ret, which, upid, infop, options, ru);
return ret;
}
ret = do_wait(&wo);
put_pid(pid);
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(4, ret, upid, stat_addr, options, ru);
return ret;
}
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
+ if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
+ return ERR_PTR(-EINVAL);
+
/*
* Thread groups must share signals as well, and detached threads
* can only be started up within the thread group.
p->utime = p->stime = p->gtime = 0;
p->utimescaled = p->stimescaled = 0;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
p->prev_cputime.utime = p->prev_cputime.stime = 0;
#endif
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
int, tls_val)
#endif
{
- long ret = do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
- asmlinkage_protect(5, ret, clone_flags, newsp,
- parent_tidptr, child_tidptr, tls_val);
- return ret;
+ return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
}
#endif
* If unsharing a user namespace must also unshare the thread.
*/
if (unshare_flags & CLONE_NEWUSER)
- unshare_flags |= CLONE_THREAD;
+ unshare_flags |= CLONE_THREAD | CLONE_FS;
/*
* If unsharing a pid namespace must also unshare the thread.
*/
#include <linux/user_namespace.h>
#include <linux/binfmts.h>
+#include <linux/sched.h>
+#include <linux/rcupdate.h>
+#include <linux/uidgid.h>
+#include <linux/cred.h>
+
#include <linux/kmsg_dump.h>
/* Move somewhere else to avoid recompiling? */
#include <generated/utsrelease.h>
system_state = SYSTEM_RESTART;
usermodehelper_disable();
device_shutdown();
- syscore_shutdown();
}
/**
{
kernel_restart_prepare(cmd);
disable_nonboot_cpus();
+ syscore_shutdown();
if (!cmd)
printk(KERN_EMERG "Restarting system.\n");
else
void kernel_halt(void)
{
kernel_shutdown_prepare(SYSTEM_HALT);
+ disable_nonboot_cpus();
syscore_shutdown();
printk(KERN_EMERG "System halted.\n");
kmsg_dump(KMSG_DUMP_HALT);
return old_fsgid;
}
+/**
+ * sys_getpid - return the thread group id of the current process
+ *
+ * Note, despite the name, this returns the tgid not the pid. The tgid and
+ * the pid are identical unless CLONE_THREAD was specified on clone() in
+ * which case the tgid is the same in all threads of the same group.
+ *
+ * This is SMP safe as current->tgid does not change.
+ */
+SYSCALL_DEFINE0(getpid)
+{
+ return task_tgid_vnr(current);
+}
+
+/* Thread ID - the internal kernel "pid" */
+SYSCALL_DEFINE0(gettid)
+{
+ return task_pid_vnr(current);
+}
+
+/*
+ * Accessing ->real_parent is not SMP-safe, it could
+ * change from under us. However, we can use a stale
+ * value of ->real_parent under rcu_read_lock(), see
+ * release_task()->call_rcu(delayed_put_task_struct).
+ */
+SYSCALL_DEFINE0(getppid)
+{
+ int pid;
+
+ rcu_read_lock();
+ pid = task_tgid_vnr(rcu_dereference(current->real_parent));
+ rcu_read_unlock();
+
+ return pid;
+}
+
+SYSCALL_DEFINE0(getuid)
+{
+ /* Only we change this so SMP safe */
+ return from_kuid_munged(current_user_ns(), current_uid());
+}
+
+SYSCALL_DEFINE0(geteuid)
+{
+ /* Only we change this so SMP safe */
+ return from_kuid_munged(current_user_ns(), current_euid());
+}
+
+SYSCALL_DEFINE0(getgid)
+{
+ /* Only we change this so SMP safe */
+ return from_kgid_munged(current_user_ns(), current_gid());
+}
+
+SYSCALL_DEFINE0(getegid)
+{
+ /* Only we change this so SMP safe */
+ return from_kgid_munged(current_user_ns(), current_egid());
+}
+
void do_sys_times(struct tms *tms)
{
cputime_t tgutime, tgstime, cutime, cstime;
return getrusage(current, who, ru);
}
+ #ifdef CONFIG_COMPAT
+ COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru)
+ {
+ struct rusage r;
+
+ if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
+ who != RUSAGE_THREAD)
+ return -EINVAL;
+
+ k_getrusage(current, who, &r);
+ return put_compat_rusage(&r, ru);
+ }
+ #endif
+
SYSCALL_DEFINE1(umask, int, mask)
{
mask = xchg(¤t->fs->umask, mask & S_IRWXUGO);
return mask;
}
-#ifdef CONFIG_CHECKPOINT_RESTORE
static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
{
struct fd exe;
return error;
}
+#ifdef CONFIG_CHECKPOINT_RESTORE
static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
{
return put_user(me->clear_child_tid, tid_addr);
}
-
-#else /* CONFIG_CHECKPOINT_RESTORE */
-static int prctl_set_mm(int opt, unsigned long addr,
- unsigned long arg4, unsigned long arg5)
-{
- return -EINVAL;
-}
+#else
static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
{
return -EINVAL;
char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
-static int __orderly_poweroff(void)
+static int __orderly_poweroff(bool force)
{
- int argc;
char **argv;
static char *envp[] = {
"HOME=/",
};
int ret;
- argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);
- if (argv == NULL) {
+ argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL);
+ if (argv) {
+ ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+ argv_free(argv);
+ } else {
printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
- __func__, poweroff_cmd);
- return -ENOMEM;
+ __func__, poweroff_cmd);
+ ret = -ENOMEM;
}
- ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC,
- NULL, NULL, NULL);
- argv_free(argv);
+ if (ret && force) {
+ printk(KERN_WARNING "Failed to start orderly shutdown: "
+ "forcing the issue\n");
+ /*
+ * I guess this should try to kick off some daemon to sync and
+ * poweroff asap. Or not even bother syncing if we're doing an
+ * emergency shutdown?
+ */
+ emergency_sync();
+ kernel_power_off();
+ }
return ret;
}
+static bool poweroff_force;
+
+static void poweroff_work_func(struct work_struct *work)
+{
+ __orderly_poweroff(poweroff_force);
+}
+
+static DECLARE_WORK(poweroff_work, poweroff_work_func);
+
/**
* orderly_poweroff - Trigger an orderly system poweroff
* @force: force poweroff if command execution fails
*/
int orderly_poweroff(bool force)
{
- int ret = __orderly_poweroff();
+ if (force) /* do not override the pending "true" */
+ poweroff_force = true;
+ schedule_work(&poweroff_work);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(orderly_poweroff);
- if (ret && force) {
- printk(KERN_WARNING "Failed to start orderly shutdown: "
- "forcing the issue\n");
+/**
+ * do_sysinfo - fill in sysinfo struct
+ * @info: pointer to buffer to fill
+ */
+static int do_sysinfo(struct sysinfo *info)
+{
+ unsigned long mem_total, sav_total;
+ unsigned int mem_unit, bitcount;
+ struct timespec tp;
- /*
- * I guess this should try to kick off some daemon to sync and
- * poweroff asap. Or not even bother syncing if we're doing an
- * emergency shutdown?
- */
- emergency_sync();
- kernel_power_off();
+ memset(info, 0, sizeof(struct sysinfo));
+
+ ktime_get_ts(&tp);
+ monotonic_to_bootbased(&tp);
+ info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
+
+ get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
+
+ info->procs = nr_threads;
+
+ si_meminfo(info);
+ si_swapinfo(info);
+
+ /*
+ * If the sum of all the available memory (i.e. ram + swap)
+ * is less than can be stored in a 32 bit unsigned long then
+ * we can be binary compatible with 2.2.x kernels. If not,
+ * well, in that case 2.2.x was broken anyways...
+ *
+ * -Erik Andersen <andersee@debian.org>
+ */
+
+ mem_total = info->totalram + info->totalswap;
+ if (mem_total < info->totalram || mem_total < info->totalswap)
+ goto out;
+ bitcount = 0;
+ mem_unit = info->mem_unit;
+ while (mem_unit > 1) {
+ bitcount++;
+ mem_unit >>= 1;
+ sav_total = mem_total;
+ mem_total <<= 1;
+ if (mem_total < sav_total)
+ goto out;
}
- return ret;
+ /*
+ * If mem_total did not overflow, multiply all memory values by
+ * info->mem_unit and set it to 1. This leaves things compatible
+ * with 2.2.x, and also retains compatibility with earlier 2.4.x
+ * kernels...
+ */
+
+ info->mem_unit = 1;
+ info->totalram <<= bitcount;
+ info->freeram <<= bitcount;
+ info->sharedram <<= bitcount;
+ info->bufferram <<= bitcount;
+ info->totalswap <<= bitcount;
+ info->freeswap <<= bitcount;
+ info->totalhigh <<= bitcount;
+ info->freehigh <<= bitcount;
+
+out:
+ return 0;
}
-EXPORT_SYMBOL_GPL(orderly_poweroff);
+
+SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info)
+{
+ struct sysinfo val;
+
+ do_sysinfo(&val);
+
+ if (copy_to_user(info, &val, sizeof(struct sysinfo)))
+ return -EFAULT;
+
+ return 0;
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_sysinfo {
+ s32 uptime;
+ u32 loads[3];
+ u32 totalram;
+ u32 freeram;
+ u32 sharedram;
+ u32 bufferram;
+ u32 totalswap;
+ u32 freeswap;
+ u16 procs;
+ u16 pad;
+ u32 totalhigh;
+ u32 freehigh;
+ u32 mem_unit;
+ char _f[20-2*sizeof(u32)-sizeof(int)];
+};
+
+COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
+{
+ struct sysinfo s;
+
+ do_sysinfo(&s);
+
+ /* Check to see if any memory value is too large for 32-bit and scale
+ * down if needed
+ */
+ if ((s.totalram >> 32) || (s.totalswap >> 32)) {
+ int bitcount = 0;
+
+ while (s.mem_unit < PAGE_SIZE) {
+ s.mem_unit <<= 1;
+ bitcount++;
+ }
+
+ s.totalram >>= bitcount;
+ s.freeram >>= bitcount;
+ s.sharedram >>= bitcount;
+ s.bufferram >>= bitcount;
+ s.totalswap >>= bitcount;
+ s.freeswap >>= bitcount;
+ s.totalhigh >>= bitcount;
+ s.freehigh >>= bitcount;
+ }
+
+ if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) ||
+ __put_user(s.uptime, &info->uptime) ||
+ __put_user(s.loads[0], &info->loads[0]) ||
+ __put_user(s.loads[1], &info->loads[1]) ||
+ __put_user(s.loads[2], &info->loads[2]) ||
+ __put_user(s.totalram, &info->totalram) ||
+ __put_user(s.freeram, &info->freeram) ||
+ __put_user(s.sharedram, &info->sharedram) ||
+ __put_user(s.bufferram, &info->bufferram) ||
+ __put_user(s.totalswap, &info->totalswap) ||
+ __put_user(s.freeswap, &info->freeswap) ||
+ __put_user(s.procs, &info->procs) ||
+ __put_user(s.totalhigh, &info->totalhigh) ||
+ __put_user(s.freehigh, &info->freehigh) ||
+ __put_user(s.mem_unit, &info->mem_unit))
+ return -EFAULT;
+
+ return 0;
+}
+#endif /* CONFIG_COMPAT */
int sysctl_overcommit_ratio = 50; /* default is 50% */
int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
+unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
+unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
int heap_stack_gap = 0;
atomic_long_t mmap_pages_allocated;
}
EXPORT_SYMBOL(follow_pfn);
-DEFINE_RWLOCK(vmlist_lock);
-struct vm_struct *vmlist;
+LIST_HEAD(vmap_area_list);
void vfree(const void *addr)
{
struct vm_area_struct *vma;
/* check the cache first */
- vma = mm->mmap_cache;
+ vma = ACCESS_ONCE(mm->mmap_cache);
if (vma && vma->vm_start <= addr && vma->vm_end > addr)
return vma;
*
* MREMAP_FIXED is not supported under NOMMU conditions
*/
- unsigned long do_mremap(unsigned long addr,
+ static unsigned long do_mremap(unsigned long addr,
unsigned long old_len, unsigned long new_len,
unsigned long flags, unsigned long new_addr)
{
vma->vm_end = vma->vm_start + new_len;
return vma->vm_start;
}
- EXPORT_SYMBOL(do_mremap);
SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
unsigned long, new_len, unsigned long, flags,
}
EXPORT_SYMBOL(remap_pfn_range);
+int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
+{
+ unsigned long pfn = start >> PAGE_SHIFT;
+ unsigned long vm_len = vma->vm_end - vma->vm_start;
+
+ pfn += vma->vm_pgoff;
+ return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vm_iomap_memory);
+
int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
unsigned long pgoff)
{
*/
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
- unsigned long free, allowed;
+ unsigned long free, allowed, reserve;
vm_acct_memory(pages);
free -= totalreserve_pages;
/*
- * Leave the last 3% for root
+ * Reserve some for root
*/
if (!cap_sys_admin)
- free -= free / 32;
+ free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
if (free > pages)
return 0;
allowed = totalram_pages * sysctl_overcommit_ratio / 100;
/*
- * Leave the last 3% for root
+ * Reserve some 3% for root
*/
if (!cap_sys_admin)
- allowed -= allowed / 32;
+ allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
allowed += total_swap_pages;
- /* Don't let a single process grow too big:
- leave 3% of the size of this process for other processes */
- if (mm)
- allowed -= mm->total_vm / 32;
+ /*
+ * Don't let a single process grow so big a user can't recover
+ */
+ if (mm) {
+ reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
+ allowed -= min(mm->total_vm / 32, reserve);
+ }
if (percpu_counter_read_positive(&vm_committed_as) < allowed)
return 0;
up_write(&nommu_region_sem);
return 0;
}
+
+/*
+ * Initialise sysctl_user_reserve_kbytes.
+ *
+ * This is intended to prevent a user from starting a single memory hogging
+ * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
+ * mode.
+ *
+ * The default value is min(3% of free memory, 128MB)
+ * 128MB is enough to recover with sshd/login, bash, and top/kill.
+ */
+static int __meminit init_user_reserve(void)
+{
+ unsigned long free_kbytes;
+
+ free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+ sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
+ return 0;
+}
+module_init(init_user_reserve)
+
+/*
+ * Initialise sysctl_admin_reserve_kbytes.
+ *
+ * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
+ * to log in and kill a memory hogging process.
+ *
+ * Systems with more than 256MB will reserve 8MB, enough to recover
+ * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
+ * only reserve 3% of free pages by default.
+ */
+static int __meminit init_admin_reserve(void)
+{
+ unsigned long free_kbytes;
+
+ free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+ sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
+ return 0;
+}
+module_init(init_admin_reserve)