]> git.karo-electronics.de Git - karo-tx-linux.git/blob - tools/testing/selftests/vm/userfaultfd.c
userfaultfd: non-cooperative: rename *EVENT_MADVDONTNEED to *EVENT_REMOVE
[karo-tx-linux.git] / tools / testing / selftests / vm / userfaultfd.c
1 /*
2  * Stress userfaultfd syscall.
3  *
4  *  Copyright (C) 2015  Red Hat, Inc.
5  *
6  *  This work is licensed under the terms of the GNU GPL, version 2. See
7  *  the COPYING file in the top-level directory.
8  *
9  * This test allocates two virtual areas and bounces the physical
10  * memory across the two virtual areas (from area_src to area_dst)
11  * using userfaultfd.
12  *
13  * There are three threads running per CPU:
14  *
15  * 1) one per-CPU thread takes a per-page pthread_mutex in a random
16  *    page of the area_dst (while the physical page may still be in
17  *    area_src), and increments a per-page counter in the same page,
18  *    and checks its value against a verification region.
19  *
20  * 2) another per-CPU thread handles the userfaults generated by
21  *    thread 1 above. userfaultfd blocking reads or poll() modes are
22  *    exercised interleaved.
23  *
24  * 3) one last per-CPU thread transfers the memory in the background
25  *    at maximum bandwidth (if not already transferred by thread
26  *    2). Each cpu thread takes cares of transferring a portion of the
27  *    area.
28  *
29  * When all threads of type 3 completed the transfer, one bounce is
30  * complete. area_src and area_dst are then swapped. All threads are
31  * respawned and so the bounce is immediately restarted in the
32  * opposite direction.
33  *
34  * per-CPU threads 1 by triggering userfaults inside
35  * pthread_mutex_lock will also verify the atomicity of the memory
36  * transfer (UFFDIO_COPY).
37  *
38  * The program takes two parameters: the amounts of physical memory in
39  * megabytes (MiB) of the area and the number of bounces to execute.
40  *
41  * # 100MiB 99999 bounces
42  * ./userfaultfd 100 99999
43  *
44  * # 1GiB 99 bounces
45  * ./userfaultfd 1000 99
46  *
47  * # 10MiB-~6GiB 999 bounces, continue forever unless an error triggers
48  * while ./userfaultfd $[RANDOM % 6000 + 10] 999; do true; done
49  */
50
51 #define _GNU_SOURCE
52 #include <stdio.h>
53 #include <errno.h>
54 #include <unistd.h>
55 #include <stdlib.h>
56 #include <sys/types.h>
57 #include <sys/stat.h>
58 #include <fcntl.h>
59 #include <time.h>
60 #include <signal.h>
61 #include <poll.h>
62 #include <string.h>
63 #include <sys/mman.h>
64 #include <sys/syscall.h>
65 #include <sys/ioctl.h>
66 #include <sys/wait.h>
67 #include <pthread.h>
68 #include <linux/userfaultfd.h>
69
70 #ifdef __NR_userfaultfd
71
72 static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
73
74 #define BOUNCE_RANDOM           (1<<0)
75 #define BOUNCE_RACINGFAULTS     (1<<1)
76 #define BOUNCE_VERIFY           (1<<2)
77 #define BOUNCE_POLL             (1<<3)
78 static int bounces;
79
80 #ifdef HUGETLB_TEST
81 static int huge_fd;
82 static char *huge_fd_off0;
83 #endif
84 static unsigned long long *count_verify;
85 static int uffd, uffd_flags, finished, *pipefd;
86 static char *area_src, *area_dst;
87 static char *zeropage;
88 pthread_attr_t attr;
89
90 /* pthread_mutex_t starts at page offset 0 */
91 #define area_mutex(___area, ___nr)                                      \
92         ((pthread_mutex_t *) ((___area) + (___nr)*page_size))
93 /*
94  * count is placed in the page after pthread_mutex_t naturally aligned
95  * to avoid non alignment faults on non-x86 archs.
96  */
97 #define area_count(___area, ___nr)                                      \
98         ((volatile unsigned long long *) ((unsigned long)               \
99                                  ((___area) + (___nr)*page_size +       \
100                                   sizeof(pthread_mutex_t) +             \
101                                   sizeof(unsigned long long) - 1) &     \
102                                  ~(unsigned long)(sizeof(unsigned long long) \
103                                                   -  1)))
104
105 #if !defined(HUGETLB_TEST) && !defined(SHMEM_TEST)
106
107 /* Anonymous memory */
108 #define EXPECTED_IOCTLS         ((1 << _UFFDIO_WAKE) | \
109                                  (1 << _UFFDIO_COPY) | \
110                                  (1 << _UFFDIO_ZEROPAGE))
111
112 static int release_pages(char *rel_area)
113 {
114         int ret = 0;
115
116         if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) {
117                 perror("madvise");
118                 ret = 1;
119         }
120
121         return ret;
122 }
123
124 static void allocate_area(void **alloc_area)
125 {
126         if (posix_memalign(alloc_area, page_size, nr_pages * page_size)) {
127                 fprintf(stderr, "out of memory\n");
128                 *alloc_area = NULL;
129         }
130 }
131
132 #else /* HUGETLB_TEST or SHMEM_TEST */
133
134 #define EXPECTED_IOCTLS         UFFD_API_RANGE_IOCTLS_BASIC
135
136 #ifdef HUGETLB_TEST
137
138 /* HugeTLB memory */
139 static int release_pages(char *rel_area)
140 {
141         int ret = 0;
142
143         if (fallocate(huge_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
144                                 rel_area == huge_fd_off0 ? 0 :
145                                 nr_pages * page_size,
146                                 nr_pages * page_size)) {
147                 perror("fallocate");
148                 ret = 1;
149         }
150
151         return ret;
152 }
153
154
155 static void allocate_area(void **alloc_area)
156 {
157         *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
158                                 MAP_PRIVATE | MAP_HUGETLB, huge_fd,
159                                 *alloc_area == area_src ? 0 :
160                                 nr_pages * page_size);
161         if (*alloc_area == MAP_FAILED) {
162                 fprintf(stderr, "mmap of hugetlbfs file failed\n");
163                 *alloc_area = NULL;
164         }
165
166         if (*alloc_area == area_src)
167                 huge_fd_off0 = *alloc_area;
168 }
169
170 #elif defined(SHMEM_TEST)
171
172 /* Shared memory */
173 static int release_pages(char *rel_area)
174 {
175         int ret = 0;
176
177         if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) {
178                 perror("madvise");
179                 ret = 1;
180         }
181
182         return ret;
183 }
184
185 static void allocate_area(void **alloc_area)
186 {
187         *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
188                            MAP_ANONYMOUS | MAP_SHARED, -1, 0);
189         if (*alloc_area == MAP_FAILED) {
190                 fprintf(stderr, "shared memory mmap failed\n");
191                 *alloc_area = NULL;
192         }
193 }
194
195 #else /* SHMEM_TEST */
196 #error "Undefined test type"
197 #endif /* HUGETLB_TEST */
198
199 #endif /* !defined(HUGETLB_TEST) && !defined(SHMEM_TEST) */
200
201 static int my_bcmp(char *str1, char *str2, size_t n)
202 {
203         unsigned long i;
204         for (i = 0; i < n; i++)
205                 if (str1[i] != str2[i])
206                         return 1;
207         return 0;
208 }
209
210 static void *locking_thread(void *arg)
211 {
212         unsigned long cpu = (unsigned long) arg;
213         struct random_data rand;
214         unsigned long page_nr = *(&(page_nr)); /* uninitialized warning */
215         int32_t rand_nr;
216         unsigned long long count;
217         char randstate[64];
218         unsigned int seed;
219         time_t start;
220
221         if (bounces & BOUNCE_RANDOM) {
222                 seed = (unsigned int) time(NULL) - bounces;
223                 if (!(bounces & BOUNCE_RACINGFAULTS))
224                         seed += cpu;
225                 bzero(&rand, sizeof(rand));
226                 bzero(&randstate, sizeof(randstate));
227                 if (initstate_r(seed, randstate, sizeof(randstate), &rand))
228                         fprintf(stderr, "srandom_r error\n"), exit(1);
229         } else {
230                 page_nr = -bounces;
231                 if (!(bounces & BOUNCE_RACINGFAULTS))
232                         page_nr += cpu * nr_pages_per_cpu;
233         }
234
235         while (!finished) {
236                 if (bounces & BOUNCE_RANDOM) {
237                         if (random_r(&rand, &rand_nr))
238                                 fprintf(stderr, "random_r 1 error\n"), exit(1);
239                         page_nr = rand_nr;
240                         if (sizeof(page_nr) > sizeof(rand_nr)) {
241                                 if (random_r(&rand, &rand_nr))
242                                         fprintf(stderr, "random_r 2 error\n"), exit(1);
243                                 page_nr |= (((unsigned long) rand_nr) << 16) <<
244                                            16;
245                         }
246                 } else
247                         page_nr += 1;
248                 page_nr %= nr_pages;
249
250                 start = time(NULL);
251                 if (bounces & BOUNCE_VERIFY) {
252                         count = *area_count(area_dst, page_nr);
253                         if (!count)
254                                 fprintf(stderr,
255                                         "page_nr %lu wrong count %Lu %Lu\n",
256                                         page_nr, count,
257                                         count_verify[page_nr]), exit(1);
258
259
260                         /*
261                          * We can't use bcmp (or memcmp) because that
262                          * returns 0 erroneously if the memory is
263                          * changing under it (even if the end of the
264                          * page is never changing and always
265                          * different).
266                          */
267 #if 1
268                         if (!my_bcmp(area_dst + page_nr * page_size, zeropage,
269                                      page_size))
270                                 fprintf(stderr,
271                                         "my_bcmp page_nr %lu wrong count %Lu %Lu\n",
272                                         page_nr, count,
273                                         count_verify[page_nr]), exit(1);
274 #else
275                         unsigned long loops;
276
277                         loops = 0;
278                         /* uncomment the below line to test with mutex */
279                         /* pthread_mutex_lock(area_mutex(area_dst, page_nr)); */
280                         while (!bcmp(area_dst + page_nr * page_size, zeropage,
281                                      page_size)) {
282                                 loops += 1;
283                                 if (loops > 10)
284                                         break;
285                         }
286                         /* uncomment below line to test with mutex */
287                         /* pthread_mutex_unlock(area_mutex(area_dst, page_nr)); */
288                         if (loops) {
289                                 fprintf(stderr,
290                                         "page_nr %lu all zero thread %lu %p %lu\n",
291                                         page_nr, cpu, area_dst + page_nr * page_size,
292                                         loops);
293                                 if (loops > 10)
294                                         exit(1);
295                         }
296 #endif
297                 }
298
299                 pthread_mutex_lock(area_mutex(area_dst, page_nr));
300                 count = *area_count(area_dst, page_nr);
301                 if (count != count_verify[page_nr]) {
302                         fprintf(stderr,
303                                 "page_nr %lu memory corruption %Lu %Lu\n",
304                                 page_nr, count,
305                                 count_verify[page_nr]), exit(1);
306                 }
307                 count++;
308                 *area_count(area_dst, page_nr) = count_verify[page_nr] = count;
309                 pthread_mutex_unlock(area_mutex(area_dst, page_nr));
310
311                 if (time(NULL) - start > 1)
312                         fprintf(stderr,
313                                 "userfault too slow %ld "
314                                 "possible false positive with overcommit\n",
315                                 time(NULL) - start);
316         }
317
318         return NULL;
319 }
320
321 static int copy_page(int ufd, unsigned long offset)
322 {
323         struct uffdio_copy uffdio_copy;
324
325         if (offset >= nr_pages * page_size)
326                 fprintf(stderr, "unexpected offset %lu\n",
327                         offset), exit(1);
328         uffdio_copy.dst = (unsigned long) area_dst + offset;
329         uffdio_copy.src = (unsigned long) area_src + offset;
330         uffdio_copy.len = page_size;
331         uffdio_copy.mode = 0;
332         uffdio_copy.copy = 0;
333         if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
334                 /* real retval in ufdio_copy.copy */
335                 if (uffdio_copy.copy != -EEXIST)
336                         fprintf(stderr, "UFFDIO_COPY error %Ld\n",
337                                 uffdio_copy.copy), exit(1);
338         } else if (uffdio_copy.copy != page_size) {
339                 fprintf(stderr, "UFFDIO_COPY unexpected copy %Ld\n",
340                         uffdio_copy.copy), exit(1);
341         } else
342                 return 1;
343         return 0;
344 }
345
346 static void *uffd_poll_thread(void *arg)
347 {
348         unsigned long cpu = (unsigned long) arg;
349         struct pollfd pollfd[2];
350         struct uffd_msg msg;
351         struct uffdio_register uffd_reg;
352         int ret;
353         unsigned long offset;
354         char tmp_chr;
355         unsigned long userfaults = 0;
356
357         pollfd[0].fd = uffd;
358         pollfd[0].events = POLLIN;
359         pollfd[1].fd = pipefd[cpu*2];
360         pollfd[1].events = POLLIN;
361
362         for (;;) {
363                 ret = poll(pollfd, 2, -1);
364                 if (!ret)
365                         fprintf(stderr, "poll error %d\n", ret), exit(1);
366                 if (ret < 0)
367                         perror("poll"), exit(1);
368                 if (pollfd[1].revents & POLLIN) {
369                         if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
370                                 fprintf(stderr, "read pipefd error\n"),
371                                         exit(1);
372                         break;
373                 }
374                 if (!(pollfd[0].revents & POLLIN))
375                         fprintf(stderr, "pollfd[0].revents %d\n",
376                                 pollfd[0].revents), exit(1);
377                 ret = read(uffd, &msg, sizeof(msg));
378                 if (ret < 0) {
379                         if (errno == EAGAIN)
380                                 continue;
381                         perror("nonblocking read error"), exit(1);
382                 }
383                 switch (msg.event) {
384                 default:
385                         fprintf(stderr, "unexpected msg event %u\n",
386                                 msg.event), exit(1);
387                         break;
388                 case UFFD_EVENT_PAGEFAULT:
389                         if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
390                                 fprintf(stderr, "unexpected write fault\n"), exit(1);
391                         offset = (char *)(unsigned long)msg.arg.pagefault.address -
392                                 area_dst;
393                         offset &= ~(page_size-1);
394                         if (copy_page(uffd, offset))
395                                 userfaults++;
396                         break;
397                 case UFFD_EVENT_FORK:
398                         uffd = msg.arg.fork.ufd;
399                         pollfd[0].fd = uffd;
400                         break;
401                 case UFFD_EVENT_REMOVE:
402                         uffd_reg.range.start = msg.arg.remove.start;
403                         uffd_reg.range.len = msg.arg.remove.end -
404                                 msg.arg.remove.start;
405                         if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
406                                 fprintf(stderr, "remove failure\n"), exit(1);
407                         break;
408                 case UFFD_EVENT_REMAP:
409                         area_dst = (char *)(unsigned long)msg.arg.remap.to;
410                         break;
411                 }
412         }
413         return (void *)userfaults;
414 }
415
416 pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
417
418 static void *uffd_read_thread(void *arg)
419 {
420         unsigned long *this_cpu_userfaults;
421         struct uffd_msg msg;
422         unsigned long offset;
423         int ret;
424
425         this_cpu_userfaults = (unsigned long *) arg;
426         *this_cpu_userfaults = 0;
427
428         pthread_mutex_unlock(&uffd_read_mutex);
429         /* from here cancellation is ok */
430
431         for (;;) {
432                 ret = read(uffd, &msg, sizeof(msg));
433                 if (ret != sizeof(msg)) {
434                         if (ret < 0)
435                                 perror("blocking read error"), exit(1);
436                         else
437                                 fprintf(stderr, "short read\n"), exit(1);
438                 }
439                 if (msg.event != UFFD_EVENT_PAGEFAULT)
440                         fprintf(stderr, "unexpected msg event %u\n",
441                                 msg.event), exit(1);
442                 if (bounces & BOUNCE_VERIFY &&
443                     msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
444                         fprintf(stderr, "unexpected write fault\n"), exit(1);
445                 offset = (char *)(unsigned long)msg.arg.pagefault.address -
446                          area_dst;
447                 offset &= ~(page_size-1);
448                 if (copy_page(uffd, offset))
449                         (*this_cpu_userfaults)++;
450         }
451         return (void *)NULL;
452 }
453
454 static void *background_thread(void *arg)
455 {
456         unsigned long cpu = (unsigned long) arg;
457         unsigned long page_nr;
458
459         for (page_nr = cpu * nr_pages_per_cpu;
460              page_nr < (cpu+1) * nr_pages_per_cpu;
461              page_nr++)
462                 copy_page(uffd, page_nr * page_size);
463
464         return NULL;
465 }
466
467 static int stress(unsigned long *userfaults)
468 {
469         unsigned long cpu;
470         pthread_t locking_threads[nr_cpus];
471         pthread_t uffd_threads[nr_cpus];
472         pthread_t background_threads[nr_cpus];
473         void **_userfaults = (void **) userfaults;
474
475         finished = 0;
476         for (cpu = 0; cpu < nr_cpus; cpu++) {
477                 if (pthread_create(&locking_threads[cpu], &attr,
478                                    locking_thread, (void *)cpu))
479                         return 1;
480                 if (bounces & BOUNCE_POLL) {
481                         if (pthread_create(&uffd_threads[cpu], &attr,
482                                            uffd_poll_thread, (void *)cpu))
483                                 return 1;
484                 } else {
485                         if (pthread_create(&uffd_threads[cpu], &attr,
486                                            uffd_read_thread,
487                                            &_userfaults[cpu]))
488                                 return 1;
489                         pthread_mutex_lock(&uffd_read_mutex);
490                 }
491                 if (pthread_create(&background_threads[cpu], &attr,
492                                    background_thread, (void *)cpu))
493                         return 1;
494         }
495         for (cpu = 0; cpu < nr_cpus; cpu++)
496                 if (pthread_join(background_threads[cpu], NULL))
497                         return 1;
498
499         /*
500          * Be strict and immediately zap area_src, the whole area has
501          * been transferred already by the background treads. The
502          * area_src could then be faulted in in a racy way by still
503          * running uffdio_threads reading zeropages after we zapped
504          * area_src (but they're guaranteed to get -EEXIST from
505          * UFFDIO_COPY without writing zero pages into area_dst
506          * because the background threads already completed).
507          */
508         if (release_pages(area_src))
509                 return 1;
510
511         for (cpu = 0; cpu < nr_cpus; cpu++) {
512                 char c;
513                 if (bounces & BOUNCE_POLL) {
514                         if (write(pipefd[cpu*2+1], &c, 1) != 1) {
515                                 fprintf(stderr, "pipefd write error\n");
516                                 return 1;
517                         }
518                         if (pthread_join(uffd_threads[cpu], &_userfaults[cpu]))
519                                 return 1;
520                 } else {
521                         if (pthread_cancel(uffd_threads[cpu]))
522                                 return 1;
523                         if (pthread_join(uffd_threads[cpu], NULL))
524                                 return 1;
525                 }
526         }
527
528         finished = 1;
529         for (cpu = 0; cpu < nr_cpus; cpu++)
530                 if (pthread_join(locking_threads[cpu], NULL))
531                         return 1;
532
533         return 0;
534 }
535
536 static int userfaultfd_open(int features)
537 {
538         struct uffdio_api uffdio_api;
539
540         uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
541         if (uffd < 0) {
542                 fprintf(stderr,
543                         "userfaultfd syscall not available in this kernel\n");
544                 return 1;
545         }
546         uffd_flags = fcntl(uffd, F_GETFD, NULL);
547
548         uffdio_api.api = UFFD_API;
549         uffdio_api.features = features;
550         if (ioctl(uffd, UFFDIO_API, &uffdio_api)) {
551                 fprintf(stderr, "UFFDIO_API\n");
552                 return 1;
553         }
554         if (uffdio_api.api != UFFD_API) {
555                 fprintf(stderr, "UFFDIO_API error %Lu\n", uffdio_api.api);
556                 return 1;
557         }
558
559         return 0;
560 }
561
562 /*
563  * For non-cooperative userfaultfd test we fork() a process that will
564  * generate pagefaults, will mremap the area monitored by the
565  * userfaultfd and at last this process will release the monitored
566  * area.
567  * For the anonymous and shared memory the area is divided into two
568  * parts, the first part is accessed before mremap, and the second
569  * part is accessed after mremap. Since hugetlbfs does not support
570  * mremap, the entire monitored area is accessed in a single pass for
571  * HUGETLB_TEST.
572  * The release of the pages currently generates event only for
573  * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked
574  * for hugetlb and shmem.
575  */
576 static int faulting_process(void)
577 {
578         unsigned long nr;
579         unsigned long long count;
580
581 #ifndef HUGETLB_TEST
582         unsigned long split_nr_pages = (nr_pages + 1) / 2;
583 #else
584         unsigned long split_nr_pages = nr_pages;
585 #endif
586
587         for (nr = 0; nr < split_nr_pages; nr++) {
588                 count = *area_count(area_dst, nr);
589                 if (count != count_verify[nr]) {
590                         fprintf(stderr,
591                                 "nr %lu memory corruption %Lu %Lu\n",
592                                 nr, count,
593                                 count_verify[nr]), exit(1);
594                 }
595         }
596
597 #ifndef HUGETLB_TEST
598         area_dst = mremap(area_dst, nr_pages * page_size,  nr_pages * page_size,
599                           MREMAP_MAYMOVE | MREMAP_FIXED, area_src);
600         if (area_dst == MAP_FAILED)
601                 perror("mremap"), exit(1);
602
603         for (; nr < nr_pages; nr++) {
604                 count = *area_count(area_dst, nr);
605                 if (count != count_verify[nr]) {
606                         fprintf(stderr,
607                                 "nr %lu memory corruption %Lu %Lu\n",
608                                 nr, count,
609                                 count_verify[nr]), exit(1);
610                 }
611         }
612
613 #ifndef SHMEM_TEST
614         if (release_pages(area_dst))
615                 return 1;
616
617         for (nr = 0; nr < nr_pages; nr++) {
618                 if (my_bcmp(area_dst + nr * page_size, zeropage, page_size))
619                         fprintf(stderr, "nr %lu is not zero\n", nr), exit(1);
620         }
621 #endif /* SHMEM_TEST */
622
623 #endif /* HUGETLB_TEST */
624
625         return 0;
626 }
627
628 static int uffdio_zeropage(int ufd, unsigned long offset)
629 {
630         struct uffdio_zeropage uffdio_zeropage;
631         int ret;
632         unsigned long has_zeropage = EXPECTED_IOCTLS & (1 << _UFFDIO_ZEROPAGE);
633
634         if (offset >= nr_pages * page_size)
635                 fprintf(stderr, "unexpected offset %lu\n",
636                         offset), exit(1);
637         uffdio_zeropage.range.start = (unsigned long) area_dst + offset;
638         uffdio_zeropage.range.len = page_size;
639         uffdio_zeropage.mode = 0;
640         ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage);
641         if (ret) {
642                 /* real retval in ufdio_zeropage.zeropage */
643                 if (has_zeropage) {
644                         if (uffdio_zeropage.zeropage == -EEXIST)
645                                 fprintf(stderr, "UFFDIO_ZEROPAGE -EEXIST\n"),
646                                         exit(1);
647                         else
648                                 fprintf(stderr, "UFFDIO_ZEROPAGE error %Ld\n",
649                                         uffdio_zeropage.zeropage), exit(1);
650                 } else {
651                         if (uffdio_zeropage.zeropage != -EINVAL)
652                                 fprintf(stderr,
653                                         "UFFDIO_ZEROPAGE not -EINVAL %Ld\n",
654                                         uffdio_zeropage.zeropage), exit(1);
655                 }
656         } else if (has_zeropage) {
657                 if (uffdio_zeropage.zeropage != page_size) {
658                         fprintf(stderr, "UFFDIO_ZEROPAGE unexpected %Ld\n",
659                                 uffdio_zeropage.zeropage), exit(1);
660                 } else
661                         return 1;
662         } else {
663                 fprintf(stderr,
664                         "UFFDIO_ZEROPAGE succeeded %Ld\n",
665                         uffdio_zeropage.zeropage), exit(1);
666         }
667
668         return 0;
669 }
670
671 /* exercise UFFDIO_ZEROPAGE */
672 static int userfaultfd_zeropage_test(void)
673 {
674         struct uffdio_register uffdio_register;
675         unsigned long expected_ioctls;
676
677         printf("testing UFFDIO_ZEROPAGE: ");
678         fflush(stdout);
679
680         if (release_pages(area_dst))
681                 return 1;
682
683         if (userfaultfd_open(0) < 0)
684                 return 1;
685         uffdio_register.range.start = (unsigned long) area_dst;
686         uffdio_register.range.len = nr_pages * page_size;
687         uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
688         if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
689                 fprintf(stderr, "register failure\n"), exit(1);
690
691         expected_ioctls = EXPECTED_IOCTLS;
692         if ((uffdio_register.ioctls & expected_ioctls) !=
693             expected_ioctls)
694                 fprintf(stderr,
695                         "unexpected missing ioctl for anon memory\n"),
696                         exit(1);
697
698         if (uffdio_zeropage(uffd, 0)) {
699                 if (my_bcmp(area_dst, zeropage, page_size))
700                         fprintf(stderr, "zeropage is not zero\n"), exit(1);
701         }
702
703         close(uffd);
704         printf("done.\n");
705         return 0;
706 }
707
708 static int userfaultfd_events_test(void)
709 {
710         struct uffdio_register uffdio_register;
711         unsigned long expected_ioctls;
712         unsigned long userfaults;
713         pthread_t uffd_mon;
714         int err, features;
715         pid_t pid;
716         char c;
717
718         printf("testing events (fork, remap, remove): ");
719         fflush(stdout);
720
721         if (release_pages(area_dst))
722                 return 1;
723
724         features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
725                 UFFD_FEATURE_EVENT_REMOVE;
726         if (userfaultfd_open(features) < 0)
727                 return 1;
728         fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
729
730         uffdio_register.range.start = (unsigned long) area_dst;
731         uffdio_register.range.len = nr_pages * page_size;
732         uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
733         if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
734                 fprintf(stderr, "register failure\n"), exit(1);
735
736         expected_ioctls = EXPECTED_IOCTLS;
737         if ((uffdio_register.ioctls & expected_ioctls) !=
738             expected_ioctls)
739                 fprintf(stderr,
740                         "unexpected missing ioctl for anon memory\n"),
741                         exit(1);
742
743         if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, NULL))
744                 perror("uffd_poll_thread create"), exit(1);
745
746         pid = fork();
747         if (pid < 0)
748                 perror("fork"), exit(1);
749
750         if (!pid)
751                 return faulting_process();
752
753         waitpid(pid, &err, 0);
754         if (err)
755                 fprintf(stderr, "faulting process failed\n"), exit(1);
756
757         if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
758                 perror("pipe write"), exit(1);
759         if (pthread_join(uffd_mon, (void **)&userfaults))
760                 return 1;
761
762         close(uffd);
763         printf("userfaults: %ld\n", userfaults);
764
765         return userfaults != nr_pages;
766 }
767
768 static int userfaultfd_stress(void)
769 {
770         void *area;
771         char *tmp_area;
772         unsigned long nr;
773         struct uffdio_register uffdio_register;
774         unsigned long cpu;
775         int err;
776         unsigned long userfaults[nr_cpus];
777
778         allocate_area((void **)&area_src);
779         if (!area_src)
780                 return 1;
781         allocate_area((void **)&area_dst);
782         if (!area_dst)
783                 return 1;
784
785         if (userfaultfd_open(0) < 0)
786                 return 1;
787
788         count_verify = malloc(nr_pages * sizeof(unsigned long long));
789         if (!count_verify) {
790                 perror("count_verify");
791                 return 1;
792         }
793
794         for (nr = 0; nr < nr_pages; nr++) {
795                 *area_mutex(area_src, nr) = (pthread_mutex_t)
796                         PTHREAD_MUTEX_INITIALIZER;
797                 count_verify[nr] = *area_count(area_src, nr) = 1;
798                 /*
799                  * In the transition between 255 to 256, powerpc will
800                  * read out of order in my_bcmp and see both bytes as
801                  * zero, so leave a placeholder below always non-zero
802                  * after the count, to avoid my_bcmp to trigger false
803                  * positives.
804                  */
805                 *(area_count(area_src, nr) + 1) = 1;
806         }
807
808         pipefd = malloc(sizeof(int) * nr_cpus * 2);
809         if (!pipefd) {
810                 perror("pipefd");
811                 return 1;
812         }
813         for (cpu = 0; cpu < nr_cpus; cpu++) {
814                 if (pipe2(&pipefd[cpu*2], O_CLOEXEC | O_NONBLOCK)) {
815                         perror("pipe");
816                         return 1;
817                 }
818         }
819
820         if (posix_memalign(&area, page_size, page_size)) {
821                 fprintf(stderr, "out of memory\n");
822                 return 1;
823         }
824         zeropage = area;
825         bzero(zeropage, page_size);
826
827         pthread_mutex_lock(&uffd_read_mutex);
828
829         pthread_attr_init(&attr);
830         pthread_attr_setstacksize(&attr, 16*1024*1024);
831
832         err = 0;
833         while (bounces--) {
834                 unsigned long expected_ioctls;
835
836                 printf("bounces: %d, mode:", bounces);
837                 if (bounces & BOUNCE_RANDOM)
838                         printf(" rnd");
839                 if (bounces & BOUNCE_RACINGFAULTS)
840                         printf(" racing");
841                 if (bounces & BOUNCE_VERIFY)
842                         printf(" ver");
843                 if (bounces & BOUNCE_POLL)
844                         printf(" poll");
845                 printf(", ");
846                 fflush(stdout);
847
848                 if (bounces & BOUNCE_POLL)
849                         fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
850                 else
851                         fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK);
852
853                 /* register */
854                 uffdio_register.range.start = (unsigned long) area_dst;
855                 uffdio_register.range.len = nr_pages * page_size;
856                 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
857                 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
858                         fprintf(stderr, "register failure\n");
859                         return 1;
860                 }
861                 expected_ioctls = EXPECTED_IOCTLS;
862                 if ((uffdio_register.ioctls & expected_ioctls) !=
863                     expected_ioctls) {
864                         fprintf(stderr,
865                                 "unexpected missing ioctl for anon memory\n");
866                         return 1;
867                 }
868
869                 /*
870                  * The madvise done previously isn't enough: some
871                  * uffd_thread could have read userfaults (one of
872                  * those already resolved by the background thread)
873                  * and it may be in the process of calling
874                  * UFFDIO_COPY. UFFDIO_COPY will read the zapped
875                  * area_src and it would map a zero page in it (of
876                  * course such a UFFDIO_COPY is perfectly safe as it'd
877                  * return -EEXIST). The problem comes at the next
878                  * bounce though: that racing UFFDIO_COPY would
879                  * generate zeropages in the area_src, so invalidating
880                  * the previous MADV_DONTNEED. Without this additional
881                  * MADV_DONTNEED those zeropages leftovers in the
882                  * area_src would lead to -EEXIST failure during the
883                  * next bounce, effectively leaving a zeropage in the
884                  * area_dst.
885                  *
886                  * Try to comment this out madvise to see the memory
887                  * corruption being caught pretty quick.
888                  *
889                  * khugepaged is also inhibited to collapse THP after
890                  * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
891                  * required to MADV_DONTNEED here.
892                  */
893                 if (release_pages(area_dst))
894                         return 1;
895
896                 /* bounce pass */
897                 if (stress(userfaults))
898                         return 1;
899
900                 /* unregister */
901                 if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) {
902                         fprintf(stderr, "register failure\n");
903                         return 1;
904                 }
905
906                 /* verification */
907                 if (bounces & BOUNCE_VERIFY) {
908                         for (nr = 0; nr < nr_pages; nr++) {
909                                 if (*area_count(area_dst, nr) != count_verify[nr]) {
910                                         fprintf(stderr,
911                                                 "error area_count %Lu %Lu %lu\n",
912                                                 *area_count(area_src, nr),
913                                                 count_verify[nr],
914                                                 nr);
915                                         err = 1;
916                                         bounces = 0;
917                                 }
918                         }
919                 }
920
921                 /* prepare next bounce */
922                 tmp_area = area_src;
923                 area_src = area_dst;
924                 area_dst = tmp_area;
925
926                 printf("userfaults:");
927                 for (cpu = 0; cpu < nr_cpus; cpu++)
928                         printf(" %lu", userfaults[cpu]);
929                 printf("\n");
930         }
931
932         if (err)
933                 return err;
934
935         close(uffd);
936         return userfaultfd_zeropage_test() || userfaultfd_events_test();
937 }
938
939 #ifndef HUGETLB_TEST
940
941 int main(int argc, char **argv)
942 {
943         if (argc < 3)
944                 fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
945         nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
946         page_size = sysconf(_SC_PAGE_SIZE);
947         if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
948             > page_size)
949                 fprintf(stderr, "Impossible to run this test\n"), exit(2);
950         nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size /
951                 nr_cpus;
952         if (!nr_pages_per_cpu) {
953                 fprintf(stderr, "invalid MiB\n");
954                 fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
955         }
956         bounces = atoi(argv[2]);
957         if (bounces <= 0) {
958                 fprintf(stderr, "invalid bounces\n");
959                 fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
960         }
961         nr_pages = nr_pages_per_cpu * nr_cpus;
962         printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
963                nr_pages, nr_pages_per_cpu);
964         return userfaultfd_stress();
965 }
966
967 #else /* HUGETLB_TEST */
968
969 /*
970  * Copied from mlock2-tests.c
971  */
972 unsigned long default_huge_page_size(void)
973 {
974         unsigned long hps = 0;
975         char *line = NULL;
976         size_t linelen = 0;
977         FILE *f = fopen("/proc/meminfo", "r");
978
979         if (!f)
980                 return 0;
981         while (getline(&line, &linelen, f) > 0) {
982                 if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
983                         hps <<= 10;
984                         break;
985                 }
986         }
987
988         free(line);
989         fclose(f);
990         return hps;
991 }
992
993 int main(int argc, char **argv)
994 {
995         if (argc < 4)
996                 fprintf(stderr, "Usage: <MiB> <bounces> <hugetlbfs_file>\n"),
997                                 exit(1);
998         nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
999         page_size = default_huge_page_size();
1000         if (!page_size)
1001                 fprintf(stderr, "Unable to determine huge page size\n"),
1002                                 exit(2);
1003         if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
1004             > page_size)
1005                 fprintf(stderr, "Impossible to run this test\n"), exit(2);
1006         nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size /
1007                 nr_cpus;
1008         if (!nr_pages_per_cpu) {
1009                 fprintf(stderr, "invalid MiB\n");
1010                 fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
1011         }
1012         bounces = atoi(argv[2]);
1013         if (bounces <= 0) {
1014                 fprintf(stderr, "invalid bounces\n");
1015                 fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
1016         }
1017         nr_pages = nr_pages_per_cpu * nr_cpus;
1018         huge_fd = open(argv[3], O_CREAT | O_RDWR, 0755);
1019         if (huge_fd < 0) {
1020                 fprintf(stderr, "Open of %s failed", argv[3]);
1021                 perror("open");
1022                 exit(1);
1023         }
1024         if (ftruncate(huge_fd, 0)) {
1025                 fprintf(stderr, "ftruncate %s to size 0 failed", argv[3]);
1026                 perror("ftruncate");
1027                 exit(1);
1028         }
1029         printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
1030                nr_pages, nr_pages_per_cpu);
1031         return userfaultfd_stress();
1032 }
1033
1034 #endif
1035 #else /* __NR_userfaultfd */
1036
1037 #warning "missing __NR_userfaultfd definition"
1038
1039 int main(void)
1040 {
1041         printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n");
1042         return 0;
1043 }
1044
1045 #endif /* __NR_userfaultfd */