]> git.karo-electronics.de Git - karo-tx-linux.git/blob - drivers/infiniband/hw/hfi1/user_exp_rcv.c
IB/hfi1: Use filedata rather than filepointer
[karo-tx-linux.git] / drivers / infiniband / hw / hfi1 / user_exp_rcv.c
1 /*
2  * Copyright(c) 2015-2017 Intel Corporation.
3  *
4  * This file is provided under a dual BSD/GPLv2 license.  When using or
5  * redistributing this file, you may do so under either license.
6  *
7  * GPL LICENSE SUMMARY
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of version 2 of the GNU General Public License as
11  * published by the Free Software Foundation.
12  *
13  * This program is distributed in the hope that it will be useful, but
14  * WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * General Public License for more details.
17  *
18  * BSD LICENSE
19  *
20  * Redistribution and use in source and binary forms, with or without
21  * modification, are permitted provided that the following conditions
22  * are met:
23  *
24  *  - Redistributions of source code must retain the above copyright
25  *    notice, this list of conditions and the following disclaimer.
26  *  - Redistributions in binary form must reproduce the above copyright
27  *    notice, this list of conditions and the following disclaimer in
28  *    the documentation and/or other materials provided with the
29  *    distribution.
30  *  - Neither the name of Intel Corporation nor the names of its
31  *    contributors may be used to endorse or promote products derived
32  *    from this software without specific prior written permission.
33  *
34  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
36  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
37  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
38  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
39  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
40  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
41  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
42  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
43  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
44  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
45  *
46  */
47 #include <asm/page.h>
48 #include <linux/string.h>
49
50 #include "user_exp_rcv.h"
51 #include "trace.h"
52 #include "mmu_rb.h"
53
54 struct tid_group {
55         struct list_head list;
56         unsigned base;
57         u8 size;
58         u8 used;
59         u8 map;
60 };
61
62 struct tid_rb_node {
63         struct mmu_rb_node mmu;
64         unsigned long phys;
65         struct tid_group *grp;
66         u32 rcventry;
67         dma_addr_t dma_addr;
68         bool freed;
69         unsigned npages;
70         struct page *pages[0];
71 };
72
73 struct tid_pageset {
74         u16 idx;
75         u16 count;
76 };
77
78 #define EXP_TID_SET_EMPTY(set) (set.count == 0 && list_empty(&set.list))
79
80 #define num_user_pages(vaddr, len)                                     \
81         (1 + (((((unsigned long)(vaddr) +                              \
82                  (unsigned long)(len) - 1) & PAGE_MASK) -              \
83                ((unsigned long)vaddr & PAGE_MASK)) >> PAGE_SHIFT))
84
85 static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
86                             struct exp_tid_set *set,
87                             struct hfi1_filedata *fd);
88 static u32 find_phys_blocks(struct page **pages, unsigned npages,
89                             struct tid_pageset *list);
90 static int set_rcvarray_entry(struct hfi1_filedata *fd, unsigned long vaddr,
91                               u32 rcventry, struct tid_group *grp,
92                               struct page **pages, unsigned npages);
93 static int tid_rb_insert(void *arg, struct mmu_rb_node *node);
94 static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
95                                     struct tid_rb_node *tnode);
96 static void tid_rb_remove(void *arg, struct mmu_rb_node *node);
97 static int tid_rb_invalidate(void *arg, struct mmu_rb_node *mnode);
98 static int program_rcvarray(struct hfi1_filedata *fd, unsigned long vaddr,
99                             struct tid_group *grp, struct tid_pageset *sets,
100                             unsigned start, u16 count, struct page **pages,
101                             u32 *tidlist, unsigned *tididx, unsigned *pmapped);
102 static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
103                               struct tid_group **grp);
104 static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node);
105
106 static struct mmu_rb_ops tid_rb_ops = {
107         .insert = tid_rb_insert,
108         .remove = tid_rb_remove,
109         .invalidate = tid_rb_invalidate
110 };
111
112 static inline u32 rcventry2tidinfo(u32 rcventry)
113 {
114         u32 pair = rcventry & ~0x1;
115
116         return EXP_TID_SET(IDX, pair >> 1) |
117                 EXP_TID_SET(CTRL, 1 << (rcventry - pair));
118 }
119
120 static inline void exp_tid_group_init(struct exp_tid_set *set)
121 {
122         INIT_LIST_HEAD(&set->list);
123         set->count = 0;
124 }
125
126 static inline void tid_group_remove(struct tid_group *grp,
127                                     struct exp_tid_set *set)
128 {
129         list_del_init(&grp->list);
130         set->count--;
131 }
132
133 static inline void tid_group_add_tail(struct tid_group *grp,
134                                       struct exp_tid_set *set)
135 {
136         list_add_tail(&grp->list, &set->list);
137         set->count++;
138 }
139
140 static inline struct tid_group *tid_group_pop(struct exp_tid_set *set)
141 {
142         struct tid_group *grp =
143                 list_first_entry(&set->list, struct tid_group, list);
144         list_del_init(&grp->list);
145         set->count--;
146         return grp;
147 }
148
149 static inline void tid_group_move(struct tid_group *group,
150                                   struct exp_tid_set *s1,
151                                   struct exp_tid_set *s2)
152 {
153         tid_group_remove(group, s1);
154         tid_group_add_tail(group, s2);
155 }
156
157 /*
158  * Initialize context and file private data needed for Expected
159  * receive caching. This needs to be done after the context has
160  * been configured with the eager/expected RcvEntry counts.
161  */
162 int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd)
163 {
164         struct hfi1_ctxtdata *uctxt = fd->uctxt;
165         struct hfi1_devdata *dd = uctxt->dd;
166         unsigned tidbase;
167         int i, ret = 0;
168
169         spin_lock_init(&fd->tid_lock);
170         spin_lock_init(&fd->invalid_lock);
171
172         if (!uctxt->subctxt_cnt || !fd->subctxt) {
173                 exp_tid_group_init(&uctxt->tid_group_list);
174                 exp_tid_group_init(&uctxt->tid_used_list);
175                 exp_tid_group_init(&uctxt->tid_full_list);
176
177                 tidbase = uctxt->expected_base;
178                 for (i = 0; i < uctxt->expected_count /
179                              dd->rcv_entries.group_size; i++) {
180                         struct tid_group *grp;
181
182                         grp = kzalloc(sizeof(*grp), GFP_KERNEL);
183                         if (!grp) {
184                                 /*
185                                  * If we fail here, the groups already
186                                  * allocated will be freed by the close
187                                  * call.
188                                  */
189                                 ret = -ENOMEM;
190                                 goto done;
191                         }
192                         grp->size = dd->rcv_entries.group_size;
193                         grp->base = tidbase;
194                         tid_group_add_tail(grp, &uctxt->tid_group_list);
195                         tidbase += dd->rcv_entries.group_size;
196                 }
197         }
198
199         fd->entry_to_rb = kcalloc(uctxt->expected_count,
200                                      sizeof(struct rb_node *),
201                                      GFP_KERNEL);
202         if (!fd->entry_to_rb)
203                 return -ENOMEM;
204
205         if (!HFI1_CAP_UGET_MASK(uctxt->flags, TID_UNMAP)) {
206                 fd->invalid_tid_idx = 0;
207                 fd->invalid_tids = kcalloc(uctxt->expected_count,
208                                            sizeof(*fd->invalid_tids),
209                                            GFP_KERNEL);
210                 if (!fd->invalid_tids) {
211                         ret = -ENOMEM;
212                         goto done;
213                 }
214
215                 /*
216                  * Register MMU notifier callbacks. If the registration
217                  * fails, continue without TID caching for this context.
218                  */
219                 ret = hfi1_mmu_rb_register(fd, fd->mm, &tid_rb_ops,
220                                            dd->pport->hfi1_wq,
221                                            &fd->handler);
222                 if (ret) {
223                         dd_dev_info(dd,
224                                     "Failed MMU notifier registration %d\n",
225                                     ret);
226                         ret = 0;
227                 }
228         }
229
230         /*
231          * PSM does not have a good way to separate, count, and
232          * effectively enforce a limit on RcvArray entries used by
233          * subctxts (when context sharing is used) when TID caching
234          * is enabled. To help with that, we calculate a per-process
235          * RcvArray entry share and enforce that.
236          * If TID caching is not in use, PSM deals with usage on its
237          * own. In that case, we allow any subctxt to take all of the
238          * entries.
239          *
240          * Make sure that we set the tid counts only after successful
241          * init.
242          */
243         spin_lock(&fd->tid_lock);
244         if (uctxt->subctxt_cnt && fd->handler) {
245                 u16 remainder;
246
247                 fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt;
248                 remainder = uctxt->expected_count % uctxt->subctxt_cnt;
249                 if (remainder && fd->subctxt < remainder)
250                         fd->tid_limit++;
251         } else {
252                 fd->tid_limit = uctxt->expected_count;
253         }
254         spin_unlock(&fd->tid_lock);
255 done:
256         return ret;
257 }
258
259 void hfi1_user_exp_rcv_grp_free(struct hfi1_ctxtdata *uctxt)
260 {
261         struct tid_group *grp, *gptr;
262
263         list_for_each_entry_safe(grp, gptr, &uctxt->tid_group_list.list,
264                                  list) {
265                 list_del_init(&grp->list);
266                 kfree(grp);
267         }
268         hfi1_clear_tids(uctxt);
269 }
270
271 int hfi1_user_exp_rcv_free(struct hfi1_filedata *fd)
272 {
273         struct hfi1_ctxtdata *uctxt = fd->uctxt;
274
275         /*
276          * The notifier would have been removed when the process'es mm
277          * was freed.
278          */
279         if (fd->handler) {
280                 hfi1_mmu_rb_unregister(fd->handler);
281         } else {
282                 if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
283                         unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd);
284                 if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
285                         unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd);
286         }
287
288         kfree(fd->invalid_tids);
289         fd->invalid_tids = NULL;
290
291         kfree(fd->entry_to_rb);
292         fd->entry_to_rb = NULL;
293         return 0;
294 }
295
296 /*
297  * Write an "empty" RcvArray entry.
298  * This function exists so the TID registaration code can use it
299  * to write to unused/unneeded entries and still take advantage
300  * of the WC performance improvements. The HFI will ignore this
301  * write to the RcvArray entry.
302  */
303 static inline void rcv_array_wc_fill(struct hfi1_devdata *dd, u32 index)
304 {
305         /*
306          * Doing the WC fill writes only makes sense if the device is
307          * present and the RcvArray has been mapped as WC memory.
308          */
309         if ((dd->flags & HFI1_PRESENT) && dd->rcvarray_wc)
310                 writeq(0, dd->rcvarray_wc + (index * 8));
311 }
312
313 /*
314  * RcvArray entry allocation for Expected Receives is done by the
315  * following algorithm:
316  *
317  * The context keeps 3 lists of groups of RcvArray entries:
318  *   1. List of empty groups - tid_group_list
319  *      This list is created during user context creation and
320  *      contains elements which describe sets (of 8) of empty
321  *      RcvArray entries.
322  *   2. List of partially used groups - tid_used_list
323  *      This list contains sets of RcvArray entries which are
324  *      not completely used up. Another mapping request could
325  *      use some of all of the remaining entries.
326  *   3. List of full groups - tid_full_list
327  *      This is the list where sets that are completely used
328  *      up go.
329  *
330  * An attempt to optimize the usage of RcvArray entries is
331  * made by finding all sets of physically contiguous pages in a
332  * user's buffer.
333  * These physically contiguous sets are further split into
334  * sizes supported by the receive engine of the HFI. The
335  * resulting sets of pages are stored in struct tid_pageset,
336  * which describes the sets as:
337  *    * .count - number of pages in this set
338  *    * .idx - starting index into struct page ** array
339  *                    of this set
340  *
341  * From this point on, the algorithm deals with the page sets
342  * described above. The number of pagesets is divided by the
343  * RcvArray group size to produce the number of full groups
344  * needed.
345  *
346  * Groups from the 3 lists are manipulated using the following
347  * rules:
348  *   1. For each set of 8 pagesets, a complete group from
349  *      tid_group_list is taken, programmed, and moved to
350  *      the tid_full_list list.
351  *   2. For all remaining pagesets:
352  *      2.1 If the tid_used_list is empty and the tid_group_list
353  *          is empty, stop processing pageset and return only
354  *          what has been programmed up to this point.
355  *      2.2 If the tid_used_list is empty and the tid_group_list
356  *          is not empty, move a group from tid_group_list to
357  *          tid_used_list.
358  *      2.3 For each group is tid_used_group, program as much as
359  *          can fit into the group. If the group becomes fully
360  *          used, move it to tid_full_list.
361  */
362 int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd,
363                             struct hfi1_tid_info *tinfo)
364 {
365         int ret = 0, need_group = 0, pinned;
366         struct hfi1_ctxtdata *uctxt = fd->uctxt;
367         struct hfi1_devdata *dd = uctxt->dd;
368         unsigned npages, ngroups, pageidx = 0, pageset_count, npagesets,
369                 tididx = 0, mapped, mapped_pages = 0;
370         unsigned long vaddr = tinfo->vaddr;
371         struct page **pages = NULL;
372         u32 *tidlist = NULL;
373         struct tid_pageset *pagesets = NULL;
374
375         /* Get the number of pages the user buffer spans */
376         npages = num_user_pages(vaddr, tinfo->length);
377         if (!npages)
378                 return -EINVAL;
379
380         if (npages > uctxt->expected_count) {
381                 dd_dev_err(dd, "Expected buffer too big\n");
382                 return -EINVAL;
383         }
384
385         /* Verify that access is OK for the user buffer */
386         if (!access_ok(VERIFY_WRITE, (void __user *)vaddr,
387                        npages * PAGE_SIZE)) {
388                 dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n",
389                            (void *)vaddr, npages);
390                 return -EFAULT;
391         }
392
393         pagesets = kcalloc(uctxt->expected_count, sizeof(*pagesets),
394                            GFP_KERNEL);
395         if (!pagesets)
396                 return -ENOMEM;
397
398         /* Allocate the array of struct page pointers needed for pinning */
399         pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
400         if (!pages) {
401                 ret = -ENOMEM;
402                 goto bail;
403         }
404
405         /*
406          * Pin all the pages of the user buffer. If we can't pin all the
407          * pages, accept the amount pinned so far and program only that.
408          * User space knows how to deal with partially programmed buffers.
409          */
410         if (!hfi1_can_pin_pages(dd, fd->mm, fd->tid_n_pinned, npages)) {
411                 ret = -ENOMEM;
412                 goto bail;
413         }
414
415         pinned = hfi1_acquire_user_pages(fd->mm, vaddr, npages, true, pages);
416         if (pinned <= 0) {
417                 ret = pinned;
418                 goto bail;
419         }
420         fd->tid_n_pinned += npages;
421
422         /* Find sets of physically contiguous pages */
423         npagesets = find_phys_blocks(pages, pinned, pagesets);
424
425         /*
426          * We don't need to access this under a lock since tid_used is per
427          * process and the same process cannot be in hfi1_user_exp_rcv_clear()
428          * and hfi1_user_exp_rcv_setup() at the same time.
429          */
430         spin_lock(&fd->tid_lock);
431         if (fd->tid_used + npagesets > fd->tid_limit)
432                 pageset_count = fd->tid_limit - fd->tid_used;
433         else
434                 pageset_count = npagesets;
435         spin_unlock(&fd->tid_lock);
436
437         if (!pageset_count)
438                 goto bail;
439
440         ngroups = pageset_count / dd->rcv_entries.group_size;
441         tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL);
442         if (!tidlist) {
443                 ret = -ENOMEM;
444                 goto nomem;
445         }
446
447         tididx = 0;
448
449         /*
450          * From this point on, we are going to be using shared (between master
451          * and subcontexts) context resources. We need to take the lock.
452          */
453         mutex_lock(&uctxt->exp_lock);
454         /*
455          * The first step is to program the RcvArray entries which are complete
456          * groups.
457          */
458         while (ngroups && uctxt->tid_group_list.count) {
459                 struct tid_group *grp =
460                         tid_group_pop(&uctxt->tid_group_list);
461
462                 ret = program_rcvarray(fd, vaddr, grp, pagesets,
463                                        pageidx, dd->rcv_entries.group_size,
464                                        pages, tidlist, &tididx, &mapped);
465                 /*
466                  * If there was a failure to program the RcvArray
467                  * entries for the entire group, reset the grp fields
468                  * and add the grp back to the free group list.
469                  */
470                 if (ret <= 0) {
471                         tid_group_add_tail(grp, &uctxt->tid_group_list);
472                         hfi1_cdbg(TID,
473                                   "Failed to program RcvArray group %d", ret);
474                         goto unlock;
475                 }
476
477                 tid_group_add_tail(grp, &uctxt->tid_full_list);
478                 ngroups--;
479                 pageidx += ret;
480                 mapped_pages += mapped;
481         }
482
483         while (pageidx < pageset_count) {
484                 struct tid_group *grp, *ptr;
485                 /*
486                  * If we don't have any partially used tid groups, check
487                  * if we have empty groups. If so, take one from there and
488                  * put in the partially used list.
489                  */
490                 if (!uctxt->tid_used_list.count || need_group) {
491                         if (!uctxt->tid_group_list.count)
492                                 goto unlock;
493
494                         grp = tid_group_pop(&uctxt->tid_group_list);
495                         tid_group_add_tail(grp, &uctxt->tid_used_list);
496                         need_group = 0;
497                 }
498                 /*
499                  * There is an optimization opportunity here - instead of
500                  * fitting as many page sets as we can, check for a group
501                  * later on in the list that could fit all of them.
502                  */
503                 list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list,
504                                          list) {
505                         unsigned use = min_t(unsigned, pageset_count - pageidx,
506                                              grp->size - grp->used);
507
508                         ret = program_rcvarray(fd, vaddr, grp, pagesets,
509                                                pageidx, use, pages, tidlist,
510                                                &tididx, &mapped);
511                         if (ret < 0) {
512                                 hfi1_cdbg(TID,
513                                           "Failed to program RcvArray entries %d",
514                                           ret);
515                                 ret = -EFAULT;
516                                 goto unlock;
517                         } else if (ret > 0) {
518                                 if (grp->used == grp->size)
519                                         tid_group_move(grp,
520                                                        &uctxt->tid_used_list,
521                                                        &uctxt->tid_full_list);
522                                 pageidx += ret;
523                                 mapped_pages += mapped;
524                                 need_group = 0;
525                                 /* Check if we are done so we break out early */
526                                 if (pageidx >= pageset_count)
527                                         break;
528                         } else if (WARN_ON(ret == 0)) {
529                                 /*
530                                  * If ret is 0, we did not program any entries
531                                  * into this group, which can only happen if
532                                  * we've screwed up the accounting somewhere.
533                                  * Warn and try to continue.
534                                  */
535                                 need_group = 1;
536                         }
537                 }
538         }
539 unlock:
540         mutex_unlock(&uctxt->exp_lock);
541 nomem:
542         hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx,
543                   mapped_pages, ret);
544         if (tididx) {
545                 spin_lock(&fd->tid_lock);
546                 fd->tid_used += tididx;
547                 spin_unlock(&fd->tid_lock);
548                 tinfo->tidcnt = tididx;
549                 tinfo->length = mapped_pages * PAGE_SIZE;
550
551                 if (copy_to_user((void __user *)(unsigned long)tinfo->tidlist,
552                                  tidlist, sizeof(tidlist[0]) * tididx)) {
553                         /*
554                          * On failure to copy to the user level, we need to undo
555                          * everything done so far so we don't leak resources.
556                          */
557                         tinfo->tidlist = (unsigned long)&tidlist;
558                         hfi1_user_exp_rcv_clear(fd, tinfo);
559                         tinfo->tidlist = 0;
560                         ret = -EFAULT;
561                         goto bail;
562                 }
563         }
564
565         /*
566          * If not everything was mapped (due to insufficient RcvArray entries,
567          * for example), unpin all unmapped pages so we can pin them nex time.
568          */
569         if (mapped_pages != pinned) {
570                 hfi1_release_user_pages(fd->mm, &pages[mapped_pages],
571                                         pinned - mapped_pages,
572                                         false);
573                 fd->tid_n_pinned -= pinned - mapped_pages;
574         }
575 bail:
576         kfree(pagesets);
577         kfree(pages);
578         kfree(tidlist);
579         return ret > 0 ? 0 : ret;
580 }
581
582 int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd,
583                             struct hfi1_tid_info *tinfo)
584 {
585         int ret = 0;
586         struct hfi1_ctxtdata *uctxt = fd->uctxt;
587         u32 *tidinfo;
588         unsigned tididx;
589
590         if (unlikely(tinfo->tidcnt > fd->tid_used))
591                 return -EINVAL;
592
593         tidinfo = memdup_user((void __user *)(unsigned long)tinfo->tidlist,
594                               sizeof(tidinfo[0]) * tinfo->tidcnt);
595         if (IS_ERR(tidinfo))
596                 return PTR_ERR(tidinfo);
597
598         mutex_lock(&uctxt->exp_lock);
599         for (tididx = 0; tididx < tinfo->tidcnt; tididx++) {
600                 ret = unprogram_rcvarray(fd, tidinfo[tididx], NULL);
601                 if (ret) {
602                         hfi1_cdbg(TID, "Failed to unprogram rcv array %d",
603                                   ret);
604                         break;
605                 }
606         }
607         spin_lock(&fd->tid_lock);
608         fd->tid_used -= tididx;
609         spin_unlock(&fd->tid_lock);
610         tinfo->tidcnt = tididx;
611         mutex_unlock(&uctxt->exp_lock);
612
613         kfree(tidinfo);
614         return ret;
615 }
616
617 int hfi1_user_exp_rcv_invalid(struct hfi1_filedata *fd,
618                               struct hfi1_tid_info *tinfo)
619 {
620         struct hfi1_ctxtdata *uctxt = fd->uctxt;
621         unsigned long *ev = uctxt->dd->events +
622                 (((uctxt->ctxt - uctxt->dd->first_dyn_alloc_ctxt) *
623                   HFI1_MAX_SHARED_CTXTS) + fd->subctxt);
624         u32 *array;
625         int ret = 0;
626
627         if (!fd->invalid_tids)
628                 return -EINVAL;
629
630         /*
631          * copy_to_user() can sleep, which will leave the invalid_lock
632          * locked and cause the MMU notifier to be blocked on the lock
633          * for a long time.
634          * Copy the data to a local buffer so we can release the lock.
635          */
636         array = kcalloc(uctxt->expected_count, sizeof(*array), GFP_KERNEL);
637         if (!array)
638                 return -EFAULT;
639
640         spin_lock(&fd->invalid_lock);
641         if (fd->invalid_tid_idx) {
642                 memcpy(array, fd->invalid_tids, sizeof(*array) *
643                        fd->invalid_tid_idx);
644                 memset(fd->invalid_tids, 0, sizeof(*fd->invalid_tids) *
645                        fd->invalid_tid_idx);
646                 tinfo->tidcnt = fd->invalid_tid_idx;
647                 fd->invalid_tid_idx = 0;
648                 /*
649                  * Reset the user flag while still holding the lock.
650                  * Otherwise, PSM can miss events.
651                  */
652                 clear_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
653         } else {
654                 tinfo->tidcnt = 0;
655         }
656         spin_unlock(&fd->invalid_lock);
657
658         if (tinfo->tidcnt) {
659                 if (copy_to_user((void __user *)tinfo->tidlist,
660                                  array, sizeof(*array) * tinfo->tidcnt))
661                         ret = -EFAULT;
662         }
663         kfree(array);
664
665         return ret;
666 }
667
668 static u32 find_phys_blocks(struct page **pages, unsigned npages,
669                             struct tid_pageset *list)
670 {
671         unsigned pagecount, pageidx, setcount = 0, i;
672         unsigned long pfn, this_pfn;
673
674         if (!npages)
675                 return 0;
676
677         /*
678          * Look for sets of physically contiguous pages in the user buffer.
679          * This will allow us to optimize Expected RcvArray entry usage by
680          * using the bigger supported sizes.
681          */
682         pfn = page_to_pfn(pages[0]);
683         for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
684                 this_pfn = i < npages ? page_to_pfn(pages[i]) : 0;
685
686                 /*
687                  * If the pfn's are not sequential, pages are not physically
688                  * contiguous.
689                  */
690                 if (this_pfn != ++pfn) {
691                         /*
692                          * At this point we have to loop over the set of
693                          * physically contiguous pages and break them down it
694                          * sizes supported by the HW.
695                          * There are two main constraints:
696                          *     1. The max buffer size is MAX_EXPECTED_BUFFER.
697                          *        If the total set size is bigger than that
698                          *        program only a MAX_EXPECTED_BUFFER chunk.
699                          *     2. The buffer size has to be a power of two. If
700                          *        it is not, round down to the closes power of
701                          *        2 and program that size.
702                          */
703                         while (pagecount) {
704                                 int maxpages = pagecount;
705                                 u32 bufsize = pagecount * PAGE_SIZE;
706
707                                 if (bufsize > MAX_EXPECTED_BUFFER)
708                                         maxpages =
709                                                 MAX_EXPECTED_BUFFER >>
710                                                 PAGE_SHIFT;
711                                 else if (!is_power_of_2(bufsize))
712                                         maxpages =
713                                                 rounddown_pow_of_two(bufsize) >>
714                                                 PAGE_SHIFT;
715
716                                 list[setcount].idx = pageidx;
717                                 list[setcount].count = maxpages;
718                                 pagecount -= maxpages;
719                                 pageidx += maxpages;
720                                 setcount++;
721                         }
722                         pageidx = i;
723                         pagecount = 1;
724                         pfn = this_pfn;
725                 } else {
726                         pagecount++;
727                 }
728         }
729         return setcount;
730 }
731
732 /**
733  * program_rcvarray() - program an RcvArray group with receive buffers
734  * @fd: filedata pointer
735  * @vaddr: starting user virtual address
736  * @grp: RcvArray group
737  * @sets: array of struct tid_pageset holding information on physically
738  *        contiguous chunks from the user buffer
739  * @start: starting index into sets array
740  * @count: number of struct tid_pageset's to program
741  * @pages: an array of struct page * for the user buffer
742  * @tidlist: the array of u32 elements when the information about the
743  *           programmed RcvArray entries is to be encoded.
744  * @tididx: starting offset into tidlist
745  * @pmapped: (output parameter) number of pages programmed into the RcvArray
746  *           entries.
747  *
748  * This function will program up to 'count' number of RcvArray entries from the
749  * group 'grp'. To make best use of write-combining writes, the function will
750  * perform writes to the unused RcvArray entries which will be ignored by the
751  * HW. Each RcvArray entry will be programmed with a physically contiguous
752  * buffer chunk from the user's virtual buffer.
753  *
754  * Return:
755  * -EINVAL if the requested count is larger than the size of the group,
756  * -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or
757  * number of RcvArray entries programmed.
758  */
759 static int program_rcvarray(struct hfi1_filedata *fd, unsigned long vaddr,
760                             struct tid_group *grp,
761                             struct tid_pageset *sets,
762                             unsigned start, u16 count, struct page **pages,
763                             u32 *tidlist, unsigned *tididx, unsigned *pmapped)
764 {
765         struct hfi1_ctxtdata *uctxt = fd->uctxt;
766         struct hfi1_devdata *dd = uctxt->dd;
767         u16 idx;
768         u32 tidinfo = 0, rcventry, useidx = 0;
769         int mapped = 0;
770
771         /* Count should never be larger than the group size */
772         if (count > grp->size)
773                 return -EINVAL;
774
775         /* Find the first unused entry in the group */
776         for (idx = 0; idx < grp->size; idx++) {
777                 if (!(grp->map & (1 << idx))) {
778                         useidx = idx;
779                         break;
780                 }
781                 rcv_array_wc_fill(dd, grp->base + idx);
782         }
783
784         idx = 0;
785         while (idx < count) {
786                 u16 npages, pageidx, setidx = start + idx;
787                 int ret = 0;
788
789                 /*
790                  * If this entry in the group is used, move to the next one.
791                  * If we go past the end of the group, exit the loop.
792                  */
793                 if (useidx >= grp->size) {
794                         break;
795                 } else if (grp->map & (1 << useidx)) {
796                         rcv_array_wc_fill(dd, grp->base + useidx);
797                         useidx++;
798                         continue;
799                 }
800
801                 rcventry = grp->base + useidx;
802                 npages = sets[setidx].count;
803                 pageidx = sets[setidx].idx;
804
805                 ret = set_rcvarray_entry(fd, vaddr + (pageidx * PAGE_SIZE),
806                                          rcventry, grp, pages + pageidx,
807                                          npages);
808                 if (ret)
809                         return ret;
810                 mapped += npages;
811
812                 tidinfo = rcventry2tidinfo(rcventry - uctxt->expected_base) |
813                         EXP_TID_SET(LEN, npages);
814                 tidlist[(*tididx)++] = tidinfo;
815                 grp->used++;
816                 grp->map |= 1 << useidx++;
817                 idx++;
818         }
819
820         /* Fill the rest of the group with "blank" writes */
821         for (; useidx < grp->size; useidx++)
822                 rcv_array_wc_fill(dd, grp->base + useidx);
823         *pmapped = mapped;
824         return idx;
825 }
826
827 static int set_rcvarray_entry(struct hfi1_filedata *fd, unsigned long vaddr,
828                               u32 rcventry, struct tid_group *grp,
829                               struct page **pages, unsigned npages)
830 {
831         int ret;
832         struct hfi1_ctxtdata *uctxt = fd->uctxt;
833         struct tid_rb_node *node;
834         struct hfi1_devdata *dd = uctxt->dd;
835         dma_addr_t phys;
836
837         /*
838          * Allocate the node first so we can handle a potential
839          * failure before we've programmed anything.
840          */
841         node = kzalloc(sizeof(*node) + (sizeof(struct page *) * npages),
842                        GFP_KERNEL);
843         if (!node)
844                 return -ENOMEM;
845
846         phys = pci_map_single(dd->pcidev,
847                               __va(page_to_phys(pages[0])),
848                               npages * PAGE_SIZE, PCI_DMA_FROMDEVICE);
849         if (dma_mapping_error(&dd->pcidev->dev, phys)) {
850                 dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n",
851                            phys);
852                 kfree(node);
853                 return -EFAULT;
854         }
855
856         node->mmu.addr = vaddr;
857         node->mmu.len = npages * PAGE_SIZE;
858         node->phys = page_to_phys(pages[0]);
859         node->npages = npages;
860         node->rcventry = rcventry;
861         node->dma_addr = phys;
862         node->grp = grp;
863         node->freed = false;
864         memcpy(node->pages, pages, sizeof(struct page *) * npages);
865
866         if (!fd->handler)
867                 ret = tid_rb_insert(fd, &node->mmu);
868         else
869                 ret = hfi1_mmu_rb_insert(fd->handler, &node->mmu);
870
871         if (ret) {
872                 hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d",
873                           node->rcventry, node->mmu.addr, node->phys, ret);
874                 pci_unmap_single(dd->pcidev, phys, npages * PAGE_SIZE,
875                                  PCI_DMA_FROMDEVICE);
876                 kfree(node);
877                 return -EFAULT;
878         }
879         hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1);
880         trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, npages,
881                                node->mmu.addr, node->phys, phys);
882         return 0;
883 }
884
885 static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
886                               struct tid_group **grp)
887 {
888         struct hfi1_ctxtdata *uctxt = fd->uctxt;
889         struct hfi1_devdata *dd = uctxt->dd;
890         struct tid_rb_node *node;
891         u8 tidctrl = EXP_TID_GET(tidinfo, CTRL);
892         u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry;
893
894         if (tididx >= uctxt->expected_count) {
895                 dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n",
896                            tididx, uctxt->ctxt);
897                 return -EINVAL;
898         }
899
900         if (tidctrl == 0x3)
901                 return -EINVAL;
902
903         rcventry = tididx + (tidctrl - 1);
904
905         node = fd->entry_to_rb[rcventry];
906         if (!node || node->rcventry != (uctxt->expected_base + rcventry))
907                 return -EBADF;
908
909         if (grp)
910                 *grp = node->grp;
911
912         if (!fd->handler)
913                 cacheless_tid_rb_remove(fd, node);
914         else
915                 hfi1_mmu_rb_remove(fd->handler, &node->mmu);
916
917         return 0;
918 }
919
920 static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
921 {
922         struct hfi1_ctxtdata *uctxt = fd->uctxt;
923         struct hfi1_devdata *dd = uctxt->dd;
924
925         trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry,
926                                  node->npages, node->mmu.addr, node->phys,
927                                  node->dma_addr);
928
929         hfi1_put_tid(dd, node->rcventry, PT_INVALID, 0, 0);
930         /*
931          * Make sure device has seen the write before we unpin the
932          * pages.
933          */
934         flush_wc();
935
936         pci_unmap_single(dd->pcidev, node->dma_addr, node->mmu.len,
937                          PCI_DMA_FROMDEVICE);
938         hfi1_release_user_pages(fd->mm, node->pages, node->npages, true);
939         fd->tid_n_pinned -= node->npages;
940
941         node->grp->used--;
942         node->grp->map &= ~(1 << (node->rcventry - node->grp->base));
943
944         if (node->grp->used == node->grp->size - 1)
945                 tid_group_move(node->grp, &uctxt->tid_full_list,
946                                &uctxt->tid_used_list);
947         else if (!node->grp->used)
948                 tid_group_move(node->grp, &uctxt->tid_used_list,
949                                &uctxt->tid_group_list);
950         kfree(node);
951 }
952
953 /*
954  * As a simple helper for hfi1_user_exp_rcv_free, this function deals with
955  * clearing nodes in the non-cached case.
956  */
957 static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
958                             struct exp_tid_set *set,
959                             struct hfi1_filedata *fd)
960 {
961         struct tid_group *grp, *ptr;
962         int i;
963
964         list_for_each_entry_safe(grp, ptr, &set->list, list) {
965                 list_del_init(&grp->list);
966
967                 for (i = 0; i < grp->size; i++) {
968                         if (grp->map & (1 << i)) {
969                                 u16 rcventry = grp->base + i;
970                                 struct tid_rb_node *node;
971
972                                 node = fd->entry_to_rb[rcventry -
973                                                           uctxt->expected_base];
974                                 if (!node || node->rcventry != rcventry)
975                                         continue;
976
977                                 cacheless_tid_rb_remove(fd, node);
978                         }
979                 }
980         }
981 }
982
983 /*
984  * Always return 0 from this function.  A non-zero return indicates that the
985  * remove operation will be called and that memory should be unpinned.
986  * However, the driver cannot unpin out from under PSM.  Instead, retain the
987  * memory (by returning 0) and inform PSM that the memory is going away.  PSM
988  * will call back later when it has removed the memory from its list.
989  */
990 static int tid_rb_invalidate(void *arg, struct mmu_rb_node *mnode)
991 {
992         struct hfi1_filedata *fdata = arg;
993         struct hfi1_ctxtdata *uctxt = fdata->uctxt;
994         struct tid_rb_node *node =
995                 container_of(mnode, struct tid_rb_node, mmu);
996
997         if (node->freed)
998                 return 0;
999
1000         trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt, node->mmu.addr,
1001                                  node->rcventry, node->npages, node->dma_addr);
1002         node->freed = true;
1003
1004         spin_lock(&fdata->invalid_lock);
1005         if (fdata->invalid_tid_idx < uctxt->expected_count) {
1006                 fdata->invalid_tids[fdata->invalid_tid_idx] =
1007                         rcventry2tidinfo(node->rcventry - uctxt->expected_base);
1008                 fdata->invalid_tids[fdata->invalid_tid_idx] |=
1009                         EXP_TID_SET(LEN, node->npages);
1010                 if (!fdata->invalid_tid_idx) {
1011                         unsigned long *ev;
1012
1013                         /*
1014                          * hfi1_set_uevent_bits() sets a user event flag
1015                          * for all processes. Because calling into the
1016                          * driver to process TID cache invalidations is
1017                          * expensive and TID cache invalidations are
1018                          * handled on a per-process basis, we can
1019                          * optimize this to set the flag only for the
1020                          * process in question.
1021                          */
1022                         ev = uctxt->dd->events +
1023                           (((uctxt->ctxt - uctxt->dd->first_dyn_alloc_ctxt) *
1024                             HFI1_MAX_SHARED_CTXTS) + fdata->subctxt);
1025                         set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
1026                 }
1027                 fdata->invalid_tid_idx++;
1028         }
1029         spin_unlock(&fdata->invalid_lock);
1030         return 0;
1031 }
1032
1033 static int tid_rb_insert(void *arg, struct mmu_rb_node *node)
1034 {
1035         struct hfi1_filedata *fdata = arg;
1036         struct tid_rb_node *tnode =
1037                 container_of(node, struct tid_rb_node, mmu);
1038         u32 base = fdata->uctxt->expected_base;
1039
1040         fdata->entry_to_rb[tnode->rcventry - base] = tnode;
1041         return 0;
1042 }
1043
1044 static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
1045                                     struct tid_rb_node *tnode)
1046 {
1047         u32 base = fdata->uctxt->expected_base;
1048
1049         fdata->entry_to_rb[tnode->rcventry - base] = NULL;
1050         clear_tid_node(fdata, tnode);
1051 }
1052
1053 static void tid_rb_remove(void *arg, struct mmu_rb_node *node)
1054 {
1055         struct hfi1_filedata *fdata = arg;
1056         struct tid_rb_node *tnode =
1057                 container_of(node, struct tid_rb_node, mmu);
1058
1059         cacheless_tid_rb_remove(fdata, tnode);
1060 }