]> git.karo-electronics.de Git - karo-tx-linux.git/blob - drivers/staging/rdma/hfi1/verbs.c
staging/rdma/hfi: fix CQ completion order issue
[karo-tx-linux.git] / drivers / staging / rdma / hfi1 / verbs.c
1 /*
2  *
3  * This file is provided under a dual BSD/GPLv2 license.  When using or
4  * redistributing this file, you may do so under either license.
5  *
6  * GPL LICENSE SUMMARY
7  *
8  * Copyright(c) 2015 Intel Corporation.
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of version 2 of the GNU General Public License as
12  * published by the Free Software Foundation.
13  *
14  * This program is distributed in the hope that it will be useful, but
15  * WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * General Public License for more details.
18  *
19  * BSD LICENSE
20  *
21  * Copyright(c) 2015 Intel Corporation.
22  *
23  * Redistribution and use in source and binary forms, with or without
24  * modification, are permitted provided that the following conditions
25  * are met:
26  *
27  *  - Redistributions of source code must retain the above copyright
28  *    notice, this list of conditions and the following disclaimer.
29  *  - Redistributions in binary form must reproduce the above copyright
30  *    notice, this list of conditions and the following disclaimer in
31  *    the documentation and/or other materials provided with the
32  *    distribution.
33  *  - Neither the name of Intel Corporation nor the names of its
34  *    contributors may be used to endorse or promote products derived
35  *    from this software without specific prior written permission.
36  *
37  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48  *
49  */
50
51 #include <rdma/ib_mad.h>
52 #include <rdma/ib_user_verbs.h>
53 #include <linux/io.h>
54 #include <linux/module.h>
55 #include <linux/utsname.h>
56 #include <linux/rculist.h>
57 #include <linux/mm.h>
58 #include <linux/random.h>
59 #include <linux/vmalloc.h>
60
61 #include "hfi.h"
62 #include "common.h"
63 #include "device.h"
64 #include "trace.h"
65 #include "qp.h"
66 #include "verbs_txreq.h"
67
68 static unsigned int hfi1_lkey_table_size = 16;
69 module_param_named(lkey_table_size, hfi1_lkey_table_size, uint,
70                    S_IRUGO);
71 MODULE_PARM_DESC(lkey_table_size,
72                  "LKEY table size in bits (2^n, 1 <= n <= 23)");
73
74 static unsigned int hfi1_max_pds = 0xFFFF;
75 module_param_named(max_pds, hfi1_max_pds, uint, S_IRUGO);
76 MODULE_PARM_DESC(max_pds,
77                  "Maximum number of protection domains to support");
78
79 static unsigned int hfi1_max_ahs = 0xFFFF;
80 module_param_named(max_ahs, hfi1_max_ahs, uint, S_IRUGO);
81 MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support");
82
83 unsigned int hfi1_max_cqes = 0x2FFFF;
84 module_param_named(max_cqes, hfi1_max_cqes, uint, S_IRUGO);
85 MODULE_PARM_DESC(max_cqes,
86                  "Maximum number of completion queue entries to support");
87
88 unsigned int hfi1_max_cqs = 0x1FFFF;
89 module_param_named(max_cqs, hfi1_max_cqs, uint, S_IRUGO);
90 MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support");
91
92 unsigned int hfi1_max_qp_wrs = 0x3FFF;
93 module_param_named(max_qp_wrs, hfi1_max_qp_wrs, uint, S_IRUGO);
94 MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support");
95
96 unsigned int hfi1_max_qps = 16384;
97 module_param_named(max_qps, hfi1_max_qps, uint, S_IRUGO);
98 MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support");
99
100 unsigned int hfi1_max_sges = 0x60;
101 module_param_named(max_sges, hfi1_max_sges, uint, S_IRUGO);
102 MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support");
103
104 unsigned int hfi1_max_mcast_grps = 16384;
105 module_param_named(max_mcast_grps, hfi1_max_mcast_grps, uint, S_IRUGO);
106 MODULE_PARM_DESC(max_mcast_grps,
107                  "Maximum number of multicast groups to support");
108
109 unsigned int hfi1_max_mcast_qp_attached = 16;
110 module_param_named(max_mcast_qp_attached, hfi1_max_mcast_qp_attached,
111                    uint, S_IRUGO);
112 MODULE_PARM_DESC(max_mcast_qp_attached,
113                  "Maximum number of attached QPs to support");
114
115 unsigned int hfi1_max_srqs = 1024;
116 module_param_named(max_srqs, hfi1_max_srqs, uint, S_IRUGO);
117 MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support");
118
119 unsigned int hfi1_max_srq_sges = 128;
120 module_param_named(max_srq_sges, hfi1_max_srq_sges, uint, S_IRUGO);
121 MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support");
122
123 unsigned int hfi1_max_srq_wrs = 0x1FFFF;
124 module_param_named(max_srq_wrs, hfi1_max_srq_wrs, uint, S_IRUGO);
125 MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support");
126
127 unsigned short piothreshold;
128 module_param(piothreshold, ushort, S_IRUGO);
129 MODULE_PARM_DESC(piothreshold, "size used to determine sdma vs. pio");
130
131 static void verbs_sdma_complete(
132         struct sdma_txreq *cookie,
133         int status);
134
135 static int pio_wait(struct rvt_qp *qp,
136                     struct send_context *sc,
137                     struct hfi1_pkt_state *ps,
138                     u32 flag);
139
140 /* Length of buffer to create verbs txreq cache name */
141 #define TXREQ_NAME_LEN 24
142
143 /*
144  * Translate ib_wr_opcode into ib_wc_opcode.
145  */
146 const enum ib_wc_opcode ib_hfi1_wc_opcode[] = {
147         [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
148         [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
149         [IB_WR_SEND] = IB_WC_SEND,
150         [IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
151         [IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
152         [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
153         [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD
154 };
155
156 /*
157  * Length of header by opcode, 0 --> not supported
158  */
159 const u8 hdr_len_by_opcode[256] = {
160         /* RC */
161         [IB_OPCODE_RC_SEND_FIRST]                     = 12 + 8,
162         [IB_OPCODE_RC_SEND_MIDDLE]                    = 12 + 8,
163         [IB_OPCODE_RC_SEND_LAST]                      = 12 + 8,
164         [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE]       = 12 + 8 + 4,
165         [IB_OPCODE_RC_SEND_ONLY]                      = 12 + 8,
166         [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 4,
167         [IB_OPCODE_RC_RDMA_WRITE_FIRST]               = 12 + 8 + 16,
168         [IB_OPCODE_RC_RDMA_WRITE_MIDDLE]              = 12 + 8,
169         [IB_OPCODE_RC_RDMA_WRITE_LAST]                = 12 + 8,
170         [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
171         [IB_OPCODE_RC_RDMA_WRITE_ONLY]                = 12 + 8 + 16,
172         [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20,
173         [IB_OPCODE_RC_RDMA_READ_REQUEST]              = 12 + 8 + 16,
174         [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST]       = 12 + 8 + 4,
175         [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE]      = 12 + 8,
176         [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST]        = 12 + 8 + 4,
177         [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY]        = 12 + 8 + 4,
178         [IB_OPCODE_RC_ACKNOWLEDGE]                    = 12 + 8 + 4,
179         [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE]             = 12 + 8 + 4,
180         [IB_OPCODE_RC_COMPARE_SWAP]                   = 12 + 8 + 28,
181         [IB_OPCODE_RC_FETCH_ADD]                      = 12 + 8 + 28,
182         /* UC */
183         [IB_OPCODE_UC_SEND_FIRST]                     = 12 + 8,
184         [IB_OPCODE_UC_SEND_MIDDLE]                    = 12 + 8,
185         [IB_OPCODE_UC_SEND_LAST]                      = 12 + 8,
186         [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE]       = 12 + 8 + 4,
187         [IB_OPCODE_UC_SEND_ONLY]                      = 12 + 8,
188         [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 4,
189         [IB_OPCODE_UC_RDMA_WRITE_FIRST]               = 12 + 8 + 16,
190         [IB_OPCODE_UC_RDMA_WRITE_MIDDLE]              = 12 + 8,
191         [IB_OPCODE_UC_RDMA_WRITE_LAST]                = 12 + 8,
192         [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
193         [IB_OPCODE_UC_RDMA_WRITE_ONLY]                = 12 + 8 + 16,
194         [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20,
195         /* UD */
196         [IB_OPCODE_UD_SEND_ONLY]                      = 12 + 8 + 8,
197         [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 12
198 };
199
200 static const opcode_handler opcode_handler_tbl[256] = {
201         /* RC */
202         [IB_OPCODE_RC_SEND_FIRST]                     = &hfi1_rc_rcv,
203         [IB_OPCODE_RC_SEND_MIDDLE]                    = &hfi1_rc_rcv,
204         [IB_OPCODE_RC_SEND_LAST]                      = &hfi1_rc_rcv,
205         [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE]       = &hfi1_rc_rcv,
206         [IB_OPCODE_RC_SEND_ONLY]                      = &hfi1_rc_rcv,
207         [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_rc_rcv,
208         [IB_OPCODE_RC_RDMA_WRITE_FIRST]               = &hfi1_rc_rcv,
209         [IB_OPCODE_RC_RDMA_WRITE_MIDDLE]              = &hfi1_rc_rcv,
210         [IB_OPCODE_RC_RDMA_WRITE_LAST]                = &hfi1_rc_rcv,
211         [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_rc_rcv,
212         [IB_OPCODE_RC_RDMA_WRITE_ONLY]                = &hfi1_rc_rcv,
213         [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_rc_rcv,
214         [IB_OPCODE_RC_RDMA_READ_REQUEST]              = &hfi1_rc_rcv,
215         [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST]       = &hfi1_rc_rcv,
216         [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE]      = &hfi1_rc_rcv,
217         [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST]        = &hfi1_rc_rcv,
218         [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY]        = &hfi1_rc_rcv,
219         [IB_OPCODE_RC_ACKNOWLEDGE]                    = &hfi1_rc_rcv,
220         [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE]             = &hfi1_rc_rcv,
221         [IB_OPCODE_RC_COMPARE_SWAP]                   = &hfi1_rc_rcv,
222         [IB_OPCODE_RC_FETCH_ADD]                      = &hfi1_rc_rcv,
223         /* UC */
224         [IB_OPCODE_UC_SEND_FIRST]                     = &hfi1_uc_rcv,
225         [IB_OPCODE_UC_SEND_MIDDLE]                    = &hfi1_uc_rcv,
226         [IB_OPCODE_UC_SEND_LAST]                      = &hfi1_uc_rcv,
227         [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE]       = &hfi1_uc_rcv,
228         [IB_OPCODE_UC_SEND_ONLY]                      = &hfi1_uc_rcv,
229         [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_uc_rcv,
230         [IB_OPCODE_UC_RDMA_WRITE_FIRST]               = &hfi1_uc_rcv,
231         [IB_OPCODE_UC_RDMA_WRITE_MIDDLE]              = &hfi1_uc_rcv,
232         [IB_OPCODE_UC_RDMA_WRITE_LAST]                = &hfi1_uc_rcv,
233         [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_uc_rcv,
234         [IB_OPCODE_UC_RDMA_WRITE_ONLY]                = &hfi1_uc_rcv,
235         [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_uc_rcv,
236         /* UD */
237         [IB_OPCODE_UD_SEND_ONLY]                      = &hfi1_ud_rcv,
238         [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_ud_rcv,
239         /* CNP */
240         [IB_OPCODE_CNP]                               = &hfi1_cnp_rcv
241 };
242
243 /*
244  * System image GUID.
245  */
246 __be64 ib_hfi1_sys_image_guid;
247
248 /**
249  * hfi1_copy_sge - copy data to SGE memory
250  * @ss: the SGE state
251  * @data: the data to copy
252  * @length: the length of the data
253  * @copy_last: do a separate copy of the last 8 bytes
254  */
255 void hfi1_copy_sge(
256         struct rvt_sge_state *ss,
257         void *data, u32 length,
258         int release,
259         int copy_last)
260 {
261         struct rvt_sge *sge = &ss->sge;
262         int in_last = 0;
263         int i;
264
265         if (copy_last) {
266                 if (length > 8) {
267                         length -= 8;
268                 } else {
269                         copy_last = 0;
270                         in_last = 1;
271                 }
272         }
273
274 again:
275         while (length) {
276                 u32 len = sge->length;
277
278                 if (len > length)
279                         len = length;
280                 if (len > sge->sge_length)
281                         len = sge->sge_length;
282                 WARN_ON_ONCE(len == 0);
283                 if (in_last) {
284                         /* enforce byte transer ordering */
285                         for (i = 0; i < len; i++)
286                                 ((u8 *)sge->vaddr)[i] = ((u8 *)data)[i];
287                 } else {
288                         memcpy(sge->vaddr, data, len);
289                 }
290                 sge->vaddr += len;
291                 sge->length -= len;
292                 sge->sge_length -= len;
293                 if (sge->sge_length == 0) {
294                         if (release)
295                                 rvt_put_mr(sge->mr);
296                         if (--ss->num_sge)
297                                 *sge = *ss->sg_list++;
298                 } else if (sge->length == 0 && sge->mr->lkey) {
299                         if (++sge->n >= RVT_SEGSZ) {
300                                 if (++sge->m >= sge->mr->mapsz)
301                                         break;
302                                 sge->n = 0;
303                         }
304                         sge->vaddr =
305                                 sge->mr->map[sge->m]->segs[sge->n].vaddr;
306                         sge->length =
307                                 sge->mr->map[sge->m]->segs[sge->n].length;
308                 }
309                 data += len;
310                 length -= len;
311         }
312
313         if (copy_last) {
314                 copy_last = 0;
315                 in_last = 1;
316                 length = 8;
317                 goto again;
318         }
319 }
320
321 /**
322  * hfi1_skip_sge - skip over SGE memory
323  * @ss: the SGE state
324  * @length: the number of bytes to skip
325  */
326 void hfi1_skip_sge(struct rvt_sge_state *ss, u32 length, int release)
327 {
328         struct rvt_sge *sge = &ss->sge;
329
330         while (length) {
331                 u32 len = sge->length;
332
333                 if (len > length)
334                         len = length;
335                 if (len > sge->sge_length)
336                         len = sge->sge_length;
337                 WARN_ON_ONCE(len == 0);
338                 sge->vaddr += len;
339                 sge->length -= len;
340                 sge->sge_length -= len;
341                 if (sge->sge_length == 0) {
342                         if (release)
343                                 rvt_put_mr(sge->mr);
344                         if (--ss->num_sge)
345                                 *sge = *ss->sg_list++;
346                 } else if (sge->length == 0 && sge->mr->lkey) {
347                         if (++sge->n >= RVT_SEGSZ) {
348                                 if (++sge->m >= sge->mr->mapsz)
349                                         break;
350                                 sge->n = 0;
351                         }
352                         sge->vaddr =
353                                 sge->mr->map[sge->m]->segs[sge->n].vaddr;
354                         sge->length =
355                                 sge->mr->map[sge->m]->segs[sge->n].length;
356                 }
357                 length -= len;
358         }
359 }
360
361 /*
362  * Make sure the QP is ready and able to accept the given opcode.
363  */
364 static inline int qp_ok(int opcode, struct hfi1_packet *packet)
365 {
366         struct hfi1_ibport *ibp;
367
368         if (!(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK))
369                 goto dropit;
370         if (((opcode & OPCODE_QP_MASK) == packet->qp->allowed_ops) ||
371             (opcode == IB_OPCODE_CNP))
372                 return 1;
373 dropit:
374         ibp = &packet->rcd->ppd->ibport_data;
375         ibp->rvp.n_pkt_drops++;
376         return 0;
377 }
378
379
380 /**
381  * hfi1_ib_rcv - process an incoming packet
382  * @packet: data packet information
383  *
384  * This is called to process an incoming packet at interrupt level.
385  *
386  * Tlen is the length of the header + data + CRC in bytes.
387  */
388 void hfi1_ib_rcv(struct hfi1_packet *packet)
389 {
390         struct hfi1_ctxtdata *rcd = packet->rcd;
391         struct hfi1_ib_header *hdr = packet->hdr;
392         u32 tlen = packet->tlen;
393         struct hfi1_pportdata *ppd = rcd->ppd;
394         struct hfi1_ibport *ibp = &ppd->ibport_data;
395         struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi;
396         unsigned long flags;
397         u32 qp_num;
398         int lnh;
399         u8 opcode;
400         u16 lid;
401
402         /* Check for GRH */
403         lnh = be16_to_cpu(hdr->lrh[0]) & 3;
404         if (lnh == HFI1_LRH_BTH)
405                 packet->ohdr = &hdr->u.oth;
406         else if (lnh == HFI1_LRH_GRH) {
407                 u32 vtf;
408
409                 packet->ohdr = &hdr->u.l.oth;
410                 if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR)
411                         goto drop;
412                 vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow);
413                 if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
414                         goto drop;
415                 packet->rcv_flags |= HFI1_HAS_GRH;
416         } else
417                 goto drop;
418
419         trace_input_ibhdr(rcd->dd, hdr);
420
421         opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24);
422         inc_opstats(tlen, &rcd->opstats->stats[opcode]);
423
424         /* Get the destination QP number. */
425         qp_num = be32_to_cpu(packet->ohdr->bth[1]) & RVT_QPN_MASK;
426         lid = be16_to_cpu(hdr->lrh[1]);
427         if (unlikely((lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
428                      (lid != be16_to_cpu(IB_LID_PERMISSIVE)))) {
429                 struct rvt_mcast *mcast;
430                 struct rvt_mcast_qp *p;
431
432                 if (lnh != HFI1_LRH_GRH)
433                         goto drop;
434                 mcast = rvt_mcast_find(&ibp->rvp, &hdr->u.l.grh.dgid);
435                 if (mcast == NULL)
436                         goto drop;
437                 list_for_each_entry_rcu(p, &mcast->qp_list, list) {
438                         packet->qp = p->qp;
439                         spin_lock_irqsave(&packet->qp->r_lock, flags);
440                         if (likely((qp_ok(opcode, packet))))
441                                 opcode_handler_tbl[opcode](packet);
442                         spin_unlock_irqrestore(&packet->qp->r_lock, flags);
443                 }
444                 /*
445                  * Notify rvt_multicast_detach() if it is waiting for us
446                  * to finish.
447                  */
448                 if (atomic_dec_return(&mcast->refcount) <= 1)
449                         wake_up(&mcast->wait);
450         } else {
451                 rcu_read_lock();
452                 packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
453                 if (!packet->qp) {
454                         rcu_read_unlock();
455                         goto drop;
456                 }
457                 spin_lock_irqsave(&packet->qp->r_lock, flags);
458                 if (likely((qp_ok(opcode, packet))))
459                         opcode_handler_tbl[opcode](packet);
460                 spin_unlock_irqrestore(&packet->qp->r_lock, flags);
461                 rcu_read_unlock();
462         }
463         return;
464
465 drop:
466         ibp->rvp.n_pkt_drops++;
467 }
468
469 /*
470  * This is called from a timer to check for QPs
471  * which need kernel memory in order to send a packet.
472  */
473 static void mem_timer(unsigned long data)
474 {
475         struct hfi1_ibdev *dev = (struct hfi1_ibdev *)data;
476         struct list_head *list = &dev->memwait;
477         struct rvt_qp *qp = NULL;
478         struct iowait *wait;
479         unsigned long flags;
480         struct hfi1_qp_priv *priv;
481
482         write_seqlock_irqsave(&dev->iowait_lock, flags);
483         if (!list_empty(list)) {
484                 wait = list_first_entry(list, struct iowait, list);
485                 qp = iowait_to_qp(wait);
486                 priv = qp->priv;
487                 list_del_init(&priv->s_iowait.list);
488                 /* refcount held until actual wake up */
489                 if (!list_empty(list))
490                         mod_timer(&dev->mem_timer, jiffies + 1);
491         }
492         write_sequnlock_irqrestore(&dev->iowait_lock, flags);
493
494         if (qp)
495                 hfi1_qp_wakeup(qp, RVT_S_WAIT_KMEM);
496 }
497
498 void update_sge(struct rvt_sge_state *ss, u32 length)
499 {
500         struct rvt_sge *sge = &ss->sge;
501
502         sge->vaddr += length;
503         sge->length -= length;
504         sge->sge_length -= length;
505         if (sge->sge_length == 0) {
506                 if (--ss->num_sge)
507                         *sge = *ss->sg_list++;
508         } else if (sge->length == 0 && sge->mr->lkey) {
509                 if (++sge->n >= RVT_SEGSZ) {
510                         if (++sge->m >= sge->mr->mapsz)
511                                 return;
512                         sge->n = 0;
513                 }
514                 sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
515                 sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
516         }
517 }
518
519 /*
520  * This is called with progress side lock held.
521  */
522 /* New API */
523 static void verbs_sdma_complete(
524         struct sdma_txreq *cookie,
525         int status)
526 {
527         struct verbs_txreq *tx =
528                 container_of(cookie, struct verbs_txreq, txreq);
529         struct rvt_qp *qp = tx->qp;
530
531         spin_lock(&qp->s_lock);
532         if (tx->wqe)
533                 hfi1_send_complete(qp, tx->wqe, IB_WC_SUCCESS);
534         else if (qp->ibqp.qp_type == IB_QPT_RC) {
535                 struct hfi1_ib_header *hdr;
536
537                 hdr = &tx->phdr.hdr;
538                 hfi1_rc_send_complete(qp, hdr);
539         }
540         spin_unlock(&qp->s_lock);
541
542         hfi1_put_txreq(tx);
543 }
544
545 static int wait_kmem(struct hfi1_ibdev *dev,
546                      struct rvt_qp *qp,
547                      struct hfi1_pkt_state *ps)
548 {
549         struct hfi1_qp_priv *priv = qp->priv;
550         unsigned long flags;
551         int ret = 0;
552
553         spin_lock_irqsave(&qp->s_lock, flags);
554         if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
555                 write_seqlock(&dev->iowait_lock);
556                 list_add_tail(&ps->s_txreq->txreq.list,
557                               &priv->s_iowait.tx_head);
558                 if (list_empty(&priv->s_iowait.list)) {
559                         if (list_empty(&dev->memwait))
560                                 mod_timer(&dev->mem_timer, jiffies + 1);
561                         qp->s_flags |= RVT_S_WAIT_KMEM;
562                         list_add_tail(&priv->s_iowait.list, &dev->memwait);
563                         trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM);
564                         atomic_inc(&qp->refcount);
565                 }
566                 write_sequnlock(&dev->iowait_lock);
567                 qp->s_flags &= ~RVT_S_BUSY;
568                 ret = -EBUSY;
569         }
570         spin_unlock_irqrestore(&qp->s_lock, flags);
571
572         return ret;
573 }
574
575 /*
576  * This routine calls txadds for each sg entry.
577  *
578  * Add failures will revert the sge cursor
579  */
580 static noinline int build_verbs_ulp_payload(
581         struct sdma_engine *sde,
582         struct rvt_sge_state *ss,
583         u32 length,
584         struct verbs_txreq *tx)
585 {
586         struct rvt_sge *sg_list = ss->sg_list;
587         struct rvt_sge sge = ss->sge;
588         u8 num_sge = ss->num_sge;
589         u32 len;
590         int ret = 0;
591
592         while (length) {
593                 len = ss->sge.length;
594                 if (len > length)
595                         len = length;
596                 if (len > ss->sge.sge_length)
597                         len = ss->sge.sge_length;
598                 WARN_ON_ONCE(len == 0);
599                 ret = sdma_txadd_kvaddr(
600                         sde->dd,
601                         &tx->txreq,
602                         ss->sge.vaddr,
603                         len);
604                 if (ret)
605                         goto bail_txadd;
606                 update_sge(ss, len);
607                 length -= len;
608         }
609         return ret;
610 bail_txadd:
611         /* unwind cursor */
612         ss->sge = sge;
613         ss->num_sge = num_sge;
614         ss->sg_list = sg_list;
615         return ret;
616 }
617
618 /*
619  * Build the number of DMA descriptors needed to send length bytes of data.
620  *
621  * NOTE: DMA mapping is held in the tx until completed in the ring or
622  *       the tx desc is freed without having been submitted to the ring
623  *
624  * This routine ensures all the helper routine calls succeed.
625  */
626 /* New API */
627 static int build_verbs_tx_desc(
628         struct sdma_engine *sde,
629         struct rvt_sge_state *ss,
630         u32 length,
631         struct verbs_txreq *tx,
632         struct ahg_ib_header *ahdr,
633         u64 pbc)
634 {
635         int ret = 0;
636         struct hfi1_pio_header *phdr = &tx->phdr;
637         u16 hdrbytes = tx->hdr_dwords << 2;
638
639         if (!ahdr->ahgcount) {
640                 ret = sdma_txinit_ahg(
641                         &tx->txreq,
642                         ahdr->tx_flags,
643                         hdrbytes + length,
644                         ahdr->ahgidx,
645                         0,
646                         NULL,
647                         0,
648                         verbs_sdma_complete);
649                 if (ret)
650                         goto bail_txadd;
651                 phdr->pbc = cpu_to_le64(pbc);
652                 ret = sdma_txadd_kvaddr(
653                         sde->dd,
654                         &tx->txreq,
655                         phdr,
656                         hdrbytes);
657                 if (ret)
658                         goto bail_txadd;
659         } else {
660                 ret = sdma_txinit_ahg(
661                         &tx->txreq,
662                         ahdr->tx_flags,
663                         length,
664                         ahdr->ahgidx,
665                         ahdr->ahgcount,
666                         ahdr->ahgdesc,
667                         hdrbytes,
668                         verbs_sdma_complete);
669                 if (ret)
670                         goto bail_txadd;
671         }
672
673         /* add the ulp payload - if any.  ss can be NULL for acks */
674         if (ss)
675                 ret = build_verbs_ulp_payload(sde, ss, length, tx);
676 bail_txadd:
677         return ret;
678 }
679
680 int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
681                         u64 pbc)
682 {
683         struct hfi1_qp_priv *priv = qp->priv;
684         struct ahg_ib_header *ahdr = priv->s_hdr;
685         u32 hdrwords = qp->s_hdrwords;
686         struct rvt_sge_state *ss = qp->s_cur_sge;
687         u32 len = qp->s_cur_size;
688         u32 plen = hdrwords + ((len + 3) >> 2) + 2; /* includes pbc */
689         struct hfi1_ibdev *dev = ps->dev;
690         struct hfi1_pportdata *ppd = ps->ppd;
691         struct verbs_txreq *tx;
692         u64 pbc_flags = 0;
693         u8 sc5 = priv->s_sc;
694
695         int ret;
696
697         tx = ps->s_txreq;
698         if (!sdma_txreq_built(&tx->txreq)) {
699                 if (likely(pbc == 0)) {
700                         u32 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5);
701                         /* No vl15 here */
702                         /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
703                         pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
704
705                         pbc = create_pbc(ppd,
706                                          pbc_flags,
707                                          qp->srate_mbps,
708                                          vl,
709                                          plen);
710                 }
711                 tx->wqe = qp->s_wqe;
712                 ret = build_verbs_tx_desc(tx->sde, ss, len, tx, ahdr, pbc);
713                 if (unlikely(ret))
714                         goto bail_build;
715         }
716         trace_output_ibhdr(dd_from_ibdev(qp->ibqp.device),
717                            &ps->s_txreq->phdr.hdr);
718         ret =  sdma_send_txreq(tx->sde, &priv->s_iowait, &tx->txreq);
719         if (unlikely(ret == -ECOMM))
720                 goto bail_ecomm;
721         return ret;
722
723 bail_ecomm:
724         /* The current one got "sent" */
725         return 0;
726 bail_build:
727         ret = wait_kmem(dev, qp, ps);
728         if (!ret) {
729                 /* free txreq - bad state */
730                 hfi1_put_txreq(ps->s_txreq);
731                 ps->s_txreq = NULL;
732         }
733         return ret;
734 }
735
736 /*
737  * If we are now in the error state, return zero to flush the
738  * send work request.
739  */
740 static int pio_wait(struct rvt_qp *qp,
741                     struct send_context *sc,
742                     struct hfi1_pkt_state *ps,
743                     u32 flag)
744 {
745         struct hfi1_qp_priv *priv = qp->priv;
746         struct hfi1_devdata *dd = sc->dd;
747         struct hfi1_ibdev *dev = &dd->verbs_dev;
748         unsigned long flags;
749         int ret = 0;
750
751         /*
752          * Note that as soon as want_buffer() is called and
753          * possibly before it returns, sc_piobufavail()
754          * could be called. Therefore, put QP on the I/O wait list before
755          * enabling the PIO avail interrupt.
756          */
757         spin_lock_irqsave(&qp->s_lock, flags);
758         if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
759                 write_seqlock(&dev->iowait_lock);
760                 list_add_tail(&ps->s_txreq->txreq.list,
761                               &priv->s_iowait.tx_head);
762                 if (list_empty(&priv->s_iowait.list)) {
763                         struct hfi1_ibdev *dev = &dd->verbs_dev;
764                         int was_empty;
765
766                         dev->n_piowait += !!(flag & RVT_S_WAIT_PIO);
767                         dev->n_piodrain += !!(flag & RVT_S_WAIT_PIO_DRAIN);
768                         dev->n_piowait++;
769                         qp->s_flags |= flag;
770                         was_empty = list_empty(&sc->piowait);
771                         list_add_tail(&priv->s_iowait.list, &sc->piowait);
772                         trace_hfi1_qpsleep(qp, RVT_S_WAIT_PIO);
773                         atomic_inc(&qp->refcount);
774                         /* counting: only call wantpiobuf_intr if first user */
775                         if (was_empty)
776                                 hfi1_sc_wantpiobuf_intr(sc, 1);
777                 }
778                 write_sequnlock(&dev->iowait_lock);
779                 qp->s_flags &= ~RVT_S_BUSY;
780                 ret = -EBUSY;
781         }
782         spin_unlock_irqrestore(&qp->s_lock, flags);
783         return ret;
784 }
785
786 struct send_context *qp_to_send_context(struct rvt_qp *qp, u8 sc5)
787 {
788         struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
789         struct hfi1_pportdata *ppd = dd->pport + (qp->port_num - 1);
790         u8 vl;
791
792         vl = sc_to_vlt(dd, sc5);
793         if (vl >= ppd->vls_supported && vl != 15)
794                 return NULL;
795         return dd->vld[vl].sc;
796 }
797
798 static void verbs_pio_complete(void *arg, int code)
799 {
800         struct rvt_qp *qp = (struct rvt_qp *)arg;
801         struct hfi1_qp_priv *priv = qp->priv;
802
803         if (iowait_pio_dec(&priv->s_iowait))
804                 iowait_drain_wakeup(&priv->s_iowait);
805 }
806
807 int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
808                         u64 pbc)
809 {
810         struct hfi1_qp_priv *priv = qp->priv;
811         u32 hdrwords = qp->s_hdrwords;
812         struct rvt_sge_state *ss = qp->s_cur_sge;
813         u32 len = qp->s_cur_size;
814         u32 dwords = (len + 3) >> 2;
815         u32 plen = hdrwords + dwords + 2; /* includes pbc */
816         struct hfi1_pportdata *ppd = ps->ppd;
817         u32 *hdr = (u32 *)&ps->s_txreq->phdr.hdr;
818         u64 pbc_flags = 0;
819         u8 sc5;
820         unsigned long flags = 0;
821         struct send_context *sc;
822         struct pio_buf *pbuf;
823         int wc_status = IB_WC_SUCCESS;
824         int ret = 0;
825         pio_release_cb cb = NULL;
826
827         /* only RC/UC use complete */
828         switch (qp->ibqp.qp_type) {
829         case IB_QPT_RC:
830         case IB_QPT_UC:
831                 cb = verbs_pio_complete;
832                 break;
833         default:
834                 break;
835         }
836
837         /* vl15 special case taken care of in ud.c */
838         sc5 = priv->s_sc;
839         sc = qp_to_send_context(qp, sc5);
840
841         if (!sc) {
842                 ret = -EINVAL;
843                 goto bail;
844         }
845         if (likely(pbc == 0)) {
846                 u8 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5);
847                 /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
848                 pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
849                 pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
850         }
851         if (cb)
852                 iowait_pio_inc(&priv->s_iowait);
853         pbuf = sc_buffer_alloc(sc, plen, cb, qp);
854         if (unlikely(pbuf == NULL)) {
855                 if (cb)
856                         verbs_pio_complete(qp, 0);
857                 if (ppd->host_link_state != HLS_UP_ACTIVE) {
858                         /*
859                          * If we have filled the PIO buffers to capacity and are
860                          * not in an active state this request is not going to
861                          * go out to so just complete it with an error or else a
862                          * ULP or the core may be stuck waiting.
863                          */
864                         hfi1_cdbg(
865                                 PIO,
866                                 "alloc failed. state not active, completing");
867                         wc_status = IB_WC_GENERAL_ERR;
868                         goto pio_bail;
869                 } else {
870                         /*
871                          * This is a normal occurrence. The PIO buffs are full
872                          * up but we are still happily sending, well we could be
873                          * so lets continue to queue the request.
874                          */
875                         hfi1_cdbg(PIO, "alloc failed. state active, queuing");
876                         ret = pio_wait(qp, sc, ps, RVT_S_WAIT_PIO);
877                         if (!ret)
878                                 /* txreq not queued - free */
879                                 goto bail;
880                         /* tx consumed in wait */
881                         return ret;
882                 }
883         }
884
885         if (len == 0) {
886                 pio_copy(ppd->dd, pbuf, pbc, hdr, hdrwords);
887         } else {
888                 if (ss) {
889                         seg_pio_copy_start(pbuf, pbc, hdr, hdrwords*4);
890                         while (len) {
891                                 void *addr = ss->sge.vaddr;
892                                 u32 slen = ss->sge.length;
893
894                                 if (slen > len)
895                                         slen = len;
896                                 update_sge(ss, slen);
897                                 seg_pio_copy_mid(pbuf, addr, slen);
898                                 len -= slen;
899                         }
900                         seg_pio_copy_end(pbuf);
901                 }
902         }
903
904         trace_output_ibhdr(dd_from_ibdev(qp->ibqp.device),
905                            &ps->s_txreq->phdr.hdr);
906
907 pio_bail:
908         if (qp->s_wqe) {
909                 spin_lock_irqsave(&qp->s_lock, flags);
910                 hfi1_send_complete(qp, qp->s_wqe, wc_status);
911                 spin_unlock_irqrestore(&qp->s_lock, flags);
912         } else if (qp->ibqp.qp_type == IB_QPT_RC) {
913                 spin_lock_irqsave(&qp->s_lock, flags);
914                 hfi1_rc_send_complete(qp, &ps->s_txreq->phdr.hdr);
915                 spin_unlock_irqrestore(&qp->s_lock, flags);
916         }
917
918         ret = 0;
919
920 bail:
921         hfi1_put_txreq(ps->s_txreq);
922         return ret;
923 }
924
925 /*
926  * egress_pkey_matches_entry - return 1 if the pkey matches ent (ent
927  * being an entry from the ingress partition key table), return 0
928  * otherwise. Use the matching criteria for egress partition keys
929  * specified in the OPAv1 spec., section 9.1l.7.
930  */
931 static inline int egress_pkey_matches_entry(u16 pkey, u16 ent)
932 {
933         u16 mkey = pkey & PKEY_LOW_15_MASK;
934         u16 ment = ent & PKEY_LOW_15_MASK;
935
936         if (mkey == ment) {
937                 /*
938                  * If pkey[15] is set (full partition member),
939                  * is bit 15 in the corresponding table element
940                  * clear (limited member)?
941                  */
942                 if (pkey & PKEY_MEMBER_MASK)
943                         return !!(ent & PKEY_MEMBER_MASK);
944                 return 1;
945         }
946         return 0;
947 }
948
949 /*
950  * egress_pkey_check - return 0 if hdr's pkey matches according to the
951  * criteria in the OPAv1 spec., section 9.11.7.
952  */
953 static inline int egress_pkey_check(struct hfi1_pportdata *ppd,
954                                     struct hfi1_ib_header *hdr,
955                                     struct rvt_qp *qp)
956 {
957         struct hfi1_qp_priv *priv = qp->priv;
958         struct hfi1_other_headers *ohdr;
959         struct hfi1_devdata *dd;
960         int i = 0;
961         u16 pkey;
962         u8 lnh, sc5 = priv->s_sc;
963
964         if (!(ppd->part_enforce & HFI1_PART_ENFORCE_OUT))
965                 return 0;
966
967         /* locate the pkey within the headers */
968         lnh = be16_to_cpu(hdr->lrh[0]) & 3;
969         if (lnh == HFI1_LRH_GRH)
970                 ohdr = &hdr->u.l.oth;
971         else
972                 ohdr = &hdr->u.oth;
973
974         pkey = (u16)be32_to_cpu(ohdr->bth[0]);
975
976         /* If SC15, pkey[0:14] must be 0x7fff */
977         if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
978                 goto bad;
979
980
981         /* Is the pkey = 0x0, or 0x8000? */
982         if ((pkey & PKEY_LOW_15_MASK) == 0)
983                 goto bad;
984
985         /* The most likely matching pkey has index qp->s_pkey_index */
986         if (unlikely(!egress_pkey_matches_entry(pkey,
987                                         ppd->pkeys[qp->s_pkey_index]))) {
988                 /* no match - try the entire table */
989                 for (; i < MAX_PKEY_VALUES; i++) {
990                         if (egress_pkey_matches_entry(pkey, ppd->pkeys[i]))
991                                 break;
992                 }
993         }
994
995         if (i < MAX_PKEY_VALUES)
996                 return 0;
997 bad:
998         incr_cntr64(&ppd->port_xmit_constraint_errors);
999         dd = ppd->dd;
1000         if (!(dd->err_info_xmit_constraint.status & OPA_EI_STATUS_SMASK)) {
1001                 u16 slid = be16_to_cpu(hdr->lrh[3]);
1002
1003                 dd->err_info_xmit_constraint.status |= OPA_EI_STATUS_SMASK;
1004                 dd->err_info_xmit_constraint.slid = slid;
1005                 dd->err_info_xmit_constraint.pkey = pkey;
1006         }
1007         return 1;
1008 }
1009
1010 /**
1011  * get_send_routine - choose an egress routine
1012  *
1013  * Choose an egress routine based on QP type
1014  * and size
1015  */
1016 static inline send_routine get_send_routine(struct rvt_qp *qp,
1017                                             struct hfi1_ib_header *h)
1018 {
1019         struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
1020         struct hfi1_qp_priv *priv = qp->priv;
1021
1022         if (unlikely(!(dd->flags & HFI1_HAS_SEND_DMA)))
1023                 return dd->process_pio_send;
1024         switch (qp->ibqp.qp_type) {
1025         case IB_QPT_SMI:
1026                 return dd->process_pio_send;
1027         case IB_QPT_GSI:
1028         case IB_QPT_UD:
1029                 if (piothreshold && qp->s_cur_size <= piothreshold)
1030                         return dd->process_pio_send;
1031                 break;
1032         case IB_QPT_RC:
1033                 if (piothreshold &&
1034                     qp->s_cur_size <= min(piothreshold, qp->pmtu) &&
1035                     (BIT(get_opcode(h) & 0x1f) & rc_only_opcode) &&
1036                     iowait_sdma_pending(&priv->s_iowait) == 0)
1037                         return dd->process_pio_send;
1038                 break;
1039         case IB_QPT_UC:
1040                 if (piothreshold &&
1041                     qp->s_cur_size <= min(piothreshold, qp->pmtu) &&
1042                     (BIT(get_opcode(h) & 0x1f) & uc_only_opcode) &&
1043                     iowait_sdma_pending(&priv->s_iowait) == 0)
1044                         return dd->process_pio_send;
1045                 break;
1046         default:
1047                 break;
1048         }
1049         return dd->process_dma_send;
1050 }
1051
1052 /**
1053  * hfi1_verbs_send - send a packet
1054  * @qp: the QP to send on
1055  * @ps: the state of the packet to send
1056  *
1057  * Return zero if packet is sent or queued OK.
1058  * Return non-zero and clear qp->s_flags RVT_S_BUSY otherwise.
1059  */
1060 int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
1061 {
1062         struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
1063         send_routine sr;
1064         int ret;
1065
1066         sr = get_send_routine(qp, &ps->s_txreq->phdr.hdr);
1067         ret = egress_pkey_check(dd->pport, &ps->s_txreq->phdr.hdr, qp);
1068         if (unlikely(ret)) {
1069                 /*
1070                  * The value we are returning here does not get propagated to
1071                  * the verbs caller. Thus we need to complete the request with
1072                  * error otherwise the caller could be sitting waiting on the
1073                  * completion event. Only do this for PIO. SDMA has its own
1074                  * mechanism for handling the errors. So for SDMA we can just
1075                  * return.
1076                  */
1077                 if (sr == dd->process_pio_send) {
1078                         unsigned long flags;
1079
1080                         hfi1_cdbg(PIO, "%s() Failed. Completing with err",
1081                                   __func__);
1082                         spin_lock_irqsave(&qp->s_lock, flags);
1083                         hfi1_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR);
1084                         spin_unlock_irqrestore(&qp->s_lock, flags);
1085                 }
1086                 return -EINVAL;
1087         }
1088         return sr(qp, ps, 0);
1089 }
1090
1091 /**
1092  * hfi1_fill_device_attr - Fill in rvt dev info device attributes.
1093  * @dd: the device data structure
1094  */
1095 static void hfi1_fill_device_attr(struct hfi1_devdata *dd)
1096 {
1097         struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
1098
1099         memset(&rdi->dparms.props, 0, sizeof(rdi->dparms.props));
1100
1101         rdi->dparms.props.device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
1102                         IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
1103                         IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
1104                         IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE;
1105         rdi->dparms.props.page_size_cap = PAGE_SIZE;
1106         rdi->dparms.props.vendor_id = dd->oui1 << 16 | dd->oui2 << 8 | dd->oui3;
1107         rdi->dparms.props.vendor_part_id = dd->pcidev->device;
1108         rdi->dparms.props.hw_ver = dd->minrev;
1109         rdi->dparms.props.sys_image_guid = ib_hfi1_sys_image_guid;
1110         rdi->dparms.props.max_mr_size = ~0ULL;
1111         rdi->dparms.props.max_qp = hfi1_max_qps;
1112         rdi->dparms.props.max_qp_wr = hfi1_max_qp_wrs;
1113         rdi->dparms.props.max_sge = hfi1_max_sges;
1114         rdi->dparms.props.max_sge_rd = hfi1_max_sges;
1115         rdi->dparms.props.max_cq = hfi1_max_cqs;
1116         rdi->dparms.props.max_ah = hfi1_max_ahs;
1117         rdi->dparms.props.max_cqe = hfi1_max_cqes;
1118         rdi->dparms.props.max_mr = rdi->lkey_table.max;
1119         rdi->dparms.props.max_fmr = rdi->lkey_table.max;
1120         rdi->dparms.props.max_map_per_fmr = 32767;
1121         rdi->dparms.props.max_pd = hfi1_max_pds;
1122         rdi->dparms.props.max_qp_rd_atom = HFI1_MAX_RDMA_ATOMIC;
1123         rdi->dparms.props.max_qp_init_rd_atom = 255;
1124         rdi->dparms.props.max_srq = hfi1_max_srqs;
1125         rdi->dparms.props.max_srq_wr = hfi1_max_srq_wrs;
1126         rdi->dparms.props.max_srq_sge = hfi1_max_srq_sges;
1127         rdi->dparms.props.atomic_cap = IB_ATOMIC_GLOB;
1128         rdi->dparms.props.max_pkeys = hfi1_get_npkeys(dd);
1129         rdi->dparms.props.max_mcast_grp = hfi1_max_mcast_grps;
1130         rdi->dparms.props.max_mcast_qp_attach = hfi1_max_mcast_qp_attached;
1131         rdi->dparms.props.max_total_mcast_qp_attach =
1132                                         rdi->dparms.props.max_mcast_qp_attach *
1133                                         rdi->dparms.props.max_mcast_grp;
1134 }
1135
1136 static inline u16 opa_speed_to_ib(u16 in)
1137 {
1138         u16 out = 0;
1139
1140         if (in & OPA_LINK_SPEED_25G)
1141                 out |= IB_SPEED_EDR;
1142         if (in & OPA_LINK_SPEED_12_5G)
1143                 out |= IB_SPEED_FDR;
1144
1145         return out;
1146 }
1147
1148 /*
1149  * Convert a single OPA link width (no multiple flags) to an IB value.
1150  * A zero OPA link width means link down, which means the IB width value
1151  * is a don't care.
1152  */
1153 static inline u16 opa_width_to_ib(u16 in)
1154 {
1155         switch (in) {
1156         case OPA_LINK_WIDTH_1X:
1157         /* map 2x and 3x to 1x as they don't exist in IB */
1158         case OPA_LINK_WIDTH_2X:
1159         case OPA_LINK_WIDTH_3X:
1160                 return IB_WIDTH_1X;
1161         default: /* link down or unknown, return our largest width */
1162         case OPA_LINK_WIDTH_4X:
1163                 return IB_WIDTH_4X;
1164         }
1165 }
1166
1167 static int query_port(struct rvt_dev_info *rdi, u8 port_num,
1168                       struct ib_port_attr *props)
1169 {
1170         struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
1171         struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
1172         struct hfi1_pportdata *ppd = &dd->pport[port_num - 1];
1173         u16 lid = ppd->lid;
1174
1175         props->lid = lid ? lid : 0;
1176         props->lmc = ppd->lmc;
1177         /* OPA logical states match IB logical states */
1178         props->state = driver_lstate(ppd);
1179         props->phys_state = hfi1_ibphys_portstate(ppd);
1180         props->gid_tbl_len = HFI1_GUIDS_PER_PORT;
1181         props->active_width = (u8)opa_width_to_ib(ppd->link_width_active);
1182         /* see rate_show() in ib core/sysfs.c */
1183         props->active_speed = (u8)opa_speed_to_ib(ppd->link_speed_active);
1184         props->max_vl_num = ppd->vls_supported;
1185
1186         /* Once we are a "first class" citizen and have added the OPA MTUs to
1187          * the core we can advertise the larger MTU enum to the ULPs, for now
1188          * advertise only 4K.
1189          *
1190          * Those applications which are either OPA aware or pass the MTU enum
1191          * from the Path Records to us will get the new 8k MTU.  Those that
1192          * attempt to process the MTU enum may fail in various ways.
1193          */
1194         props->max_mtu = mtu_to_enum((!valid_ib_mtu(hfi1_max_mtu) ?
1195                                       4096 : hfi1_max_mtu), IB_MTU_4096);
1196         props->active_mtu = !valid_ib_mtu(ppd->ibmtu) ? props->max_mtu :
1197                 mtu_to_enum(ppd->ibmtu, IB_MTU_2048);
1198
1199         return 0;
1200 }
1201
1202 static int modify_device(struct ib_device *device,
1203                          int device_modify_mask,
1204                          struct ib_device_modify *device_modify)
1205 {
1206         struct hfi1_devdata *dd = dd_from_ibdev(device);
1207         unsigned i;
1208         int ret;
1209
1210         if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID |
1211                                    IB_DEVICE_MODIFY_NODE_DESC)) {
1212                 ret = -EOPNOTSUPP;
1213                 goto bail;
1214         }
1215
1216         if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC) {
1217                 memcpy(device->node_desc, device_modify->node_desc, 64);
1218                 for (i = 0; i < dd->num_pports; i++) {
1219                         struct hfi1_ibport *ibp = &dd->pport[i].ibport_data;
1220
1221                         hfi1_node_desc_chg(ibp);
1222                 }
1223         }
1224
1225         if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) {
1226                 ib_hfi1_sys_image_guid =
1227                         cpu_to_be64(device_modify->sys_image_guid);
1228                 for (i = 0; i < dd->num_pports; i++) {
1229                         struct hfi1_ibport *ibp = &dd->pport[i].ibport_data;
1230
1231                         hfi1_sys_guid_chg(ibp);
1232                 }
1233         }
1234
1235         ret = 0;
1236
1237 bail:
1238         return ret;
1239 }
1240
1241 static int shut_down_port(struct rvt_dev_info *rdi, u8 port_num)
1242 {
1243         struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
1244         struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
1245         struct hfi1_pportdata *ppd = &dd->pport[port_num - 1];
1246         int ret;
1247
1248         set_link_down_reason(ppd, OPA_LINKDOWN_REASON_UNKNOWN, 0,
1249                              OPA_LINKDOWN_REASON_UNKNOWN);
1250         ret = set_link_state(ppd, HLS_DN_DOWNDEF);
1251         return ret;
1252 }
1253
1254 static int hfi1_get_guid_be(struct rvt_dev_info *rdi, struct rvt_ibport *rvp,
1255                             int guid_index, __be64 *guid)
1256 {
1257         struct hfi1_ibport *ibp = container_of(rvp, struct hfi1_ibport, rvp);
1258         struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
1259
1260         if (guid_index == 0)
1261                 *guid = cpu_to_be64(ppd->guid);
1262         else if (guid_index < HFI1_GUIDS_PER_PORT)
1263                 *guid = ibp->guids[guid_index - 1];
1264         else
1265                 return -EINVAL;
1266
1267         return 0;
1268 }
1269
1270 /*
1271  * convert ah port,sl to sc
1272  */
1273 u8 ah_to_sc(struct ib_device *ibdev, struct ib_ah_attr *ah)
1274 {
1275         struct hfi1_ibport *ibp = to_iport(ibdev, ah->port_num);
1276
1277         return ibp->sl_to_sc[ah->sl];
1278 }
1279
1280 static int hfi1_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr)
1281 {
1282         struct hfi1_ibport *ibp;
1283         struct hfi1_pportdata *ppd;
1284         struct hfi1_devdata *dd;
1285         u8 sc5;
1286
1287         /* test the mapping for validity */
1288         ibp = to_iport(ibdev, ah_attr->port_num);
1289         ppd = ppd_from_ibp(ibp);
1290         sc5 = ibp->sl_to_sc[ah_attr->sl];
1291         dd = dd_from_ppd(ppd);
1292         if (sc_to_vlt(dd, sc5) > num_vls && sc_to_vlt(dd, sc5) != 0xf)
1293                 return -EINVAL;
1294         return 0;
1295 }
1296
1297 static void hfi1_notify_new_ah(struct ib_device *ibdev,
1298                                struct ib_ah_attr *ah_attr,
1299                                struct rvt_ah *ah)
1300 {
1301         struct hfi1_ibport *ibp;
1302         struct hfi1_pportdata *ppd;
1303         struct hfi1_devdata *dd;
1304         u8 sc5;
1305
1306         /*
1307          * Do not trust reading anything from rvt_ah at this point as it is not
1308          * done being setup. We can however modify things which we need to set.
1309          */
1310
1311         ibp = to_iport(ibdev, ah_attr->port_num);
1312         ppd = ppd_from_ibp(ibp);
1313         sc5 = ibp->sl_to_sc[ah->attr.sl];
1314         dd = dd_from_ppd(ppd);
1315         ah->vl = sc_to_vlt(dd, sc5);
1316         if (ah->vl < num_vls || ah->vl == 15)
1317                 ah->log_pmtu = ilog2(dd->vld[ah->vl].mtu);
1318 }
1319
1320 struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u16 dlid)
1321 {
1322         struct ib_ah_attr attr;
1323         struct ib_ah *ah = ERR_PTR(-EINVAL);
1324         struct rvt_qp *qp0;
1325
1326         memset(&attr, 0, sizeof(attr));
1327         attr.dlid = dlid;
1328         attr.port_num = ppd_from_ibp(ibp)->port;
1329         rcu_read_lock();
1330         qp0 = rcu_dereference(ibp->rvp.qp[0]);
1331         if (qp0)
1332                 ah = ib_create_ah(qp0->ibqp.pd, &attr);
1333         rcu_read_unlock();
1334         return ah;
1335 }
1336
1337 /**
1338  * hfi1_get_npkeys - return the size of the PKEY table for context 0
1339  * @dd: the hfi1_ib device
1340  */
1341 unsigned hfi1_get_npkeys(struct hfi1_devdata *dd)
1342 {
1343         return ARRAY_SIZE(dd->pport[0].pkeys);
1344 }
1345
1346 static void init_ibport(struct hfi1_pportdata *ppd)
1347 {
1348         struct hfi1_ibport *ibp = &ppd->ibport_data;
1349         size_t sz = ARRAY_SIZE(ibp->sl_to_sc);
1350         int i;
1351
1352         for (i = 0; i < sz; i++) {
1353                 ibp->sl_to_sc[i] = i;
1354                 ibp->sc_to_sl[i] = i;
1355         }
1356
1357         spin_lock_init(&ibp->rvp.lock);
1358         /* Set the prefix to the default value (see ch. 4.1.1) */
1359         ibp->rvp.gid_prefix = IB_DEFAULT_GID_PREFIX;
1360         ibp->rvp.sm_lid = 0;
1361         /* Below should only set bits defined in OPA PortInfo.CapabilityMask */
1362         ibp->rvp.port_cap_flags = IB_PORT_AUTO_MIGR_SUP |
1363                 IB_PORT_CAP_MASK_NOTICE_SUP;
1364         ibp->rvp.pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA;
1365         ibp->rvp.pma_counter_select[1] = IB_PMA_PORT_RCV_DATA;
1366         ibp->rvp.pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS;
1367         ibp->rvp.pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS;
1368         ibp->rvp.pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT;
1369
1370         RCU_INIT_POINTER(ibp->rvp.qp[0], NULL);
1371         RCU_INIT_POINTER(ibp->rvp.qp[1], NULL);
1372 }
1373
1374 /**
1375  * hfi1_register_ib_device - register our device with the infiniband core
1376  * @dd: the device data structure
1377  * Return 0 if successful, errno if unsuccessful.
1378  */
1379 int hfi1_register_ib_device(struct hfi1_devdata *dd)
1380 {
1381         struct hfi1_ibdev *dev = &dd->verbs_dev;
1382         struct ib_device *ibdev = &dev->rdi.ibdev;
1383         struct hfi1_pportdata *ppd = dd->pport;
1384         unsigned i;
1385         int ret;
1386         size_t lcpysz = IB_DEVICE_NAME_MAX;
1387
1388         for (i = 0; i < dd->num_pports; i++)
1389                 init_ibport(ppd + i);
1390
1391         /* Only need to initialize non-zero fields. */
1392
1393         setup_timer(&dev->mem_timer, mem_timer, (unsigned long)dev);
1394
1395         seqlock_init(&dev->iowait_lock);
1396         INIT_LIST_HEAD(&dev->txwait);
1397         INIT_LIST_HEAD(&dev->memwait);
1398
1399         ret = verbs_txreq_init(dev);
1400         if (ret)
1401                 goto err_verbs_txreq;
1402
1403         /*
1404          * The system image GUID is supposed to be the same for all
1405          * HFIs in a single system but since there can be other
1406          * device types in the system, we can't be sure this is unique.
1407          */
1408         if (!ib_hfi1_sys_image_guid)
1409                 ib_hfi1_sys_image_guid = cpu_to_be64(ppd->guid);
1410         lcpysz = strlcpy(ibdev->name, class_name(), lcpysz);
1411         strlcpy(ibdev->name + lcpysz, "_%d", IB_DEVICE_NAME_MAX - lcpysz);
1412         ibdev->owner = THIS_MODULE;
1413         ibdev->node_guid = cpu_to_be64(ppd->guid);
1414         ibdev->phys_port_cnt = dd->num_pports;
1415         ibdev->dma_device = &dd->pcidev->dev;
1416         ibdev->modify_device = modify_device;
1417
1418         /* keep process mad in the driver */
1419         ibdev->process_mad = hfi1_process_mad;
1420
1421         strncpy(ibdev->node_desc, init_utsname()->nodename,
1422                 sizeof(ibdev->node_desc));
1423
1424         /*
1425          * Fill in rvt info object.
1426          */
1427         dd->verbs_dev.rdi.driver_f.port_callback = hfi1_create_port_files;
1428         dd->verbs_dev.rdi.driver_f.get_card_name = get_card_name;
1429         dd->verbs_dev.rdi.driver_f.get_pci_dev = get_pci_dev;
1430         dd->verbs_dev.rdi.driver_f.check_ah = hfi1_check_ah;
1431         dd->verbs_dev.rdi.driver_f.notify_new_ah = hfi1_notify_new_ah;
1432         dd->verbs_dev.rdi.driver_f.get_guid_be = hfi1_get_guid_be;
1433         dd->verbs_dev.rdi.driver_f.query_port_state = query_port;
1434         dd->verbs_dev.rdi.driver_f.shut_down_port = shut_down_port;
1435         dd->verbs_dev.rdi.driver_f.cap_mask_chg = hfi1_cap_mask_chg;
1436         /*
1437          * Fill in rvt info device attributes.
1438          */
1439         hfi1_fill_device_attr(dd);
1440
1441         /* queue pair */
1442         dd->verbs_dev.rdi.dparms.qp_table_size = hfi1_qp_table_size;
1443         dd->verbs_dev.rdi.dparms.qpn_start = 0;
1444         dd->verbs_dev.rdi.dparms.qpn_inc = 1;
1445         dd->verbs_dev.rdi.dparms.qos_shift = dd->qos_shift;
1446         dd->verbs_dev.rdi.dparms.qpn_res_start = kdeth_qp << 16;
1447         dd->verbs_dev.rdi.dparms.qpn_res_end =
1448         dd->verbs_dev.rdi.dparms.qpn_res_start + 65535;
1449         dd->verbs_dev.rdi.dparms.max_rdma_atomic = HFI1_MAX_RDMA_ATOMIC;
1450         dd->verbs_dev.rdi.dparms.psn_mask = PSN_MASK;
1451         dd->verbs_dev.rdi.dparms.psn_shift = PSN_SHIFT;
1452         dd->verbs_dev.rdi.dparms.psn_modify_mask = PSN_MODIFY_MASK;
1453         dd->verbs_dev.rdi.dparms.core_cap_flags = RDMA_CORE_PORT_INTEL_OPA;
1454         dd->verbs_dev.rdi.dparms.max_mad_size = OPA_MGMT_MAD_SIZE;
1455
1456         dd->verbs_dev.rdi.driver_f.qp_priv_alloc = qp_priv_alloc;
1457         dd->verbs_dev.rdi.driver_f.qp_priv_free = qp_priv_free;
1458         dd->verbs_dev.rdi.driver_f.free_all_qps = free_all_qps;
1459         dd->verbs_dev.rdi.driver_f.notify_qp_reset = notify_qp_reset;
1460         dd->verbs_dev.rdi.driver_f.do_send = hfi1_do_send;
1461         dd->verbs_dev.rdi.driver_f.schedule_send = hfi1_schedule_send;
1462         dd->verbs_dev.rdi.driver_f.schedule_send_no_lock = _hfi1_schedule_send;
1463         dd->verbs_dev.rdi.driver_f.get_pmtu_from_attr = get_pmtu_from_attr;
1464         dd->verbs_dev.rdi.driver_f.notify_error_qp = notify_error_qp;
1465         dd->verbs_dev.rdi.driver_f.flush_qp_waiters = flush_qp_waiters;
1466         dd->verbs_dev.rdi.driver_f.stop_send_queue = stop_send_queue;
1467         dd->verbs_dev.rdi.driver_f.quiesce_qp = quiesce_qp;
1468         dd->verbs_dev.rdi.driver_f.notify_error_qp = notify_error_qp;
1469         dd->verbs_dev.rdi.driver_f.mtu_from_qp = mtu_from_qp;
1470         dd->verbs_dev.rdi.driver_f.mtu_to_path_mtu = mtu_to_path_mtu;
1471         dd->verbs_dev.rdi.driver_f.check_modify_qp = hfi1_check_modify_qp;
1472         dd->verbs_dev.rdi.driver_f.modify_qp = hfi1_modify_qp;
1473         dd->verbs_dev.rdi.driver_f.check_send_wqe = hfi1_check_send_wqe;
1474
1475         /* completeion queue */
1476         snprintf(dd->verbs_dev.rdi.dparms.cq_name,
1477                  sizeof(dd->verbs_dev.rdi.dparms.cq_name),
1478                  "hfi1_cq%d", dd->unit);
1479         dd->verbs_dev.rdi.dparms.node = dd->node;
1480
1481         /* misc settings */
1482         dd->verbs_dev.rdi.flags = 0; /* Let rdmavt handle it all */
1483         dd->verbs_dev.rdi.dparms.lkey_table_size = hfi1_lkey_table_size;
1484         dd->verbs_dev.rdi.dparms.nports = dd->num_pports;
1485         dd->verbs_dev.rdi.dparms.npkeys = hfi1_get_npkeys(dd);
1486
1487         ppd = dd->pport;
1488         for (i = 0; i < dd->num_pports; i++, ppd++)
1489                 rvt_init_port(&dd->verbs_dev.rdi,
1490                               &ppd->ibport_data.rvp,
1491                               i,
1492                               ppd->pkeys);
1493
1494         ret = rvt_register_device(&dd->verbs_dev.rdi);
1495         if (ret)
1496                 goto err_verbs_txreq;
1497
1498         ret = hfi1_verbs_register_sysfs(dd);
1499         if (ret)
1500                 goto err_class;
1501
1502         return ret;
1503
1504 err_class:
1505         rvt_unregister_device(&dd->verbs_dev.rdi);
1506 err_verbs_txreq:
1507         verbs_txreq_exit(dev);
1508         dd_dev_err(dd, "cannot register verbs: %d!\n", -ret);
1509         return ret;
1510 }
1511
1512 void hfi1_unregister_ib_device(struct hfi1_devdata *dd)
1513 {
1514         struct hfi1_ibdev *dev = &dd->verbs_dev;
1515
1516         hfi1_verbs_unregister_sysfs(dd);
1517
1518         rvt_unregister_device(&dd->verbs_dev.rdi);
1519
1520         if (!list_empty(&dev->txwait))
1521                 dd_dev_err(dd, "txwait list not empty!\n");
1522         if (!list_empty(&dev->memwait))
1523                 dd_dev_err(dd, "memwait list not empty!\n");
1524
1525         del_timer_sync(&dev->mem_timer);
1526         verbs_txreq_exit(dev);
1527 }
1528
1529 void hfi1_cnp_rcv(struct hfi1_packet *packet)
1530 {
1531         struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
1532         struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
1533         struct hfi1_ib_header *hdr = packet->hdr;
1534         struct rvt_qp *qp = packet->qp;
1535         u32 lqpn, rqpn = 0;
1536         u16 rlid = 0;
1537         u8 sl, sc5, sc4_bit, svc_type;
1538         bool sc4_set = has_sc4_bit(packet);
1539
1540         switch (packet->qp->ibqp.qp_type) {
1541         case IB_QPT_UC:
1542                 rlid = qp->remote_ah_attr.dlid;
1543                 rqpn = qp->remote_qpn;
1544                 svc_type = IB_CC_SVCTYPE_UC;
1545                 break;
1546         case IB_QPT_RC:
1547                 rlid = qp->remote_ah_attr.dlid;
1548                 rqpn = qp->remote_qpn;
1549                 svc_type = IB_CC_SVCTYPE_RC;
1550                 break;
1551         case IB_QPT_SMI:
1552         case IB_QPT_GSI:
1553         case IB_QPT_UD:
1554                 svc_type = IB_CC_SVCTYPE_UD;
1555                 break;
1556         default:
1557                 ibp->rvp.n_pkt_drops++;
1558                 return;
1559         }
1560
1561         sc4_bit = sc4_set << 4;
1562         sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
1563         sc5 |= sc4_bit;
1564         sl = ibp->sc_to_sl[sc5];
1565         lqpn = qp->ibqp.qp_num;
1566
1567         process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type);
1568 }