]> git.karo-electronics.de Git - mv-sheeva.git/blob - drivers/infiniband/hw/ipath/ipath_rc.c
[PATCH] IB/ipath: fixes to performance get counters for IB compliance
[mv-sheeva.git] / drivers / infiniband / hw / ipath / ipath_rc.c
1 /*
2  * Copyright (c) 2006 QLogic, Inc. All rights reserved.
3  * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the
9  * OpenIB.org BSD license below:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      - Redistributions of source code must retain the above
16  *        copyright notice, this list of conditions and the following
17  *        disclaimer.
18  *
19  *      - Redistributions in binary form must reproduce the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer in the documentation and/or other materials
22  *        provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33
34 #include "ipath_verbs.h"
35 #include "ips_common.h"
36
37 /* cut down ridiculously long IB macro names */
38 #define OP(x) IB_OPCODE_RC_##x
39
40 /**
41  * ipath_init_restart- initialize the qp->s_sge after a restart
42  * @qp: the QP who's SGE we're restarting
43  * @wqe: the work queue to initialize the QP's SGE from
44  *
45  * The QP s_lock should be held.
46  */
47 static void ipath_init_restart(struct ipath_qp *qp, struct ipath_swqe *wqe)
48 {
49         struct ipath_ibdev *dev;
50         u32 len;
51
52         len = ((qp->s_psn - wqe->psn) & IPS_PSN_MASK) *
53                 ib_mtu_enum_to_int(qp->path_mtu);
54         qp->s_sge.sge = wqe->sg_list[0];
55         qp->s_sge.sg_list = wqe->sg_list + 1;
56         qp->s_sge.num_sge = wqe->wr.num_sge;
57         ipath_skip_sge(&qp->s_sge, len);
58         qp->s_len = wqe->length - len;
59         dev = to_idev(qp->ibqp.device);
60         spin_lock(&dev->pending_lock);
61         if (list_empty(&qp->timerwait))
62                 list_add_tail(&qp->timerwait,
63                               &dev->pending[dev->pending_index]);
64         spin_unlock(&dev->pending_lock);
65 }
66
67 /**
68  * ipath_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
69  * @qp: a pointer to the QP
70  * @ohdr: a pointer to the IB header being constructed
71  * @pmtu: the path MTU
72  *
73  * Return bth0 if constructed; otherwise, return 0.
74  * Note the QP s_lock must be held.
75  */
76 u32 ipath_make_rc_ack(struct ipath_qp *qp,
77                       struct ipath_other_headers *ohdr,
78                       u32 pmtu)
79 {
80         struct ipath_sge_state *ss;
81         u32 hwords;
82         u32 len;
83         u32 bth0;
84
85         /* header size in 32-bit words LRH+BTH = (8+12)/4. */
86         hwords = 5;
87
88         /*
89          * Send a response.  Note that we are in the responder's
90          * side of the QP context.
91          */
92         switch (qp->s_ack_state) {
93         case OP(RDMA_READ_REQUEST):
94                 ss = &qp->s_rdma_sge;
95                 len = qp->s_rdma_len;
96                 if (len > pmtu) {
97                         len = pmtu;
98                         qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
99                 } else
100                         qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
101                 qp->s_rdma_len -= len;
102                 bth0 = qp->s_ack_state << 24;
103                 ohdr->u.aeth = ipath_compute_aeth(qp);
104                 hwords++;
105                 break;
106
107         case OP(RDMA_READ_RESPONSE_FIRST):
108                 qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
109                 /* FALLTHROUGH */
110         case OP(RDMA_READ_RESPONSE_MIDDLE):
111                 ss = &qp->s_rdma_sge;
112                 len = qp->s_rdma_len;
113                 if (len > pmtu)
114                         len = pmtu;
115                 else {
116                         ohdr->u.aeth = ipath_compute_aeth(qp);
117                         hwords++;
118                         qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
119                 }
120                 qp->s_rdma_len -= len;
121                 bth0 = qp->s_ack_state << 24;
122                 break;
123
124         case OP(RDMA_READ_RESPONSE_LAST):
125         case OP(RDMA_READ_RESPONSE_ONLY):
126                 /*
127                  * We have to prevent new requests from changing
128                  * the r_sge state while a ipath_verbs_send()
129                  * is in progress.
130                  * Changing r_state allows the receiver
131                  * to continue processing new packets.
132                  * We do it here now instead of above so
133                  * that we are sure the packet was sent before
134                  * changing the state.
135                  */
136                 qp->r_state = OP(RDMA_READ_RESPONSE_LAST);
137                 qp->s_ack_state = OP(ACKNOWLEDGE);
138                 return 0;
139
140         case OP(COMPARE_SWAP):
141         case OP(FETCH_ADD):
142                 ss = NULL;
143                 len = 0;
144                 qp->r_state = OP(SEND_LAST);
145                 qp->s_ack_state = OP(ACKNOWLEDGE);
146                 bth0 = IB_OPCODE_ATOMIC_ACKNOWLEDGE << 24;
147                 ohdr->u.at.aeth = ipath_compute_aeth(qp);
148                 ohdr->u.at.atomic_ack_eth = cpu_to_be64(qp->s_ack_atomic);
149                 hwords += sizeof(ohdr->u.at) / 4;
150                 break;
151
152         default:
153                 /* Send a regular ACK. */
154                 ss = NULL;
155                 len = 0;
156                 qp->s_ack_state = OP(ACKNOWLEDGE);
157                 bth0 = qp->s_ack_state << 24;
158                 ohdr->u.aeth = ipath_compute_aeth(qp);
159                 hwords++;
160         }
161         qp->s_hdrwords = hwords;
162         qp->s_cur_sge = ss;
163         qp->s_cur_size = len;
164
165         return bth0;
166 }
167
168 /**
169  * ipath_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
170  * @qp: a pointer to the QP
171  * @ohdr: a pointer to the IB header being constructed
172  * @pmtu: the path MTU
173  * @bth0p: pointer to the BTH opcode word
174  * @bth2p: pointer to the BTH PSN word
175  *
176  * Return 1 if constructed; otherwise, return 0.
177  * Note the QP s_lock must be held.
178  */
179 int ipath_make_rc_req(struct ipath_qp *qp,
180                       struct ipath_other_headers *ohdr,
181                       u32 pmtu, u32 *bth0p, u32 *bth2p)
182 {
183         struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
184         struct ipath_sge_state *ss;
185         struct ipath_swqe *wqe;
186         u32 hwords;
187         u32 len;
188         u32 bth0;
189         u32 bth2;
190         char newreq;
191
192         if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) ||
193             qp->s_rnr_timeout)
194                 goto done;
195
196         /* header size in 32-bit words LRH+BTH = (8+12)/4. */
197         hwords = 5;
198         bth0 = 0;
199
200         /* Send a request. */
201         wqe = get_swqe_ptr(qp, qp->s_cur);
202         switch (qp->s_state) {
203         default:
204                 /*
205                  * Resend an old request or start a new one.
206                  *
207                  * We keep track of the current SWQE so that
208                  * we don't reset the "furthest progress" state
209                  * if we need to back up.
210                  */
211                 newreq = 0;
212                 if (qp->s_cur == qp->s_tail) {
213                         /* Check if send work queue is empty. */
214                         if (qp->s_tail == qp->s_head)
215                                 goto done;
216                         qp->s_psn = wqe->psn = qp->s_next_psn;
217                         newreq = 1;
218                 }
219                 /*
220                  * Note that we have to be careful not to modify the
221                  * original work request since we may need to resend
222                  * it.
223                  */
224                 qp->s_sge.sge = wqe->sg_list[0];
225                 qp->s_sge.sg_list = wqe->sg_list + 1;
226                 qp->s_sge.num_sge = wqe->wr.num_sge;
227                 qp->s_len = len = wqe->length;
228                 ss = &qp->s_sge;
229                 bth2 = 0;
230                 switch (wqe->wr.opcode) {
231                 case IB_WR_SEND:
232                 case IB_WR_SEND_WITH_IMM:
233                         /* If no credit, return. */
234                         if (qp->s_lsn != (u32) -1 &&
235                             ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0)
236                                 goto done;
237                         wqe->lpsn = wqe->psn;
238                         if (len > pmtu) {
239                                 wqe->lpsn += (len - 1) / pmtu;
240                                 qp->s_state = OP(SEND_FIRST);
241                                 len = pmtu;
242                                 break;
243                         }
244                         if (wqe->wr.opcode == IB_WR_SEND)
245                                 qp->s_state = OP(SEND_ONLY);
246                         else {
247                                 qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
248                                 /* Immediate data comes after the BTH */
249                                 ohdr->u.imm_data = wqe->wr.imm_data;
250                                 hwords += 1;
251                         }
252                         if (wqe->wr.send_flags & IB_SEND_SOLICITED)
253                                 bth0 |= 1 << 23;
254                         bth2 = 1 << 31; /* Request ACK. */
255                         if (++qp->s_cur == qp->s_size)
256                                 qp->s_cur = 0;
257                         break;
258
259                 case IB_WR_RDMA_WRITE:
260                         if (newreq && qp->s_lsn != (u32) -1)
261                                 qp->s_lsn++;
262                         /* FALLTHROUGH */
263                 case IB_WR_RDMA_WRITE_WITH_IMM:
264                         /* If no credit, return. */
265                         if (qp->s_lsn != (u32) -1 &&
266                             ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0)
267                                 goto done;
268                         ohdr->u.rc.reth.vaddr =
269                                 cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
270                         ohdr->u.rc.reth.rkey =
271                                 cpu_to_be32(wqe->wr.wr.rdma.rkey);
272                         ohdr->u.rc.reth.length = cpu_to_be32(len);
273                         hwords += sizeof(struct ib_reth) / 4;
274                         wqe->lpsn = wqe->psn;
275                         if (len > pmtu) {
276                                 wqe->lpsn += (len - 1) / pmtu;
277                                 qp->s_state = OP(RDMA_WRITE_FIRST);
278                                 len = pmtu;
279                                 break;
280                         }
281                         if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
282                                 qp->s_state = OP(RDMA_WRITE_ONLY);
283                         else {
284                                 qp->s_state =
285                                         OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
286                                 /* Immediate data comes after RETH */
287                                 ohdr->u.rc.imm_data = wqe->wr.imm_data;
288                                 hwords += 1;
289                                 if (wqe->wr.send_flags & IB_SEND_SOLICITED)
290                                         bth0 |= 1 << 23;
291                         }
292                         bth2 = 1 << 31; /* Request ACK. */
293                         if (++qp->s_cur == qp->s_size)
294                                 qp->s_cur = 0;
295                         break;
296
297                 case IB_WR_RDMA_READ:
298                         ohdr->u.rc.reth.vaddr =
299                                 cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
300                         ohdr->u.rc.reth.rkey =
301                                 cpu_to_be32(wqe->wr.wr.rdma.rkey);
302                         ohdr->u.rc.reth.length = cpu_to_be32(len);
303                         qp->s_state = OP(RDMA_READ_REQUEST);
304                         hwords += sizeof(ohdr->u.rc.reth) / 4;
305                         if (newreq) {
306                                 if (qp->s_lsn != (u32) -1)
307                                         qp->s_lsn++;
308                                 /*
309                                  * Adjust s_next_psn to count the
310                                  * expected number of responses.
311                                  */
312                                 if (len > pmtu)
313                                         qp->s_next_psn += (len - 1) / pmtu;
314                                 wqe->lpsn = qp->s_next_psn++;
315                         }
316                         ss = NULL;
317                         len = 0;
318                         if (++qp->s_cur == qp->s_size)
319                                 qp->s_cur = 0;
320                         break;
321
322                 case IB_WR_ATOMIC_CMP_AND_SWP:
323                 case IB_WR_ATOMIC_FETCH_AND_ADD:
324                         if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP)
325                                 qp->s_state = OP(COMPARE_SWAP);
326                         else
327                                 qp->s_state = OP(FETCH_ADD);
328                         ohdr->u.atomic_eth.vaddr = cpu_to_be64(
329                                 wqe->wr.wr.atomic.remote_addr);
330                         ohdr->u.atomic_eth.rkey = cpu_to_be32(
331                                 wqe->wr.wr.atomic.rkey);
332                         ohdr->u.atomic_eth.swap_data = cpu_to_be64(
333                                 wqe->wr.wr.atomic.swap);
334                         ohdr->u.atomic_eth.compare_data = cpu_to_be64(
335                                 wqe->wr.wr.atomic.compare_add);
336                         hwords += sizeof(struct ib_atomic_eth) / 4;
337                         if (newreq) {
338                                 if (qp->s_lsn != (u32) -1)
339                                         qp->s_lsn++;
340                                 wqe->lpsn = wqe->psn;
341                         }
342                         if (++qp->s_cur == qp->s_size)
343                                 qp->s_cur = 0;
344                         ss = NULL;
345                         len = 0;
346                         break;
347
348                 default:
349                         goto done;
350                 }
351                 if (newreq) {
352                         qp->s_tail++;
353                         if (qp->s_tail >= qp->s_size)
354                                 qp->s_tail = 0;
355                 }
356                 bth2 |= qp->s_psn++ & IPS_PSN_MASK;
357                 if ((int)(qp->s_psn - qp->s_next_psn) > 0)
358                         qp->s_next_psn = qp->s_psn;
359                 spin_lock(&dev->pending_lock);
360                 if (list_empty(&qp->timerwait))
361                         list_add_tail(&qp->timerwait,
362                                       &dev->pending[dev->pending_index]);
363                 spin_unlock(&dev->pending_lock);
364                 break;
365
366         case OP(RDMA_READ_RESPONSE_FIRST):
367                 /*
368                  * This case can only happen if a send is restarted.  See
369                  * ipath_restart_rc().
370                  */
371                 ipath_init_restart(qp, wqe);
372                 /* FALLTHROUGH */
373         case OP(SEND_FIRST):
374                 qp->s_state = OP(SEND_MIDDLE);
375                 /* FALLTHROUGH */
376         case OP(SEND_MIDDLE):
377                 bth2 = qp->s_psn++ & IPS_PSN_MASK;
378                 if ((int)(qp->s_psn - qp->s_next_psn) > 0)
379                         qp->s_next_psn = qp->s_psn;
380                 ss = &qp->s_sge;
381                 len = qp->s_len;
382                 if (len > pmtu) {
383                         /*
384                          * Request an ACK every 1/2 MB to avoid retransmit
385                          * timeouts.
386                          */
387                         if (((wqe->length - len) % (512 * 1024)) == 0)
388                                 bth2 |= 1 << 31;
389                         len = pmtu;
390                         break;
391                 }
392                 if (wqe->wr.opcode == IB_WR_SEND)
393                         qp->s_state = OP(SEND_LAST);
394                 else {
395                         qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
396                         /* Immediate data comes after the BTH */
397                         ohdr->u.imm_data = wqe->wr.imm_data;
398                         hwords += 1;
399                 }
400                 if (wqe->wr.send_flags & IB_SEND_SOLICITED)
401                         bth0 |= 1 << 23;
402                 bth2 |= 1 << 31;        /* Request ACK. */
403                 qp->s_cur++;
404                 if (qp->s_cur >= qp->s_size)
405                         qp->s_cur = 0;
406                 break;
407
408         case OP(RDMA_READ_RESPONSE_LAST):
409                 /*
410                  * This case can only happen if a RDMA write is restarted.
411                  * See ipath_restart_rc().
412                  */
413                 ipath_init_restart(qp, wqe);
414                 /* FALLTHROUGH */
415         case OP(RDMA_WRITE_FIRST):
416                 qp->s_state = OP(RDMA_WRITE_MIDDLE);
417                 /* FALLTHROUGH */
418         case OP(RDMA_WRITE_MIDDLE):
419                 bth2 = qp->s_psn++ & IPS_PSN_MASK;
420                 if ((int)(qp->s_psn - qp->s_next_psn) > 0)
421                         qp->s_next_psn = qp->s_psn;
422                 ss = &qp->s_sge;
423                 len = qp->s_len;
424                 if (len > pmtu) {
425                         /*
426                          * Request an ACK every 1/2 MB to avoid retransmit
427                          * timeouts.
428                          */
429                         if (((wqe->length - len) % (512 * 1024)) == 0)
430                                 bth2 |= 1 << 31;
431                         len = pmtu;
432                         break;
433                 }
434                 if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
435                         qp->s_state = OP(RDMA_WRITE_LAST);
436                 else {
437                         qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
438                         /* Immediate data comes after the BTH */
439                         ohdr->u.imm_data = wqe->wr.imm_data;
440                         hwords += 1;
441                         if (wqe->wr.send_flags & IB_SEND_SOLICITED)
442                                 bth0 |= 1 << 23;
443                 }
444                 bth2 |= 1 << 31;        /* Request ACK. */
445                 qp->s_cur++;
446                 if (qp->s_cur >= qp->s_size)
447                         qp->s_cur = 0;
448                 break;
449
450         case OP(RDMA_READ_RESPONSE_MIDDLE):
451                 /*
452                  * This case can only happen if a RDMA read is restarted.
453                  * See ipath_restart_rc().
454                  */
455                 ipath_init_restart(qp, wqe);
456                 len = ((qp->s_psn - wqe->psn) & IPS_PSN_MASK) * pmtu;
457                 ohdr->u.rc.reth.vaddr =
458                         cpu_to_be64(wqe->wr.wr.rdma.remote_addr + len);
459                 ohdr->u.rc.reth.rkey =
460                         cpu_to_be32(wqe->wr.wr.rdma.rkey);
461                 ohdr->u.rc.reth.length = cpu_to_be32(qp->s_len);
462                 qp->s_state = OP(RDMA_READ_REQUEST);
463                 hwords += sizeof(ohdr->u.rc.reth) / 4;
464                 bth2 = qp->s_psn++ & IPS_PSN_MASK;
465                 if ((int)(qp->s_psn - qp->s_next_psn) > 0)
466                         qp->s_next_psn = qp->s_psn;
467                 ss = NULL;
468                 len = 0;
469                 qp->s_cur++;
470                 if (qp->s_cur == qp->s_size)
471                         qp->s_cur = 0;
472                 break;
473
474         case OP(RDMA_READ_REQUEST):
475         case OP(COMPARE_SWAP):
476         case OP(FETCH_ADD):
477                 /*
478                  * We shouldn't start anything new until this request is
479                  * finished.  The ACK will handle rescheduling us.  XXX The
480                  * number of outstanding ones is negotiated at connection
481                  * setup time (see pg. 258,289)?  XXX Also, if we support
482                  * multiple outstanding requests, we need to check the WQE
483                  * IB_SEND_FENCE flag and not send a new request if a RDMA
484                  * read or atomic is pending.
485                  */
486                 goto done;
487         }
488         qp->s_len -= len;
489         qp->s_hdrwords = hwords;
490         qp->s_cur_sge = ss;
491         qp->s_cur_size = len;
492         *bth0p = bth0 | (qp->s_state << 24);
493         *bth2p = bth2;
494         return 1;
495
496 done:
497         return 0;
498 }
499
500 /**
501  * send_rc_ack - Construct an ACK packet and send it
502  * @qp: a pointer to the QP
503  *
504  * This is called from ipath_rc_rcv() and only uses the receive
505  * side QP state.
506  * Note that RDMA reads are handled in the send side QP state and tasklet.
507  */
508 static void send_rc_ack(struct ipath_qp *qp)
509 {
510         struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
511         u16 lrh0;
512         u32 bth0;
513         u32 hwords;
514         struct ipath_ib_header hdr;
515         struct ipath_other_headers *ohdr;
516
517         /* Construct the header. */
518         ohdr = &hdr.u.oth;
519         lrh0 = IPS_LRH_BTH;
520         /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */
521         hwords = 6;
522         if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
523                 hwords += ipath_make_grh(dev, &hdr.u.l.grh,
524                                          &qp->remote_ah_attr.grh,
525                                          hwords, 0);
526                 ohdr = &hdr.u.l.oth;
527                 lrh0 = IPS_LRH_GRH;
528         }
529         bth0 = ipath_layer_get_pkey(dev->dd, qp->s_pkey_index);
530         ohdr->u.aeth = ipath_compute_aeth(qp);
531         if (qp->s_ack_state >= OP(COMPARE_SWAP)) {
532                 bth0 |= IB_OPCODE_ATOMIC_ACKNOWLEDGE << 24;
533                 ohdr->u.at.atomic_ack_eth = cpu_to_be64(qp->s_ack_atomic);
534                 hwords += sizeof(ohdr->u.at.atomic_ack_eth) / 4;
535         } else
536                 bth0 |= OP(ACKNOWLEDGE) << 24;
537         lrh0 |= qp->remote_ah_attr.sl << 4;
538         hdr.lrh[0] = cpu_to_be16(lrh0);
539         hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
540         hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
541         hdr.lrh[3] = cpu_to_be16(ipath_layer_get_lid(dev->dd));
542         ohdr->bth[0] = cpu_to_be32(bth0);
543         ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
544         ohdr->bth[2] = cpu_to_be32(qp->s_ack_psn & IPS_PSN_MASK);
545
546         /*
547          * If we can send the ACK, clear the ACK state.
548          */
549         if (ipath_verbs_send(dev->dd, hwords, (u32 *) &hdr, 0, NULL) == 0) {
550                 qp->s_ack_state = OP(ACKNOWLEDGE);
551                 dev->n_rc_qacks++;
552                 dev->n_unicast_xmit++;
553         }
554 }
555
556 /**
557  * reset_psn - reset the QP state to send starting from PSN
558  * @qp: the QP
559  * @psn: the packet sequence number to restart at
560  *
561  * This is called from ipath_rc_rcv() to process an incoming RC ACK
562  * for the given QP.
563  * Called at interrupt level with the QP s_lock held.
564  */
565 static void reset_psn(struct ipath_qp *qp, u32 psn)
566 {
567         u32 n = qp->s_last;
568         struct ipath_swqe *wqe = get_swqe_ptr(qp, n);
569         u32 opcode;
570
571         qp->s_cur = n;
572
573         /*
574          * If we are starting the request from the beginning,
575          * let the normal send code handle initialization.
576          */
577         if (ipath_cmp24(psn, wqe->psn) <= 0) {
578                 qp->s_state = OP(SEND_LAST);
579                 goto done;
580         }
581
582         /* Find the work request opcode corresponding to the given PSN. */
583         opcode = wqe->wr.opcode;
584         for (;;) {
585                 int diff;
586
587                 if (++n == qp->s_size)
588                         n = 0;
589                 if (n == qp->s_tail)
590                         break;
591                 wqe = get_swqe_ptr(qp, n);
592                 diff = ipath_cmp24(psn, wqe->psn);
593                 if (diff < 0)
594                         break;
595                 qp->s_cur = n;
596                 /*
597                  * If we are starting the request from the beginning,
598                  * let the normal send code handle initialization.
599                  */
600                 if (diff == 0) {
601                         qp->s_state = OP(SEND_LAST);
602                         goto done;
603                 }
604                 opcode = wqe->wr.opcode;
605         }
606
607         /*
608          * Set the state to restart in the middle of a request.
609          * Don't change the s_sge, s_cur_sge, or s_cur_size.
610          * See ipath_do_rc_send().
611          */
612         switch (opcode) {
613         case IB_WR_SEND:
614         case IB_WR_SEND_WITH_IMM:
615                 qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
616                 break;
617
618         case IB_WR_RDMA_WRITE:
619         case IB_WR_RDMA_WRITE_WITH_IMM:
620                 qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
621                 break;
622
623         case IB_WR_RDMA_READ:
624                 qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
625                 break;
626
627         default:
628                 /*
629                  * This case shouldn't happen since its only
630                  * one PSN per req.
631                  */
632                 qp->s_state = OP(SEND_LAST);
633         }
634 done:
635         qp->s_psn = psn;
636 }
637
638 /**
639  * ipath_restart_rc - back up requester to resend the last un-ACKed request
640  * @qp: the QP to restart
641  * @psn: packet sequence number for the request
642  * @wc: the work completion request
643  *
644  * The QP s_lock should be held.
645  */
646 void ipath_restart_rc(struct ipath_qp *qp, u32 psn, struct ib_wc *wc)
647 {
648         struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last);
649         struct ipath_ibdev *dev;
650
651         /*
652          * If there are no requests pending, we are done.
653          */
654         if (ipath_cmp24(psn, qp->s_next_psn) >= 0 ||
655             qp->s_last == qp->s_tail)
656                 goto done;
657
658         if (qp->s_retry == 0) {
659                 wc->wr_id = wqe->wr.wr_id;
660                 wc->status = IB_WC_RETRY_EXC_ERR;
661                 wc->opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
662                 wc->vendor_err = 0;
663                 wc->byte_len = 0;
664                 wc->qp_num = qp->ibqp.qp_num;
665                 wc->src_qp = qp->remote_qpn;
666                 wc->pkey_index = 0;
667                 wc->slid = qp->remote_ah_attr.dlid;
668                 wc->sl = qp->remote_ah_attr.sl;
669                 wc->dlid_path_bits = 0;
670                 wc->port_num = 0;
671                 ipath_sqerror_qp(qp, wc);
672                 goto bail;
673         }
674         qp->s_retry--;
675
676         /*
677          * Remove the QP from the timeout queue.
678          * Note: it may already have been removed by ipath_ib_timer().
679          */
680         dev = to_idev(qp->ibqp.device);
681         spin_lock(&dev->pending_lock);
682         if (!list_empty(&qp->timerwait))
683                 list_del_init(&qp->timerwait);
684         spin_unlock(&dev->pending_lock);
685
686         if (wqe->wr.opcode == IB_WR_RDMA_READ)
687                 dev->n_rc_resends++;
688         else
689                 dev->n_rc_resends += (int)qp->s_psn - (int)psn;
690
691         reset_psn(qp, psn);
692
693 done:
694         tasklet_hi_schedule(&qp->s_task);
695
696 bail:
697         return;
698 }
699
700 /**
701  * do_rc_ack - process an incoming RC ACK
702  * @qp: the QP the ACK came in on
703  * @psn: the packet sequence number of the ACK
704  * @opcode: the opcode of the request that resulted in the ACK
705  *
706  * This is called from ipath_rc_rcv_resp() to process an incoming RC ACK
707  * for the given QP.
708  * Called at interrupt level with the QP s_lock held.
709  * Returns 1 if OK, 0 if current operation should be aborted (NAK).
710  */
711 static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode)
712 {
713         struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
714         struct ib_wc wc;
715         struct ipath_swqe *wqe;
716         int ret = 0;
717
718         /*
719          * Remove the QP from the timeout queue (or RNR timeout queue).
720          * If ipath_ib_timer() has already removed it,
721          * it's OK since we hold the QP s_lock and ipath_restart_rc()
722          * just won't find anything to restart if we ACK everything.
723          */
724         spin_lock(&dev->pending_lock);
725         if (!list_empty(&qp->timerwait))
726                 list_del_init(&qp->timerwait);
727         spin_unlock(&dev->pending_lock);
728
729         /*
730          * Note that NAKs implicitly ACK outstanding SEND and RDMA write
731          * requests and implicitly NAK RDMA read and atomic requests issued
732          * before the NAK'ed request.  The MSN won't include the NAK'ed
733          * request but will include an ACK'ed request(s).
734          */
735         wqe = get_swqe_ptr(qp, qp->s_last);
736
737         /* Nothing is pending to ACK/NAK. */
738         if (qp->s_last == qp->s_tail)
739                 goto bail;
740
741         /*
742          * The MSN might be for a later WQE than the PSN indicates so
743          * only complete WQEs that the PSN finishes.
744          */
745         while (ipath_cmp24(psn, wqe->lpsn) >= 0) {
746                 /* If we are ACKing a WQE, the MSN should be >= the SSN. */
747                 if (ipath_cmp24(aeth, wqe->ssn) < 0)
748                         break;
749                 /*
750                  * If this request is a RDMA read or atomic, and the ACK is
751                  * for a later operation, this ACK NAKs the RDMA read or
752                  * atomic.  In other words, only a RDMA_READ_LAST or ONLY
753                  * can ACK a RDMA read and likewise for atomic ops.  Note
754                  * that the NAK case can only happen if relaxed ordering is
755                  * used and requests are sent after an RDMA read or atomic
756                  * is sent but before the response is received.
757                  */
758                 if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
759                      opcode != OP(RDMA_READ_RESPONSE_LAST)) ||
760                     ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
761                       wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
762                      (opcode != OP(ATOMIC_ACKNOWLEDGE) ||
763                       ipath_cmp24(wqe->psn, psn) != 0))) {
764                         /*
765                          * The last valid PSN seen is the previous
766                          * request's.
767                          */
768                         qp->s_last_psn = wqe->psn - 1;
769                         /* Retry this request. */
770                         ipath_restart_rc(qp, wqe->psn, &wc);
771                         /*
772                          * No need to process the ACK/NAK since we are
773                          * restarting an earlier request.
774                          */
775                         goto bail;
776                 }
777                 /* Post a send completion queue entry if requested. */
778                 if (!test_bit(IPATH_S_SIGNAL_REQ_WR, &qp->s_flags) ||
779                     (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
780                         wc.wr_id = wqe->wr.wr_id;
781                         wc.status = IB_WC_SUCCESS;
782                         wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
783                         wc.vendor_err = 0;
784                         wc.byte_len = wqe->length;
785                         wc.qp_num = qp->ibqp.qp_num;
786                         wc.src_qp = qp->remote_qpn;
787                         wc.pkey_index = 0;
788                         wc.slid = qp->remote_ah_attr.dlid;
789                         wc.sl = qp->remote_ah_attr.sl;
790                         wc.dlid_path_bits = 0;
791                         wc.port_num = 0;
792                         ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
793                 }
794                 qp->s_retry = qp->s_retry_cnt;
795                 /*
796                  * If we are completing a request which is in the process of
797                  * being resent, we can stop resending it since we know the
798                  * responder has already seen it.
799                  */
800                 if (qp->s_last == qp->s_cur) {
801                         if (++qp->s_cur >= qp->s_size)
802                                 qp->s_cur = 0;
803                         wqe = get_swqe_ptr(qp, qp->s_cur);
804                         qp->s_state = OP(SEND_LAST);
805                         qp->s_psn = wqe->psn;
806                 }
807                 if (++qp->s_last >= qp->s_size)
808                         qp->s_last = 0;
809                 wqe = get_swqe_ptr(qp, qp->s_last);
810                 if (qp->s_last == qp->s_tail)
811                         break;
812         }
813
814         switch (aeth >> 29) {
815         case 0:         /* ACK */
816                 dev->n_rc_acks++;
817                 /* If this is a partial ACK, reset the retransmit timer. */
818                 if (qp->s_last != qp->s_tail) {
819                         spin_lock(&dev->pending_lock);
820                         list_add_tail(&qp->timerwait,
821                                       &dev->pending[dev->pending_index]);
822                         spin_unlock(&dev->pending_lock);
823                 }
824                 ipath_get_credit(qp, aeth);
825                 qp->s_rnr_retry = qp->s_rnr_retry_cnt;
826                 qp->s_retry = qp->s_retry_cnt;
827                 qp->s_last_psn = psn;
828                 ret = 1;
829                 goto bail;
830
831         case 1:         /* RNR NAK */
832                 dev->n_rnr_naks++;
833                 if (qp->s_rnr_retry == 0) {
834                         if (qp->s_last == qp->s_tail)
835                                 goto bail;
836
837                         wc.status = IB_WC_RNR_RETRY_EXC_ERR;
838                         goto class_b;
839                 }
840                 if (qp->s_rnr_retry_cnt < 7)
841                         qp->s_rnr_retry--;
842                 if (qp->s_last == qp->s_tail)
843                         goto bail;
844
845                 /* The last valid PSN is the previous PSN. */
846                 qp->s_last_psn = psn - 1;
847
848                 dev->n_rc_resends += (int)qp->s_psn - (int)psn;
849
850                 reset_psn(qp, psn);
851
852                 qp->s_rnr_timeout =
853                         ib_ipath_rnr_table[(aeth >> IPS_AETH_CREDIT_SHIFT) &
854                                            IPS_AETH_CREDIT_MASK];
855                 ipath_insert_rnr_queue(qp);
856                 goto bail;
857
858         case 3:         /* NAK */
859                 /* The last valid PSN seen is the previous request's. */
860                 if (qp->s_last != qp->s_tail)
861                         qp->s_last_psn = wqe->psn - 1;
862                 switch ((aeth >> IPS_AETH_CREDIT_SHIFT) &
863                         IPS_AETH_CREDIT_MASK) {
864                 case 0: /* PSN sequence error */
865                         dev->n_seq_naks++;
866                         /*
867                          * Back up to the responder's expected PSN.  XXX
868                          * Note that we might get a NAK in the middle of an
869                          * RDMA READ response which terminates the RDMA
870                          * READ.
871                          */
872                         if (qp->s_last == qp->s_tail)
873                                 break;
874
875                         if (ipath_cmp24(psn, wqe->psn) < 0)
876                                 break;
877
878                         /* Retry the request. */
879                         ipath_restart_rc(qp, psn, &wc);
880                         break;
881
882                 case 1: /* Invalid Request */
883                         wc.status = IB_WC_REM_INV_REQ_ERR;
884                         dev->n_other_naks++;
885                         goto class_b;
886
887                 case 2: /* Remote Access Error */
888                         wc.status = IB_WC_REM_ACCESS_ERR;
889                         dev->n_other_naks++;
890                         goto class_b;
891
892                 case 3: /* Remote Operation Error */
893                         wc.status = IB_WC_REM_OP_ERR;
894                         dev->n_other_naks++;
895                 class_b:
896                         wc.wr_id = wqe->wr.wr_id;
897                         wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
898                         wc.vendor_err = 0;
899                         wc.byte_len = 0;
900                         wc.qp_num = qp->ibqp.qp_num;
901                         wc.src_qp = qp->remote_qpn;
902                         wc.pkey_index = 0;
903                         wc.slid = qp->remote_ah_attr.dlid;
904                         wc.sl = qp->remote_ah_attr.sl;
905                         wc.dlid_path_bits = 0;
906                         wc.port_num = 0;
907                         ipath_sqerror_qp(qp, &wc);
908                         break;
909
910                 default:
911                         /* Ignore other reserved NAK error codes */
912                         goto reserved;
913                 }
914                 qp->s_rnr_retry = qp->s_rnr_retry_cnt;
915                 goto bail;
916
917         default:                /* 2: reserved */
918         reserved:
919                 /* Ignore reserved NAK codes. */
920                 goto bail;
921         }
922
923 bail:
924         return ret;
925 }
926
927 /**
928  * ipath_rc_rcv_resp - process an incoming RC response packet
929  * @dev: the device this packet came in on
930  * @ohdr: the other headers for this packet
931  * @data: the packet data
932  * @tlen: the packet length
933  * @qp: the QP for this packet
934  * @opcode: the opcode for this packet
935  * @psn: the packet sequence number for this packet
936  * @hdrsize: the header length
937  * @pmtu: the path MTU
938  * @header_in_data: true if part of the header data is in the data buffer
939  *
940  * This is called from ipath_rc_rcv() to process an incoming RC response
941  * packet for the given QP.
942  * Called at interrupt level.
943  */
944 static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev,
945                                      struct ipath_other_headers *ohdr,
946                                      void *data, u32 tlen,
947                                      struct ipath_qp *qp,
948                                      u32 opcode,
949                                      u32 psn, u32 hdrsize, u32 pmtu,
950                                      int header_in_data)
951 {
952         unsigned long flags;
953         struct ib_wc wc;
954         int diff;
955         u32 pad;
956         u32 aeth;
957
958         spin_lock_irqsave(&qp->s_lock, flags);
959
960         /* Ignore invalid responses. */
961         if (ipath_cmp24(psn, qp->s_next_psn) >= 0)
962                 goto ack_done;
963
964         /* Ignore duplicate responses. */
965         diff = ipath_cmp24(psn, qp->s_last_psn);
966         if (unlikely(diff <= 0)) {
967                 /* Update credits for "ghost" ACKs */
968                 if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
969                         if (!header_in_data)
970                                 aeth = be32_to_cpu(ohdr->u.aeth);
971                         else {
972                                 aeth = be32_to_cpu(((__be32 *) data)[0]);
973                                 data += sizeof(__be32);
974                         }
975                         if ((aeth >> 29) == 0)
976                                 ipath_get_credit(qp, aeth);
977                 }
978                 goto ack_done;
979         }
980
981         switch (opcode) {
982         case OP(ACKNOWLEDGE):
983         case OP(ATOMIC_ACKNOWLEDGE):
984         case OP(RDMA_READ_RESPONSE_FIRST):
985                 if (!header_in_data)
986                         aeth = be32_to_cpu(ohdr->u.aeth);
987                 else {
988                         aeth = be32_to_cpu(((__be32 *) data)[0]);
989                         data += sizeof(__be32);
990                 }
991                 if (opcode == OP(ATOMIC_ACKNOWLEDGE))
992                         *(u64 *) qp->s_sge.sge.vaddr = *(u64 *) data;
993                 if (!do_rc_ack(qp, aeth, psn, opcode) ||
994                     opcode != OP(RDMA_READ_RESPONSE_FIRST))
995                         goto ack_done;
996                 hdrsize += 4;
997                 /*
998                  * do_rc_ack() has already checked the PSN so skip
999                  * the sequence check.
1000                  */
1001                 goto rdma_read;
1002
1003         case OP(RDMA_READ_RESPONSE_MIDDLE):
1004                 /* no AETH, no ACK */
1005                 if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) {
1006                         dev->n_rdma_seq++;
1007                         ipath_restart_rc(qp, qp->s_last_psn + 1, &wc);
1008                         goto ack_done;
1009                 }
1010         rdma_read:
1011                 if (unlikely(qp->s_state != OP(RDMA_READ_REQUEST)))
1012                         goto ack_done;
1013                 if (unlikely(tlen != (hdrsize + pmtu + 4)))
1014                         goto ack_done;
1015                 if (unlikely(pmtu >= qp->s_len))
1016                         goto ack_done;
1017                 /* We got a response so update the timeout. */
1018                 if (unlikely(qp->s_last == qp->s_tail ||
1019                              get_swqe_ptr(qp, qp->s_last)->wr.opcode !=
1020                              IB_WR_RDMA_READ))
1021                         goto ack_done;
1022                 spin_lock(&dev->pending_lock);
1023                 if (qp->s_rnr_timeout == 0 && !list_empty(&qp->timerwait))
1024                         list_move_tail(&qp->timerwait,
1025                                        &dev->pending[dev->pending_index]);
1026                 spin_unlock(&dev->pending_lock);
1027                 /*
1028                  * Update the RDMA receive state but do the copy w/o
1029                  * holding the locks and blocking interrupts.
1030                  * XXX Yet another place that affects relaxed RDMA order
1031                  * since we don't want s_sge modified.
1032                  */
1033                 qp->s_len -= pmtu;
1034                 qp->s_last_psn = psn;
1035                 spin_unlock_irqrestore(&qp->s_lock, flags);
1036                 ipath_copy_sge(&qp->s_sge, data, pmtu);
1037                 goto bail;
1038
1039         case OP(RDMA_READ_RESPONSE_LAST):
1040                 /* ACKs READ req. */
1041                 if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) {
1042                         dev->n_rdma_seq++;
1043                         ipath_restart_rc(qp, qp->s_last_psn + 1, &wc);
1044                         goto ack_done;
1045                 }
1046                 /* FALLTHROUGH */
1047         case OP(RDMA_READ_RESPONSE_ONLY):
1048                 if (unlikely(qp->s_state != OP(RDMA_READ_REQUEST)))
1049                         goto ack_done;
1050                 /*
1051                  * Get the number of bytes the message was padded by.
1052                  */
1053                 pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
1054                 /*
1055                  * Check that the data size is >= 1 && <= pmtu.
1056                  * Remember to account for the AETH header (4) and
1057                  * ICRC (4).
1058                  */
1059                 if (unlikely(tlen <= (hdrsize + pad + 8))) {
1060                         /* XXX Need to generate an error CQ entry. */
1061                         goto ack_done;
1062                 }
1063                 tlen -= hdrsize + pad + 8;
1064                 if (unlikely(tlen != qp->s_len)) {
1065                         /* XXX Need to generate an error CQ entry. */
1066                         goto ack_done;
1067                 }
1068                 if (!header_in_data)
1069                         aeth = be32_to_cpu(ohdr->u.aeth);
1070                 else {
1071                         aeth = be32_to_cpu(((__be32 *) data)[0]);
1072                         data += sizeof(__be32);
1073                 }
1074                 ipath_copy_sge(&qp->s_sge, data, tlen);
1075                 if (do_rc_ack(qp, aeth, psn, OP(RDMA_READ_RESPONSE_LAST))) {
1076                         /*
1077                          * Change the state so we contimue
1078                          * processing new requests and wake up the
1079                          * tasklet if there are posted sends.
1080                          */
1081                         qp->s_state = OP(SEND_LAST);
1082                         if (qp->s_tail != qp->s_head)
1083                                 tasklet_hi_schedule(&qp->s_task);
1084                 }
1085                 goto ack_done;
1086         }
1087
1088 ack_done:
1089         spin_unlock_irqrestore(&qp->s_lock, flags);
1090 bail:
1091         return;
1092 }
1093
1094 /**
1095  * ipath_rc_rcv_error - process an incoming duplicate or error RC packet
1096  * @dev: the device this packet came in on
1097  * @ohdr: the other headers for this packet
1098  * @data: the packet data
1099  * @qp: the QP for this packet
1100  * @opcode: the opcode for this packet
1101  * @psn: the packet sequence number for this packet
1102  * @diff: the difference between the PSN and the expected PSN
1103  * @header_in_data: true if part of the header data is in the data buffer
1104  *
1105  * This is called from ipath_rc_rcv() to process an unexpected
1106  * incoming RC packet for the given QP.
1107  * Called at interrupt level.
1108  * Return 1 if no more processing is needed; otherwise return 0 to
1109  * schedule a response to be sent and the s_lock unlocked.
1110  */
1111 static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev,
1112                                      struct ipath_other_headers *ohdr,
1113                                      void *data,
1114                                      struct ipath_qp *qp,
1115                                      u32 opcode,
1116                                      u32 psn,
1117                                      int diff,
1118                                      int header_in_data)
1119 {
1120         struct ib_reth *reth;
1121
1122         if (diff > 0) {
1123                 /*
1124                  * Packet sequence error.
1125                  * A NAK will ACK earlier sends and RDMA writes.
1126                  * Don't queue the NAK if a RDMA read, atomic, or
1127                  * NAK is pending though.
1128                  */
1129                 spin_lock(&qp->s_lock);
1130                 if ((qp->s_ack_state >= OP(RDMA_READ_REQUEST) &&
1131                      qp->s_ack_state != IB_OPCODE_ACKNOWLEDGE) ||
1132                     qp->s_nak_state != 0) {
1133                         spin_unlock(&qp->s_lock);
1134                         goto done;
1135                 }
1136                 qp->s_ack_state = OP(SEND_ONLY);
1137                 qp->s_nak_state = IB_NAK_PSN_ERROR;
1138                 /* Use the expected PSN. */
1139                 qp->s_ack_psn = qp->r_psn;
1140                 goto resched;
1141         }
1142
1143         /*
1144          * Handle a duplicate request.  Don't re-execute SEND, RDMA
1145          * write or atomic op.  Don't NAK errors, just silently drop
1146          * the duplicate request.  Note that r_sge, r_len, and
1147          * r_rcv_len may be in use so don't modify them.
1148          *
1149          * We are supposed to ACK the earliest duplicate PSN but we
1150          * can coalesce an outstanding duplicate ACK.  We have to
1151          * send the earliest so that RDMA reads can be restarted at
1152          * the requester's expected PSN.
1153          */
1154         spin_lock(&qp->s_lock);
1155         if (qp->s_ack_state != IB_OPCODE_ACKNOWLEDGE &&
1156             ipath_cmp24(psn, qp->s_ack_psn) >= 0) {
1157                 if (qp->s_ack_state < IB_OPCODE_RDMA_READ_REQUEST)
1158                         qp->s_ack_psn = psn;
1159                 spin_unlock(&qp->s_lock);
1160                 goto done;
1161         }
1162         switch (opcode) {
1163         case OP(RDMA_READ_REQUEST):
1164                 /*
1165                  * We have to be careful to not change s_rdma_sge
1166                  * while ipath_do_rc_send() is using it and not
1167                  * holding the s_lock.
1168                  */
1169                 if (qp->s_ack_state != OP(ACKNOWLEDGE) &&
1170                     qp->s_ack_state >= IB_OPCODE_RDMA_READ_REQUEST) {
1171                         spin_unlock(&qp->s_lock);
1172                         dev->n_rdma_dup_busy++;
1173                         goto done;
1174                 }
1175                 /* RETH comes after BTH */
1176                 if (!header_in_data)
1177                         reth = &ohdr->u.rc.reth;
1178                 else {
1179                         reth = (struct ib_reth *)data;
1180                         data += sizeof(*reth);
1181                 }
1182                 qp->s_rdma_len = be32_to_cpu(reth->length);
1183                 if (qp->s_rdma_len != 0) {
1184                         u32 rkey = be32_to_cpu(reth->rkey);
1185                         u64 vaddr = be64_to_cpu(reth->vaddr);
1186                         int ok;
1187
1188                         /*
1189                          * Address range must be a subset of the original
1190                          * request and start on pmtu boundaries.
1191                          */
1192                         ok = ipath_rkey_ok(dev, &qp->s_rdma_sge,
1193                                            qp->s_rdma_len, vaddr, rkey,
1194                                            IB_ACCESS_REMOTE_READ);
1195                         if (unlikely(!ok))
1196                                 goto done;
1197                 } else {
1198                         qp->s_rdma_sge.sg_list = NULL;
1199                         qp->s_rdma_sge.num_sge = 0;
1200                         qp->s_rdma_sge.sge.mr = NULL;
1201                         qp->s_rdma_sge.sge.vaddr = NULL;
1202                         qp->s_rdma_sge.sge.length = 0;
1203                         qp->s_rdma_sge.sge.sge_length = 0;
1204                 }
1205                 break;
1206
1207         case OP(COMPARE_SWAP):
1208         case OP(FETCH_ADD):
1209                 /*
1210                  * Check for the PSN of the last atomic operation
1211                  * performed and resend the result if found.
1212                  */
1213                 if ((psn & IPS_PSN_MASK) != qp->r_atomic_psn) {
1214                         spin_unlock(&qp->s_lock);
1215                         goto done;
1216                 }
1217                 qp->s_ack_atomic = qp->r_atomic_data;
1218                 break;
1219         }
1220         qp->s_ack_state = opcode;
1221         qp->s_nak_state = 0;
1222         qp->s_ack_psn = psn;
1223 resched:
1224         return 0;
1225
1226 done:
1227         return 1;
1228 }
1229
1230 /**
1231  * ipath_rc_rcv - process an incoming RC packet
1232  * @dev: the device this packet came in on
1233  * @hdr: the header of this packet
1234  * @has_grh: true if the header has a GRH
1235  * @data: the packet data
1236  * @tlen: the packet length
1237  * @qp: the QP for this packet
1238  *
1239  * This is called from ipath_qp_rcv() to process an incoming RC packet
1240  * for the given QP.
1241  * Called at interrupt level.
1242  */
1243 void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
1244                   int has_grh, void *data, u32 tlen, struct ipath_qp *qp)
1245 {
1246         struct ipath_other_headers *ohdr;
1247         u32 opcode;
1248         u32 hdrsize;
1249         u32 psn;
1250         u32 pad;
1251         unsigned long flags;
1252         struct ib_wc wc;
1253         u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
1254         int diff;
1255         struct ib_reth *reth;
1256         int header_in_data;
1257
1258         /* Check for GRH */
1259         if (!has_grh) {
1260                 ohdr = &hdr->u.oth;
1261                 hdrsize = 8 + 12;       /* LRH + BTH */
1262                 psn = be32_to_cpu(ohdr->bth[2]);
1263                 header_in_data = 0;
1264         } else {
1265                 ohdr = &hdr->u.l.oth;
1266                 hdrsize = 8 + 40 + 12;  /* LRH + GRH + BTH */
1267                 /*
1268                  * The header with GRH is 60 bytes and the core driver sets
1269                  * the eager header buffer size to 56 bytes so the last 4
1270                  * bytes of the BTH header (PSN) is in the data buffer.
1271                  */
1272                 header_in_data =
1273                         ipath_layer_get_rcvhdrentsize(dev->dd) == 16;
1274                 if (header_in_data) {
1275                         psn = be32_to_cpu(((__be32 *) data)[0]);
1276                         data += sizeof(__be32);
1277                 } else
1278                         psn = be32_to_cpu(ohdr->bth[2]);
1279         }
1280
1281         /*
1282          * Process responses (ACKs) before anything else.  Note that the
1283          * packet sequence number will be for something in the send work
1284          * queue rather than the expected receive packet sequence number.
1285          * In other words, this QP is the requester.
1286          */
1287         opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
1288         if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
1289             opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
1290                 ipath_rc_rcv_resp(dev, ohdr, data, tlen, qp, opcode, psn,
1291                                   hdrsize, pmtu, header_in_data);
1292                 goto bail;
1293         }
1294
1295         spin_lock_irqsave(&qp->r_rq.lock, flags);
1296
1297         /* Compute 24 bits worth of difference. */
1298         diff = ipath_cmp24(psn, qp->r_psn);
1299         if (unlikely(diff)) {
1300                 if (ipath_rc_rcv_error(dev, ohdr, data, qp, opcode,
1301                                        psn, diff, header_in_data))
1302                         goto done;
1303                 goto resched;
1304         }
1305
1306         /* Check for opcode sequence errors. */
1307         switch (qp->r_state) {
1308         case OP(SEND_FIRST):
1309         case OP(SEND_MIDDLE):
1310                 if (opcode == OP(SEND_MIDDLE) ||
1311                     opcode == OP(SEND_LAST) ||
1312                     opcode == OP(SEND_LAST_WITH_IMMEDIATE))
1313                         break;
1314         nack_inv:
1315         /*
1316          * A NAK will ACK earlier sends and RDMA writes.  Don't queue the
1317          * NAK if a RDMA read, atomic, or NAK is pending though.
1318          */
1319         spin_lock(&qp->s_lock);
1320         if (qp->s_ack_state >= OP(RDMA_READ_REQUEST) &&
1321             qp->s_ack_state != IB_OPCODE_ACKNOWLEDGE) {
1322                 spin_unlock(&qp->s_lock);
1323                 goto done;
1324         }
1325         /* XXX Flush WQEs */
1326         qp->state = IB_QPS_ERR;
1327         qp->s_ack_state = OP(SEND_ONLY);
1328         qp->s_nak_state = IB_NAK_INVALID_REQUEST;
1329         qp->s_ack_psn = qp->r_psn;
1330         goto resched;
1331
1332         case OP(RDMA_WRITE_FIRST):
1333         case OP(RDMA_WRITE_MIDDLE):
1334                 if (opcode == OP(RDMA_WRITE_MIDDLE) ||
1335                     opcode == OP(RDMA_WRITE_LAST) ||
1336                     opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
1337                         break;
1338                 goto nack_inv;
1339
1340         case OP(RDMA_READ_REQUEST):
1341         case OP(COMPARE_SWAP):
1342         case OP(FETCH_ADD):
1343                 /*
1344                  * Drop all new requests until a response has been sent.  A
1345                  * new request then ACKs the RDMA response we sent.  Relaxed
1346                  * ordering would allow new requests to be processed but we
1347                  * would need to keep a queue of rwqe's for all that are in
1348                  * progress.  Note that we can't RNR NAK this request since
1349                  * the RDMA READ or atomic response is already queued to be
1350                  * sent (unless we implement a response send queue).
1351                  */
1352                 goto done;
1353
1354         default:
1355                 if (opcode == OP(SEND_MIDDLE) ||
1356                     opcode == OP(SEND_LAST) ||
1357                     opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
1358                     opcode == OP(RDMA_WRITE_MIDDLE) ||
1359                     opcode == OP(RDMA_WRITE_LAST) ||
1360                     opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
1361                         goto nack_inv;
1362                 break;
1363         }
1364
1365         wc.imm_data = 0;
1366         wc.wc_flags = 0;
1367
1368         /* OK, process the packet. */
1369         switch (opcode) {
1370         case OP(SEND_FIRST):
1371                 if (!ipath_get_rwqe(qp, 0)) {
1372                 rnr_nak:
1373                         /*
1374                          * A RNR NAK will ACK earlier sends and RDMA writes.
1375                          * Don't queue the NAK if a RDMA read or atomic
1376                          * is pending though.
1377                          */
1378                         spin_lock(&qp->s_lock);
1379                         if (qp->s_ack_state >=
1380                             OP(RDMA_READ_REQUEST) &&
1381                             qp->s_ack_state != IB_OPCODE_ACKNOWLEDGE) {
1382                                 spin_unlock(&qp->s_lock);
1383                                 goto done;
1384                         }
1385                         qp->s_ack_state = OP(SEND_ONLY);
1386                         qp->s_nak_state = IB_RNR_NAK | qp->s_min_rnr_timer;
1387                         qp->s_ack_psn = qp->r_psn;
1388                         goto resched;
1389                 }
1390                 qp->r_rcv_len = 0;
1391                 /* FALLTHROUGH */
1392         case OP(SEND_MIDDLE):
1393         case OP(RDMA_WRITE_MIDDLE):
1394         send_middle:
1395                 /* Check for invalid length PMTU or posted rwqe len. */
1396                 if (unlikely(tlen != (hdrsize + pmtu + 4)))
1397                         goto nack_inv;
1398                 qp->r_rcv_len += pmtu;
1399                 if (unlikely(qp->r_rcv_len > qp->r_len))
1400                         goto nack_inv;
1401                 ipath_copy_sge(&qp->r_sge, data, pmtu);
1402                 break;
1403
1404         case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
1405                 /* consume RWQE */
1406                 if (!ipath_get_rwqe(qp, 1))
1407                         goto rnr_nak;
1408                 goto send_last_imm;
1409
1410         case OP(SEND_ONLY):
1411         case OP(SEND_ONLY_WITH_IMMEDIATE):
1412                 if (!ipath_get_rwqe(qp, 0))
1413                         goto rnr_nak;
1414                 qp->r_rcv_len = 0;
1415                 if (opcode == OP(SEND_ONLY))
1416                         goto send_last;
1417                 /* FALLTHROUGH */
1418         case OP(SEND_LAST_WITH_IMMEDIATE):
1419         send_last_imm:
1420                 if (header_in_data) {
1421                         wc.imm_data = *(__be32 *) data;
1422                         data += sizeof(__be32);
1423                 } else {
1424                         /* Immediate data comes after BTH */
1425                         wc.imm_data = ohdr->u.imm_data;
1426                 }
1427                 hdrsize += 4;
1428                 wc.wc_flags = IB_WC_WITH_IMM;
1429                 /* FALLTHROUGH */
1430         case OP(SEND_LAST):
1431         case OP(RDMA_WRITE_LAST):
1432         send_last:
1433                 /* Get the number of bytes the message was padded by. */
1434                 pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
1435                 /* Check for invalid length. */
1436                 /* XXX LAST len should be >= 1 */
1437                 if (unlikely(tlen < (hdrsize + pad + 4)))
1438                         goto nack_inv;
1439                 /* Don't count the CRC. */
1440                 tlen -= (hdrsize + pad + 4);
1441                 wc.byte_len = tlen + qp->r_rcv_len;
1442                 if (unlikely(wc.byte_len > qp->r_len))
1443                         goto nack_inv;
1444                 ipath_copy_sge(&qp->r_sge, data, tlen);
1445                 atomic_inc(&qp->msn);
1446                 if (opcode == OP(RDMA_WRITE_LAST) ||
1447                     opcode == OP(RDMA_WRITE_ONLY))
1448                         break;
1449                 wc.wr_id = qp->r_wr_id;
1450                 wc.status = IB_WC_SUCCESS;
1451                 wc.opcode = IB_WC_RECV;
1452                 wc.vendor_err = 0;
1453                 wc.qp_num = qp->ibqp.qp_num;
1454                 wc.src_qp = qp->remote_qpn;
1455                 wc.pkey_index = 0;
1456                 wc.slid = qp->remote_ah_attr.dlid;
1457                 wc.sl = qp->remote_ah_attr.sl;
1458                 wc.dlid_path_bits = 0;
1459                 wc.port_num = 0;
1460                 /* Signal completion event if the solicited bit is set. */
1461                 ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
1462                                (ohdr->bth[0] &
1463                                 __constant_cpu_to_be32(1 << 23)) != 0);
1464                 break;
1465
1466         case OP(RDMA_WRITE_FIRST):
1467         case OP(RDMA_WRITE_ONLY):
1468         case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
1469                 /* consume RWQE */
1470                 /* RETH comes after BTH */
1471                 if (!header_in_data)
1472                         reth = &ohdr->u.rc.reth;
1473                 else {
1474                         reth = (struct ib_reth *)data;
1475                         data += sizeof(*reth);
1476                 }
1477                 hdrsize += sizeof(*reth);
1478                 qp->r_len = be32_to_cpu(reth->length);
1479                 qp->r_rcv_len = 0;
1480                 if (qp->r_len != 0) {
1481                         u32 rkey = be32_to_cpu(reth->rkey);
1482                         u64 vaddr = be64_to_cpu(reth->vaddr);
1483                         int ok;
1484
1485                         /* Check rkey & NAK */
1486                         ok = ipath_rkey_ok(dev, &qp->r_sge,
1487                                            qp->r_len, vaddr, rkey,
1488                                            IB_ACCESS_REMOTE_WRITE);
1489                         if (unlikely(!ok)) {
1490                         nack_acc:
1491                                 /*
1492                                  * A NAK will ACK earlier sends and RDMA
1493                                  * writes.  Don't queue the NAK if a RDMA
1494                                  * read, atomic, or NAK is pending though.
1495                                  */
1496                                 spin_lock(&qp->s_lock);
1497                                 if (qp->s_ack_state >=
1498                                     OP(RDMA_READ_REQUEST) &&
1499                                     qp->s_ack_state !=
1500                                     IB_OPCODE_ACKNOWLEDGE) {
1501                                         spin_unlock(&qp->s_lock);
1502                                         goto done;
1503                                 }
1504                                 /* XXX Flush WQEs */
1505                                 qp->state = IB_QPS_ERR;
1506                                 qp->s_ack_state = OP(RDMA_WRITE_ONLY);
1507                                 qp->s_nak_state =
1508                                         IB_NAK_REMOTE_ACCESS_ERROR;
1509                                 qp->s_ack_psn = qp->r_psn;
1510                                 goto resched;
1511                         }
1512                 } else {
1513                         qp->r_sge.sg_list = NULL;
1514                         qp->r_sge.sge.mr = NULL;
1515                         qp->r_sge.sge.vaddr = NULL;
1516                         qp->r_sge.sge.length = 0;
1517                         qp->r_sge.sge.sge_length = 0;
1518                 }
1519                 if (unlikely(!(qp->qp_access_flags &
1520                                IB_ACCESS_REMOTE_WRITE)))
1521                         goto nack_acc;
1522                 if (opcode == OP(RDMA_WRITE_FIRST))
1523                         goto send_middle;
1524                 else if (opcode == OP(RDMA_WRITE_ONLY))
1525                         goto send_last;
1526                 if (!ipath_get_rwqe(qp, 1))
1527                         goto rnr_nak;
1528                 goto send_last_imm;
1529
1530         case OP(RDMA_READ_REQUEST):
1531                 /* RETH comes after BTH */
1532                 if (!header_in_data)
1533                         reth = &ohdr->u.rc.reth;
1534                 else {
1535                         reth = (struct ib_reth *)data;
1536                         data += sizeof(*reth);
1537                 }
1538                 spin_lock(&qp->s_lock);
1539                 if (qp->s_ack_state != OP(ACKNOWLEDGE) &&
1540                     qp->s_ack_state >= IB_OPCODE_RDMA_READ_REQUEST) {
1541                         spin_unlock(&qp->s_lock);
1542                         goto done;
1543                 }
1544                 qp->s_rdma_len = be32_to_cpu(reth->length);
1545                 if (qp->s_rdma_len != 0) {
1546                         u32 rkey = be32_to_cpu(reth->rkey);
1547                         u64 vaddr = be64_to_cpu(reth->vaddr);
1548                         int ok;
1549
1550                         /* Check rkey & NAK */
1551                         ok = ipath_rkey_ok(dev, &qp->s_rdma_sge,
1552                                            qp->s_rdma_len, vaddr, rkey,
1553                                            IB_ACCESS_REMOTE_READ);
1554                         if (unlikely(!ok)) {
1555                                 spin_unlock(&qp->s_lock);
1556                                 goto nack_acc;
1557                         }
1558                         /*
1559                          * Update the next expected PSN.  We add 1 later
1560                          * below, so only add the remainder here.
1561                          */
1562                         if (qp->s_rdma_len > pmtu)
1563                                 qp->r_psn += (qp->s_rdma_len - 1) / pmtu;
1564                 } else {
1565                         qp->s_rdma_sge.sg_list = NULL;
1566                         qp->s_rdma_sge.num_sge = 0;
1567                         qp->s_rdma_sge.sge.mr = NULL;
1568                         qp->s_rdma_sge.sge.vaddr = NULL;
1569                         qp->s_rdma_sge.sge.length = 0;
1570                         qp->s_rdma_sge.sge.sge_length = 0;
1571                 }
1572                 if (unlikely(!(qp->qp_access_flags &
1573                                IB_ACCESS_REMOTE_READ)))
1574                         goto nack_acc;
1575                 /*
1576                  * We need to increment the MSN here instead of when we
1577                  * finish sending the result since a duplicate request would
1578                  * increment it more than once.
1579                  */
1580                 atomic_inc(&qp->msn);
1581                 qp->s_ack_state = opcode;
1582                 qp->s_nak_state = 0;
1583                 qp->s_ack_psn = psn;
1584                 qp->r_psn++;
1585                 qp->r_state = opcode;
1586                 goto rdmadone;
1587
1588         case OP(COMPARE_SWAP):
1589         case OP(FETCH_ADD): {
1590                 struct ib_atomic_eth *ateth;
1591                 u64 vaddr;
1592                 u64 sdata;
1593                 u32 rkey;
1594
1595                 if (!header_in_data)
1596                         ateth = &ohdr->u.atomic_eth;
1597                 else {
1598                         ateth = (struct ib_atomic_eth *)data;
1599                         data += sizeof(*ateth);
1600                 }
1601                 vaddr = be64_to_cpu(ateth->vaddr);
1602                 if (unlikely(vaddr & (sizeof(u64) - 1)))
1603                         goto nack_inv;
1604                 rkey = be32_to_cpu(ateth->rkey);
1605                 /* Check rkey & NAK */
1606                 if (unlikely(!ipath_rkey_ok(dev, &qp->r_sge,
1607                                             sizeof(u64), vaddr, rkey,
1608                                             IB_ACCESS_REMOTE_ATOMIC)))
1609                         goto nack_acc;
1610                 if (unlikely(!(qp->qp_access_flags &
1611                                IB_ACCESS_REMOTE_ATOMIC)))
1612                         goto nack_acc;
1613                 /* Perform atomic OP and save result. */
1614                 sdata = be64_to_cpu(ateth->swap_data);
1615                 spin_lock(&dev->pending_lock);
1616                 qp->r_atomic_data = *(u64 *) qp->r_sge.sge.vaddr;
1617                 if (opcode == OP(FETCH_ADD))
1618                         *(u64 *) qp->r_sge.sge.vaddr =
1619                                 qp->r_atomic_data + sdata;
1620                 else if (qp->r_atomic_data ==
1621                          be64_to_cpu(ateth->compare_data))
1622                         *(u64 *) qp->r_sge.sge.vaddr = sdata;
1623                 spin_unlock(&dev->pending_lock);
1624                 atomic_inc(&qp->msn);
1625                 qp->r_atomic_psn = psn & IPS_PSN_MASK;
1626                 psn |= 1 << 31;
1627                 break;
1628         }
1629
1630         default:
1631                 /* Drop packet for unknown opcodes. */
1632                 goto done;
1633         }
1634         qp->r_psn++;
1635         qp->r_state = opcode;
1636         /* Send an ACK if requested or required. */
1637         if (psn & (1 << 31)) {
1638                 /*
1639                  * Coalesce ACKs unless there is a RDMA READ or
1640                  * ATOMIC pending.
1641                  */
1642                 spin_lock(&qp->s_lock);
1643                 if (qp->s_ack_state == OP(ACKNOWLEDGE) ||
1644                     qp->s_ack_state < IB_OPCODE_RDMA_READ_REQUEST) {
1645                         qp->s_ack_state = opcode;
1646                         qp->s_nak_state = 0;
1647                         qp->s_ack_psn = psn;
1648                         qp->s_ack_atomic = qp->r_atomic_data;
1649                         goto resched;
1650                 }
1651                 spin_unlock(&qp->s_lock);
1652         }
1653 done:
1654         spin_unlock_irqrestore(&qp->r_rq.lock, flags);
1655         goto bail;
1656
1657 resched:
1658         /*
1659          * Try to send ACK right away but not if ipath_do_rc_send() is
1660          * active.
1661          */
1662         if (qp->s_hdrwords == 0 &&
1663             (qp->s_ack_state < IB_OPCODE_RDMA_READ_REQUEST ||
1664              qp->s_ack_state >= IB_OPCODE_COMPARE_SWAP))
1665                 send_rc_ack(qp);
1666
1667 rdmadone:
1668         spin_unlock(&qp->s_lock);
1669         spin_unlock_irqrestore(&qp->r_rq.lock, flags);
1670
1671         /* Call ipath_do_rc_send() in another thread. */
1672         tasklet_hi_schedule(&qp->s_task);
1673
1674 bail:
1675         return;
1676 }