1 /*******************************************************************************
3 * Intel Ethernet Controller XL710 Family Linux Driver
4 * Copyright(c) 2013 - 2016 Intel Corporation.
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License,
8 * version 2, as published by the Free Software Foundation.
10 * This program is distributed in the hope it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 * You should have received a copy of the GNU General Public License along
16 * with this program. If not, see <http://www.gnu.org/licenses/>.
18 * The full GNU General Public License is included in this distribution in
19 * the file called "COPYING".
21 * Contact Information:
22 * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
23 * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
25 ******************************************************************************/
27 #include <linux/prefetch.h>
28 #include <net/busy_poll.h>
30 #include "i40e_prototype.h"
32 static inline __le64 build_ctob(u32 td_cmd, u32 td_offset, unsigned int size,
35 return cpu_to_le64(I40E_TX_DESC_DTYPE_DATA |
36 ((u64)td_cmd << I40E_TXD_QW1_CMD_SHIFT) |
37 ((u64)td_offset << I40E_TXD_QW1_OFFSET_SHIFT) |
38 ((u64)size << I40E_TXD_QW1_TX_BUF_SZ_SHIFT) |
39 ((u64)td_tag << I40E_TXD_QW1_L2TAG1_SHIFT));
42 #define I40E_TXD_CMD (I40E_TX_DESC_CMD_EOP | I40E_TX_DESC_CMD_RS)
44 * i40e_fdir - Generate a Flow Director descriptor based on fdata
45 * @tx_ring: Tx ring to send buffer on
46 * @fdata: Flow director filter data
47 * @add: Indicate if we are adding a rule or deleting one
50 static void i40e_fdir(struct i40e_ring *tx_ring,
51 struct i40e_fdir_filter *fdata, bool add)
53 struct i40e_filter_program_desc *fdir_desc;
54 struct i40e_pf *pf = tx_ring->vsi->back;
55 u32 flex_ptype, dtype_cmd;
58 /* grab the next descriptor */
59 i = tx_ring->next_to_use;
60 fdir_desc = I40E_TX_FDIRDESC(tx_ring, i);
63 tx_ring->next_to_use = (i < tx_ring->count) ? i : 0;
65 flex_ptype = I40E_TXD_FLTR_QW0_QINDEX_MASK &
66 (fdata->q_index << I40E_TXD_FLTR_QW0_QINDEX_SHIFT);
68 flex_ptype |= I40E_TXD_FLTR_QW0_FLEXOFF_MASK &
69 (fdata->flex_off << I40E_TXD_FLTR_QW0_FLEXOFF_SHIFT);
71 flex_ptype |= I40E_TXD_FLTR_QW0_PCTYPE_MASK &
72 (fdata->pctype << I40E_TXD_FLTR_QW0_PCTYPE_SHIFT);
74 /* Use LAN VSI Id if not programmed by user */
75 flex_ptype |= I40E_TXD_FLTR_QW0_DEST_VSI_MASK &
76 ((u32)(fdata->dest_vsi ? : pf->vsi[pf->lan_vsi]->id) <<
77 I40E_TXD_FLTR_QW0_DEST_VSI_SHIFT);
79 dtype_cmd = I40E_TX_DESC_DTYPE_FILTER_PROG;
82 I40E_FILTER_PROGRAM_DESC_PCMD_ADD_UPDATE <<
83 I40E_TXD_FLTR_QW1_PCMD_SHIFT :
84 I40E_FILTER_PROGRAM_DESC_PCMD_REMOVE <<
85 I40E_TXD_FLTR_QW1_PCMD_SHIFT;
87 dtype_cmd |= I40E_TXD_FLTR_QW1_DEST_MASK &
88 (fdata->dest_ctl << I40E_TXD_FLTR_QW1_DEST_SHIFT);
90 dtype_cmd |= I40E_TXD_FLTR_QW1_FD_STATUS_MASK &
91 (fdata->fd_status << I40E_TXD_FLTR_QW1_FD_STATUS_SHIFT);
93 if (fdata->cnt_index) {
94 dtype_cmd |= I40E_TXD_FLTR_QW1_CNT_ENA_MASK;
95 dtype_cmd |= I40E_TXD_FLTR_QW1_CNTINDEX_MASK &
96 ((u32)fdata->cnt_index <<
97 I40E_TXD_FLTR_QW1_CNTINDEX_SHIFT);
100 fdir_desc->qindex_flex_ptype_vsi = cpu_to_le32(flex_ptype);
101 fdir_desc->rsvd = cpu_to_le32(0);
102 fdir_desc->dtype_cmd_cntindex = cpu_to_le32(dtype_cmd);
103 fdir_desc->fd_id = cpu_to_le32(fdata->fd_id);
106 #define I40E_FD_CLEAN_DELAY 10
108 * i40e_program_fdir_filter - Program a Flow Director filter
109 * @fdir_data: Packet data that will be filter parameters
110 * @raw_packet: the pre-allocated packet buffer for FDir
111 * @pf: The PF pointer
112 * @add: True for add/update, False for remove
114 static int i40e_program_fdir_filter(struct i40e_fdir_filter *fdir_data,
115 u8 *raw_packet, struct i40e_pf *pf,
118 struct i40e_tx_buffer *tx_buf, *first;
119 struct i40e_tx_desc *tx_desc;
120 struct i40e_ring *tx_ring;
121 struct i40e_vsi *vsi;
127 /* find existing FDIR VSI */
128 vsi = i40e_find_vsi_by_type(pf, I40E_VSI_FDIR);
132 tx_ring = vsi->tx_rings[0];
135 /* we need two descriptors to add/del a filter and we can wait */
136 for (i = I40E_FD_CLEAN_DELAY; I40E_DESC_UNUSED(tx_ring) < 2; i--) {
139 msleep_interruptible(1);
142 dma = dma_map_single(dev, raw_packet,
143 I40E_FDIR_MAX_RAW_PACKET_SIZE, DMA_TO_DEVICE);
144 if (dma_mapping_error(dev, dma))
147 /* grab the next descriptor */
148 i = tx_ring->next_to_use;
149 first = &tx_ring->tx_bi[i];
150 i40e_fdir(tx_ring, fdir_data, add);
152 /* Now program a dummy descriptor */
153 i = tx_ring->next_to_use;
154 tx_desc = I40E_TX_DESC(tx_ring, i);
155 tx_buf = &tx_ring->tx_bi[i];
157 tx_ring->next_to_use = ((i + 1) < tx_ring->count) ? i + 1 : 0;
159 memset(tx_buf, 0, sizeof(struct i40e_tx_buffer));
161 /* record length, and DMA address */
162 dma_unmap_len_set(tx_buf, len, I40E_FDIR_MAX_RAW_PACKET_SIZE);
163 dma_unmap_addr_set(tx_buf, dma, dma);
165 tx_desc->buffer_addr = cpu_to_le64(dma);
166 td_cmd = I40E_TXD_CMD | I40E_TX_DESC_CMD_DUMMY;
168 tx_buf->tx_flags = I40E_TX_FLAGS_FD_SB;
169 tx_buf->raw_buf = (void *)raw_packet;
171 tx_desc->cmd_type_offset_bsz =
172 build_ctob(td_cmd, 0, I40E_FDIR_MAX_RAW_PACKET_SIZE, 0);
174 /* Force memory writes to complete before letting h/w
175 * know there are new descriptors to fetch.
179 /* Mark the data descriptor to be watched */
180 first->next_to_watch = tx_desc;
182 writel(tx_ring->next_to_use, tx_ring->tail);
189 #define IP_HEADER_OFFSET 14
190 #define I40E_UDPIP_DUMMY_PACKET_LEN 42
192 * i40e_add_del_fdir_udpv4 - Add/Remove UDPv4 filters
193 * @vsi: pointer to the targeted VSI
194 * @fd_data: the flow director data required for the FDir descriptor
195 * @add: true adds a filter, false removes it
197 * Returns 0 if the filters were successfully added or removed
199 static int i40e_add_del_fdir_udpv4(struct i40e_vsi *vsi,
200 struct i40e_fdir_filter *fd_data,
203 struct i40e_pf *pf = vsi->back;
208 static char packet[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0,
209 0x45, 0, 0, 0x1c, 0, 0, 0x40, 0, 0x40, 0x11, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
212 raw_packet = kzalloc(I40E_FDIR_MAX_RAW_PACKET_SIZE, GFP_KERNEL);
215 memcpy(raw_packet, packet, I40E_UDPIP_DUMMY_PACKET_LEN);
217 ip = (struct iphdr *)(raw_packet + IP_HEADER_OFFSET);
218 udp = (struct udphdr *)(raw_packet + IP_HEADER_OFFSET
219 + sizeof(struct iphdr));
221 ip->daddr = fd_data->dst_ip;
222 udp->dest = fd_data->dst_port;
223 ip->saddr = fd_data->src_ip;
224 udp->source = fd_data->src_port;
226 fd_data->pctype = I40E_FILTER_PCTYPE_NONF_IPV4_UDP;
227 ret = i40e_program_fdir_filter(fd_data, raw_packet, pf, add);
229 dev_info(&pf->pdev->dev,
230 "PCTYPE:%d, Filter command send failed for fd_id:%d (ret = %d)\n",
231 fd_data->pctype, fd_data->fd_id, ret);
232 /* Free the packet buffer since it wasn't added to the ring */
235 } else if (I40E_DEBUG_FD & pf->hw.debug_mask) {
237 dev_info(&pf->pdev->dev,
238 "Filter OK for PCTYPE %d loc = %d\n",
239 fd_data->pctype, fd_data->fd_id);
241 dev_info(&pf->pdev->dev,
242 "Filter deleted for PCTYPE %d loc = %d\n",
243 fd_data->pctype, fd_data->fd_id);
249 #define I40E_TCPIP_DUMMY_PACKET_LEN 54
251 * i40e_add_del_fdir_tcpv4 - Add/Remove TCPv4 filters
252 * @vsi: pointer to the targeted VSI
253 * @fd_data: the flow director data required for the FDir descriptor
254 * @add: true adds a filter, false removes it
256 * Returns 0 if the filters were successfully added or removed
258 static int i40e_add_del_fdir_tcpv4(struct i40e_vsi *vsi,
259 struct i40e_fdir_filter *fd_data,
262 struct i40e_pf *pf = vsi->back;
268 static char packet[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0,
269 0x45, 0, 0, 0x28, 0, 0, 0x40, 0, 0x40, 0x6, 0, 0, 0, 0, 0, 0,
270 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x80, 0x11,
271 0x0, 0x72, 0, 0, 0, 0};
273 raw_packet = kzalloc(I40E_FDIR_MAX_RAW_PACKET_SIZE, GFP_KERNEL);
276 memcpy(raw_packet, packet, I40E_TCPIP_DUMMY_PACKET_LEN);
278 ip = (struct iphdr *)(raw_packet + IP_HEADER_OFFSET);
279 tcp = (struct tcphdr *)(raw_packet + IP_HEADER_OFFSET
280 + sizeof(struct iphdr));
282 ip->daddr = fd_data->dst_ip;
283 tcp->dest = fd_data->dst_port;
284 ip->saddr = fd_data->src_ip;
285 tcp->source = fd_data->src_port;
289 if ((pf->flags & I40E_FLAG_FD_ATR_ENABLED) &&
290 I40E_DEBUG_FD & pf->hw.debug_mask)
291 dev_info(&pf->pdev->dev, "Forcing ATR off, sideband rules for TCP/IPv4 flow being applied\n");
292 pf->hw_disabled_flags |= I40E_FLAG_FD_ATR_ENABLED;
294 pf->fd_tcp_rule = (pf->fd_tcp_rule > 0) ?
295 (pf->fd_tcp_rule - 1) : 0;
296 if (pf->fd_tcp_rule == 0) {
297 if ((pf->flags & I40E_FLAG_FD_ATR_ENABLED) &&
298 I40E_DEBUG_FD & pf->hw.debug_mask)
299 dev_info(&pf->pdev->dev, "ATR re-enabled due to no sideband TCP/IPv4 rules\n");
300 pf->hw_disabled_flags &= ~I40E_FLAG_FD_ATR_ENABLED;
304 fd_data->pctype = I40E_FILTER_PCTYPE_NONF_IPV4_TCP;
305 ret = i40e_program_fdir_filter(fd_data, raw_packet, pf, add);
307 dev_info(&pf->pdev->dev,
308 "PCTYPE:%d, Filter command send failed for fd_id:%d (ret = %d)\n",
309 fd_data->pctype, fd_data->fd_id, ret);
310 /* Free the packet buffer since it wasn't added to the ring */
313 } else if (I40E_DEBUG_FD & pf->hw.debug_mask) {
315 dev_info(&pf->pdev->dev, "Filter OK for PCTYPE %d loc = %d)\n",
316 fd_data->pctype, fd_data->fd_id);
318 dev_info(&pf->pdev->dev,
319 "Filter deleted for PCTYPE %d loc = %d\n",
320 fd_data->pctype, fd_data->fd_id);
326 #define I40E_IP_DUMMY_PACKET_LEN 34
328 * i40e_add_del_fdir_ipv4 - Add/Remove IPv4 Flow Director filters for
329 * a specific flow spec
330 * @vsi: pointer to the targeted VSI
331 * @fd_data: the flow director data required for the FDir descriptor
332 * @add: true adds a filter, false removes it
334 * Returns 0 if the filters were successfully added or removed
336 static int i40e_add_del_fdir_ipv4(struct i40e_vsi *vsi,
337 struct i40e_fdir_filter *fd_data,
340 struct i40e_pf *pf = vsi->back;
345 static char packet[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0,
346 0x45, 0, 0, 0x14, 0, 0, 0x40, 0, 0x40, 0x10, 0, 0, 0, 0, 0, 0,
349 for (i = I40E_FILTER_PCTYPE_NONF_IPV4_OTHER;
350 i <= I40E_FILTER_PCTYPE_FRAG_IPV4; i++) {
351 raw_packet = kzalloc(I40E_FDIR_MAX_RAW_PACKET_SIZE, GFP_KERNEL);
354 memcpy(raw_packet, packet, I40E_IP_DUMMY_PACKET_LEN);
355 ip = (struct iphdr *)(raw_packet + IP_HEADER_OFFSET);
357 ip->saddr = fd_data->src_ip;
358 ip->daddr = fd_data->dst_ip;
362 ret = i40e_program_fdir_filter(fd_data, raw_packet, pf, add);
364 dev_info(&pf->pdev->dev,
365 "PCTYPE:%d, Filter command send failed for fd_id:%d (ret = %d)\n",
366 fd_data->pctype, fd_data->fd_id, ret);
367 /* The packet buffer wasn't added to the ring so we
368 * need to free it now.
372 } else if (I40E_DEBUG_FD & pf->hw.debug_mask) {
374 dev_info(&pf->pdev->dev,
375 "Filter OK for PCTYPE %d loc = %d\n",
376 fd_data->pctype, fd_data->fd_id);
378 dev_info(&pf->pdev->dev,
379 "Filter deleted for PCTYPE %d loc = %d\n",
380 fd_data->pctype, fd_data->fd_id);
388 * i40e_add_del_fdir - Build raw packets to add/del fdir filter
389 * @vsi: pointer to the targeted VSI
390 * @cmd: command to get or set RX flow classification rules
391 * @add: true adds a filter, false removes it
394 int i40e_add_del_fdir(struct i40e_vsi *vsi,
395 struct i40e_fdir_filter *input, bool add)
397 struct i40e_pf *pf = vsi->back;
400 switch (input->flow_type & ~FLOW_EXT) {
402 ret = i40e_add_del_fdir_tcpv4(vsi, input, add);
405 ret = i40e_add_del_fdir_udpv4(vsi, input, add);
408 switch (input->ip4_proto) {
410 ret = i40e_add_del_fdir_tcpv4(vsi, input, add);
413 ret = i40e_add_del_fdir_udpv4(vsi, input, add);
416 ret = i40e_add_del_fdir_ipv4(vsi, input, add);
419 /* We cannot support masking based on protocol */
420 goto unsupported_flow;
425 dev_info(&pf->pdev->dev, "Could not specify spec type %d\n",
430 /* The buffer allocated here will be normally be freed by
431 * i40e_clean_fdir_tx_irq() as it reclaims resources after transmit
432 * completion. In the event of an error adding the buffer to the FDIR
433 * ring, it will immediately be freed. It may also be freed by
434 * i40e_clean_tx_ring() when closing the VSI.
440 * i40e_fd_handle_status - check the Programming Status for FD
441 * @rx_ring: the Rx ring for this descriptor
442 * @rx_desc: the Rx descriptor for programming Status, not a packet descriptor.
443 * @prog_id: the id originally used for programming
445 * This is used to verify if the FD programming or invalidation
446 * requested by SW to the HW is successful or not and take actions accordingly.
448 static void i40e_fd_handle_status(struct i40e_ring *rx_ring,
449 union i40e_rx_desc *rx_desc, u8 prog_id)
451 struct i40e_pf *pf = rx_ring->vsi->back;
452 struct pci_dev *pdev = pf->pdev;
453 u32 fcnt_prog, fcnt_avail;
457 qw = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
458 error = (qw & I40E_RX_PROG_STATUS_DESC_QW1_ERROR_MASK) >>
459 I40E_RX_PROG_STATUS_DESC_QW1_ERROR_SHIFT;
461 if (error == BIT(I40E_RX_PROG_STATUS_DESC_FD_TBL_FULL_SHIFT)) {
462 pf->fd_inv = le32_to_cpu(rx_desc->wb.qword0.hi_dword.fd_id);
463 if ((rx_desc->wb.qword0.hi_dword.fd_id != 0) ||
464 (I40E_DEBUG_FD & pf->hw.debug_mask))
465 dev_warn(&pdev->dev, "ntuple filter loc = %d, could not be added\n",
468 /* Check if the programming error is for ATR.
469 * If so, auto disable ATR and set a state for
470 * flush in progress. Next time we come here if flush is in
471 * progress do nothing, once flush is complete the state will
474 if (test_bit(__I40E_FD_FLUSH_REQUESTED, &pf->state))
478 /* store the current atr filter count */
479 pf->fd_atr_cnt = i40e_get_current_atr_cnt(pf);
481 if ((rx_desc->wb.qword0.hi_dword.fd_id == 0) &&
482 (pf->hw_disabled_flags & I40E_FLAG_FD_SB_ENABLED)) {
483 pf->hw_disabled_flags |= I40E_FLAG_FD_ATR_ENABLED;
484 set_bit(__I40E_FD_FLUSH_REQUESTED, &pf->state);
487 /* filter programming failed most likely due to table full */
488 fcnt_prog = i40e_get_global_fd_count(pf);
489 fcnt_avail = pf->fdir_pf_filter_count;
490 /* If ATR is running fcnt_prog can quickly change,
491 * if we are very close to full, it makes sense to disable
492 * FD ATR/SB and then re-enable it when there is room.
494 if (fcnt_prog >= (fcnt_avail - I40E_FDIR_BUFFER_FULL_MARGIN)) {
495 if ((pf->flags & I40E_FLAG_FD_SB_ENABLED) &&
496 !(pf->hw_disabled_flags &
497 I40E_FLAG_FD_SB_ENABLED)) {
498 if (I40E_DEBUG_FD & pf->hw.debug_mask)
499 dev_warn(&pdev->dev, "FD filter space full, new ntuple rules will not be added\n");
500 pf->hw_disabled_flags |=
501 I40E_FLAG_FD_SB_ENABLED;
504 } else if (error == BIT(I40E_RX_PROG_STATUS_DESC_NO_FD_ENTRY_SHIFT)) {
505 if (I40E_DEBUG_FD & pf->hw.debug_mask)
506 dev_info(&pdev->dev, "ntuple filter fd_id = %d, could not be removed\n",
507 rx_desc->wb.qword0.hi_dword.fd_id);
512 * i40e_unmap_and_free_tx_resource - Release a Tx buffer
513 * @ring: the ring that owns the buffer
514 * @tx_buffer: the buffer to free
516 static void i40e_unmap_and_free_tx_resource(struct i40e_ring *ring,
517 struct i40e_tx_buffer *tx_buffer)
519 if (tx_buffer->skb) {
520 if (tx_buffer->tx_flags & I40E_TX_FLAGS_FD_SB)
521 kfree(tx_buffer->raw_buf);
523 dev_kfree_skb_any(tx_buffer->skb);
524 if (dma_unmap_len(tx_buffer, len))
525 dma_unmap_single(ring->dev,
526 dma_unmap_addr(tx_buffer, dma),
527 dma_unmap_len(tx_buffer, len),
529 } else if (dma_unmap_len(tx_buffer, len)) {
530 dma_unmap_page(ring->dev,
531 dma_unmap_addr(tx_buffer, dma),
532 dma_unmap_len(tx_buffer, len),
536 tx_buffer->next_to_watch = NULL;
537 tx_buffer->skb = NULL;
538 dma_unmap_len_set(tx_buffer, len, 0);
539 /* tx_buffer must be completely set up in the transmit path */
543 * i40e_clean_tx_ring - Free any empty Tx buffers
544 * @tx_ring: ring to be cleaned
546 void i40e_clean_tx_ring(struct i40e_ring *tx_ring)
548 unsigned long bi_size;
551 /* ring already cleared, nothing to do */
555 /* Free all the Tx ring sk_buffs */
556 for (i = 0; i < tx_ring->count; i++)
557 i40e_unmap_and_free_tx_resource(tx_ring, &tx_ring->tx_bi[i]);
559 bi_size = sizeof(struct i40e_tx_buffer) * tx_ring->count;
560 memset(tx_ring->tx_bi, 0, bi_size);
562 /* Zero out the descriptor ring */
563 memset(tx_ring->desc, 0, tx_ring->size);
565 tx_ring->next_to_use = 0;
566 tx_ring->next_to_clean = 0;
568 if (!tx_ring->netdev)
571 /* cleanup Tx queue statistics */
572 netdev_tx_reset_queue(txring_txq(tx_ring));
576 * i40e_free_tx_resources - Free Tx resources per queue
577 * @tx_ring: Tx descriptor ring for a specific queue
579 * Free all transmit software resources
581 void i40e_free_tx_resources(struct i40e_ring *tx_ring)
583 i40e_clean_tx_ring(tx_ring);
584 kfree(tx_ring->tx_bi);
585 tx_ring->tx_bi = NULL;
588 dma_free_coherent(tx_ring->dev, tx_ring->size,
589 tx_ring->desc, tx_ring->dma);
590 tx_ring->desc = NULL;
595 * i40e_get_tx_pending - how many tx descriptors not processed
596 * @tx_ring: the ring of descriptors
597 * @in_sw: is tx_pending being checked in SW or HW
599 * Since there is no access to the ring head register
600 * in XL710, we need to use our local copies
602 u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw)
607 head = i40e_get_head(ring);
609 head = ring->next_to_clean;
610 tail = readl(ring->tail);
613 return (head < tail) ?
614 tail - head : (tail + ring->count - head);
622 * i40e_clean_tx_irq - Reclaim resources after transmit completes
623 * @vsi: the VSI we care about
624 * @tx_ring: Tx ring to clean
625 * @napi_budget: Used to determine if we are in netpoll
627 * Returns true if there's any budget left (e.g. the clean is finished)
629 static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
630 struct i40e_ring *tx_ring, int napi_budget)
632 u16 i = tx_ring->next_to_clean;
633 struct i40e_tx_buffer *tx_buf;
634 struct i40e_tx_desc *tx_head;
635 struct i40e_tx_desc *tx_desc;
636 unsigned int total_bytes = 0, total_packets = 0;
637 unsigned int budget = vsi->work_limit;
639 tx_buf = &tx_ring->tx_bi[i];
640 tx_desc = I40E_TX_DESC(tx_ring, i);
643 tx_head = I40E_TX_DESC(tx_ring, i40e_get_head(tx_ring));
646 struct i40e_tx_desc *eop_desc = tx_buf->next_to_watch;
648 /* if next_to_watch is not set then there is no work pending */
652 /* prevent any other reads prior to eop_desc */
653 read_barrier_depends();
655 /* we have caught up to head, no work left to do */
656 if (tx_head == tx_desc)
659 /* clear next_to_watch to prevent false hangs */
660 tx_buf->next_to_watch = NULL;
662 /* update the statistics for this packet */
663 total_bytes += tx_buf->bytecount;
664 total_packets += tx_buf->gso_segs;
667 napi_consume_skb(tx_buf->skb, napi_budget);
669 /* unmap skb header data */
670 dma_unmap_single(tx_ring->dev,
671 dma_unmap_addr(tx_buf, dma),
672 dma_unmap_len(tx_buf, len),
675 /* clear tx_buffer data */
677 dma_unmap_len_set(tx_buf, len, 0);
679 /* unmap remaining buffers */
680 while (tx_desc != eop_desc) {
687 tx_buf = tx_ring->tx_bi;
688 tx_desc = I40E_TX_DESC(tx_ring, 0);
691 /* unmap any remaining paged data */
692 if (dma_unmap_len(tx_buf, len)) {
693 dma_unmap_page(tx_ring->dev,
694 dma_unmap_addr(tx_buf, dma),
695 dma_unmap_len(tx_buf, len),
697 dma_unmap_len_set(tx_buf, len, 0);
701 /* move us one more past the eop_desc for start of next pkt */
707 tx_buf = tx_ring->tx_bi;
708 tx_desc = I40E_TX_DESC(tx_ring, 0);
713 /* update budget accounting */
715 } while (likely(budget));
718 tx_ring->next_to_clean = i;
719 u64_stats_update_begin(&tx_ring->syncp);
720 tx_ring->stats.bytes += total_bytes;
721 tx_ring->stats.packets += total_packets;
722 u64_stats_update_end(&tx_ring->syncp);
723 tx_ring->q_vector->tx.total_bytes += total_bytes;
724 tx_ring->q_vector->tx.total_packets += total_packets;
726 if (tx_ring->flags & I40E_TXR_FLAGS_WB_ON_ITR) {
727 /* check to see if there are < 4 descriptors
728 * waiting to be written back, then kick the hardware to force
729 * them to be written back in case we stay in NAPI.
730 * In this mode on X722 we do not enable Interrupt.
732 unsigned int j = i40e_get_tx_pending(tx_ring, false);
735 ((j / WB_STRIDE) == 0) && (j > 0) &&
736 !test_bit(__I40E_DOWN, &vsi->state) &&
737 (I40E_DESC_UNUSED(tx_ring) != tx_ring->count))
738 tx_ring->arm_wb = true;
741 /* notify netdev of completed buffers */
742 netdev_tx_completed_queue(txring_txq(tx_ring),
743 total_packets, total_bytes);
745 #define TX_WAKE_THRESHOLD (DESC_NEEDED * 2)
746 if (unlikely(total_packets && netif_carrier_ok(tx_ring->netdev) &&
747 (I40E_DESC_UNUSED(tx_ring) >= TX_WAKE_THRESHOLD))) {
748 /* Make sure that anybody stopping the queue after this
749 * sees the new next_to_clean.
752 if (__netif_subqueue_stopped(tx_ring->netdev,
753 tx_ring->queue_index) &&
754 !test_bit(__I40E_DOWN, &vsi->state)) {
755 netif_wake_subqueue(tx_ring->netdev,
756 tx_ring->queue_index);
757 ++tx_ring->tx_stats.restart_queue;
765 * i40e_enable_wb_on_itr - Arm hardware to do a wb, interrupts are not enabled
766 * @vsi: the VSI we care about
767 * @q_vector: the vector on which to enable writeback
770 static void i40e_enable_wb_on_itr(struct i40e_vsi *vsi,
771 struct i40e_q_vector *q_vector)
773 u16 flags = q_vector->tx.ring[0].flags;
776 if (!(flags & I40E_TXR_FLAGS_WB_ON_ITR))
779 if (q_vector->arm_wb_state)
782 if (vsi->back->flags & I40E_FLAG_MSIX_ENABLED) {
783 val = I40E_PFINT_DYN_CTLN_WB_ON_ITR_MASK |
784 I40E_PFINT_DYN_CTLN_ITR_INDX_MASK; /* set noitr */
787 I40E_PFINT_DYN_CTLN(q_vector->v_idx + vsi->base_vector - 1),
790 val = I40E_PFINT_DYN_CTL0_WB_ON_ITR_MASK |
791 I40E_PFINT_DYN_CTL0_ITR_INDX_MASK; /* set noitr */
793 wr32(&vsi->back->hw, I40E_PFINT_DYN_CTL0, val);
795 q_vector->arm_wb_state = true;
799 * i40e_force_wb - Issue SW Interrupt so HW does a wb
800 * @vsi: the VSI we care about
801 * @q_vector: the vector on which to force writeback
804 void i40e_force_wb(struct i40e_vsi *vsi, struct i40e_q_vector *q_vector)
806 if (vsi->back->flags & I40E_FLAG_MSIX_ENABLED) {
807 u32 val = I40E_PFINT_DYN_CTLN_INTENA_MASK |
808 I40E_PFINT_DYN_CTLN_ITR_INDX_MASK | /* set noitr */
809 I40E_PFINT_DYN_CTLN_SWINT_TRIG_MASK |
810 I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_MASK;
811 /* allow 00 to be written to the index */
814 I40E_PFINT_DYN_CTLN(q_vector->v_idx +
815 vsi->base_vector - 1), val);
817 u32 val = I40E_PFINT_DYN_CTL0_INTENA_MASK |
818 I40E_PFINT_DYN_CTL0_ITR_INDX_MASK | /* set noitr */
819 I40E_PFINT_DYN_CTL0_SWINT_TRIG_MASK |
820 I40E_PFINT_DYN_CTL0_SW_ITR_INDX_ENA_MASK;
821 /* allow 00 to be written to the index */
823 wr32(&vsi->back->hw, I40E_PFINT_DYN_CTL0, val);
828 * i40e_set_new_dynamic_itr - Find new ITR level
829 * @rc: structure containing ring performance data
831 * Returns true if ITR changed, false if not
833 * Stores a new ITR value based on packets and byte counts during
834 * the last interrupt. The advantage of per interrupt computation
835 * is faster updates and more accurate ITR for the current traffic
836 * pattern. Constants in this function were computed based on
837 * theoretical maximum wire speed and thresholds were set based on
838 * testing data as well as attempting to minimize response time
839 * while increasing bulk throughput.
841 static bool i40e_set_new_dynamic_itr(struct i40e_ring_container *rc)
843 enum i40e_latency_range new_latency_range = rc->latency_range;
844 struct i40e_q_vector *qv = rc->ring->q_vector;
845 u32 new_itr = rc->itr;
849 if (rc->total_packets == 0 || !rc->itr)
852 /* simple throttlerate management
853 * 0-10MB/s lowest (50000 ints/s)
854 * 10-20MB/s low (20000 ints/s)
855 * 20-1249MB/s bulk (18000 ints/s)
856 * > 40000 Rx packets per second (8000 ints/s)
858 * The math works out because the divisor is in 10^(-6) which
859 * turns the bytes/us input value into MB/s values, but
860 * make sure to use usecs, as the register values written
861 * are in 2 usec increments in the ITR registers, and make sure
862 * to use the smoothed values that the countdown timer gives us.
864 usecs = (rc->itr << 1) * ITR_COUNTDOWN_START;
865 bytes_per_int = rc->total_bytes / usecs;
867 switch (new_latency_range) {
868 case I40E_LOWEST_LATENCY:
869 if (bytes_per_int > 10)
870 new_latency_range = I40E_LOW_LATENCY;
872 case I40E_LOW_LATENCY:
873 if (bytes_per_int > 20)
874 new_latency_range = I40E_BULK_LATENCY;
875 else if (bytes_per_int <= 10)
876 new_latency_range = I40E_LOWEST_LATENCY;
878 case I40E_BULK_LATENCY:
879 case I40E_ULTRA_LATENCY:
881 if (bytes_per_int <= 20)
882 new_latency_range = I40E_LOW_LATENCY;
886 /* this is to adjust RX more aggressively when streaming small
887 * packets. The value of 40000 was picked as it is just beyond
888 * what the hardware can receive per second if in low latency
891 #define RX_ULTRA_PACKET_RATE 40000
893 if ((((rc->total_packets * 1000000) / usecs) > RX_ULTRA_PACKET_RATE) &&
895 new_latency_range = I40E_ULTRA_LATENCY;
897 rc->latency_range = new_latency_range;
899 switch (new_latency_range) {
900 case I40E_LOWEST_LATENCY:
901 new_itr = I40E_ITR_50K;
903 case I40E_LOW_LATENCY:
904 new_itr = I40E_ITR_20K;
906 case I40E_BULK_LATENCY:
907 new_itr = I40E_ITR_18K;
909 case I40E_ULTRA_LATENCY:
910 new_itr = I40E_ITR_8K;
917 rc->total_packets = 0;
919 if (new_itr != rc->itr) {
928 * i40e_clean_programming_status - clean the programming status descriptor
929 * @rx_ring: the rx ring that has this descriptor
930 * @rx_desc: the rx descriptor written back by HW
932 * Flow director should handle FD_FILTER_STATUS to check its filter programming
933 * status being successful or not and take actions accordingly. FCoE should
934 * handle its context/filter programming/invalidation status and take actions.
937 static void i40e_clean_programming_status(struct i40e_ring *rx_ring,
938 union i40e_rx_desc *rx_desc)
943 qw = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
944 id = (qw & I40E_RX_PROG_STATUS_DESC_QW1_PROGID_MASK) >>
945 I40E_RX_PROG_STATUS_DESC_QW1_PROGID_SHIFT;
947 if (id == I40E_RX_PROG_STATUS_DESC_FD_FILTER_STATUS)
948 i40e_fd_handle_status(rx_ring, rx_desc, id);
950 else if ((id == I40E_RX_PROG_STATUS_DESC_FCOE_CTXT_PROG_STATUS) ||
951 (id == I40E_RX_PROG_STATUS_DESC_FCOE_CTXT_INVL_STATUS))
952 i40e_fcoe_handle_status(rx_ring, rx_desc, id);
957 * i40e_setup_tx_descriptors - Allocate the Tx descriptors
958 * @tx_ring: the tx ring to set up
960 * Return 0 on success, negative on error
962 int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring)
964 struct device *dev = tx_ring->dev;
970 /* warn if we are about to overwrite the pointer */
971 WARN_ON(tx_ring->tx_bi);
972 bi_size = sizeof(struct i40e_tx_buffer) * tx_ring->count;
973 tx_ring->tx_bi = kzalloc(bi_size, GFP_KERNEL);
977 /* round up to nearest 4K */
978 tx_ring->size = tx_ring->count * sizeof(struct i40e_tx_desc);
979 /* add u32 for head writeback, align after this takes care of
980 * guaranteeing this is at least one cache line in size
982 tx_ring->size += sizeof(u32);
983 tx_ring->size = ALIGN(tx_ring->size, 4096);
984 tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
985 &tx_ring->dma, GFP_KERNEL);
986 if (!tx_ring->desc) {
987 dev_info(dev, "Unable to allocate memory for the Tx descriptor ring, size=%d\n",
992 tx_ring->next_to_use = 0;
993 tx_ring->next_to_clean = 0;
997 kfree(tx_ring->tx_bi);
998 tx_ring->tx_bi = NULL;
1003 * i40e_clean_rx_ring - Free Rx buffers
1004 * @rx_ring: ring to be cleaned
1006 void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
1008 unsigned long bi_size;
1011 /* ring already cleared, nothing to do */
1012 if (!rx_ring->rx_bi)
1016 dev_kfree_skb(rx_ring->skb);
1017 rx_ring->skb = NULL;
1020 /* Free all the Rx ring sk_buffs */
1021 for (i = 0; i < rx_ring->count; i++) {
1022 struct i40e_rx_buffer *rx_bi = &rx_ring->rx_bi[i];
1027 /* Invalidate cache lines that may have been written to by
1028 * device so that we avoid corrupting memory.
1030 dma_sync_single_range_for_cpu(rx_ring->dev,
1036 /* free resources associated with mapping */
1037 dma_unmap_page_attrs(rx_ring->dev, rx_bi->dma,
1041 __free_pages(rx_bi->page, 0);
1044 rx_bi->page_offset = 0;
1047 bi_size = sizeof(struct i40e_rx_buffer) * rx_ring->count;
1048 memset(rx_ring->rx_bi, 0, bi_size);
1050 /* Zero out the descriptor ring */
1051 memset(rx_ring->desc, 0, rx_ring->size);
1053 rx_ring->next_to_alloc = 0;
1054 rx_ring->next_to_clean = 0;
1055 rx_ring->next_to_use = 0;
1059 * i40e_free_rx_resources - Free Rx resources
1060 * @rx_ring: ring to clean the resources from
1062 * Free all receive software resources
1064 void i40e_free_rx_resources(struct i40e_ring *rx_ring)
1066 i40e_clean_rx_ring(rx_ring);
1067 kfree(rx_ring->rx_bi);
1068 rx_ring->rx_bi = NULL;
1070 if (rx_ring->desc) {
1071 dma_free_coherent(rx_ring->dev, rx_ring->size,
1072 rx_ring->desc, rx_ring->dma);
1073 rx_ring->desc = NULL;
1078 * i40e_setup_rx_descriptors - Allocate Rx descriptors
1079 * @rx_ring: Rx descriptor ring (for a specific queue) to setup
1081 * Returns 0 on success, negative on failure
1083 int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
1085 struct device *dev = rx_ring->dev;
1088 /* warn if we are about to overwrite the pointer */
1089 WARN_ON(rx_ring->rx_bi);
1090 bi_size = sizeof(struct i40e_rx_buffer) * rx_ring->count;
1091 rx_ring->rx_bi = kzalloc(bi_size, GFP_KERNEL);
1092 if (!rx_ring->rx_bi)
1095 u64_stats_init(&rx_ring->syncp);
1097 /* Round up to nearest 4K */
1098 rx_ring->size = rx_ring->count * sizeof(union i40e_32byte_rx_desc);
1099 rx_ring->size = ALIGN(rx_ring->size, 4096);
1100 rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
1101 &rx_ring->dma, GFP_KERNEL);
1103 if (!rx_ring->desc) {
1104 dev_info(dev, "Unable to allocate memory for the Rx descriptor ring, size=%d\n",
1109 rx_ring->next_to_alloc = 0;
1110 rx_ring->next_to_clean = 0;
1111 rx_ring->next_to_use = 0;
1115 kfree(rx_ring->rx_bi);
1116 rx_ring->rx_bi = NULL;
1121 * i40e_release_rx_desc - Store the new tail and head values
1122 * @rx_ring: ring to bump
1123 * @val: new head index
1125 static inline void i40e_release_rx_desc(struct i40e_ring *rx_ring, u32 val)
1127 rx_ring->next_to_use = val;
1129 /* update next to alloc since we have filled the ring */
1130 rx_ring->next_to_alloc = val;
1132 /* Force memory writes to complete before letting h/w
1133 * know there are new descriptors to fetch. (Only
1134 * applicable for weak-ordered memory model archs,
1138 writel(val, rx_ring->tail);
1142 * i40e_alloc_mapped_page - recycle or make a new page
1143 * @rx_ring: ring to use
1144 * @bi: rx_buffer struct to modify
1146 * Returns true if the page was successfully allocated or
1149 static bool i40e_alloc_mapped_page(struct i40e_ring *rx_ring,
1150 struct i40e_rx_buffer *bi)
1152 struct page *page = bi->page;
1155 /* since we are recycling buffers we should seldom need to alloc */
1157 rx_ring->rx_stats.page_reuse_count++;
1161 /* alloc new page for storage */
1162 page = dev_alloc_page();
1163 if (unlikely(!page)) {
1164 rx_ring->rx_stats.alloc_page_failed++;
1168 /* map page for use */
1169 dma = dma_map_page_attrs(rx_ring->dev, page, 0,
1174 /* if mapping failed free memory back to system since
1175 * there isn't much point in holding memory we can't use
1177 if (dma_mapping_error(rx_ring->dev, dma)) {
1178 __free_pages(page, 0);
1179 rx_ring->rx_stats.alloc_page_failed++;
1185 bi->page_offset = 0;
1191 * i40e_receive_skb - Send a completed packet up the stack
1192 * @rx_ring: rx ring in play
1193 * @skb: packet to send up
1194 * @vlan_tag: vlan tag for packet
1196 static void i40e_receive_skb(struct i40e_ring *rx_ring,
1197 struct sk_buff *skb, u16 vlan_tag)
1199 struct i40e_q_vector *q_vector = rx_ring->q_vector;
1201 if ((rx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_RX) &&
1202 (vlan_tag & VLAN_VID_MASK))
1203 __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
1205 napi_gro_receive(&q_vector->napi, skb);
1209 * i40e_alloc_rx_buffers - Replace used receive buffers
1210 * @rx_ring: ring to place buffers on
1211 * @cleaned_count: number of buffers to replace
1213 * Returns false if all allocations were successful, true if any fail
1215 bool i40e_alloc_rx_buffers(struct i40e_ring *rx_ring, u16 cleaned_count)
1217 u16 ntu = rx_ring->next_to_use;
1218 union i40e_rx_desc *rx_desc;
1219 struct i40e_rx_buffer *bi;
1221 /* do nothing if no valid netdev defined */
1222 if (!rx_ring->netdev || !cleaned_count)
1225 rx_desc = I40E_RX_DESC(rx_ring, ntu);
1226 bi = &rx_ring->rx_bi[ntu];
1229 if (!i40e_alloc_mapped_page(rx_ring, bi))
1232 /* sync the buffer for use by the device */
1233 dma_sync_single_range_for_device(rx_ring->dev, bi->dma,
1238 /* Refresh the desc even if buffer_addrs didn't change
1239 * because each write-back erases this info.
1241 rx_desc->read.pkt_addr = cpu_to_le64(bi->dma + bi->page_offset);
1246 if (unlikely(ntu == rx_ring->count)) {
1247 rx_desc = I40E_RX_DESC(rx_ring, 0);
1248 bi = rx_ring->rx_bi;
1252 /* clear the status bits for the next_to_use descriptor */
1253 rx_desc->wb.qword1.status_error_len = 0;
1256 } while (cleaned_count);
1258 if (rx_ring->next_to_use != ntu)
1259 i40e_release_rx_desc(rx_ring, ntu);
1264 if (rx_ring->next_to_use != ntu)
1265 i40e_release_rx_desc(rx_ring, ntu);
1267 /* make sure to come back via polling to try again after
1268 * allocation failure
1274 * i40e_rx_checksum - Indicate in skb if hw indicated a good cksum
1275 * @vsi: the VSI we care about
1276 * @skb: skb currently being received and modified
1277 * @rx_desc: the receive descriptor
1279 * skb->protocol must be set before this function is called
1281 static inline void i40e_rx_checksum(struct i40e_vsi *vsi,
1282 struct sk_buff *skb,
1283 union i40e_rx_desc *rx_desc)
1285 struct i40e_rx_ptype_decoded decoded;
1286 u32 rx_error, rx_status;
1291 qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
1292 ptype = (qword & I40E_RXD_QW1_PTYPE_MASK) >> I40E_RXD_QW1_PTYPE_SHIFT;
1293 rx_error = (qword & I40E_RXD_QW1_ERROR_MASK) >>
1294 I40E_RXD_QW1_ERROR_SHIFT;
1295 rx_status = (qword & I40E_RXD_QW1_STATUS_MASK) >>
1296 I40E_RXD_QW1_STATUS_SHIFT;
1297 decoded = decode_rx_desc_ptype(ptype);
1299 skb->ip_summed = CHECKSUM_NONE;
1301 skb_checksum_none_assert(skb);
1303 /* Rx csum enabled and ip headers found? */
1304 if (!(vsi->netdev->features & NETIF_F_RXCSUM))
1307 /* did the hardware decode the packet and checksum? */
1308 if (!(rx_status & BIT(I40E_RX_DESC_STATUS_L3L4P_SHIFT)))
1311 /* both known and outer_ip must be set for the below code to work */
1312 if (!(decoded.known && decoded.outer_ip))
1315 ipv4 = (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP) &&
1316 (decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV4);
1317 ipv6 = (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP) &&
1318 (decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6);
1321 (rx_error & (BIT(I40E_RX_DESC_ERROR_IPE_SHIFT) |
1322 BIT(I40E_RX_DESC_ERROR_EIPE_SHIFT))))
1325 /* likely incorrect csum if alternate IP extension headers found */
1327 rx_status & BIT(I40E_RX_DESC_STATUS_IPV6EXADD_SHIFT))
1328 /* don't increment checksum err here, non-fatal err */
1331 /* there was some L4 error, count error and punt packet to the stack */
1332 if (rx_error & BIT(I40E_RX_DESC_ERROR_L4E_SHIFT))
1335 /* handle packets that were not able to be checksummed due
1336 * to arrival speed, in this case the stack can compute
1339 if (rx_error & BIT(I40E_RX_DESC_ERROR_PPRS_SHIFT))
1342 /* If there is an outer header present that might contain a checksum
1343 * we need to bump the checksum level by 1 to reflect the fact that
1344 * we are indicating we validated the inner checksum.
1346 if (decoded.tunnel_type >= I40E_RX_PTYPE_TUNNEL_IP_GRENAT)
1347 skb->csum_level = 1;
1349 /* Only report checksum unnecessary for TCP, UDP, or SCTP */
1350 switch (decoded.inner_prot) {
1351 case I40E_RX_PTYPE_INNER_PROT_TCP:
1352 case I40E_RX_PTYPE_INNER_PROT_UDP:
1353 case I40E_RX_PTYPE_INNER_PROT_SCTP:
1354 skb->ip_summed = CHECKSUM_UNNECESSARY;
1363 vsi->back->hw_csum_rx_error++;
1367 * i40e_ptype_to_htype - get a hash type
1368 * @ptype: the ptype value from the descriptor
1370 * Returns a hash type to be used by skb_set_hash
1372 static inline int i40e_ptype_to_htype(u8 ptype)
1374 struct i40e_rx_ptype_decoded decoded = decode_rx_desc_ptype(ptype);
1377 return PKT_HASH_TYPE_NONE;
1379 if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
1380 decoded.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY4)
1381 return PKT_HASH_TYPE_L4;
1382 else if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
1383 decoded.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY3)
1384 return PKT_HASH_TYPE_L3;
1386 return PKT_HASH_TYPE_L2;
1390 * i40e_rx_hash - set the hash value in the skb
1391 * @ring: descriptor ring
1392 * @rx_desc: specific descriptor
1394 static inline void i40e_rx_hash(struct i40e_ring *ring,
1395 union i40e_rx_desc *rx_desc,
1396 struct sk_buff *skb,
1400 const __le64 rss_mask =
1401 cpu_to_le64((u64)I40E_RX_DESC_FLTSTAT_RSS_HASH <<
1402 I40E_RX_DESC_STATUS_FLTSTAT_SHIFT);
1404 if (!(ring->netdev->features & NETIF_F_RXHASH))
1407 if ((rx_desc->wb.qword1.status_error_len & rss_mask) == rss_mask) {
1408 hash = le32_to_cpu(rx_desc->wb.qword0.hi_dword.rss);
1409 skb_set_hash(skb, hash, i40e_ptype_to_htype(rx_ptype));
1414 * i40e_process_skb_fields - Populate skb header fields from Rx descriptor
1415 * @rx_ring: rx descriptor ring packet is being transacted on
1416 * @rx_desc: pointer to the EOP Rx descriptor
1417 * @skb: pointer to current skb being populated
1418 * @rx_ptype: the packet type decoded by hardware
1420 * This function checks the ring, descriptor, and packet information in
1421 * order to populate the hash, checksum, VLAN, protocol, and
1422 * other fields within the skb.
1425 void i40e_process_skb_fields(struct i40e_ring *rx_ring,
1426 union i40e_rx_desc *rx_desc, struct sk_buff *skb,
1429 u64 qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
1430 u32 rx_status = (qword & I40E_RXD_QW1_STATUS_MASK) >>
1431 I40E_RXD_QW1_STATUS_SHIFT;
1432 u32 tsynvalid = rx_status & I40E_RXD_QW1_STATUS_TSYNVALID_MASK;
1433 u32 tsyn = (rx_status & I40E_RXD_QW1_STATUS_TSYNINDX_MASK) >>
1434 I40E_RXD_QW1_STATUS_TSYNINDX_SHIFT;
1436 if (unlikely(tsynvalid))
1437 i40e_ptp_rx_hwtstamp(rx_ring->vsi->back, skb, tsyn);
1439 i40e_rx_hash(rx_ring, rx_desc, skb, rx_ptype);
1441 /* modifies the skb - consumes the enet header */
1442 skb->protocol = eth_type_trans(skb, rx_ring->netdev);
1444 i40e_rx_checksum(rx_ring->vsi, skb, rx_desc);
1446 skb_record_rx_queue(skb, rx_ring->queue_index);
1450 * i40e_cleanup_headers - Correct empty headers
1451 * @rx_ring: rx descriptor ring packet is being transacted on
1452 * @skb: pointer to current skb being fixed
1454 * Also address the case where we are pulling data in on pages only
1455 * and as such no data is present in the skb header.
1457 * In addition if skb is not at least 60 bytes we need to pad it so that
1458 * it is large enough to qualify as a valid Ethernet frame.
1460 * Returns true if an error was encountered and skb was freed.
1462 static bool i40e_cleanup_headers(struct i40e_ring *rx_ring, struct sk_buff *skb)
1464 /* if eth_skb_pad returns an error the skb was freed */
1465 if (eth_skb_pad(skb))
1472 * i40e_reuse_rx_page - page flip buffer and store it back on the ring
1473 * @rx_ring: rx descriptor ring to store buffers on
1474 * @old_buff: donor buffer to have page reused
1476 * Synchronizes page for reuse by the adapter
1478 static void i40e_reuse_rx_page(struct i40e_ring *rx_ring,
1479 struct i40e_rx_buffer *old_buff)
1481 struct i40e_rx_buffer *new_buff;
1482 u16 nta = rx_ring->next_to_alloc;
1484 new_buff = &rx_ring->rx_bi[nta];
1486 /* update, and store next to alloc */
1488 rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
1490 /* transfer page from old buffer to new buffer */
1491 *new_buff = *old_buff;
1495 * i40e_page_is_reusable - check if any reuse is possible
1496 * @page: page struct to check
1498 * A page is not reusable if it was allocated under low memory
1499 * conditions, or it's not in the same NUMA node as this CPU.
1501 static inline bool i40e_page_is_reusable(struct page *page)
1503 return (page_to_nid(page) == numa_mem_id()) &&
1504 !page_is_pfmemalloc(page);
1508 * i40e_can_reuse_rx_page - Determine if this page can be reused by
1509 * the adapter for another receive
1511 * @rx_buffer: buffer containing the page
1512 * @page: page address from rx_buffer
1513 * @truesize: actual size of the buffer in this page
1515 * If page is reusable, rx_buffer->page_offset is adjusted to point to
1516 * an unused region in the page.
1518 * For small pages, @truesize will be a constant value, half the size
1519 * of the memory at page. We'll attempt to alternate between high and
1520 * low halves of the page, with one half ready for use by the hardware
1521 * and the other half being consumed by the stack. We use the page
1522 * ref count to determine whether the stack has finished consuming the
1523 * portion of this page that was passed up with a previous packet. If
1524 * the page ref count is >1, we'll assume the "other" half page is
1525 * still busy, and this page cannot be reused.
1527 * For larger pages, @truesize will be the actual space used by the
1528 * received packet (adjusted upward to an even multiple of the cache
1529 * line size). This will advance through the page by the amount
1530 * actually consumed by the received packets while there is still
1531 * space for a buffer. Each region of larger pages will be used at
1532 * most once, after which the page will not be reused.
1534 * In either case, if the page is reusable its refcount is increased.
1536 static bool i40e_can_reuse_rx_page(struct i40e_rx_buffer *rx_buffer,
1538 const unsigned int truesize)
1540 #if (PAGE_SIZE >= 8192)
1541 unsigned int last_offset = PAGE_SIZE - I40E_RXBUFFER_2048;
1544 /* Is any reuse possible? */
1545 if (unlikely(!i40e_page_is_reusable(page)))
1548 #if (PAGE_SIZE < 8192)
1549 /* if we are only owner of page we can reuse it */
1550 if (unlikely(page_count(page) != 1))
1553 /* flip page offset to other buffer */
1554 rx_buffer->page_offset ^= truesize;
1556 /* move offset up to the next cache line */
1557 rx_buffer->page_offset += truesize;
1559 if (rx_buffer->page_offset > last_offset)
1563 /* Inc ref count on page before passing it up to the stack */
1570 * i40e_add_rx_frag - Add contents of Rx buffer to sk_buff
1571 * @rx_ring: rx descriptor ring to transact packets on
1572 * @rx_buffer: buffer containing page to add
1573 * @size: packet length from rx_desc
1574 * @skb: sk_buff to place the data into
1576 * This function will add the data contained in rx_buffer->page to the skb.
1577 * This is done either through a direct copy if the data in the buffer is
1578 * less than the skb header size, otherwise it will just attach the page as
1579 * a frag to the skb.
1581 * The function will then update the page offset if necessary and return
1582 * true if the buffer can be reused by the adapter.
1584 static bool i40e_add_rx_frag(struct i40e_ring *rx_ring,
1585 struct i40e_rx_buffer *rx_buffer,
1587 struct sk_buff *skb)
1589 struct page *page = rx_buffer->page;
1590 unsigned char *va = page_address(page) + rx_buffer->page_offset;
1591 #if (PAGE_SIZE < 8192)
1592 unsigned int truesize = I40E_RXBUFFER_2048;
1594 unsigned int truesize = ALIGN(size, L1_CACHE_BYTES);
1596 unsigned int pull_len;
1598 if (unlikely(skb_is_nonlinear(skb)))
1601 /* will the data fit in the skb we allocated? if so, just
1602 * copy it as it is pretty small anyway
1604 if (size <= I40E_RX_HDR_SIZE) {
1605 memcpy(__skb_put(skb, size), va, ALIGN(size, sizeof(long)));
1607 /* page is reusable, we can reuse buffer as-is */
1608 if (likely(i40e_page_is_reusable(page)))
1611 /* this page cannot be reused so discard it */
1612 __free_pages(page, 0);
1616 /* we need the header to contain the greater of either
1617 * ETH_HLEN or 60 bytes if the skb->len is less than
1620 pull_len = eth_get_headlen(va, I40E_RX_HDR_SIZE);
1622 /* align pull length to size of long to optimize
1623 * memcpy performance
1625 memcpy(__skb_put(skb, pull_len), va, ALIGN(pull_len, sizeof(long)));
1627 /* update all of the pointers */
1632 skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page,
1633 (unsigned long)va & ~PAGE_MASK, size, truesize);
1635 return i40e_can_reuse_rx_page(rx_buffer, page, truesize);
1639 * i40e_fetch_rx_buffer - Allocate skb and populate it
1640 * @rx_ring: rx descriptor ring to transact packets on
1641 * @rx_desc: descriptor containing info written by hardware
1643 * This function allocates an skb on the fly, and populates it with the page
1644 * data from the current receive descriptor, taking care to set up the skb
1645 * correctly, as well as handling calling the page recycle function if
1649 struct sk_buff *i40e_fetch_rx_buffer(struct i40e_ring *rx_ring,
1650 union i40e_rx_desc *rx_desc,
1651 struct sk_buff *skb)
1653 u64 local_status_error_len =
1654 le64_to_cpu(rx_desc->wb.qword1.status_error_len);
1656 (local_status_error_len & I40E_RXD_QW1_LENGTH_PBUF_MASK) >>
1657 I40E_RXD_QW1_LENGTH_PBUF_SHIFT;
1658 struct i40e_rx_buffer *rx_buffer;
1661 rx_buffer = &rx_ring->rx_bi[rx_ring->next_to_clean];
1662 page = rx_buffer->page;
1666 void *page_addr = page_address(page) + rx_buffer->page_offset;
1668 /* prefetch first cache line of first page */
1669 prefetch(page_addr);
1670 #if L1_CACHE_BYTES < 128
1671 prefetch(page_addr + L1_CACHE_BYTES);
1674 /* allocate a skb to store the frags */
1675 skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
1677 GFP_ATOMIC | __GFP_NOWARN);
1678 if (unlikely(!skb)) {
1679 rx_ring->rx_stats.alloc_buff_failed++;
1683 /* we will be copying header into skb->data in
1684 * pskb_may_pull so it is in our interest to prefetch
1685 * it now to avoid a possible cache miss
1687 prefetchw(skb->data);
1690 /* we are reusing so sync this buffer for CPU use */
1691 dma_sync_single_range_for_cpu(rx_ring->dev,
1693 rx_buffer->page_offset,
1697 /* pull page into skb */
1698 if (i40e_add_rx_frag(rx_ring, rx_buffer, size, skb)) {
1699 /* hand second half of page back to the ring */
1700 i40e_reuse_rx_page(rx_ring, rx_buffer);
1701 rx_ring->rx_stats.page_reuse_count++;
1703 /* we are not reusing the buffer so unmap it */
1704 dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma, PAGE_SIZE,
1705 DMA_FROM_DEVICE, I40E_RX_DMA_ATTR);
1708 /* clear contents of buffer_info */
1709 rx_buffer->page = NULL;
1715 * i40e_is_non_eop - process handling of non-EOP buffers
1716 * @rx_ring: Rx ring being processed
1717 * @rx_desc: Rx descriptor for current buffer
1718 * @skb: Current socket buffer containing buffer in progress
1720 * This function updates next to clean. If the buffer is an EOP buffer
1721 * this function exits returning false, otherwise it will place the
1722 * sk_buff in the next buffer to be chained and return true indicating
1723 * that this is in fact a non-EOP buffer.
1725 static bool i40e_is_non_eop(struct i40e_ring *rx_ring,
1726 union i40e_rx_desc *rx_desc,
1727 struct sk_buff *skb)
1729 u32 ntc = rx_ring->next_to_clean + 1;
1731 /* fetch, update, and store next to clean */
1732 ntc = (ntc < rx_ring->count) ? ntc : 0;
1733 rx_ring->next_to_clean = ntc;
1735 prefetch(I40E_RX_DESC(rx_ring, ntc));
1737 #define staterrlen rx_desc->wb.qword1.status_error_len
1738 if (unlikely(i40e_rx_is_programming_status(le64_to_cpu(staterrlen)))) {
1739 i40e_clean_programming_status(rx_ring, rx_desc);
1742 /* if we are the last buffer then there is nothing else to do */
1743 #define I40E_RXD_EOF BIT(I40E_RX_DESC_STATUS_EOF_SHIFT)
1744 if (likely(i40e_test_staterr(rx_desc, I40E_RXD_EOF)))
1747 rx_ring->rx_stats.non_eop_descs++;
1753 * i40e_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
1754 * @rx_ring: rx descriptor ring to transact packets on
1755 * @budget: Total limit on number of packets to process
1757 * This function provides a "bounce buffer" approach to Rx interrupt
1758 * processing. The advantage to this is that on systems that have
1759 * expensive overhead for IOMMU access this provides a means of avoiding
1760 * it by maintaining the mapping of the page to the system.
1762 * Returns amount of work completed
1764 static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
1766 unsigned int total_rx_bytes = 0, total_rx_packets = 0;
1767 struct sk_buff *skb = rx_ring->skb;
1768 u16 cleaned_count = I40E_DESC_UNUSED(rx_ring);
1769 bool failure = false;
1771 while (likely(total_rx_packets < budget)) {
1772 union i40e_rx_desc *rx_desc;
1777 /* return some buffers to hardware, one at a time is too slow */
1778 if (cleaned_count >= I40E_RX_BUFFER_WRITE) {
1779 failure = failure ||
1780 i40e_alloc_rx_buffers(rx_ring, cleaned_count);
1784 rx_desc = I40E_RX_DESC(rx_ring, rx_ring->next_to_clean);
1786 /* status_error_len will always be zero for unused descriptors
1787 * because it's cleared in cleanup, and overlaps with hdr_addr
1788 * which is always zero because packet split isn't used, if the
1789 * hardware wrote DD then it will be non-zero
1791 if (!i40e_test_staterr(rx_desc,
1792 BIT(I40E_RX_DESC_STATUS_DD_SHIFT)))
1795 /* This memory barrier is needed to keep us from reading
1796 * any other fields out of the rx_desc until we know the
1801 skb = i40e_fetch_rx_buffer(rx_ring, rx_desc, skb);
1807 if (i40e_is_non_eop(rx_ring, rx_desc, skb))
1810 /* ERR_MASK will only have valid bits if EOP set, and
1811 * what we are doing here is actually checking
1812 * I40E_RX_DESC_ERROR_RXE_SHIFT, since it is the zeroth bit in
1815 if (unlikely(i40e_test_staterr(rx_desc, BIT(I40E_RXD_QW1_ERROR_SHIFT)))) {
1816 dev_kfree_skb_any(skb);
1820 if (i40e_cleanup_headers(rx_ring, skb)) {
1825 /* probably a little skewed due to removing CRC */
1826 total_rx_bytes += skb->len;
1828 qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
1829 rx_ptype = (qword & I40E_RXD_QW1_PTYPE_MASK) >>
1830 I40E_RXD_QW1_PTYPE_SHIFT;
1832 /* populate checksum, VLAN, and protocol */
1833 i40e_process_skb_fields(rx_ring, rx_desc, skb, rx_ptype);
1837 i40e_rx_is_fcoe(rx_ptype) &&
1838 !i40e_fcoe_handle_offload(rx_ring, rx_desc, skb))) {
1839 dev_kfree_skb_any(skb);
1844 vlan_tag = (qword & BIT(I40E_RX_DESC_STATUS_L2TAG1P_SHIFT)) ?
1845 le16_to_cpu(rx_desc->wb.qword0.lo_dword.l2tag1) : 0;
1847 i40e_receive_skb(rx_ring, skb, vlan_tag);
1850 /* update budget accounting */
1856 u64_stats_update_begin(&rx_ring->syncp);
1857 rx_ring->stats.packets += total_rx_packets;
1858 rx_ring->stats.bytes += total_rx_bytes;
1859 u64_stats_update_end(&rx_ring->syncp);
1860 rx_ring->q_vector->rx.total_packets += total_rx_packets;
1861 rx_ring->q_vector->rx.total_bytes += total_rx_bytes;
1863 /* guarantee a trip back through this routine if there was a failure */
1864 return failure ? budget : total_rx_packets;
1867 static u32 i40e_buildreg_itr(const int type, const u16 itr)
1871 val = I40E_PFINT_DYN_CTLN_INTENA_MASK |
1872 /* Don't clear PBA because that can cause lost interrupts that
1873 * came in while we were cleaning/polling
1875 (type << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT) |
1876 (itr << I40E_PFINT_DYN_CTLN_INTERVAL_SHIFT);
1881 /* a small macro to shorten up some long lines */
1882 #define INTREG I40E_PFINT_DYN_CTLN
1883 static inline int get_rx_itr(struct i40e_vsi *vsi, int idx)
1885 return vsi->rx_rings[idx]->rx_itr_setting;
1888 static inline int get_tx_itr(struct i40e_vsi *vsi, int idx)
1890 return vsi->tx_rings[idx]->tx_itr_setting;
1894 * i40e_update_enable_itr - Update itr and re-enable MSIX interrupt
1895 * @vsi: the VSI we care about
1896 * @q_vector: q_vector for which itr is being updated and interrupt enabled
1899 static inline void i40e_update_enable_itr(struct i40e_vsi *vsi,
1900 struct i40e_q_vector *q_vector)
1902 struct i40e_hw *hw = &vsi->back->hw;
1903 bool rx = false, tx = false;
1906 int idx = q_vector->v_idx;
1907 int rx_itr_setting, tx_itr_setting;
1909 vector = (q_vector->v_idx + vsi->base_vector);
1911 /* avoid dynamic calculation if in countdown mode OR if
1912 * all dynamic is disabled
1914 rxval = txval = i40e_buildreg_itr(I40E_ITR_NONE, 0);
1916 rx_itr_setting = get_rx_itr(vsi, idx);
1917 tx_itr_setting = get_tx_itr(vsi, idx);
1919 if (q_vector->itr_countdown > 0 ||
1920 (!ITR_IS_DYNAMIC(rx_itr_setting) &&
1921 !ITR_IS_DYNAMIC(tx_itr_setting))) {
1925 if (ITR_IS_DYNAMIC(tx_itr_setting)) {
1926 rx = i40e_set_new_dynamic_itr(&q_vector->rx);
1927 rxval = i40e_buildreg_itr(I40E_RX_ITR, q_vector->rx.itr);
1930 if (ITR_IS_DYNAMIC(tx_itr_setting)) {
1931 tx = i40e_set_new_dynamic_itr(&q_vector->tx);
1932 txval = i40e_buildreg_itr(I40E_TX_ITR, q_vector->tx.itr);
1936 /* get the higher of the two ITR adjustments and
1937 * use the same value for both ITR registers
1938 * when in adaptive mode (Rx and/or Tx)
1940 u16 itr = max(q_vector->tx.itr, q_vector->rx.itr);
1942 q_vector->tx.itr = q_vector->rx.itr = itr;
1943 txval = i40e_buildreg_itr(I40E_TX_ITR, itr);
1945 rxval = i40e_buildreg_itr(I40E_RX_ITR, itr);
1949 /* only need to enable the interrupt once, but need
1950 * to possibly update both ITR values
1953 /* set the INTENA_MSK_MASK so that this first write
1954 * won't actually enable the interrupt, instead just
1955 * updating the ITR (it's bit 31 PF and VF)
1958 /* don't check _DOWN because interrupt isn't being enabled */
1959 wr32(hw, INTREG(vector - 1), rxval);
1963 if (!test_bit(__I40E_DOWN, &vsi->state))
1964 wr32(hw, INTREG(vector - 1), txval);
1966 if (q_vector->itr_countdown)
1967 q_vector->itr_countdown--;
1969 q_vector->itr_countdown = ITR_COUNTDOWN_START;
1973 * i40e_napi_poll - NAPI polling Rx/Tx cleanup routine
1974 * @napi: napi struct with our devices info in it
1975 * @budget: amount of work driver is allowed to do this pass, in packets
1977 * This function will clean all queues associated with a q_vector.
1979 * Returns the amount of work done
1981 int i40e_napi_poll(struct napi_struct *napi, int budget)
1983 struct i40e_q_vector *q_vector =
1984 container_of(napi, struct i40e_q_vector, napi);
1985 struct i40e_vsi *vsi = q_vector->vsi;
1986 struct i40e_ring *ring;
1987 bool clean_complete = true;
1988 bool arm_wb = false;
1989 int budget_per_ring;
1992 if (test_bit(__I40E_DOWN, &vsi->state)) {
1993 napi_complete(napi);
1997 /* Clear hung_detected bit */
1998 clear_bit(I40E_Q_VECTOR_HUNG_DETECT, &q_vector->hung_detected);
1999 /* Since the actual Tx work is minimal, we can give the Tx a larger
2000 * budget and be more aggressive about cleaning up the Tx descriptors.
2002 i40e_for_each_ring(ring, q_vector->tx) {
2003 if (!i40e_clean_tx_irq(vsi, ring, budget)) {
2004 clean_complete = false;
2007 arm_wb |= ring->arm_wb;
2008 ring->arm_wb = false;
2011 /* Handle case where we are called by netpoll with a budget of 0 */
2015 /* We attempt to distribute budget to each Rx queue fairly, but don't
2016 * allow the budget to go below 1 because that would exit polling early.
2018 budget_per_ring = max(budget/q_vector->num_ringpairs, 1);
2020 i40e_for_each_ring(ring, q_vector->rx) {
2021 int cleaned = i40e_clean_rx_irq(ring, budget_per_ring);
2023 work_done += cleaned;
2024 /* if we clean as many as budgeted, we must not be done */
2025 if (cleaned >= budget_per_ring)
2026 clean_complete = false;
2029 /* If work not completed, return budget and polling will return */
2030 if (!clean_complete) {
2031 const cpumask_t *aff_mask = &q_vector->affinity_mask;
2032 int cpu_id = smp_processor_id();
2034 /* It is possible that the interrupt affinity has changed but,
2035 * if the cpu is pegged at 100%, polling will never exit while
2036 * traffic continues and the interrupt will be stuck on this
2037 * cpu. We check to make sure affinity is correct before we
2038 * continue to poll, otherwise we must stop polling so the
2039 * interrupt can move to the correct cpu.
2041 if (likely(cpumask_test_cpu(cpu_id, aff_mask) ||
2042 !(vsi->back->flags & I40E_FLAG_MSIX_ENABLED))) {
2045 q_vector->tx.ring[0].tx_stats.tx_force_wb++;
2046 i40e_enable_wb_on_itr(vsi, q_vector);
2052 if (vsi->back->flags & I40E_TXR_FLAGS_WB_ON_ITR)
2053 q_vector->arm_wb_state = false;
2055 /* Work is done so exit the polling mode and re-enable the interrupt */
2056 napi_complete_done(napi, work_done);
2058 /* If we're prematurely stopping polling to fix the interrupt
2059 * affinity we want to make sure polling starts back up so we
2060 * issue a call to i40e_force_wb which triggers a SW interrupt.
2062 if (!clean_complete)
2063 i40e_force_wb(vsi, q_vector);
2064 else if (!(vsi->back->flags & I40E_FLAG_MSIX_ENABLED))
2065 i40e_irq_dynamic_enable_icr0(vsi->back, false);
2067 i40e_update_enable_itr(vsi, q_vector);
2069 return min(work_done, budget - 1);
2073 * i40e_atr - Add a Flow Director ATR filter
2074 * @tx_ring: ring to add programming descriptor to
2076 * @tx_flags: send tx flags
2078 static void i40e_atr(struct i40e_ring *tx_ring, struct sk_buff *skb,
2081 struct i40e_filter_program_desc *fdir_desc;
2082 struct i40e_pf *pf = tx_ring->vsi->back;
2084 unsigned char *network;
2086 struct ipv6hdr *ipv6;
2090 u32 flex_ptype, dtype_cmd;
2094 /* make sure ATR is enabled */
2095 if (!(pf->flags & I40E_FLAG_FD_ATR_ENABLED))
2098 if ((pf->hw_disabled_flags & I40E_FLAG_FD_ATR_ENABLED))
2101 /* if sampling is disabled do nothing */
2102 if (!tx_ring->atr_sample_rate)
2105 /* Currently only IPv4/IPv6 with TCP is supported */
2106 if (!(tx_flags & (I40E_TX_FLAGS_IPV4 | I40E_TX_FLAGS_IPV6)))
2109 /* snag network header to get L4 type and address */
2110 hdr.network = (tx_flags & I40E_TX_FLAGS_UDP_TUNNEL) ?
2111 skb_inner_network_header(skb) : skb_network_header(skb);
2113 /* Note: tx_flags gets modified to reflect inner protocols in
2114 * tx_enable_csum function if encap is enabled.
2116 if (tx_flags & I40E_TX_FLAGS_IPV4) {
2117 /* access ihl as u8 to avoid unaligned access on ia64 */
2118 hlen = (hdr.network[0] & 0x0F) << 2;
2119 l4_proto = hdr.ipv4->protocol;
2121 hlen = hdr.network - skb->data;
2122 l4_proto = ipv6_find_hdr(skb, &hlen, IPPROTO_TCP, NULL, NULL);
2123 hlen -= hdr.network - skb->data;
2126 if (l4_proto != IPPROTO_TCP)
2129 th = (struct tcphdr *)(hdr.network + hlen);
2131 /* Due to lack of space, no more new filters can be programmed */
2132 if (th->syn && (pf->hw_disabled_flags & I40E_FLAG_FD_ATR_ENABLED))
2134 if ((pf->flags & I40E_FLAG_HW_ATR_EVICT_CAPABLE) &&
2135 (!(pf->hw_disabled_flags & I40E_FLAG_HW_ATR_EVICT_CAPABLE))) {
2136 /* HW ATR eviction will take care of removing filters on FIN
2139 if (th->fin || th->rst)
2143 tx_ring->atr_count++;
2145 /* sample on all syn/fin/rst packets or once every atr sample rate */
2149 (tx_ring->atr_count < tx_ring->atr_sample_rate))
2152 tx_ring->atr_count = 0;
2154 /* grab the next descriptor */
2155 i = tx_ring->next_to_use;
2156 fdir_desc = I40E_TX_FDIRDESC(tx_ring, i);
2159 tx_ring->next_to_use = (i < tx_ring->count) ? i : 0;
2161 flex_ptype = (tx_ring->queue_index << I40E_TXD_FLTR_QW0_QINDEX_SHIFT) &
2162 I40E_TXD_FLTR_QW0_QINDEX_MASK;
2163 flex_ptype |= (tx_flags & I40E_TX_FLAGS_IPV4) ?
2164 (I40E_FILTER_PCTYPE_NONF_IPV4_TCP <<
2165 I40E_TXD_FLTR_QW0_PCTYPE_SHIFT) :
2166 (I40E_FILTER_PCTYPE_NONF_IPV6_TCP <<
2167 I40E_TXD_FLTR_QW0_PCTYPE_SHIFT);
2169 flex_ptype |= tx_ring->vsi->id << I40E_TXD_FLTR_QW0_DEST_VSI_SHIFT;
2171 dtype_cmd = I40E_TX_DESC_DTYPE_FILTER_PROG;
2173 dtype_cmd |= (th->fin || th->rst) ?
2174 (I40E_FILTER_PROGRAM_DESC_PCMD_REMOVE <<
2175 I40E_TXD_FLTR_QW1_PCMD_SHIFT) :
2176 (I40E_FILTER_PROGRAM_DESC_PCMD_ADD_UPDATE <<
2177 I40E_TXD_FLTR_QW1_PCMD_SHIFT);
2179 dtype_cmd |= I40E_FILTER_PROGRAM_DESC_DEST_DIRECT_PACKET_QINDEX <<
2180 I40E_TXD_FLTR_QW1_DEST_SHIFT;
2182 dtype_cmd |= I40E_FILTER_PROGRAM_DESC_FD_STATUS_FD_ID <<
2183 I40E_TXD_FLTR_QW1_FD_STATUS_SHIFT;
2185 dtype_cmd |= I40E_TXD_FLTR_QW1_CNT_ENA_MASK;
2186 if (!(tx_flags & I40E_TX_FLAGS_UDP_TUNNEL))
2188 ((u32)I40E_FD_ATR_STAT_IDX(pf->hw.pf_id) <<
2189 I40E_TXD_FLTR_QW1_CNTINDEX_SHIFT) &
2190 I40E_TXD_FLTR_QW1_CNTINDEX_MASK;
2193 ((u32)I40E_FD_ATR_TUNNEL_STAT_IDX(pf->hw.pf_id) <<
2194 I40E_TXD_FLTR_QW1_CNTINDEX_SHIFT) &
2195 I40E_TXD_FLTR_QW1_CNTINDEX_MASK;
2197 if ((pf->flags & I40E_FLAG_HW_ATR_EVICT_CAPABLE) &&
2198 (!(pf->hw_disabled_flags & I40E_FLAG_HW_ATR_EVICT_CAPABLE)))
2199 dtype_cmd |= I40E_TXD_FLTR_QW1_ATR_MASK;
2201 fdir_desc->qindex_flex_ptype_vsi = cpu_to_le32(flex_ptype);
2202 fdir_desc->rsvd = cpu_to_le32(0);
2203 fdir_desc->dtype_cmd_cntindex = cpu_to_le32(dtype_cmd);
2204 fdir_desc->fd_id = cpu_to_le32(0);
2208 * i40e_tx_prepare_vlan_flags - prepare generic TX VLAN tagging flags for HW
2210 * @tx_ring: ring to send buffer on
2211 * @flags: the tx flags to be set
2213 * Checks the skb and set up correspondingly several generic transmit flags
2214 * related to VLAN tagging for the HW, such as VLAN, DCB, etc.
2216 * Returns error code indicate the frame should be dropped upon error and the
2217 * otherwise returns 0 to indicate the flags has been set properly.
2220 inline int i40e_tx_prepare_vlan_flags(struct sk_buff *skb,
2221 struct i40e_ring *tx_ring,
2224 static inline int i40e_tx_prepare_vlan_flags(struct sk_buff *skb,
2225 struct i40e_ring *tx_ring,
2229 __be16 protocol = skb->protocol;
2232 if (protocol == htons(ETH_P_8021Q) &&
2233 !(tx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_TX)) {
2234 /* When HW VLAN acceleration is turned off by the user the
2235 * stack sets the protocol to 8021q so that the driver
2236 * can take any steps required to support the SW only
2237 * VLAN handling. In our case the driver doesn't need
2238 * to take any further steps so just set the protocol
2239 * to the encapsulated ethertype.
2241 skb->protocol = vlan_get_protocol(skb);
2245 /* if we have a HW VLAN tag being added, default to the HW one */
2246 if (skb_vlan_tag_present(skb)) {
2247 tx_flags |= skb_vlan_tag_get(skb) << I40E_TX_FLAGS_VLAN_SHIFT;
2248 tx_flags |= I40E_TX_FLAGS_HW_VLAN;
2249 /* else if it is a SW VLAN, check the next protocol and store the tag */
2250 } else if (protocol == htons(ETH_P_8021Q)) {
2251 struct vlan_hdr *vhdr, _vhdr;
2253 vhdr = skb_header_pointer(skb, ETH_HLEN, sizeof(_vhdr), &_vhdr);
2257 protocol = vhdr->h_vlan_encapsulated_proto;
2258 tx_flags |= ntohs(vhdr->h_vlan_TCI) << I40E_TX_FLAGS_VLAN_SHIFT;
2259 tx_flags |= I40E_TX_FLAGS_SW_VLAN;
2262 if (!(tx_ring->vsi->back->flags & I40E_FLAG_DCB_ENABLED))
2265 /* Insert 802.1p priority into VLAN header */
2266 if ((tx_flags & (I40E_TX_FLAGS_HW_VLAN | I40E_TX_FLAGS_SW_VLAN)) ||
2267 (skb->priority != TC_PRIO_CONTROL)) {
2268 tx_flags &= ~I40E_TX_FLAGS_VLAN_PRIO_MASK;
2269 tx_flags |= (skb->priority & 0x7) <<
2270 I40E_TX_FLAGS_VLAN_PRIO_SHIFT;
2271 if (tx_flags & I40E_TX_FLAGS_SW_VLAN) {
2272 struct vlan_ethhdr *vhdr;
2275 rc = skb_cow_head(skb, 0);
2278 vhdr = (struct vlan_ethhdr *)skb->data;
2279 vhdr->h_vlan_TCI = htons(tx_flags >>
2280 I40E_TX_FLAGS_VLAN_SHIFT);
2282 tx_flags |= I40E_TX_FLAGS_HW_VLAN;
2292 * i40e_tso - set up the tso context descriptor
2293 * @first: pointer to first Tx buffer for xmit
2294 * @hdr_len: ptr to the size of the packet header
2295 * @cd_type_cmd_tso_mss: Quad Word 1
2297 * Returns 0 if no TSO can happen, 1 if tso is going, or error
2299 static int i40e_tso(struct i40e_tx_buffer *first, u8 *hdr_len,
2300 u64 *cd_type_cmd_tso_mss)
2302 struct sk_buff *skb = first->skb;
2303 u64 cd_cmd, cd_tso_len, cd_mss;
2314 u32 paylen, l4_offset;
2315 u16 gso_segs, gso_size;
2318 if (skb->ip_summed != CHECKSUM_PARTIAL)
2321 if (!skb_is_gso(skb))
2324 err = skb_cow_head(skb, 0);
2328 ip.hdr = skb_network_header(skb);
2329 l4.hdr = skb_transport_header(skb);
2331 /* initialize outer IP header fields */
2332 if (ip.v4->version == 4) {
2336 ip.v6->payload_len = 0;
2339 if (skb_shinfo(skb)->gso_type & (SKB_GSO_GRE |
2343 SKB_GSO_UDP_TUNNEL |
2344 SKB_GSO_UDP_TUNNEL_CSUM)) {
2345 if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) &&
2346 (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM)) {
2349 /* determine offset of outer transport header */
2350 l4_offset = l4.hdr - skb->data;
2352 /* remove payload length from outer checksum */
2353 paylen = skb->len - l4_offset;
2354 csum_replace_by_diff(&l4.udp->check,
2355 (__force __wsum)htonl(paylen));
2358 /* reset pointers to inner headers */
2359 ip.hdr = skb_inner_network_header(skb);
2360 l4.hdr = skb_inner_transport_header(skb);
2362 /* initialize inner IP header fields */
2363 if (ip.v4->version == 4) {
2367 ip.v6->payload_len = 0;
2371 /* determine offset of inner transport header */
2372 l4_offset = l4.hdr - skb->data;
2374 /* remove payload length from inner checksum */
2375 paylen = skb->len - l4_offset;
2376 csum_replace_by_diff(&l4.tcp->check, (__force __wsum)htonl(paylen));
2378 /* compute length of segmentation header */
2379 *hdr_len = (l4.tcp->doff * 4) + l4_offset;
2381 /* pull values out of skb_shinfo */
2382 gso_size = skb_shinfo(skb)->gso_size;
2383 gso_segs = skb_shinfo(skb)->gso_segs;
2385 /* update GSO size and bytecount with header size */
2386 first->gso_segs = gso_segs;
2387 first->bytecount += (first->gso_segs - 1) * *hdr_len;
2389 /* find the field values */
2390 cd_cmd = I40E_TX_CTX_DESC_TSO;
2391 cd_tso_len = skb->len - *hdr_len;
2393 *cd_type_cmd_tso_mss |= (cd_cmd << I40E_TXD_CTX_QW1_CMD_SHIFT) |
2394 (cd_tso_len << I40E_TXD_CTX_QW1_TSO_LEN_SHIFT) |
2395 (cd_mss << I40E_TXD_CTX_QW1_MSS_SHIFT);
2400 * i40e_tsyn - set up the tsyn context descriptor
2401 * @tx_ring: ptr to the ring to send
2402 * @skb: ptr to the skb we're sending
2403 * @tx_flags: the collected send information
2404 * @cd_type_cmd_tso_mss: Quad Word 1
2406 * Returns 0 if no Tx timestamp can happen and 1 if the timestamp will happen
2408 static int i40e_tsyn(struct i40e_ring *tx_ring, struct sk_buff *skb,
2409 u32 tx_flags, u64 *cd_type_cmd_tso_mss)
2413 if (likely(!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)))
2416 /* Tx timestamps cannot be sampled when doing TSO */
2417 if (tx_flags & I40E_TX_FLAGS_TSO)
2420 /* only timestamp the outbound packet if the user has requested it and
2421 * we are not already transmitting a packet to be timestamped
2423 pf = i40e_netdev_to_pf(tx_ring->netdev);
2424 if (!(pf->flags & I40E_FLAG_PTP))
2428 !test_and_set_bit_lock(__I40E_PTP_TX_IN_PROGRESS, &pf->state)) {
2429 skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
2430 pf->ptp_tx_skb = skb_get(skb);
2435 *cd_type_cmd_tso_mss |= (u64)I40E_TX_CTX_DESC_TSYN <<
2436 I40E_TXD_CTX_QW1_CMD_SHIFT;
2442 * i40e_tx_enable_csum - Enable Tx checksum offloads
2444 * @tx_flags: pointer to Tx flags currently set
2445 * @td_cmd: Tx descriptor command bits to set
2446 * @td_offset: Tx descriptor header offsets to set
2447 * @tx_ring: Tx descriptor ring
2448 * @cd_tunneling: ptr to context desc bits
2450 static int i40e_tx_enable_csum(struct sk_buff *skb, u32 *tx_flags,
2451 u32 *td_cmd, u32 *td_offset,
2452 struct i40e_ring *tx_ring,
2465 unsigned char *exthdr;
2466 u32 offset, cmd = 0;
2470 if (skb->ip_summed != CHECKSUM_PARTIAL)
2473 ip.hdr = skb_network_header(skb);
2474 l4.hdr = skb_transport_header(skb);
2476 /* compute outer L2 header size */
2477 offset = ((ip.hdr - skb->data) / 2) << I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
2479 if (skb->encapsulation) {
2481 /* define outer network header type */
2482 if (*tx_flags & I40E_TX_FLAGS_IPV4) {
2483 tunnel |= (*tx_flags & I40E_TX_FLAGS_TSO) ?
2484 I40E_TX_CTX_EXT_IP_IPV4 :
2485 I40E_TX_CTX_EXT_IP_IPV4_NO_CSUM;
2487 l4_proto = ip.v4->protocol;
2488 } else if (*tx_flags & I40E_TX_FLAGS_IPV6) {
2489 tunnel |= I40E_TX_CTX_EXT_IP_IPV6;
2491 exthdr = ip.hdr + sizeof(*ip.v6);
2492 l4_proto = ip.v6->nexthdr;
2493 if (l4.hdr != exthdr)
2494 ipv6_skip_exthdr(skb, exthdr - skb->data,
2495 &l4_proto, &frag_off);
2498 /* define outer transport */
2501 tunnel |= I40E_TXD_CTX_UDP_TUNNELING;
2502 *tx_flags |= I40E_TX_FLAGS_UDP_TUNNEL;
2505 tunnel |= I40E_TXD_CTX_GRE_TUNNELING;
2506 *tx_flags |= I40E_TX_FLAGS_UDP_TUNNEL;
2510 *tx_flags |= I40E_TX_FLAGS_UDP_TUNNEL;
2511 l4.hdr = skb_inner_network_header(skb);
2514 if (*tx_flags & I40E_TX_FLAGS_TSO)
2517 skb_checksum_help(skb);
2521 /* compute outer L3 header size */
2522 tunnel |= ((l4.hdr - ip.hdr) / 4) <<
2523 I40E_TXD_CTX_QW0_EXT_IPLEN_SHIFT;
2525 /* switch IP header pointer from outer to inner header */
2526 ip.hdr = skb_inner_network_header(skb);
2528 /* compute tunnel header size */
2529 tunnel |= ((ip.hdr - l4.hdr) / 2) <<
2530 I40E_TXD_CTX_QW0_NATLEN_SHIFT;
2532 /* indicate if we need to offload outer UDP header */
2533 if ((*tx_flags & I40E_TX_FLAGS_TSO) &&
2534 !(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) &&
2535 (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM))
2536 tunnel |= I40E_TXD_CTX_QW0_L4T_CS_MASK;
2538 /* record tunnel offload values */
2539 *cd_tunneling |= tunnel;
2541 /* switch L4 header pointer from outer to inner */
2542 l4.hdr = skb_inner_transport_header(skb);
2545 /* reset type as we transition from outer to inner headers */
2546 *tx_flags &= ~(I40E_TX_FLAGS_IPV4 | I40E_TX_FLAGS_IPV6);
2547 if (ip.v4->version == 4)
2548 *tx_flags |= I40E_TX_FLAGS_IPV4;
2549 if (ip.v6->version == 6)
2550 *tx_flags |= I40E_TX_FLAGS_IPV6;
2553 /* Enable IP checksum offloads */
2554 if (*tx_flags & I40E_TX_FLAGS_IPV4) {
2555 l4_proto = ip.v4->protocol;
2556 /* the stack computes the IP header already, the only time we
2557 * need the hardware to recompute it is in the case of TSO.
2559 cmd |= (*tx_flags & I40E_TX_FLAGS_TSO) ?
2560 I40E_TX_DESC_CMD_IIPT_IPV4_CSUM :
2561 I40E_TX_DESC_CMD_IIPT_IPV4;
2562 } else if (*tx_flags & I40E_TX_FLAGS_IPV6) {
2563 cmd |= I40E_TX_DESC_CMD_IIPT_IPV6;
2565 exthdr = ip.hdr + sizeof(*ip.v6);
2566 l4_proto = ip.v6->nexthdr;
2567 if (l4.hdr != exthdr)
2568 ipv6_skip_exthdr(skb, exthdr - skb->data,
2569 &l4_proto, &frag_off);
2572 /* compute inner L3 header size */
2573 offset |= ((l4.hdr - ip.hdr) / 4) << I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
2575 /* Enable L4 checksum offloads */
2578 /* enable checksum offloads */
2579 cmd |= I40E_TX_DESC_CMD_L4T_EOFT_TCP;
2580 offset |= l4.tcp->doff << I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
2583 /* enable SCTP checksum offload */
2584 cmd |= I40E_TX_DESC_CMD_L4T_EOFT_SCTP;
2585 offset |= (sizeof(struct sctphdr) >> 2) <<
2586 I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
2589 /* enable UDP checksum offload */
2590 cmd |= I40E_TX_DESC_CMD_L4T_EOFT_UDP;
2591 offset |= (sizeof(struct udphdr) >> 2) <<
2592 I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
2595 if (*tx_flags & I40E_TX_FLAGS_TSO)
2597 skb_checksum_help(skb);
2602 *td_offset |= offset;
2608 * i40e_create_tx_ctx Build the Tx context descriptor
2609 * @tx_ring: ring to create the descriptor on
2610 * @cd_type_cmd_tso_mss: Quad Word 1
2611 * @cd_tunneling: Quad Word 0 - bits 0-31
2612 * @cd_l2tag2: Quad Word 0 - bits 32-63
2614 static void i40e_create_tx_ctx(struct i40e_ring *tx_ring,
2615 const u64 cd_type_cmd_tso_mss,
2616 const u32 cd_tunneling, const u32 cd_l2tag2)
2618 struct i40e_tx_context_desc *context_desc;
2619 int i = tx_ring->next_to_use;
2621 if ((cd_type_cmd_tso_mss == I40E_TX_DESC_DTYPE_CONTEXT) &&
2622 !cd_tunneling && !cd_l2tag2)
2625 /* grab the next descriptor */
2626 context_desc = I40E_TX_CTXTDESC(tx_ring, i);
2629 tx_ring->next_to_use = (i < tx_ring->count) ? i : 0;
2631 /* cpu_to_le32 and assign to struct fields */
2632 context_desc->tunneling_params = cpu_to_le32(cd_tunneling);
2633 context_desc->l2tag2 = cpu_to_le16(cd_l2tag2);
2634 context_desc->rsvd = cpu_to_le16(0);
2635 context_desc->type_cmd_tso_mss = cpu_to_le64(cd_type_cmd_tso_mss);
2639 * __i40e_maybe_stop_tx - 2nd level check for tx stop conditions
2640 * @tx_ring: the ring to be checked
2641 * @size: the size buffer we want to assure is available
2643 * Returns -EBUSY if a stop is needed, else 0
2645 int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size)
2647 netif_stop_subqueue(tx_ring->netdev, tx_ring->queue_index);
2648 /* Memory barrier before checking head and tail */
2651 /* Check again in a case another CPU has just made room available. */
2652 if (likely(I40E_DESC_UNUSED(tx_ring) < size))
2655 /* A reprieve! - use start_queue because it doesn't call schedule */
2656 netif_start_subqueue(tx_ring->netdev, tx_ring->queue_index);
2657 ++tx_ring->tx_stats.restart_queue;
2662 * __i40e_chk_linearize - Check if there are more than 8 buffers per packet
2665 * Note: Our HW can't DMA more than 8 buffers to build a packet on the wire
2666 * and so we need to figure out the cases where we need to linearize the skb.
2668 * For TSO we need to count the TSO header and segment payload separately.
2669 * As such we need to check cases where we have 7 fragments or more as we
2670 * can potentially require 9 DMA transactions, 1 for the TSO header, 1 for
2671 * the segment payload in the first descriptor, and another 7 for the
2674 bool __i40e_chk_linearize(struct sk_buff *skb)
2676 const struct skb_frag_struct *frag, *stale;
2679 /* no need to check if number of frags is less than 7 */
2680 nr_frags = skb_shinfo(skb)->nr_frags;
2681 if (nr_frags < (I40E_MAX_BUFFER_TXD - 1))
2684 /* We need to walk through the list and validate that each group
2685 * of 6 fragments totals at least gso_size.
2687 nr_frags -= I40E_MAX_BUFFER_TXD - 2;
2688 frag = &skb_shinfo(skb)->frags[0];
2690 /* Initialize size to the negative value of gso_size minus 1. We
2691 * use this as the worst case scenerio in which the frag ahead
2692 * of us only provides one byte which is why we are limited to 6
2693 * descriptors for a single transmit as the header and previous
2694 * fragment are already consuming 2 descriptors.
2696 sum = 1 - skb_shinfo(skb)->gso_size;
2698 /* Add size of frags 0 through 4 to create our initial sum */
2699 sum += skb_frag_size(frag++);
2700 sum += skb_frag_size(frag++);
2701 sum += skb_frag_size(frag++);
2702 sum += skb_frag_size(frag++);
2703 sum += skb_frag_size(frag++);
2705 /* Walk through fragments adding latest fragment, testing it, and
2706 * then removing stale fragments from the sum.
2708 stale = &skb_shinfo(skb)->frags[0];
2710 sum += skb_frag_size(frag++);
2712 /* if sum is negative we failed to make sufficient progress */
2719 sum -= skb_frag_size(stale++);
2726 * i40e_tx_map - Build the Tx descriptor
2727 * @tx_ring: ring to send buffer on
2729 * @first: first buffer info buffer to use
2730 * @tx_flags: collected send information
2731 * @hdr_len: size of the packet header
2732 * @td_cmd: the command field in the descriptor
2733 * @td_offset: offset for checksum or crc
2736 inline void i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
2737 struct i40e_tx_buffer *first, u32 tx_flags,
2738 const u8 hdr_len, u32 td_cmd, u32 td_offset)
2740 static inline void i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
2741 struct i40e_tx_buffer *first, u32 tx_flags,
2742 const u8 hdr_len, u32 td_cmd, u32 td_offset)
2745 unsigned int data_len = skb->data_len;
2746 unsigned int size = skb_headlen(skb);
2747 struct skb_frag_struct *frag;
2748 struct i40e_tx_buffer *tx_bi;
2749 struct i40e_tx_desc *tx_desc;
2750 u16 i = tx_ring->next_to_use;
2755 if (tx_flags & I40E_TX_FLAGS_HW_VLAN) {
2756 td_cmd |= I40E_TX_DESC_CMD_IL2TAG1;
2757 td_tag = (tx_flags & I40E_TX_FLAGS_VLAN_MASK) >>
2758 I40E_TX_FLAGS_VLAN_SHIFT;
2761 first->tx_flags = tx_flags;
2763 dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);
2765 tx_desc = I40E_TX_DESC(tx_ring, i);
2768 for (frag = &skb_shinfo(skb)->frags[0];; frag++) {
2769 unsigned int max_data = I40E_MAX_DATA_PER_TXD_ALIGNED;
2771 if (dma_mapping_error(tx_ring->dev, dma))
2774 /* record length, and DMA address */
2775 dma_unmap_len_set(tx_bi, len, size);
2776 dma_unmap_addr_set(tx_bi, dma, dma);
2778 /* align size to end of page */
2779 max_data += -dma & (I40E_MAX_READ_REQ_SIZE - 1);
2780 tx_desc->buffer_addr = cpu_to_le64(dma);
2782 while (unlikely(size > I40E_MAX_DATA_PER_TXD)) {
2783 tx_desc->cmd_type_offset_bsz =
2784 build_ctob(td_cmd, td_offset,
2791 if (i == tx_ring->count) {
2792 tx_desc = I40E_TX_DESC(tx_ring, 0);
2799 max_data = I40E_MAX_DATA_PER_TXD_ALIGNED;
2800 tx_desc->buffer_addr = cpu_to_le64(dma);
2803 if (likely(!data_len))
2806 tx_desc->cmd_type_offset_bsz = build_ctob(td_cmd, td_offset,
2813 if (i == tx_ring->count) {
2814 tx_desc = I40E_TX_DESC(tx_ring, 0);
2818 size = skb_frag_size(frag);
2821 dma = skb_frag_dma_map(tx_ring->dev, frag, 0, size,
2824 tx_bi = &tx_ring->tx_bi[i];
2827 netdev_tx_sent_queue(txring_txq(tx_ring), first->bytecount);
2830 if (i == tx_ring->count)
2833 tx_ring->next_to_use = i;
2835 i40e_maybe_stop_tx(tx_ring, DESC_NEEDED);
2837 /* write last descriptor with EOP bit */
2838 td_cmd |= I40E_TX_DESC_CMD_EOP;
2840 /* We can OR these values together as they both are checked against
2841 * 4 below and at this point desc_count will be used as a boolean value
2842 * after this if/else block.
2844 desc_count |= ++tx_ring->packet_stride;
2846 /* Algorithm to optimize tail and RS bit setting:
2847 * if queue is stopped
2849 * reset packet counter
2850 * else if xmit_more is supported and is true
2851 * advance packet counter to 4
2852 * reset desc_count to 0
2854 * if desc_count >= 4
2856 * reset packet counter
2860 * Note: If there are less than 4 descriptors
2861 * pending and interrupts were disabled the service task will
2862 * trigger a force WB.
2864 if (netif_xmit_stopped(txring_txq(tx_ring))) {
2866 } else if (skb->xmit_more) {
2867 /* set stride to arm on next packet and reset desc_count */
2868 tx_ring->packet_stride = WB_STRIDE;
2870 } else if (desc_count >= WB_STRIDE) {
2872 /* write last descriptor with RS bit set */
2873 td_cmd |= I40E_TX_DESC_CMD_RS;
2874 tx_ring->packet_stride = 0;
2877 tx_desc->cmd_type_offset_bsz =
2878 build_ctob(td_cmd, td_offset, size, td_tag);
2880 /* Force memory writes to complete before letting h/w know there
2881 * are new descriptors to fetch.
2883 * We also use this memory barrier to make certain all of the
2884 * status bits have been updated before next_to_watch is written.
2888 /* set next_to_watch value indicating a packet is present */
2889 first->next_to_watch = tx_desc;
2891 /* notify HW of packet */
2893 writel(i, tx_ring->tail);
2895 /* we need this if more than one processor can write to our tail
2896 * at a time, it synchronizes IO on IA64/Altix systems
2904 dev_info(tx_ring->dev, "TX DMA map failed\n");
2906 /* clear dma mappings for failed tx_bi map */
2908 tx_bi = &tx_ring->tx_bi[i];
2909 i40e_unmap_and_free_tx_resource(tx_ring, tx_bi);
2917 tx_ring->next_to_use = i;
2921 * i40e_xmit_frame_ring - Sends buffer on Tx ring
2923 * @tx_ring: ring to send buffer on
2925 * Returns NETDEV_TX_OK if sent, else an error code
2927 static netdev_tx_t i40e_xmit_frame_ring(struct sk_buff *skb,
2928 struct i40e_ring *tx_ring)
2930 u64 cd_type_cmd_tso_mss = I40E_TX_DESC_DTYPE_CONTEXT;
2931 u32 cd_tunneling = 0, cd_l2tag2 = 0;
2932 struct i40e_tx_buffer *first;
2941 /* prefetch the data, we'll need it later */
2942 prefetch(skb->data);
2944 count = i40e_xmit_descriptor_count(skb);
2945 if (i40e_chk_linearize(skb, count)) {
2946 if (__skb_linearize(skb)) {
2947 dev_kfree_skb_any(skb);
2948 return NETDEV_TX_OK;
2950 count = i40e_txd_use_count(skb->len);
2951 tx_ring->tx_stats.tx_linearize++;
2954 /* need: 1 descriptor per page * PAGE_SIZE/I40E_MAX_DATA_PER_TXD,
2955 * + 1 desc for skb_head_len/I40E_MAX_DATA_PER_TXD,
2956 * + 4 desc gap to avoid the cache line where head is,
2957 * + 1 desc for context descriptor,
2958 * otherwise try next time
2960 if (i40e_maybe_stop_tx(tx_ring, count + 4 + 1)) {
2961 tx_ring->tx_stats.tx_busy++;
2962 return NETDEV_TX_BUSY;
2965 /* record the location of the first descriptor for this packet */
2966 first = &tx_ring->tx_bi[tx_ring->next_to_use];
2968 first->bytecount = skb->len;
2969 first->gso_segs = 1;
2971 /* prepare the xmit flags */
2972 if (i40e_tx_prepare_vlan_flags(skb, tx_ring, &tx_flags))
2975 /* obtain protocol of skb */
2976 protocol = vlan_get_protocol(skb);
2978 /* setup IPv4/IPv6 offloads */
2979 if (protocol == htons(ETH_P_IP))
2980 tx_flags |= I40E_TX_FLAGS_IPV4;
2981 else if (protocol == htons(ETH_P_IPV6))
2982 tx_flags |= I40E_TX_FLAGS_IPV6;
2984 tso = i40e_tso(first, &hdr_len, &cd_type_cmd_tso_mss);
2989 tx_flags |= I40E_TX_FLAGS_TSO;
2991 /* Always offload the checksum, since it's in the data descriptor */
2992 tso = i40e_tx_enable_csum(skb, &tx_flags, &td_cmd, &td_offset,
2993 tx_ring, &cd_tunneling);
2997 tsyn = i40e_tsyn(tx_ring, skb, tx_flags, &cd_type_cmd_tso_mss);
3000 tx_flags |= I40E_TX_FLAGS_TSYN;
3002 skb_tx_timestamp(skb);
3004 /* always enable CRC insertion offload */
3005 td_cmd |= I40E_TX_DESC_CMD_ICRC;
3007 i40e_create_tx_ctx(tx_ring, cd_type_cmd_tso_mss,
3008 cd_tunneling, cd_l2tag2);
3010 /* Add Flow Director ATR if it's enabled.
3012 * NOTE: this must always be directly before the data descriptor.
3014 i40e_atr(tx_ring, skb, tx_flags);
3016 i40e_tx_map(tx_ring, skb, first, tx_flags, hdr_len,
3019 return NETDEV_TX_OK;
3022 dev_kfree_skb_any(first->skb);
3024 return NETDEV_TX_OK;
3028 * i40e_lan_xmit_frame - Selects the correct VSI and Tx queue to send buffer
3030 * @netdev: network interface device structure
3032 * Returns NETDEV_TX_OK if sent, else an error code
3034 netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
3036 struct i40e_netdev_priv *np = netdev_priv(netdev);
3037 struct i40e_vsi *vsi = np->vsi;
3038 struct i40e_ring *tx_ring = vsi->tx_rings[skb->queue_mapping];
3040 /* hardware can't handle really short frames, hardware padding works
3043 if (skb_put_padto(skb, I40E_MIN_TX_LEN))
3044 return NETDEV_TX_OK;
3046 return i40e_xmit_frame_ring(skb, tx_ring);