[qca-nss-sfe] Use HW csum in TX if possible.

Change-Id: I7857b24e1fa0e240a41109b6c4ce0625293a8154
Signed-off-by: Ratheesh Kannoth <quic_rkannoth@quicinc.com>
diff --git a/sfe_ipv4.c b/sfe_ipv4.c
index 0e565d9..cf04e12 100644
--- a/sfe_ipv4.c
+++ b/sfe_ipv4.c
@@ -748,7 +748,9 @@
 	}
 
 	/*
-	 * Validate ip csum
+	 * Validate ip csum if necessary. If ip_summed is set to CHECKSUM_UNNECESSARY, it is assumed
+	 * that the L3 checksum is validated by the Rx interface or the tunnel interface that has
+	 * generated the packet.
 	 */
 	iph = (struct iphdr *)skb->data;
 	if (unlikely(skb->ip_summed != CHECKSUM_UNNECESSARY) && (ip_fast_csum((u8 *)iph, iph->ihl))) {
@@ -1080,6 +1082,7 @@
 		original_cm->dscp = msg->dscp_rule.flow_dscp << SFE_IPV4_DSCP_SHIFT;
 		original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_DSCP_REMARK;
 	}
+
 #ifdef CONFIG_NF_FLOW_COOKIE
 	original_cm->flow_cookie = 0;
 #endif
@@ -1089,8 +1092,19 @@
 	} else {
 		original_cm->flow_accel = 1;
 	}
-
 #endif
+	/*
+	 * If l2_features are disabled and flow uses l2 features such as macvlan/bridge/pppoe/vlan,
+	 * bottom interfaces are expected to be disabled in the flow rule and always top interfaces
+	 * are used. In such cases, do not use HW csum offload. csum offload is used only when we
+	 * are sending directly to the destination interface that supports it.
+	 */
+	if (likely(dest_dev->features & NETIF_F_HW_CSUM)) {
+		if ((msg->conn_rule.return_top_interface_num == msg->conn_rule.return_interface_num) ||
+			(msg->rule_flags & SFE_RULE_CREATE_FLAG_USE_RETURN_BOTTOM_INTERFACE)) {
+			 original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_CSUM_OFFLOAD;
+		}
+	}
 
 	/*
 	 * For the non-arp interface, we don't write L2 HDR.
@@ -1168,6 +1182,18 @@
 	}
 
 #endif
+	/*
+	 * If l2_features are disabled and flow uses l2 features such as macvlan/bridge/pppoe/vlan,
+	 * bottom interfaces are expected to be disabled in the flow rule and always top interfaces
+	 * are used. In such cases, do not use HW csum offload. csum offload is used only when we
+	 * are sending directly to the destination interface that supports it.
+	 */
+	if (likely(src_dev->features & NETIF_F_HW_CSUM)) {
+		if ((msg->conn_rule.flow_top_interface_num == msg->conn_rule.flow_interface_num) ||
+			(msg->rule_flags & SFE_RULE_CREATE_FLAG_USE_FLOW_BOTTOM_INTERFACE)) {
+			 reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_CSUM_OFFLOAD;
+		}
+	}
 
 	/*
 	 * For the non-arp interface, we don't write L2 HDR.
diff --git a/sfe_ipv4.h b/sfe_ipv4.h
index 01baf2e..bdfd72d 100644
--- a/sfe_ipv4.h
+++ b/sfe_ipv4.h
@@ -53,6 +53,8 @@
 					/* remark priority of SKB */
 #define SFE_IPV4_CONNECTION_MATCH_FLAG_DSCP_REMARK (1<<6)
 					/* remark DSCP of packet */
+#define SFE_IPV4_CONNECTION_MATCH_FLAG_CSUM_OFFLOAD (1<<7)
+					/* checksum offload.*/
 
 /*
  * IPv4 connection matching structure.
diff --git a/sfe_ipv4_tcp.c b/sfe_ipv4_tcp.c
index df8df0c..b9dcdce 100644
--- a/sfe_ipv4_tcp.c
+++ b/sfe_ipv4_tcp.c
@@ -127,6 +127,7 @@
 	u32 flags;
 	struct net_device *xmit_dev;
 	bool ret;
+	bool hw_csum;
 
 	/*
 	 * Is our packet too short to contain a valid UDP header?
@@ -496,6 +497,13 @@
 	iph->ttl = ttl - 1;
 
 	/*
+	 * Enable HW csum if rx checksum is verified and xmit interface is CSUM offload capable.
+	 * Note: If L4 csum at Rx was found to be incorrect, we (router) should use incremental L4 checksum here
+	 * so that HW does not re-calculate/replace the L4 csum
+	 */
+	hw_csum = !!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_CSUM_OFFLOAD) && (skb->ip_summed == CHECKSUM_UNNECESSARY);
+
+	/*
 	 * Do we have to perform translations of the source address/port?
 	 */
 	if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC)) {
@@ -505,19 +513,17 @@
 		iph->saddr = cm->xlate_src_ip;
 		tcph->source = cm->xlate_src_port;
 
-		/*
-		 * Do we have a non-zero UDP checksum?  If we do then we need
-		 * to update it.
-		 */
-		tcp_csum = tcph->check;
-		if (unlikely(skb->ip_summed == CHECKSUM_PARTIAL)) {
-			sum = tcp_csum + cm->xlate_src_partial_csum_adjustment;
-		} else {
-			sum = tcp_csum + cm->xlate_src_csum_adjustment;
-		}
+		if (unlikely(!hw_csum)) {
+			tcp_csum = tcph->check;
+			if (unlikely(skb->ip_summed == CHECKSUM_PARTIAL)) {
+				sum = tcp_csum + cm->xlate_src_partial_csum_adjustment;
+			} else {
+				sum = tcp_csum + cm->xlate_src_csum_adjustment;
+			}
 
-		sum = (sum & 0xffff) + (sum >> 16);
-		tcph->check = (u16)sum;
+			sum = (sum & 0xffff) + (sum >> 16);
+			tcph->check = (u16)sum;
+		}
 	}
 
 	/*
@@ -530,25 +536,30 @@
 		iph->daddr = cm->xlate_dest_ip;
 		tcph->dest = cm->xlate_dest_port;
 
-		/*
-		 * Do we have a non-zero UDP checksum?  If we do then we need
-		 * to update it.
-		 */
-		tcp_csum = tcph->check;
-		if (unlikely(skb->ip_summed == CHECKSUM_PARTIAL)) {
-			sum = tcp_csum + cm->xlate_dest_partial_csum_adjustment;
-		} else {
-			sum = tcp_csum + cm->xlate_dest_csum_adjustment;
-		}
+		if (unlikely(!hw_csum)) {
+			tcp_csum = tcph->check;
+			if (unlikely(skb->ip_summed == CHECKSUM_PARTIAL)) {
+				sum = tcp_csum + cm->xlate_dest_partial_csum_adjustment;
+			} else {
+				sum = tcp_csum + cm->xlate_dest_csum_adjustment;
+			}
 
-		sum = (sum & 0xffff) + (sum >> 16);
-		tcph->check = (u16)sum;
+			sum = (sum & 0xffff) + (sum >> 16);
+			tcph->check = (u16)sum;
+		}
 	}
 
 	/*
-	 * Replace the IP checksum.
+	 * If HW checksum offload is not possible, full L3 checksum and incremental L4 checksum
+	 * are used to update the packet. Setting ip_summed to CHECKSUM_UNNECESSARY ensures checksum is
+	 * not recalculated further in packet path.
 	 */
-	iph->check = sfe_ipv4_gen_ip_csum(iph);
+	if (likely(hw_csum)) {
+		skb->ip_summed = CHECKSUM_PARTIAL;
+	} else {
+		iph->check = sfe_ipv4_gen_ip_csum(iph);
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	}
 
 	/*
 	 * Update traffic stats.
diff --git a/sfe_ipv4_udp.c b/sfe_ipv4_udp.c
index a13c923..1d1a4df 100644
--- a/sfe_ipv4_udp.c
+++ b/sfe_ipv4_udp.c
@@ -45,6 +45,7 @@
 	u8 ttl;
 	struct net_device *xmit_dev;
 	bool ret;
+	bool hw_csum;
 
 	/*
 	 * Is our packet too short to contain a valid UDP header?
@@ -197,6 +198,13 @@
 	iph->ttl = ttl - 1;
 
 	/*
+	 * Enable HW csum if rx checksum is verified and xmit interface is CSUM offload capable.
+	 * Note: If L4 csum at Rx was found to be incorrect, we (router) should use incremental L4 checksum here
+	 * so that HW does not re-calculate/replace the L4 csum
+	 */
+	hw_csum = !!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_CSUM_OFFLOAD) && (skb->ip_summed == CHECKSUM_UNNECESSARY);
+
+	/*
 	 * Do we have to perform translations of the source address/port?
 	 */
 	if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC)) {
@@ -209,18 +217,20 @@
 		 * Do we have a non-zero UDP checksum?  If we do then we need
 		 * to update it.
 		 */
-		udp_csum = udph->check;
-		if (likely(udp_csum)) {
-			u32 sum;
+		if (unlikely(!hw_csum)) {
+			udp_csum = udph->check;
+			if (likely(udp_csum)) {
+				u32 sum;
 
-			if (unlikely(skb->ip_summed == CHECKSUM_PARTIAL)) {
-				sum = udp_csum + cm->xlate_src_partial_csum_adjustment;
-			} else {
-				sum = udp_csum + cm->xlate_src_csum_adjustment;
+				if (unlikely(skb->ip_summed == CHECKSUM_PARTIAL)) {
+					sum = udp_csum + cm->xlate_src_partial_csum_adjustment;
+				} else {
+					sum = udp_csum + cm->xlate_src_csum_adjustment;
+				}
+
+				sum = (sum & 0xffff) + (sum >> 16);
+				udph->check = (u16)sum;
 			}
-
-			sum = (sum & 0xffff) + (sum >> 16);
-			udph->check = (u16)sum;
 		}
 	}
 
@@ -237,25 +247,38 @@
 		 * Do we have a non-zero UDP checksum?  If we do then we need
 		 * to update it.
 		 */
-		udp_csum = udph->check;
-		if (likely(udp_csum)) {
-			u32 sum;
+		if (unlikely(!hw_csum)) {
+			udp_csum = udph->check;
+			if (likely(udp_csum)) {
+				u32 sum;
 
-			if (unlikely(skb->ip_summed == CHECKSUM_PARTIAL)) {
-				sum = udp_csum + cm->xlate_dest_partial_csum_adjustment;
-			} else {
-				sum = udp_csum + cm->xlate_dest_csum_adjustment;
+				/*
+				 * TODO: Use a common API for below incremental checksum calculation
+				 * for IPv4/IPv6 UDP/TCP
+				 */
+				if (unlikely(skb->ip_summed == CHECKSUM_PARTIAL)) {
+					sum = udp_csum + cm->xlate_dest_partial_csum_adjustment;
+				} else {
+					sum = udp_csum + cm->xlate_dest_csum_adjustment;
+				}
+
+				sum = (sum & 0xffff) + (sum >> 16);
+				udph->check = (u16)sum;
 			}
-
-			sum = (sum & 0xffff) + (sum >> 16);
-			udph->check = (u16)sum;
 		}
 	}
 
 	/*
-	 * Replace the IP checksum.
+	 * If HW checksum offload is not possible, full L3 checksum and incremental L4 checksum
+	 * are used to update the packet. Setting ip_summed to CHECKSUM_UNNECESSARY ensures checksum is
+	 * not recalculated further in packet path.
 	 */
-	iph->check = sfe_ipv4_gen_ip_csum(iph);
+	if (likely(hw_csum)) {
+		skb->ip_summed = CHECKSUM_PARTIAL;
+	} else {
+		iph->check = sfe_ipv4_gen_ip_csum(iph);
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	}
 
 	/*
 	 * Update traffic stats.
diff --git a/sfe_ipv6.c b/sfe_ipv6.c
index 50c35b6..e53cef5 100644
--- a/sfe_ipv6.c
+++ b/sfe_ipv6.c
@@ -1075,6 +1075,18 @@
 		original_cm->flow_accel = 1;
 	}
 #endif
+	/*
+	 * If l2_features are disabled and flow uses l2 features such as macvlan/bridge/pppoe/vlan,
+	 * bottom interfaces are expected to be disabled in the flow rule and always top interfaces
+	 * are used. In such cases, do not use HW csum offload. csum offload is used only when we
+	 * are sending directly to the destination interface that supports it.
+	 */
+	if (likely(dest_dev->features & NETIF_F_HW_CSUM)) {
+		if ((msg->conn_rule.return_top_interface_num == msg->conn_rule.return_interface_num) ||
+			(msg->rule_flags & SFE_RULE_CREATE_FLAG_USE_RETURN_BOTTOM_INTERFACE)) {
+			 original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_CSUM_OFFLOAD;
+		}
+	}
 
 	/*
 	 * For the non-arp interface, we don't write L2 HDR.
@@ -1149,6 +1161,19 @@
 	}
 #endif
 	/*
+	 * If l2_features are disabled and flow uses l2 features such as macvlan/bridge/pppoe/vlan,
+	 * bottom interfaces are expected to be disabled in the flow rule and always top interfaces
+	 * are used. In such cases, do not use HW csum offload. csum offload is used only when we
+	 * are sending directly to the destination interface that supports it.
+	 */
+	if (likely(src_dev->features & NETIF_F_HW_CSUM)) {
+		if ((msg->conn_rule.flow_top_interface_num == msg->conn_rule.flow_interface_num) ||
+			(msg->rule_flags & SFE_RULE_CREATE_FLAG_USE_FLOW_BOTTOM_INTERFACE)) {
+			 reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_CSUM_OFFLOAD;
+		}
+	}
+
+	/*
 	 * For the non-arp interface, we don't write L2 HDR.
 	 */
 	if (!(src_dev->flags & IFF_NOARP)) {
diff --git a/sfe_ipv6.h b/sfe_ipv6.h
index d3812f1..e9c58bd 100644
--- a/sfe_ipv6.h
+++ b/sfe_ipv6.h
@@ -66,6 +66,8 @@
 					/* remark priority of SKB */
 #define SFE_IPV6_CONNECTION_MATCH_FLAG_DSCP_REMARK (1<<6)
 					/* remark DSCP of packet */
+#define SFE_IPV6_CONNECTION_MATCH_FLAG_CSUM_OFFLOAD (1<<7)
+					/* checksum offload.*/
 
 /*
  * IPv6 connection matching structure.
diff --git a/sfe_ipv6_tcp.c b/sfe_ipv6_tcp.c
index c936116..04870e9 100644
--- a/sfe_ipv6_tcp.c
+++ b/sfe_ipv6_tcp.c
@@ -126,6 +126,7 @@
 	u32 flags;
 	struct net_device *xmit_dev;
 	bool ret;
+	bool hw_csum;
 
 	/*
 	 * Is our packet too short to contain a valid UDP header?
@@ -504,6 +505,13 @@
 	iph->hop_limit -= 1;
 
 	/*
+	 * Enable HW csum if rx checksum is verified and xmit interface is CSUM offload capable.
+	 * Note: If L4 csum at Rx was found to be incorrect, we (router) should use incremental L4 checksum here
+	 * so that HW does not re-calculate/replace the L4 csum
+	 */
+	hw_csum = !!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_CSUM_OFFLOAD) && (skb->ip_summed == CHECKSUM_UNNECESSARY);
+
+	/*
 	 * Do we have to perform translations of the source address/port?
 	 */
 	if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_SRC)) {
@@ -516,14 +524,12 @@
 		iph->saddr.s6_addr32[3] = cm->xlate_src_ip[0].addr[3];
 		tcph->source = cm->xlate_src_port;
 
-		/*
-		 * Do we have a non-zero UDP checksum?  If we do then we need
-		 * to update it.
-		 */
-		tcp_csum = tcph->check;
-		sum = tcp_csum + cm->xlate_src_csum_adjustment;
-		sum = (sum & 0xffff) + (sum >> 16);
-		tcph->check = (u16)sum;
+		if (unlikely(!hw_csum)) {
+			tcp_csum = tcph->check;
+			sum = tcp_csum + cm->xlate_src_csum_adjustment;
+			sum = (sum & 0xffff) + (sum >> 16);
+			tcph->check = (u16)sum;
+		}
 	}
 
 	/*
@@ -539,14 +545,23 @@
 		iph->daddr.s6_addr32[3] = cm->xlate_dest_ip[0].addr[3];
 		tcph->dest = cm->xlate_dest_port;
 
-		/*
-		 * Do we have a non-zero UDP checksum?  If we do then we need
-		 * to update it.
-		 */
-		tcp_csum = tcph->check;
-		sum = tcp_csum + cm->xlate_dest_csum_adjustment;
-		sum = (sum & 0xffff) + (sum >> 16);
-		tcph->check = (u16)sum;
+		if (unlikely(!hw_csum)) {
+			tcp_csum = tcph->check;
+			sum = tcp_csum + cm->xlate_dest_csum_adjustment;
+			sum = (sum & 0xffff) + (sum >> 16);
+			tcph->check = (u16)sum;
+		}
+	}
+
+	/*
+	 * If HW checksum offload is not possible, incremental L4 checksum is used to update the packet.
+	 * Setting ip_summed to CHECKSUM_UNNECESSARY ensures checksum is not recalculated further in packet
+	 * path.
+	 */
+	if (likely(hw_csum)) {
+		skb->ip_summed = CHECKSUM_PARTIAL;
+	} else {
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
 	}
 
 	/*
diff --git a/sfe_ipv6_udp.c b/sfe_ipv6_udp.c
index 559425e..2523255 100644
--- a/sfe_ipv6_udp.c
+++ b/sfe_ipv6_udp.c
@@ -44,6 +44,7 @@
 	struct sfe_ipv6_connection_match *cm;
 	struct net_device *xmit_dev;
 	bool ret;
+	bool hw_csum;
 
 	/*
 	 * Is our packet too short to contain a valid UDP header?
@@ -198,6 +199,13 @@
 	iph->hop_limit -= 1;
 
 	/*
+	 * Enable HW csum if rx checksum is verified and xmit interface is CSUM offload capable.
+	 * Note: If L4 csum at Rx was found to be incorrect, we (router) should use incremental L4 checksum here
+	 * so that HW does not re-calculate/replace the L4 csum
+	 */
+	hw_csum = !!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_CSUM_OFFLOAD) && (skb->ip_summed == CHECKSUM_UNNECESSARY);
+
+	/*
 	 * Do we have to perform translations of the source address/port?
 	 */
 	if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_SRC)) {
@@ -213,11 +221,13 @@
 		 * Do we have a non-zero UDP checksum?  If we do then we need
 		 * to update it.
 		 */
-		udp_csum = udph->check;
-		if (likely(udp_csum)) {
-			u32 sum = udp_csum + cm->xlate_src_csum_adjustment;
-			sum = (sum & 0xffff) + (sum >> 16);
-			udph->check = (u16)sum;
+		if (unlikely(!hw_csum)) {
+			udp_csum = udph->check;
+			if (likely(udp_csum)) {
+				u32 sum = udp_csum + cm->xlate_src_csum_adjustment;
+				sum = (sum & 0xffff) + (sum >> 16);
+				udph->check = (u16)sum;
+			}
 		}
 	}
 
@@ -237,15 +247,28 @@
 		 * Do we have a non-zero UDP checksum?  If we do then we need
 		 * to update it.
 		 */
-		udp_csum = udph->check;
-		if (likely(udp_csum)) {
-			u32 sum = udp_csum + cm->xlate_dest_csum_adjustment;
-			sum = (sum & 0xffff) + (sum >> 16);
-			udph->check = (u16)sum;
+		if (unlikely(!hw_csum)) {
+			udp_csum = udph->check;
+			if (likely(udp_csum)) {
+				u32 sum = udp_csum + cm->xlate_dest_csum_adjustment;
+				sum = (sum & 0xffff) + (sum >> 16);
+				udph->check = (u16)sum;
+			}
 		}
 	}
 
 	/*
+	 * If HW checksum offload is not possible, incremental L4 checksum is used to update the packet.
+	 * Setting ip_summed to CHECKSUM_UNNECESSARY ensures checksum is not recalculated further in packet
+	 * path.
+	 */
+	if (likely(hw_csum)) {
+		skb->ip_summed = CHECKSUM_PARTIAL;
+	} else {
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	}
+
+	/*
 	 * Update traffic stats.
 	 */
 	atomic_inc(&cm->rx_packet_count);