From 30f421d1ea49c2999430da4f815c52ebf431e0cc Mon Sep 17 00:00:00 2001 From: ywc689 Date: Mon, 11 Dec 2023 18:06:57 +0800 Subject: [PATCH 1/2] ipvs: toa enhancements 1. Do not insert toa data in syn packets any more. 2. Compact tcp option space or remove some options when inserting toa data failed and then try again. 3. Emit a warning log for audit when toa insertion failed finally. Signed-off-by: ywc689 --- src/ipvs/ip_vs_proto_tcp.c | 184 +++++++++++++++++++++++++++---------- 1 file changed, 136 insertions(+), 48 deletions(-) diff --git a/src/ipvs/ip_vs_proto_tcp.c b/src/ipvs/ip_vs_proto_tcp.c index 75e5713a..8c7edb50 100644 --- a/src/ipvs/ip_vs_proto_tcp.c +++ b/src/ipvs/ip_vs_proto_tcp.c @@ -306,46 +306,129 @@ static void tcp_in_remove_ts(struct tcphdr *tcph) } } -/* use NOP option to replace TCP_OLEN_IP4_ADDR and TCP_OLEN_IP6_ADDR opt */ -static void tcp_in_remove_toa(struct tcphdr *tcph, int af) +/* + * Remove NOP and TOA options preset in the mbuf and compact option space. + * If still no enough space, trim more options except for the protected ones. + * + * Return the trimmed length on success, otherwise dpvs error num on failure. + * */ +static int tcp_in_prune_options(int af, int reqlen, struct rte_mbuf *mbuf, struct tcphdr *tcph) { - unsigned char *ptr; - int len, i; - uint32_t tcp_opt_len = af == AF_INET ? TCP_OLEN_IP4_ADDR : TCP_OLEN_IP6_ADDR; + unsigned char *ptr, *fast, *slow; + const unsigned char *l3hdr, *payload; + int i, optlen; + unsigned int pruned; + uint8_t opcode, opsize; + uint64_t opts_protected; + const uint8_t opts_maxlen[64] = { + [2] = 4, [3] = 3, [4] = 2, + [8] = 10, [30] = 40, [34] = 18 + }; ptr = (unsigned char *)(tcph + 1); - len = (tcph->doff << 2) - sizeof(struct tcphdr); + fast = slow = ptr; + optlen = (tcph->doff << 2) - sizeof(struct tcphdr); + payload = ptr + optlen; - while (len > 0) { - int opcode = *ptr++; - int opsize; + if (optlen < reqlen) /* make no sense to do anything */ + return 0; + while (optlen > 0) { + opcode = *ptr++; switch (opcode) { case TCP_OPT_EOL: - return; + goto fini; case TCP_OPT_NOP: - len--; + fast++; + optlen--; continue; default: opsize = *ptr++; - if (opsize < 2) /* silly options */ - return; - if (opsize > len) - return; /* partial options */ - if ((opcode == TCP_OPT_ADDR) && (opsize == tcp_opt_len)) { - for (i = 0; i < tcp_opt_len; i++) { - *(ptr - 2 + i) = TCP_OPT_NOP; + if (opsize < 2) /* silly options */ + goto fini; + if (opsize > optlen) /* partial options */ + goto fini; + if (opcode == TCP_OPT_ADDR) { + fast += opsize; + } else { + for (i = 0; i < opsize; i++) { + if (slow != fast) + *slow = *fast; + slow++; + fast++; } - /* DON'T RETURN - * keep search other TCP_OPT_ADDR ,and clear them. - * See https://github.com/iqiyi/dpvs/pull/925 for more detail. */ } ptr += opsize - 2; - len -= opsize; + optlen -= opsize; break; } } + +fini: + pruned = payload - slow; + if (pruned < reqlen) { + /* further trim the options, the tcp functionality relies on unprotected + * options may get hurt, refer to: + * https://www.iana.org/assignments/tcp-parameters/tcp-parameters.xhtml + * #tcp-parameters-1 + * */ + ptr = slow; + slow = fast = (unsigned char *)(tcph + 1); + if (tcph->syn) + opts_protected = (1ULL << 2) | (1ULL << 3) | (1ULL << 4) /* MSS, WS, SACKP */ + | (1ULL << 8) | (1ULL << 30) | (1ULL << 34); /* TS, MPTCP, TFO */ + else + opts_protected = (1ULL << 8); /* TS, drop SACK, MPTCP DSS/REMOVE_ADDR */ + while (fast < ptr) { + opcode = *fast; + opsize = *(fast + 1); + if (opcode < 64 && ((1ULL << opcode) & opts_protected) + && (opsize <= opts_maxlen[opcode])) { + for (i = 0; i < opsize; i++) + *slow++ = *fast++; + opts_protected ^= (1ULL << opcode); + } else { + fast += opsize; + pruned += opsize; + if (pruned >= reqlen) { + while (fast < ptr) + *slow++ = *fast++; + break; + } + } + } + pruned = payload - slow; + } + if (pruned > 0) { + while (pruned & 0x3) { /* 4-bytes alignment for tcp options */ + *slow++ = 0; + pruned--; + } + if (!pruned) + return 0; + /* trim the packet */ + l3hdr = rte_pktmbuf_mtod(mbuf, void *); + if (unlikely(mbuf_may_pull(mbuf, mbuf->pkt_len) != 0)) { + memset(slow, 0, pruned); + return EDPVS_INVPKT; + } + if (unlikely(payload - l3hdr > mbuf->pkt_len)) { + memset(slow, 0, pruned); + return EDPVS_INVPKT; + } + memmove(slow, payload, mbuf->pkt_len - (payload - l3hdr)); + rte_pktmbuf_trim(mbuf, pruned); + tcph->doff -= (pruned >> 2); + if (af == AF_INET) + ((struct rte_ipv4_hdr *)l3hdr)->total_length = + htons(ntohs(((struct rte_ipv4_hdr *)l3hdr)->total_length) - pruned); + else + ((struct rte_ipv6_hdr *)l3hdr)->payload_len = + htons(ntohs(((struct rte_ipv6_hdr *)l3hdr)->payload_len) - pruned); + return pruned; + } + return 0; } static int tcp_in_add_proxy_proto(struct dp_vs_conn *conn, struct rte_mbuf *mbuf, @@ -797,15 +880,12 @@ static int tcp_fnat_in_handler(struct dp_vs_proto *proto, struct dp_vs_conn *conn, struct rte_mbuf *mbuf) { struct tcphdr *th; - /* af/mbuf may be changed for nat64 which in af is ipv6 and out is ipv4 */ - int iaf, oaf; - int iphdrlen; + int af; /* outbound af */ + int iphdrlen, toalen; int err, pp_hdr_shift = 0; - iaf = tuplehash_in(conn).af; - oaf = tuplehash_out(conn).af; - - iphdrlen = ((AF_INET6 == oaf) ? ip6_hdrlen(mbuf): ip4_hdrlen(mbuf)); + af = tuplehash_out(conn).af; + iphdrlen = ((AF_INET6 == af) ? ip6_hdrlen(mbuf): ip4_hdrlen(mbuf)); if (mbuf_may_pull(mbuf, iphdrlen + sizeof(*th)) != 0) return EDPVS_INVPKT; @@ -819,41 +899,49 @@ static int tcp_fnat_in_handler(struct dp_vs_proto *proto, /* * for SYN packet - * 1. remove tcp timestamp option - * laddress for different client have diff timestamp. - * 2. save original TCP sequence for seq-adjust later. - * since TCP option will be change. - * 3. add TOA option - * so that RS with TOA module can get real client IP. + * 1. remove tcp timestamp option, + * laddrs for different clients have diff timestamp. + * 2. save original TCP sequence for seq-adjust later + * since TCP option will be changed. */ if (th->syn && !th->ack) { tcp_in_remove_ts(th); - tcp_in_init_seq(conn, mbuf, th); - if (PROXY_PROTOCOL_V1 != PROXY_PROTOCOL_VERSION(conn->pp_version) - && PROXY_PROTOCOL_V2 != PROXY_PROTOCOL_VERSION(conn->pp_version)) { - if (unlikely(tcp_in_add_toa(conn, mbuf, th) != EDPVS_OK)) { - tcp_in_remove_toa(th, iaf); - } - } } - /* add toa/proxy_proto to first data packet */ + /* Add toa/proxy_protocol to the first data packet */ if (ntohl(th->ack_seq) == conn->fnat_seq.fdata_seq && !th->syn && !th->rst /*&& !th->fin*/) { if (PROXY_PROTOCOL_V2 == PROXY_PROTOCOL_VERSION(conn->pp_version) || PROXY_PROTOCOL_V1 == PROXY_PROTOCOL_VERSION(conn->pp_version)) { if (conn->fnat_seq.isn - conn->fnat_seq.delta + 1 == ntohl(th->seq)) { - /* avoid inserting repetitive ppdata when the first rs ack delayed */ + /* avoid inserting repetitive proxy protocol data + * when the first rs ack is delayed */ err = tcp_in_add_proxy_proto(conn, mbuf, th, iphdrlen, &pp_hdr_shift); if (unlikely(EDPVS_OK != err)) RTE_LOG(INFO, IPVS, "%s: insert proxy protocol fail -- %s\n", __func__, dpvs_strerror(err)); th = ((void *)th) + pp_hdr_shift; } - } else { - if (unlikely(tcp_in_add_toa(conn, mbuf, th) != EDPVS_OK)) { - tcp_in_remove_toa(th, iaf); + } else { /* use toa */ + err = tcp_in_add_toa(conn, mbuf, th); + if (unlikely(EDPVS_OK != err)) { + toalen = tuplehash_in(conn).af == AF_INET ? TCP_OLEN_IP4_ADDR : TCP_OLEN_IP6_ADDR; + if (tcp_in_prune_options(af, toalen, mbuf, th) >= toalen + && (EDPVS_NOROOM == err || EDPVS_FRAG == err)) { + err = tcp_in_add_toa(conn, mbuf, th); + } + if (EDPVS_OK != err) { + char caddrbuf[64], vaddrbuf[64], laddrbuf[64], daddrbuf[64]; + const char *caddr, *vaddr, *laddr, *daddr; + caddr = inet_ntop(conn->af, &conn->caddr, caddrbuf, sizeof(caddrbuf)) ? caddrbuf : "::"; + vaddr = inet_ntop(conn->af, &conn->vaddr, vaddrbuf, sizeof(vaddrbuf)) ? vaddrbuf : "::"; + laddr = inet_ntop(af, &conn->laddr, laddrbuf, sizeof(laddrbuf)) ? laddrbuf : "::"; + daddr = inet_ntop(af, &conn->daddr, daddrbuf, sizeof(daddrbuf)) ? daddrbuf : "::"; + RTE_LOG(WARNING, IPVS, "TOA add failed(%s): [%s]:%d -> [%s]:%d; [%s]:%d -> [%s]:%d\n", + dpvs_strerror(err), caddr, htons(conn->cport), vaddr, htons(conn->vport), + laddr, htons(conn->lport), daddr, htons(conn->dport)); + } } } } @@ -864,7 +952,7 @@ static int tcp_fnat_in_handler(struct dp_vs_proto *proto, th->source = conn->lport; th->dest = conn->dport; - return tcp_send_csum(oaf, iphdrlen, th, conn, mbuf, conn->in_dev); + return tcp_send_csum(af, iphdrlen, th, conn, mbuf, conn->in_dev); } static int tcp_fnat_out_handler(struct dp_vs_proto *proto, From 58a0e7528a972da16966e6294c04be909874ce28 Mon Sep 17 00:00:00 2001 From: ywc689 Date: Tue, 12 Dec 2023 10:35:31 +0800 Subject: [PATCH 2/2] patch: update dcdn-toa.patch Signed-off-by: ywc689 --- patch/dcdn-toa.patch | 47 +++++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/patch/dcdn-toa.patch b/patch/dcdn-toa.patch index 5107fcd4..3d9c84a9 100644 --- a/patch/dcdn-toa.patch +++ b/patch/dcdn-toa.patch @@ -1,32 +1,31 @@ -From 55e8e5da2b4b0893d36cb3f621bedf9833c4ea50 Mon Sep 17 00:00:00 2001 +From cee6889685240558ebea795615539b7289070842 Mon Sep 17 00:00:00 2001 From: wangyetong Date: Thu, 14 Sep 2023 15:33:42 +0800 Subject: [PATCH] added dcdn toa --- - include/ipvs/conn.h | 5 +++++ + include/ipvs/conn.h | 4 ++++ include/ipvs/proto_tcp.h | 2 ++ - src/ipvs/ip_vs_proto_tcp.c | 54 +++++++++++++++++++++++++++++++++++++++++++++- + src/ipvs/ip_vs_proto_tcp.c | 55 +++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 60 insertions(+), 1 deletion(-) diff --git a/include/ipvs/conn.h b/include/ipvs/conn.h -index fa0bdeb..88dcb44 100644 +index 843721e..78fb0ba 100644 --- a/include/ipvs/conn.h +++ b/include/ipvs/conn.h -@@ -166,6 +166,11 @@ struct dp_vs_conn { - /* flag for gfwip */ - bool outwall; +@@ -167,6 +167,10 @@ struct dp_vs_conn { + /* connection redirect in fnat/snat/nat modes */ + struct dp_vs_redirect *redirect; + /* dcdn toa found or not */ + bool dcdn_found; + /* dcdn toa address */ + struct in_addr dcdn_addr; -+ } __rte_cache_aligned; /* for syn-proxy to save all ack packet in conn before rs's syn-ack arrives */ diff --git a/include/ipvs/proto_tcp.h b/include/ipvs/proto_tcp.h -index 9f5162a..41d5646 100644 +index 3d1515a..f0cf50c 100644 --- a/include/ipvs/proto_tcp.h +++ b/include/ipvs/proto_tcp.h @@ -28,6 +28,7 @@ enum { @@ -46,11 +45,11 @@ index 9f5162a..41d5646 100644 #define TCP_OLEN_TSTAMP_ALIGNED 12 #define TCP_OLEN_SACK_BASE 2 diff --git a/src/ipvs/ip_vs_proto_tcp.c b/src/ipvs/ip_vs_proto_tcp.c -index cbb7cb2..2cd889a 100644 +index 6acbbca..5b185fa 100644 --- a/src/ipvs/ip_vs_proto_tcp.c +++ b/src/ipvs/ip_vs_proto_tcp.c -@@ -305,6 +305,43 @@ static void tcp_in_remove_ts(struct tcphdr *tcph) - } +@@ -441,6 +441,43 @@ static int tcp_in_add_proxy_proto(struct dp_vs_conn *conn, struct rte_mbuf *mbuf + return proxy_proto_insert(&ppinfo, conn, mbuf, tcph, hdr_shift); } +/* check dcdn toa option */ @@ -90,10 +89,10 @@ index cbb7cb2..2cd889a 100644 + return EDPVS_NOTEXIST; +} + - static inline int tcp_in_add_toa(struct dp_vs_conn *conn, struct rte_mbuf *mbuf, + static int tcp_in_add_toa(struct dp_vs_conn *conn, struct rte_mbuf *mbuf, struct tcphdr *tcph) { -@@ -382,7 +419,10 @@ static inline int tcp_in_add_toa(struct dp_vs_conn *conn, struct rte_mbuf *mbuf, +@@ -518,7 +555,10 @@ static int tcp_in_add_toa(struct dp_vs_conn *conn, struct rte_mbuf *mbuf, if (conn->af == AF_INET) { struct tcpopt_ip4_addr *toa_ip4 = (struct tcpopt_ip4_addr *)(tcph + 1); @@ -105,21 +104,18 @@ index cbb7cb2..2cd889a 100644 } else { struct tcpopt_ip6_addr *toa_ip6 = (struct tcpopt_ip6_addr *)(tcph + 1); -@@ -694,9 +734,13 @@ static int tcp_fnat_in_handler(struct dp_vs_proto *proto, - struct dp_vs_conn *conn, struct rte_mbuf *mbuf) - { - struct tcphdr *th; +@@ -842,6 +882,10 @@ static int tcp_fnat_in_handler(struct dp_vs_proto *proto, + int af; /* outbound af */ + int iphdrlen; + int err, pp_hdr_shift = 0; + struct in_addr dcdn_addr; - /* af/mbuf may be changed for nat64 which in af is ipv6 and out is ipv4 */ - int af = tuplehash_out(conn).af; - int iphdrlen = ((AF_INET6 == af) ? ip6_hdrlen(mbuf): ip4_hdrlen(mbuf)); +#ifdef CONFIG_DPVS_IPVS_DEBUG + char dcdn_buf[64]; +#endif - if (mbuf_may_pull(mbuf, iphdrlen + sizeof(*th)) != 0) - return EDPVS_INVPKT; -@@ -720,6 +764,14 @@ static int tcp_fnat_in_handler(struct dp_vs_proto *proto, + af = tuplehash_out(conn).af; + iphdrlen = ((AF_INET6 == af) ? ip6_hdrlen(mbuf): ip4_hdrlen(mbuf)); +@@ -866,6 +910,15 @@ static int tcp_fnat_in_handler(struct dp_vs_proto *proto, if (th->syn && !th->ack) { tcp_in_remove_ts(th); tcp_in_init_seq(conn, mbuf, th); @@ -131,9 +127,10 @@ index cbb7cb2..2cd889a 100644 + RTE_LOG(DEBUG, IPVS, "get dcdn toa addr %s\n", dcdn_buf); +#endif + } - tcp_in_add_toa(conn, mbuf, th); ++ tcp_in_add_toa(conn, mbuf, th); } + /* Add toa/proxy_protocol to the first data packet */ -- 1.8.3.1