[NETD-BPF#3] Relocate Tethering/bpf_progs and rename bpf_tethering_headers The folder is currently used by tether offload only. Because we will move netd.c and clatd.c to it, the folder should be moved to the upper tier. Also, rename bpf_tethering_headers to bpf_connectivity_headers so that other connectivity code besides to tethering are justified to use it. Bug: 202086915 Test: atest FrameworksNetTests Change-Id: I95943c6e909f1fdca12604ef0c55d67c39ca686b

commit: bb57fa9eece916ed06603c1b448a5469ddc6cedf [log] [tgz]
author: Ken Chen <cken@google.com> Fri Oct 22 00:49:13 2021 +0800
committer: Ken Chen <cken@google.com> Sat Nov 20 18:31:08 2021 +0800
tree: 9ec679c21ecb2c95a1f16d4042ed23b45525e4d8
parent: 2c75c0444dd4105dd0c5803220f81e1b53b06fd6 [diff]
diff --git a/bpf_progs/Android.bp b/bpf_progs/Android.bp
new file mode 100644
index 0000000..a52a63c
--- /dev/null
+++ b/bpf_progs/Android.bp

@@ -0,0 +1,60 @@
+//
+// Copyright (C) 2020 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+//
+// struct definitions shared with JNI
+//
+package {
+    default_applicable_licenses: ["Android-Apache-2.0"],
+}
+
+cc_library_headers {
+    name: "bpf_connectivity_headers",
+    vendor_available: false,
+    host_supported: false,
+    export_include_dirs: ["."],
+    cflags: [
+        "-Wall",
+        "-Werror",
+    ],
+    sdk_version: "30",
+    min_sdk_version: "30",
+    apex_available: ["com.android.tethering"],
+    visibility: [
+        "//packages/modules/Connectivity/Tethering",
+    ],
+}
+
+//
+// bpf kernel programs
+//
+bpf {
+    name: "offload.o",
+    srcs: ["offload.c"],
+    cflags: [
+        "-Wall",
+        "-Werror",
+    ],
+}
+
+bpf {
+    name: "test.o",
+    srcs: ["test.c"],
+    cflags: [
+        "-Wall",
+        "-Werror",
+    ],
+}

diff --git a/bpf_progs/bpf_net_helpers.h b/bpf_progs/bpf_net_helpers.h
new file mode 100644
index 0000000..c798580
--- /dev/null
+++ b/bpf_progs/bpf_net_helpers.h

@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2019 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <linux/bpf.h>
+#include <linux/if_packet.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+// this returns 0 iff skb->sk is NULL
+static uint64_t (*bpf_get_socket_cookie)(struct __sk_buff* skb) = (void*)BPF_FUNC_get_socket_cookie;
+
+static uint32_t (*bpf_get_socket_uid)(struct __sk_buff* skb) = (void*)BPF_FUNC_get_socket_uid;
+
+static int (*bpf_skb_pull_data)(struct __sk_buff* skb, __u32 len) = (void*)BPF_FUNC_skb_pull_data;
+
+static int (*bpf_skb_load_bytes)(struct __sk_buff* skb, int off, void* to,
+                                 int len) = (void*)BPF_FUNC_skb_load_bytes;
+
+static int (*bpf_skb_store_bytes)(struct __sk_buff* skb, __u32 offset, const void* from, __u32 len,
+                                  __u64 flags) = (void*)BPF_FUNC_skb_store_bytes;
+
+static int64_t (*bpf_csum_diff)(__be32* from, __u32 from_size, __be32* to, __u32 to_size,
+                                __wsum seed) = (void*)BPF_FUNC_csum_diff;
+
+static int64_t (*bpf_csum_update)(struct __sk_buff* skb, __wsum csum) = (void*)BPF_FUNC_csum_update;
+
+static int (*bpf_skb_change_proto)(struct __sk_buff* skb, __be16 proto,
+                                   __u64 flags) = (void*)BPF_FUNC_skb_change_proto;
+static int (*bpf_l3_csum_replace)(struct __sk_buff* skb, __u32 offset, __u64 from, __u64 to,
+                                  __u64 flags) = (void*)BPF_FUNC_l3_csum_replace;
+static int (*bpf_l4_csum_replace)(struct __sk_buff* skb, __u32 offset, __u64 from, __u64 to,
+                                  __u64 flags) = (void*)BPF_FUNC_l4_csum_replace;
+static int (*bpf_redirect)(__u32 ifindex, __u64 flags) = (void*)BPF_FUNC_redirect;
+static int (*bpf_redirect_map)(const struct bpf_map_def* map, __u32 key,
+                               __u64 flags) = (void*)BPF_FUNC_redirect_map;
+
+static int (*bpf_skb_change_head)(struct __sk_buff* skb, __u32 head_room,
+                                  __u64 flags) = (void*)BPF_FUNC_skb_change_head;
+static int (*bpf_skb_adjust_room)(struct __sk_buff* skb, __s32 len_diff, __u32 mode,
+                                  __u64 flags) = (void*)BPF_FUNC_skb_adjust_room;
+
+// Android only supports little endian architectures
+#define htons(x) (__builtin_constant_p(x) ? ___constant_swab16(x) : __builtin_bswap16(x))
+#define htonl(x) (__builtin_constant_p(x) ? ___constant_swab32(x) : __builtin_bswap32(x))
+#define ntohs(x) htons(x)
+#define ntohl(x) htonl(x)
+
+static inline __always_inline __unused bool is_received_skb(struct __sk_buff* skb) {
+    return skb->pkt_type == PACKET_HOST || skb->pkt_type == PACKET_BROADCAST ||
+           skb->pkt_type == PACKET_MULTICAST;
+}
+
+// try to make the first 'len' header bytes readable via direct packet access
+static inline __always_inline void try_make_readable(struct __sk_buff* skb, int len) {
+    if (len > skb->len) len = skb->len;
+    if (skb->data_end - skb->data < len) bpf_skb_pull_data(skb, len);
+}

diff --git a/bpf_progs/bpf_tethering.h b/bpf_progs/bpf_tethering.h
new file mode 100644
index 0000000..b0ec8f6
--- /dev/null
+++ b/bpf_progs/bpf_tethering.h

@@ -0,0 +1,217 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <linux/if.h>
+#include <linux/if_ether.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+
+// Common definitions for BPF code in the tethering mainline module.
+// These definitions are available to:
+// - The BPF programs in Tethering/bpf_progs/
+// - JNI code that depends on the bpf_connectivity_headers library.
+
+#define BPF_TETHER_ERRORS    \
+    ERR(INVALID_IP_VERSION)  \
+    ERR(LOW_TTL)             \
+    ERR(INVALID_TCP_HEADER)  \
+    ERR(TCP_CONTROL_PACKET)  \
+    ERR(NON_GLOBAL_SRC)      \
+    ERR(NON_GLOBAL_DST)      \
+    ERR(LOCAL_SRC_DST)       \
+    ERR(NO_STATS_ENTRY)      \
+    ERR(NO_LIMIT_ENTRY)      \
+    ERR(BELOW_IPV4_MTU)      \
+    ERR(BELOW_IPV6_MTU)      \
+    ERR(LIMIT_REACHED)       \
+    ERR(CHANGE_HEAD_FAILED)  \
+    ERR(TOO_SHORT)           \
+    ERR(HAS_IP_OPTIONS)      \
+    ERR(IS_IP_FRAG)          \
+    ERR(CHECKSUM)            \
+    ERR(NON_TCP_UDP)         \
+    ERR(NON_TCP)             \
+    ERR(SHORT_L4_HEADER)     \
+    ERR(SHORT_TCP_HEADER)    \
+    ERR(SHORT_UDP_HEADER)    \
+    ERR(UDP_CSUM_ZERO)       \
+    ERR(TRUNCATED_IPV4)      \
+    ERR(_MAX)
+
+#define ERR(x) BPF_TETHER_ERR_ ##x,
+enum {
+    BPF_TETHER_ERRORS
+};
+#undef ERR
+
+#define ERR(x) #x,
+static const char *bpf_tether_errors[] = {
+    BPF_TETHER_ERRORS
+};
+#undef ERR
+
+// This header file is shared by eBPF kernel programs (C) and netd (C++) and
+// some of the maps are also accessed directly from Java mainline module code.
+//
+// Hence: explicitly pad all relevant structures and assert that their size
+// is the sum of the sizes of their fields.
+#define STRUCT_SIZE(name, size) _Static_assert(sizeof(name) == (size), "Incorrect struct size.")
+
+
+#define BPF_PATH_TETHER BPF_PATH "tethering/"
+
+#define TETHER_STATS_MAP_PATH BPF_PATH_TETHER "map_offload_tether_stats_map"
+
+typedef uint32_t TetherStatsKey;  // upstream ifindex
+
+typedef struct {
+    uint64_t rxPackets;
+    uint64_t rxBytes;
+    uint64_t rxErrors;
+    uint64_t txPackets;
+    uint64_t txBytes;
+    uint64_t txErrors;
+} TetherStatsValue;
+STRUCT_SIZE(TetherStatsValue, 6 * 8);  // 48
+
+#define TETHER_LIMIT_MAP_PATH BPF_PATH_TETHER "map_offload_tether_limit_map"
+
+typedef uint32_t TetherLimitKey;    // upstream ifindex
+typedef uint64_t TetherLimitValue;  // in bytes
+
+#define TETHER_DOWNSTREAM6_TC_PROG_RAWIP_NAME "prog_offload_schedcls_tether_downstream6_rawip"
+#define TETHER_DOWNSTREAM6_TC_PROG_ETHER_NAME "prog_offload_schedcls_tether_downstream6_ether"
+
+#define TETHER_DOWNSTREAM6_TC_PROG_RAWIP_PATH BPF_PATH_TETHER TETHER_DOWNSTREAM6_TC_PROG_RAWIP_NAME
+#define TETHER_DOWNSTREAM6_TC_PROG_ETHER_PATH BPF_PATH_TETHER TETHER_DOWNSTREAM6_TC_PROG_ETHER_NAME
+
+#define TETHER_DOWNSTREAM6_MAP_PATH BPF_PATH_TETHER "map_offload_tether_downstream6_map"
+
+// For now tethering offload only needs to support downstreams that use 6-byte MAC addresses,
+// because all downstream types that are currently supported (WiFi, USB, Bluetooth and
+// Ethernet) have 6-byte MAC addresses.
+
+typedef struct {
+    uint32_t iif;              // The input interface index
+    uint8_t dstMac[ETH_ALEN];  // destination ethernet mac address (zeroed iff rawip ingress)
+    uint8_t zero[2];           // zero pad for 8 byte alignment
+    struct in6_addr neigh6;    // The destination IPv6 address
+} TetherDownstream6Key;
+STRUCT_SIZE(TetherDownstream6Key, 4 + 6 + 2 + 16);  // 28
+
+typedef struct {
+    uint32_t oif;             // The output interface to redirect to
+    struct ethhdr macHeader;  // includes dst/src mac and ethertype (zeroed iff rawip egress)
+    uint16_t pmtu;            // The maximum L3 output path/route mtu
+} Tether6Value;
+STRUCT_SIZE(Tether6Value, 4 + 14 + 2);  // 20
+
+#define TETHER_DOWNSTREAM64_MAP_PATH BPF_PATH_TETHER "map_offload_tether_downstream64_map"
+
+typedef struct {
+    uint32_t iif;              // The input interface index
+    uint8_t dstMac[ETH_ALEN];  // destination ethernet mac address (zeroed iff rawip ingress)
+    uint16_t l4Proto;          // IPPROTO_TCP/UDP/...
+    struct in6_addr src6;      // source &
+    struct in6_addr dst6;      // destination IPv6 addresses
+    __be16 srcPort;            // source &
+    __be16 dstPort;            // destination tcp/udp/... ports
+} TetherDownstream64Key;
+STRUCT_SIZE(TetherDownstream64Key, 4 + 6 + 2 + 16 + 16 + 2 + 2);  // 48
+
+typedef struct {
+    uint32_t oif;             // The output interface to redirect to
+    struct ethhdr macHeader;  // includes dst/src mac and ethertype (zeroed iff rawip egress)
+    uint16_t pmtu;            // The maximum L3 output path/route mtu
+    struct in_addr src4;      // source &
+    struct in_addr dst4;      // destination IPv4 addresses
+    __be16 srcPort;           // source &
+    __be16 outPort;           // destination tcp/udp/... ports
+    uint64_t lastUsed;        // Kernel updates on each use with bpf_ktime_get_boot_ns()
+} TetherDownstream64Value;
+STRUCT_SIZE(TetherDownstream64Value, 4 + 14 + 2 + 4 + 4 + 2 + 2 + 8);  // 40
+
+#define TETHER_UPSTREAM6_TC_PROG_RAWIP_NAME "prog_offload_schedcls_tether_upstream6_rawip"
+#define TETHER_UPSTREAM6_TC_PROG_ETHER_NAME "prog_offload_schedcls_tether_upstream6_ether"
+
+#define TETHER_UPSTREAM6_TC_PROG_RAWIP_PATH BPF_PATH_TETHER TETHER_UPSTREAM6_TC_PROG_RAWIP_NAME
+#define TETHER_UPSTREAM6_TC_PROG_ETHER_PATH BPF_PATH_TETHER TETHER_UPSTREAM6_TC_PROG_ETHER_NAME
+
+#define TETHER_UPSTREAM6_MAP_PATH BPF_PATH_TETHER "map_offload_tether_upstream6_map"
+
+typedef struct {
+    uint32_t iif;              // The input interface index
+    uint8_t dstMac[ETH_ALEN];  // destination ethernet mac address (zeroed iff rawip ingress)
+    uint8_t zero[2];           // zero pad for 8 byte alignment
+                               // TODO: extend this to include src ip /64 subnet
+} TetherUpstream6Key;
+STRUCT_SIZE(TetherUpstream6Key, 12);
+
+#define TETHER_DOWNSTREAM4_TC_PROG_RAWIP_NAME "prog_offload_schedcls_tether_downstream4_rawip"
+#define TETHER_DOWNSTREAM4_TC_PROG_ETHER_NAME "prog_offload_schedcls_tether_downstream4_ether"
+
+#define TETHER_DOWNSTREAM4_TC_PROG_RAWIP_PATH BPF_PATH_TETHER TETHER_DOWNSTREAM4_TC_PROG_RAWIP_NAME
+#define TETHER_DOWNSTREAM4_TC_PROG_ETHER_PATH BPF_PATH_TETHER TETHER_DOWNSTREAM4_TC_PROG_ETHER_NAME
+
+#define TETHER_DOWNSTREAM4_MAP_PATH BPF_PATH_TETHER "map_offload_tether_downstream4_map"
+
+
+#define TETHER_UPSTREAM4_TC_PROG_RAWIP_NAME "prog_offload_schedcls_tether_upstream4_rawip"
+#define TETHER_UPSTREAM4_TC_PROG_ETHER_NAME "prog_offload_schedcls_tether_upstream4_ether"
+
+#define TETHER_UPSTREAM4_TC_PROG_RAWIP_PATH BPF_PATH_TETHER TETHER_UPSTREAM4_TC_PROG_RAWIP_NAME
+#define TETHER_UPSTREAM4_TC_PROG_ETHER_PATH BPF_PATH_TETHER TETHER_UPSTREAM4_TC_PROG_ETHER_NAME
+
+#define TETHER_UPSTREAM4_MAP_PATH BPF_PATH_TETHER "map_offload_tether_upstream4_map"
+
+typedef struct {
+    uint32_t iif;              // The input interface index
+    uint8_t dstMac[ETH_ALEN];  // destination ethernet mac address (zeroed iff rawip ingress)
+    uint16_t l4Proto;          // IPPROTO_TCP/UDP/...
+    struct in_addr src4;       // source &
+    struct in_addr dst4;       // destination IPv4 addresses
+    __be16 srcPort;            // source &
+    __be16 dstPort;            // destination TCP/UDP/... ports
+} Tether4Key;
+STRUCT_SIZE(Tether4Key, 4 + 6 + 2 + 4 + 4 + 2 + 2);  // 24
+
+typedef struct {
+    uint32_t oif;             // The output interface to redirect to
+    struct ethhdr macHeader;  // includes dst/src mac and ethertype (zeroed iff rawip egress)
+    uint16_t pmtu;            // Maximum L3 output path/route mtu
+    struct in6_addr src46;    // source &                 (always IPv4 mapped for downstream)
+    struct in6_addr dst46;    // destination IP addresses (may be IPv4 mapped or IPv6 for upstream)
+    __be16 srcPort;           // source &
+    __be16 dstPort;           // destination tcp/udp/... ports
+    uint64_t last_used;       // Kernel updates on each use with bpf_ktime_get_boot_ns()
+} Tether4Value;
+STRUCT_SIZE(Tether4Value, 4 + 14 + 2 + 16 + 16 + 2 + 2 + 8);  // 64
+
+#define TETHER_DOWNSTREAM_XDP_PROG_RAWIP_NAME "prog_offload_xdp_tether_downstream_rawip"
+#define TETHER_DOWNSTREAM_XDP_PROG_ETHER_NAME "prog_offload_xdp_tether_downstream_ether"
+
+#define TETHER_DOWNSTREAM_XDP_PROG_RAWIP_PATH BPF_PATH_TETHER TETHER_DOWNSTREAM_XDP_PROG_RAWIP_NAME
+#define TETHER_DOWNSTREAM_XDP_PROG_ETHER_PATH BPF_PATH_TETHER TETHER_DOWNSTREAM_XDP_PROG_ETHER_NAME
+
+#define TETHER_UPSTREAM_XDP_PROG_RAWIP_NAME "prog_offload_xdp_tether_upstream_rawip"
+#define TETHER_UPSTREAM_XDP_PROG_ETHER_NAME "prog_offload_xdp_tether_upstream_ether"
+
+#define TETHER_UPSTREAM_XDP_PROG_RAWIP_PATH BPF_PATH_TETHER TETHER_UPSTREAM_XDP_PROG_RAWIP_NAME
+#define TETHER_UPSTREAM_XDP_PROG_ETHER_PATH BPF_PATH_TETHER TETHER_UPSTREAM_XDP_PROG_ETHER_NAME
+
+#undef STRUCT_SIZE

diff --git a/bpf_progs/offload.c b/bpf_progs/offload.c
new file mode 100644
index 0000000..977e918
--- /dev/null
+++ b/bpf_progs/offload.c

@@ -0,0 +1,848 @@
+/*
+ * Copyright (C) 2020 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <linux/if.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/pkt_cls.h>
+#include <linux/tcp.h>
+
+// bionic kernel uapi linux/udp.h header is munged...
+#define __kernel_udphdr udphdr
+#include <linux/udp.h>
+
+// The resulting .o needs to load on the Android S bpfloader v0.2
+#define BPFLOADER_MIN_VER 2u
+
+#include "bpf_helpers.h"
+#include "bpf_net_helpers.h"
+#include "bpf_tethering.h"
+
+// From kernel:include/net/ip.h
+#define IP_DF 0x4000  // Flag: "Don't Fragment"
+
+// ----- Helper functions for offsets to fields -----
+
+// They all assume simple IP packets:
+//   - no VLAN ethernet tags
+//   - no IPv4 options (see IPV4_HLEN/TCP4_OFFSET/UDP4_OFFSET)
+//   - no IPv6 extension headers
+//   - no TCP options (see TCP_HLEN)
+
+//#define ETH_HLEN sizeof(struct ethhdr)
+#define IP4_HLEN sizeof(struct iphdr)
+#define IP6_HLEN sizeof(struct ipv6hdr)
+#define TCP_HLEN sizeof(struct tcphdr)
+#define UDP_HLEN sizeof(struct udphdr)
+
+// Offsets from beginning of L4 (TCP/UDP) header
+#define TCP_OFFSET(field) offsetof(struct tcphdr, field)
+#define UDP_OFFSET(field) offsetof(struct udphdr, field)
+
+// Offsets from beginning of L3 (IPv4) header
+#define IP4_OFFSET(field) offsetof(struct iphdr, field)
+#define IP4_TCP_OFFSET(field) (IP4_HLEN + TCP_OFFSET(field))
+#define IP4_UDP_OFFSET(field) (IP4_HLEN + UDP_OFFSET(field))
+
+// Offsets from beginning of L3 (IPv6) header
+#define IP6_OFFSET(field) offsetof(struct ipv6hdr, field)
+#define IP6_TCP_OFFSET(field) (IP6_HLEN + TCP_OFFSET(field))
+#define IP6_UDP_OFFSET(field) (IP6_HLEN + UDP_OFFSET(field))
+
+// Offsets from beginning of L2 (ie. Ethernet) header (which must be present)
+#define ETH_IP4_OFFSET(field) (ETH_HLEN + IP4_OFFSET(field))
+#define ETH_IP4_TCP_OFFSET(field) (ETH_HLEN + IP4_TCP_OFFSET(field))
+#define ETH_IP4_UDP_OFFSET(field) (ETH_HLEN + IP4_UDP_OFFSET(field))
+#define ETH_IP6_OFFSET(field) (ETH_HLEN + IP6_OFFSET(field))
+#define ETH_IP6_TCP_OFFSET(field) (ETH_HLEN + IP6_TCP_OFFSET(field))
+#define ETH_IP6_UDP_OFFSET(field) (ETH_HLEN + IP6_UDP_OFFSET(field))
+
+// ----- Tethering Error Counters -----
+
+DEFINE_BPF_MAP_GRW(tether_error_map, ARRAY, uint32_t, uint32_t, BPF_TETHER_ERR__MAX,
+                   AID_NETWORK_STACK)
+
+#define COUNT_AND_RETURN(counter, ret) do {                     \
+    uint32_t code = BPF_TETHER_ERR_ ## counter;                 \
+    uint32_t *count = bpf_tether_error_map_lookup_elem(&code);  \
+    if (count) __sync_fetch_and_add(count, 1);                  \
+    return ret;                                                 \
+} while(0)
+
+#define TC_DROP(counter) COUNT_AND_RETURN(counter, TC_ACT_SHOT)
+#define TC_PUNT(counter) COUNT_AND_RETURN(counter, TC_ACT_PIPE)
+
+#define XDP_DROP(counter) COUNT_AND_RETURN(counter, XDP_DROP)
+#define XDP_PUNT(counter) COUNT_AND_RETURN(counter, XDP_PASS)
+
+// ----- Tethering Data Stats and Limits -----
+
+// Tethering stats, indexed by upstream interface.
+DEFINE_BPF_MAP_GRW(tether_stats_map, HASH, TetherStatsKey, TetherStatsValue, 16, AID_NETWORK_STACK)
+
+// Tethering data limit, indexed by upstream interface.
+// (tethering allowed when stats[iif].rxBytes + stats[iif].txBytes < limit[iif])
+DEFINE_BPF_MAP_GRW(tether_limit_map, HASH, TetherLimitKey, TetherLimitValue, 16, AID_NETWORK_STACK)
+
+// ----- IPv6 Support -----
+
+DEFINE_BPF_MAP_GRW(tether_downstream6_map, HASH, TetherDownstream6Key, Tether6Value, 64,
+                   AID_NETWORK_STACK)
+
+DEFINE_BPF_MAP_GRW(tether_downstream64_map, HASH, TetherDownstream64Key, TetherDownstream64Value,
+                   1024, AID_NETWORK_STACK)
+
+DEFINE_BPF_MAP_GRW(tether_upstream6_map, HASH, TetherUpstream6Key, Tether6Value, 64,
+                   AID_NETWORK_STACK)
+
+static inline __always_inline int do_forward6(struct __sk_buff* skb, const bool is_ethernet,
+        const bool downstream) {
+    // Must be meta-ethernet IPv6 frame
+    if (skb->protocol != htons(ETH_P_IPV6)) return TC_ACT_PIPE;
+
+    // Require ethernet dst mac address to be our unicast address.
+    if (is_ethernet && (skb->pkt_type != PACKET_HOST)) return TC_ACT_PIPE;
+
+    const int l2_header_size = is_ethernet ? sizeof(struct ethhdr) : 0;
+
+    // Since the program never writes via DPA (direct packet access) auto-pull/unclone logic does
+    // not trigger and thus we need to manually make sure we can read packet headers via DPA.
+    // Note: this is a blind best effort pull, which may fail or pull less - this doesn't matter.
+    // It has to be done early cause it will invalidate any skb->data/data_end derived pointers.
+    try_make_readable(skb, l2_header_size + IP6_HLEN + TCP_HLEN);
+
+    void* data = (void*)(long)skb->data;
+    const void* data_end = (void*)(long)skb->data_end;
+    struct ethhdr* eth = is_ethernet ? data : NULL;  // used iff is_ethernet
+    struct ipv6hdr* ip6 = is_ethernet ? (void*)(eth + 1) : data;
+
+    // Must have (ethernet and) ipv6 header
+    if (data + l2_header_size + sizeof(*ip6) > data_end) return TC_ACT_PIPE;
+
+    // Ethertype - if present - must be IPv6
+    if (is_ethernet && (eth->h_proto != htons(ETH_P_IPV6))) return TC_ACT_PIPE;
+
+    // IP version must be 6
+    if (ip6->version != 6) TC_PUNT(INVALID_IP_VERSION);
+
+    // Cannot decrement during forward if already zero or would be zero,
+    // Let the kernel's stack handle these cases and generate appropriate ICMP errors.
+    if (ip6->hop_limit <= 1) TC_PUNT(LOW_TTL);
+
+    // If hardware offload is running and programming flows based on conntrack entries,
+    // try not to interfere with it.
+    if (ip6->nexthdr == IPPROTO_TCP) {
+        struct tcphdr* tcph = (void*)(ip6 + 1);
+
+        // Make sure we can get at the tcp header
+        if (data + l2_header_size + sizeof(*ip6) + sizeof(*tcph) > data_end)
+            TC_PUNT(INVALID_TCP_HEADER);
+
+        // Do not offload TCP packets with any one of the SYN/FIN/RST flags
+        if (tcph->syn || tcph->fin || tcph->rst) TC_PUNT(TCP_CONTROL_PACKET);
+    }
+
+    // Protect against forwarding packets sourced from ::1 or fe80::/64 or other weirdness.
+    __be32 src32 = ip6->saddr.s6_addr32[0];
+    if (src32 != htonl(0x0064ff9b) &&                        // 64:ff9b:/32 incl. XLAT464 WKP
+        (src32 & htonl(0xe0000000)) != htonl(0x20000000))    // 2000::/3 Global Unicast
+        TC_PUNT(NON_GLOBAL_SRC);
+
+    // Protect against forwarding packets destined to ::1 or fe80::/64 or other weirdness.
+    __be32 dst32 = ip6->daddr.s6_addr32[0];
+    if (dst32 != htonl(0x0064ff9b) &&                        // 64:ff9b:/32 incl. XLAT464 WKP
+        (dst32 & htonl(0xe0000000)) != htonl(0x20000000))    // 2000::/3 Global Unicast
+        TC_PUNT(NON_GLOBAL_DST);
+
+    // In the upstream direction do not forward traffic within the same /64 subnet.
+    if (!downstream && (src32 == dst32) && (ip6->saddr.s6_addr32[1] == ip6->daddr.s6_addr32[1]))
+        TC_PUNT(LOCAL_SRC_DST);
+
+    TetherDownstream6Key kd = {
+            .iif = skb->ifindex,
+            .neigh6 = ip6->daddr,
+    };
+
+    TetherUpstream6Key ku = {
+            .iif = skb->ifindex,
+    };
+    if (is_ethernet) __builtin_memcpy(downstream ? kd.dstMac : ku.dstMac, eth->h_dest, ETH_ALEN);
+
+    Tether6Value* v = downstream ? bpf_tether_downstream6_map_lookup_elem(&kd)
+                                 : bpf_tether_upstream6_map_lookup_elem(&ku);
+
+    // If we don't find any offload information then simply let the core stack handle it...
+    if (!v) return TC_ACT_PIPE;
+
+    uint32_t stat_and_limit_k = downstream ? skb->ifindex : v->oif;
+
+    TetherStatsValue* stat_v = bpf_tether_stats_map_lookup_elem(&stat_and_limit_k);
+
+    // If we don't have anywhere to put stats, then abort...
+    if (!stat_v) TC_PUNT(NO_STATS_ENTRY);
+
+    uint64_t* limit_v = bpf_tether_limit_map_lookup_elem(&stat_and_limit_k);
+
+    // If we don't have a limit, then abort...
+    if (!limit_v) TC_PUNT(NO_LIMIT_ENTRY);
+
+    // Required IPv6 minimum mtu is 1280, below that not clear what we should do, abort...
+    if (v->pmtu < IPV6_MIN_MTU) TC_PUNT(BELOW_IPV6_MTU);
+
+    // Approximate handling of TCP/IPv6 overhead for incoming LRO/GRO packets: default
+    // outbound path mtu of 1500 is not necessarily correct, but worst case we simply
+    // undercount, which is still better then not accounting for this overhead at all.
+    // Note: this really shouldn't be device/path mtu at all, but rather should be
+    // derived from this particular connection's mss (ie. from gro segment size).
+    // This would require a much newer kernel with newer ebpf accessors.
+    // (This is also blindly assuming 12 bytes of tcp timestamp option in tcp header)
+    uint64_t packets = 1;
+    uint64_t bytes = skb->len;
+    if (bytes > v->pmtu) {
+        const int tcp_overhead = sizeof(struct ipv6hdr) + sizeof(struct tcphdr) + 12;
+        const int mss = v->pmtu - tcp_overhead;
+        const uint64_t payload = bytes - tcp_overhead;
+        packets = (payload + mss - 1) / mss;
+        bytes = tcp_overhead * packets + payload;
+    }
+
+    // Are we past the limit?  If so, then abort...
+    // Note: will not overflow since u64 is 936 years even at 5Gbps.
+    // Do not drop here.  Offload is just that, whenever we fail to handle
+    // a packet we let the core stack deal with things.
+    // (The core stack needs to handle limits correctly anyway,
+    // since we don't offload all traffic in both directions)
+    if (stat_v->rxBytes + stat_v->txBytes + bytes > *limit_v) TC_PUNT(LIMIT_REACHED);
+
+    if (!is_ethernet) {
+        // Try to inject an ethernet header, and simply return if we fail.
+        // We do this even if TX interface is RAWIP and thus does not need an ethernet header,
+        // because this is easier and the kernel will strip extraneous ethernet header.
+        if (bpf_skb_change_head(skb, sizeof(struct ethhdr), /*flags*/ 0)) {
+            __sync_fetch_and_add(downstream ? &stat_v->rxErrors : &stat_v->txErrors, 1);
+            TC_PUNT(CHANGE_HEAD_FAILED);
+        }
+
+        // bpf_skb_change_head() invalidates all pointers - reload them
+        data = (void*)(long)skb->data;
+        data_end = (void*)(long)skb->data_end;
+        eth = data;
+        ip6 = (void*)(eth + 1);
+
+        // I do not believe this can ever happen, but keep the verifier happy...
+        if (data + sizeof(struct ethhdr) + sizeof(*ip6) > data_end) {
+            __sync_fetch_and_add(downstream ? &stat_v->rxErrors : &stat_v->txErrors, 1);
+            TC_DROP(TOO_SHORT);
+        }
+    };
+
+    // At this point we always have an ethernet header - which will get stripped by the
+    // kernel during transmit through a rawip interface.  ie. 'eth' pointer is valid.
+    // Additionally note that 'is_ethernet' and 'l2_header_size' are no longer correct.
+
+    // CHECKSUM_COMPLETE is a 16-bit one's complement sum,
+    // thus corrections for it need to be done in 16-byte chunks at even offsets.
+    // IPv6 nexthdr is at offset 6, while hop limit is at offset 7
+    uint8_t old_hl = ip6->hop_limit;
+    --ip6->hop_limit;
+    uint8_t new_hl = ip6->hop_limit;
+
+    // bpf_csum_update() always succeeds if the skb is CHECKSUM_COMPLETE and returns an error
+    // (-ENOTSUPP) if it isn't.
+    bpf_csum_update(skb, 0xFFFF - ntohs(old_hl) + ntohs(new_hl));
+
+    __sync_fetch_and_add(downstream ? &stat_v->rxPackets : &stat_v->txPackets, packets);
+    __sync_fetch_and_add(downstream ? &stat_v->rxBytes : &stat_v->txBytes, bytes);
+
+    // Overwrite any mac header with the new one
+    // For a rawip tx interface it will simply be a bunch of zeroes and later stripped.
+    *eth = v->macHeader;
+
+    // Redirect to forwarded interface.
+    //
+    // Note that bpf_redirect() cannot fail unless you pass invalid flags.
+    // The redirect actually happens after the ebpf program has already terminated,
+    // and can fail for example for mtu reasons at that point in time, but there's nothing
+    // we can do about it here.
+    return bpf_redirect(v->oif, 0 /* this is effectively BPF_F_EGRESS */);
+}
+
+DEFINE_BPF_PROG("schedcls/tether_downstream6_ether", AID_ROOT, AID_NETWORK_STACK,
+                sched_cls_tether_downstream6_ether)
+(struct __sk_buff* skb) {
+    return do_forward6(skb, /* is_ethernet */ true, /* downstream */ true);
+}
+
+DEFINE_BPF_PROG("schedcls/tether_upstream6_ether", AID_ROOT, AID_NETWORK_STACK,
+                sched_cls_tether_upstream6_ether)
+(struct __sk_buff* skb) {
+    return do_forward6(skb, /* is_ethernet */ true, /* downstream */ false);
+}
+
+// Note: section names must be unique to prevent programs from appending to each other,
+// so instead the bpf loader will strip everything past the final $ symbol when actually
+// pinning the program into the filesystem.
+//
+// bpf_skb_change_head() is only present on 4.14+ and 2 trivial kernel patches are needed:
+//   ANDROID: net: bpf: Allow TC programs to call BPF_FUNC_skb_change_head
+//   ANDROID: net: bpf: permit redirect from ingress L3 to egress L2 devices at near max mtu
+// (the first of those has already been upstreamed)
+//
+// 5.4 kernel support was only added to Android Common Kernel in R,
+// and thus a 5.4 kernel always supports this.
+//
+// Hence, these mandatory (must load successfully) implementations for 5.4+ kernels:
+DEFINE_BPF_PROG_KVER("schedcls/tether_downstream6_rawip$5_4", AID_ROOT, AID_NETWORK_STACK,
+                     sched_cls_tether_downstream6_rawip_5_4, KVER(5, 4, 0))
+(struct __sk_buff* skb) {
+    return do_forward6(skb, /* is_ethernet */ false, /* downstream */ true);
+}
+
+DEFINE_BPF_PROG_KVER("schedcls/tether_upstream6_rawip$5_4", AID_ROOT, AID_NETWORK_STACK,
+                     sched_cls_tether_upstream6_rawip_5_4, KVER(5, 4, 0))
+(struct __sk_buff* skb) {
+    return do_forward6(skb, /* is_ethernet */ false, /* downstream */ false);
+}
+
+// and these identical optional (may fail to load) implementations for [4.14..5.4) patched kernels:
+DEFINE_OPTIONAL_BPF_PROG_KVER_RANGE("schedcls/tether_downstream6_rawip$4_14",
+                                    AID_ROOT, AID_NETWORK_STACK,
+                                    sched_cls_tether_downstream6_rawip_4_14,
+                                    KVER(4, 14, 0), KVER(5, 4, 0))
+(struct __sk_buff* skb) {
+    return do_forward6(skb, /* is_ethernet */ false, /* downstream */ true);
+}
+
+DEFINE_OPTIONAL_BPF_PROG_KVER_RANGE("schedcls/tether_upstream6_rawip$4_14",
+                                    AID_ROOT, AID_NETWORK_STACK,
+                                    sched_cls_tether_upstream6_rawip_4_14,
+                                    KVER(4, 14, 0), KVER(5, 4, 0))
+(struct __sk_buff* skb) {
+    return do_forward6(skb, /* is_ethernet */ false, /* downstream */ false);
+}
+
+// and define no-op stubs for [4.9,4.14) and unpatched [4.14,5.4) kernels.
+// (if the above real 4.14+ program loaded successfully, then bpfloader will have already pinned
+// it at the same location this one would be pinned at and will thus skip loading this stub)
+DEFINE_BPF_PROG_KVER_RANGE("schedcls/tether_downstream6_rawip$stub", AID_ROOT, AID_NETWORK_STACK,
+                           sched_cls_tether_downstream6_rawip_stub, KVER_NONE, KVER(5, 4, 0))
+(struct __sk_buff* skb) {
+    return TC_ACT_PIPE;
+}
+
+DEFINE_BPF_PROG_KVER_RANGE("schedcls/tether_upstream6_rawip$stub", AID_ROOT, AID_NETWORK_STACK,
+                           sched_cls_tether_upstream6_rawip_stub, KVER_NONE, KVER(5, 4, 0))
+(struct __sk_buff* skb) {
+    return TC_ACT_PIPE;
+}
+
+// ----- IPv4 Support -----
+
+DEFINE_BPF_MAP_GRW(tether_downstream4_map, HASH, Tether4Key, Tether4Value, 1024, AID_NETWORK_STACK)
+
+DEFINE_BPF_MAP_GRW(tether_upstream4_map, HASH, Tether4Key, Tether4Value, 1024, AID_NETWORK_STACK)
+
+static inline __always_inline int do_forward4(struct __sk_buff* skb, const bool is_ethernet,
+        const bool downstream, const bool updatetime) {
+    // Require ethernet dst mac address to be our unicast address.
+    if (is_ethernet && (skb->pkt_type != PACKET_HOST)) return TC_ACT_PIPE;
+
+    // Must be meta-ethernet IPv4 frame
+    if (skb->protocol != htons(ETH_P_IP)) return TC_ACT_PIPE;
+
+    const int l2_header_size = is_ethernet ? sizeof(struct ethhdr) : 0;
+
+    // Since the program never writes via DPA (direct packet access) auto-pull/unclone logic does
+    // not trigger and thus we need to manually make sure we can read packet headers via DPA.
+    // Note: this is a blind best effort pull, which may fail or pull less - this doesn't matter.
+    // It has to be done early cause it will invalidate any skb->data/data_end derived pointers.
+    try_make_readable(skb, l2_header_size + IP4_HLEN + TCP_HLEN);
+
+    void* data = (void*)(long)skb->data;
+    const void* data_end = (void*)(long)skb->data_end;
+    struct ethhdr* eth = is_ethernet ? data : NULL;  // used iff is_ethernet
+    struct iphdr* ip = is_ethernet ? (void*)(eth + 1) : data;
+
+    // Must have (ethernet and) ipv4 header
+    if (data + l2_header_size + sizeof(*ip) > data_end) return TC_ACT_PIPE;
+
+    // Ethertype - if present - must be IPv4
+    if (is_ethernet && (eth->h_proto != htons(ETH_P_IP))) return TC_ACT_PIPE;
+
+    // IP version must be 4
+    if (ip->version != 4) TC_PUNT(INVALID_IP_VERSION);
+
+    // We cannot handle IP options, just standard 20 byte == 5 dword minimal IPv4 header
+    if (ip->ihl != 5) TC_PUNT(HAS_IP_OPTIONS);
+
+    // Calculate the IPv4 one's complement checksum of the IPv4 header.
+    __wsum sum4 = 0;
+    for (int i = 0; i < sizeof(*ip) / sizeof(__u16); ++i) {
+        sum4 += ((__u16*)ip)[i];
+    }
+    // Note that sum4 is guaranteed to be non-zero by virtue of ip4->version == 4
+    sum4 = (sum4 & 0xFFFF) + (sum4 >> 16);  // collapse u32 into range 1 .. 0x1FFFE
+    sum4 = (sum4 & 0xFFFF) + (sum4 >> 16);  // collapse any potential carry into u16
+    // for a correct checksum we should get *a* zero, but sum4 must be positive, ie 0xFFFF
+    if (sum4 != 0xFFFF) TC_PUNT(CHECKSUM);
+
+    // Minimum IPv4 total length is the size of the header
+    if (ntohs(ip->tot_len) < sizeof(*ip)) TC_PUNT(TRUNCATED_IPV4);
+
+    // We are incapable of dealing with IPv4 fragments
+    if (ip->frag_off & ~htons(IP_DF)) TC_PUNT(IS_IP_FRAG);
+
+    // Cannot decrement during forward if already zero or would be zero,
+    // Let the kernel's stack handle these cases and generate appropriate ICMP errors.
+    if (ip->ttl <= 1) TC_PUNT(LOW_TTL);
+
+    // If we cannot update the 'last_used' field due to lack of bpf_ktime_get_boot_ns() helper,
+    // then it is not safe to offload UDP due to the small conntrack timeouts, as such,
+    // in such a situation we can only support TCP.  This also has the added nice benefit of
+    // using a separate error counter, and thus making it obvious which version of the program
+    // is loaded.
+    if (!updatetime && ip->protocol != IPPROTO_TCP) TC_PUNT(NON_TCP);
+
+    // We do not support offloading anything besides IPv4 TCP and UDP, due to need for NAT,
+    // but no need to check this if !updatetime due to check immediately above.
+    if (updatetime && (ip->protocol != IPPROTO_TCP) && (ip->protocol != IPPROTO_UDP))
+        TC_PUNT(NON_TCP_UDP);
+
+    // We want to make sure that the compiler will, in the !updatetime case, entirely optimize
+    // out all the non-tcp logic.  Also note that at this point is_udp === !is_tcp.
+    const bool is_tcp = !updatetime || (ip->protocol == IPPROTO_TCP);
+
+    // This is a bit of a hack to make things easier on the bpf verifier.
+    // (In particular I believe the Linux 4.14 kernel's verifier can get confused later on about
+    // what offsets into the packet are valid and can spuriously reject the program, this is
+    // because it fails to realize that is_tcp && !is_tcp is impossible)
+    //
+    // For both TCP & UDP we'll need to read and modify the src/dst ports, which so happen to
+    // always be in the first 4 bytes of the L4 header.  Additionally for UDP we'll need access
+    // to the checksum field which is in bytes 7 and 8.  While for TCP we'll need to read the
+    // TCP flags (at offset 13) and access to the checksum field (2 bytes at offset 16).
+    // As such we *always* need access to at least 8 bytes.
+    if (data + l2_header_size + sizeof(*ip) + 8 > data_end) TC_PUNT(SHORT_L4_HEADER);
+
+    struct tcphdr* tcph = is_tcp ? (void*)(ip + 1) : NULL;
+    struct udphdr* udph = is_tcp ? NULL : (void*)(ip + 1);
+
+    if (is_tcp) {
+        // Make sure we can get at the tcp header
+        if (data + l2_header_size + sizeof(*ip) + sizeof(*tcph) > data_end)
+            TC_PUNT(SHORT_TCP_HEADER);
+
+        // If hardware offload is running and programming flows based on conntrack entries, try not
+        // to interfere with it, so do not offload TCP packets with any one of the SYN/FIN/RST flags
+        if (tcph->syn || tcph->fin || tcph->rst) TC_PUNT(TCP_CONTROL_PACKET);
+    } else { // UDP
+        // Make sure we can get at the udp header
+        if (data + l2_header_size + sizeof(*ip) + sizeof(*udph) > data_end)
+            TC_PUNT(SHORT_UDP_HEADER);
+
+        // Skip handling of CHECKSUM_COMPLETE packets with udp checksum zero due to need for
+        // additional updating of skb->csum (this could be fixed up manually with more effort).
+        //
+        // Note that the in-kernel implementation of 'int64_t bpf_csum_update(skb, u32 csum)' is:
+        //   if (skb->ip_summed == CHECKSUM_COMPLETE)
+        //     return (skb->csum = csum_add(skb->csum, csum));
+        //   else
+        //     return -ENOTSUPP;
+        //
+        // So this will punt any CHECKSUM_COMPLETE packet with a zero UDP checksum,
+        // and leave all other packets unaffected (since it just at most adds zero to skb->csum).
+        //
+        // In practice this should almost never trigger because most nics do not generate
+        // CHECKSUM_COMPLETE packets on receive - especially so for nics/drivers on a phone.
+        //
+        // Additionally since we're forwarding, in most cases the value of the skb->csum field
+        // shouldn't matter (it's not used by physical nic egress).
+        //
+        // It only matters if we're ingressing through a CHECKSUM_COMPLETE capable nic
+        // and egressing through a virtual interface looping back to the kernel itself
+        // (ie. something like veth) where the CHECKSUM_COMPLETE/skb->csum can get reused
+        // on ingress.
+        //
+        // If we were in the kernel we'd simply probably call
+        //   void skb_checksum_complete_unset(struct sk_buff *skb) {
+        //     if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE;
+        //   }
+        // here instead.  Perhaps there should be a bpf helper for that?
+        if (!udph->check && (bpf_csum_update(skb, 0) >= 0)) TC_PUNT(UDP_CSUM_ZERO);
+    }
+
+    Tether4Key k = {
+            .iif = skb->ifindex,
+            .l4Proto = ip->protocol,
+            .src4.s_addr = ip->saddr,
+            .dst4.s_addr = ip->daddr,
+            .srcPort = is_tcp ? tcph->source : udph->source,
+            .dstPort = is_tcp ? tcph->dest : udph->dest,
+    };
+    if (is_ethernet) __builtin_memcpy(k.dstMac, eth->h_dest, ETH_ALEN);
+
+    Tether4Value* v = downstream ? bpf_tether_downstream4_map_lookup_elem(&k)
+                                 : bpf_tether_upstream4_map_lookup_elem(&k);
+
+    // If we don't find any offload information then simply let the core stack handle it...
+    if (!v) return TC_ACT_PIPE;
+
+    uint32_t stat_and_limit_k = downstream ? skb->ifindex : v->oif;
+
+    TetherStatsValue* stat_v = bpf_tether_stats_map_lookup_elem(&stat_and_limit_k);
+
+    // If we don't have anywhere to put stats, then abort...
+    if (!stat_v) TC_PUNT(NO_STATS_ENTRY);
+
+    uint64_t* limit_v = bpf_tether_limit_map_lookup_elem(&stat_and_limit_k);
+
+    // If we don't have a limit, then abort...
+    if (!limit_v) TC_PUNT(NO_LIMIT_ENTRY);
+
+    // Required IPv4 minimum mtu is 68, below that not clear what we should do, abort...
+    if (v->pmtu < 68) TC_PUNT(BELOW_IPV4_MTU);
+
+    // Approximate handling of TCP/IPv4 overhead for incoming LRO/GRO packets: default
+    // outbound path mtu of 1500 is not necessarily correct, but worst case we simply
+    // undercount, which is still better then not accounting for this overhead at all.
+    // Note: this really shouldn't be device/path mtu at all, but rather should be
+    // derived from this particular connection's mss (ie. from gro segment size).
+    // This would require a much newer kernel with newer ebpf accessors.
+    // (This is also blindly assuming 12 bytes of tcp timestamp option in tcp header)
+    uint64_t packets = 1;
+    uint64_t bytes = skb->len;
+    if (bytes > v->pmtu) {
+        const int tcp_overhead = sizeof(struct iphdr) + sizeof(struct tcphdr) + 12;
+        const int mss = v->pmtu - tcp_overhead;
+        const uint64_t payload = bytes - tcp_overhead;
+        packets = (payload + mss - 1) / mss;
+        bytes = tcp_overhead * packets + payload;
+    }
+
+    // Are we past the limit?  If so, then abort...
+    // Note: will not overflow since u64 is 936 years even at 5Gbps.
+    // Do not drop here.  Offload is just that, whenever we fail to handle
+    // a packet we let the core stack deal with things.
+    // (The core stack needs to handle limits correctly anyway,
+    // since we don't offload all traffic in both directions)
+    if (stat_v->rxBytes + stat_v->txBytes + bytes > *limit_v) TC_PUNT(LIMIT_REACHED);
+
+    if (!is_ethernet) {
+        // Try to inject an ethernet header, and simply return if we fail.
+        // We do this even if TX interface is RAWIP and thus does not need an ethernet header,
+        // because this is easier and the kernel will strip extraneous ethernet header.
+        if (bpf_skb_change_head(skb, sizeof(struct ethhdr), /*flags*/ 0)) {
+            __sync_fetch_and_add(downstream ? &stat_v->rxErrors : &stat_v->txErrors, 1);
+            TC_PUNT(CHANGE_HEAD_FAILED);
+        }
+
+        // bpf_skb_change_head() invalidates all pointers - reload them
+        data = (void*)(long)skb->data;
+        data_end = (void*)(long)skb->data_end;
+        eth = data;
+        ip = (void*)(eth + 1);
+        tcph = is_tcp ? (void*)(ip + 1) : NULL;
+        udph = is_tcp ? NULL : (void*)(ip + 1);
+
+        // I do not believe this can ever happen, but keep the verifier happy...
+        if (data + sizeof(struct ethhdr) + sizeof(*ip) + (is_tcp ? sizeof(*tcph) : sizeof(*udph)) > data_end) {
+            __sync_fetch_and_add(downstream ? &stat_v->rxErrors : &stat_v->txErrors, 1);
+            TC_DROP(TOO_SHORT);
+        }
+    };
+
+    // At this point we always have an ethernet header - which will get stripped by the
+    // kernel during transmit through a rawip interface.  ie. 'eth' pointer is valid.
+    // Additionally note that 'is_ethernet' and 'l2_header_size' are no longer correct.
+
+    // Overwrite any mac header with the new one
+    // For a rawip tx interface it will simply be a bunch of zeroes and later stripped.
+    *eth = v->macHeader;
+
+    // Decrement the IPv4 TTL, we already know it's greater than 1.
+    // u8 TTL field is followed by u8 protocol to make a u16 for ipv4 header checksum update.
+    // Since we're keeping the ipv4 checksum valid (which means the checksum of the entire
+    // ipv4 header remains 0), the overall checksum of the entire packet does not change.
+    const int sz2 = sizeof(__be16);
+    const __be16 old_ttl_proto = *(__be16 *)&ip->ttl;
+    const __be16 new_ttl_proto = old_ttl_proto - htons(0x0100);
+    bpf_l3_csum_replace(skb, ETH_IP4_OFFSET(check), old_ttl_proto, new_ttl_proto, sz2);
+    bpf_skb_store_bytes(skb, ETH_IP4_OFFSET(ttl), &new_ttl_proto, sz2, 0);
+
+    const int l4_offs_csum = is_tcp ? ETH_IP4_TCP_OFFSET(check) : ETH_IP4_UDP_OFFSET(check);
+    const int sz4 = sizeof(__be32);
+    // UDP 0 is special and stored as FFFF (this flag also causes a csum of 0 to be unmodified)
+    const int l4_flags = is_tcp ? 0 : BPF_F_MARK_MANGLED_0;
+    const __be32 old_daddr = k.dst4.s_addr;
+    const __be32 old_saddr = k.src4.s_addr;
+    const __be32 new_daddr = v->dst46.s6_addr32[3];
+    const __be32 new_saddr = v->src46.s6_addr32[3];
+
+    bpf_l4_csum_replace(skb, l4_offs_csum, old_daddr, new_daddr, sz4 | BPF_F_PSEUDO_HDR | l4_flags);
+    bpf_l3_csum_replace(skb, ETH_IP4_OFFSET(check), old_daddr, new_daddr, sz4);
+    bpf_skb_store_bytes(skb, ETH_IP4_OFFSET(daddr), &new_daddr, sz4, 0);
+
+    bpf_l4_csum_replace(skb, l4_offs_csum, old_saddr, new_saddr, sz4 | BPF_F_PSEUDO_HDR | l4_flags);
+    bpf_l3_csum_replace(skb, ETH_IP4_OFFSET(check), old_saddr, new_saddr, sz4);
+    bpf_skb_store_bytes(skb, ETH_IP4_OFFSET(saddr), &new_saddr, sz4, 0);
+
+    // The offsets for TCP and UDP ports: source (u16 @ L4 offset 0) & dest (u16 @ L4 offset 2) are
+    // actually the same, so the compiler should just optimize them both down to a constant.
+    bpf_l4_csum_replace(skb, l4_offs_csum, k.srcPort, v->srcPort, sz2 | l4_flags);
+    bpf_skb_store_bytes(skb, is_tcp ? ETH_IP4_TCP_OFFSET(source) : ETH_IP4_UDP_OFFSET(source),
+                        &v->srcPort, sz2, 0);
+
+    bpf_l4_csum_replace(skb, l4_offs_csum, k.dstPort, v->dstPort, sz2 | l4_flags);
+    bpf_skb_store_bytes(skb, is_tcp ? ETH_IP4_TCP_OFFSET(dest) : ETH_IP4_UDP_OFFSET(dest),
+                        &v->dstPort, sz2, 0);
+
+    // This requires the bpf_ktime_get_boot_ns() helper which was added in 5.8,
+    // and backported to all Android Common Kernel 4.14+ trees.
+    if (updatetime) v->last_used = bpf_ktime_get_boot_ns();
+
+    __sync_fetch_and_add(downstream ? &stat_v->rxPackets : &stat_v->txPackets, packets);
+    __sync_fetch_and_add(downstream ? &stat_v->rxBytes : &stat_v->txBytes, bytes);
+
+    // Redirect to forwarded interface.
+    //
+    // Note that bpf_redirect() cannot fail unless you pass invalid flags.
+    // The redirect actually happens after the ebpf program has already terminated,
+    // and can fail for example for mtu reasons at that point in time, but there's nothing
+    // we can do about it here.
+    return bpf_redirect(v->oif, 0 /* this is effectively BPF_F_EGRESS */);
+}
+
+// Full featured (required) implementations for 5.8+ kernels (these are S+ by definition)
+
+DEFINE_BPF_PROG_KVER("schedcls/tether_downstream4_rawip$5_8", AID_ROOT, AID_NETWORK_STACK,
+                     sched_cls_tether_downstream4_rawip_5_8, KVER(5, 8, 0))
+(struct __sk_buff* skb) {
+    return do_forward4(skb, /* is_ethernet */ false, /* downstream */ true, /* updatetime */ true);
+}
+
+DEFINE_BPF_PROG_KVER("schedcls/tether_upstream4_rawip$5_8", AID_ROOT, AID_NETWORK_STACK,
+                     sched_cls_tether_upstream4_rawip_5_8, KVER(5, 8, 0))
+(struct __sk_buff* skb) {
+    return do_forward4(skb, /* is_ethernet */ false, /* downstream */ false, /* updatetime */ true);
+}
+
+DEFINE_BPF_PROG_KVER("schedcls/tether_downstream4_ether$5_8", AID_ROOT, AID_NETWORK_STACK,
+                     sched_cls_tether_downstream4_ether_5_8, KVER(5, 8, 0))
+(struct __sk_buff* skb) {
+    return do_forward4(skb, /* is_ethernet */ true, /* downstream */ true, /* updatetime */ true);
+}
+
+DEFINE_BPF_PROG_KVER("schedcls/tether_upstream4_ether$5_8", AID_ROOT, AID_NETWORK_STACK,
+                     sched_cls_tether_upstream4_ether_5_8, KVER(5, 8, 0))
+(struct __sk_buff* skb) {
+    return do_forward4(skb, /* is_ethernet */ true, /* downstream */ false, /* updatetime */ true);
+}
+
+// Full featured (optional) implementations for 4.14-S, 4.19-S & 5.4-S kernels
+// (optional, because we need to be able to fallback for 4.14/4.19/5.4 pre-S kernels)
+
+DEFINE_OPTIONAL_BPF_PROG_KVER_RANGE("schedcls/tether_downstream4_rawip$opt",
+                                    AID_ROOT, AID_NETWORK_STACK,
+                                    sched_cls_tether_downstream4_rawip_opt,
+                                    KVER(4, 14, 0), KVER(5, 8, 0))
+(struct __sk_buff* skb) {
+    return do_forward4(skb, /* is_ethernet */ false, /* downstream */ true, /* updatetime */ true);
+}
+
+DEFINE_OPTIONAL_BPF_PROG_KVER_RANGE("schedcls/tether_upstream4_rawip$opt",
+                                    AID_ROOT, AID_NETWORK_STACK,
+                                    sched_cls_tether_upstream4_rawip_opt,
+                                    KVER(4, 14, 0), KVER(5, 8, 0))
+(struct __sk_buff* skb) {
+    return do_forward4(skb, /* is_ethernet */ false, /* downstream */ false, /* updatetime */ true);
+}
+
+DEFINE_OPTIONAL_BPF_PROG_KVER_RANGE("schedcls/tether_downstream4_ether$opt",
+                                    AID_ROOT, AID_NETWORK_STACK,
+                                    sched_cls_tether_downstream4_ether_opt,
+                                    KVER(4, 14, 0), KVER(5, 8, 0))
+(struct __sk_buff* skb) {
+    return do_forward4(skb, /* is_ethernet */ true, /* downstream */ true, /* updatetime */ true);
+}
+
+DEFINE_OPTIONAL_BPF_PROG_KVER_RANGE("schedcls/tether_upstream4_ether$opt",
+                                    AID_ROOT, AID_NETWORK_STACK,
+                                    sched_cls_tether_upstream4_ether_opt,
+                                    KVER(4, 14, 0), KVER(5, 8, 0))
+(struct __sk_buff* skb) {
+    return do_forward4(skb, /* is_ethernet */ true, /* downstream */ false, /* updatetime */ true);
+}
+
+// Partial (TCP-only: will not update 'last_used' field) implementations for 4.14+ kernels.
+// These will be loaded only if the above optional ones failed (loading of *these* must succeed
+// for 5.4+, since that is always an R patched kernel).
+//
+// [Note: as a result TCP connections will not have their conntrack timeout refreshed, however,
+// since /proc/sys/net/netfilter/nf_conntrack_tcp_timeout_established defaults to 432000 (seconds),
+// this in practice means they'll break only after 5 days.  This seems an acceptable trade-off.
+//
+// Additionally kernel/tests change "net-test: add bpf_ktime_get_ns / bpf_ktime_get_boot_ns tests"
+// which enforces and documents the required kernel cherrypicks will make it pretty unlikely that
+// many devices upgrading to S will end up relying on these fallback programs.
+
+// RAWIP: Required for 5.4-R kernels -- which always support bpf_skb_change_head().
+
+DEFINE_BPF_PROG_KVER_RANGE("schedcls/tether_downstream4_rawip$5_4", AID_ROOT, AID_NETWORK_STACK,
+                           sched_cls_tether_downstream4_rawip_5_4, KVER(5, 4, 0), KVER(5, 8, 0))
+(struct __sk_buff* skb) {
+    return do_forward4(skb, /* is_ethernet */ false, /* downstream */ true, /* updatetime */ false);
+}
+
+DEFINE_BPF_PROG_KVER_RANGE("schedcls/tether_upstream4_rawip$5_4", AID_ROOT, AID_NETWORK_STACK,
+                           sched_cls_tether_upstream4_rawip_5_4, KVER(5, 4, 0), KVER(5, 8, 0))
+(struct __sk_buff* skb) {
+    return do_forward4(skb, /* is_ethernet */ false, /* downstream */ false, /* updatetime */ false);
+}
+
+// RAWIP: Optional for 4.14/4.19 (R) kernels -- which support bpf_skb_change_head().
+// [Note: fallback for 4.14/4.19 (P/Q) kernels is below in stub section]
+
+DEFINE_OPTIONAL_BPF_PROG_KVER_RANGE("schedcls/tether_downstream4_rawip$4_14",
+                                    AID_ROOT, AID_NETWORK_STACK,
+                                    sched_cls_tether_downstream4_rawip_4_14,
+                                    KVER(4, 14, 0), KVER(5, 4, 0))
+(struct __sk_buff* skb) {
+    return do_forward4(skb, /* is_ethernet */ false, /* downstream */ true, /* updatetime */ false);
+}
+
+DEFINE_OPTIONAL_BPF_PROG_KVER_RANGE("schedcls/tether_upstream4_rawip$4_14",
+                                    AID_ROOT, AID_NETWORK_STACK,
+                                    sched_cls_tether_upstream4_rawip_4_14,
+                                    KVER(4, 14, 0), KVER(5, 4, 0))
+(struct __sk_buff* skb) {
+    return do_forward4(skb, /* is_ethernet */ false, /* downstream */ false, /* updatetime */ false);
+}
+
+// ETHER: Required for 4.14-Q/R, 4.19-Q/R & 5.4-R kernels.
+
+DEFINE_BPF_PROG_KVER_RANGE("schedcls/tether_downstream4_ether$4_14", AID_ROOT, AID_NETWORK_STACK,
+                           sched_cls_tether_downstream4_ether_4_14, KVER(4, 14, 0), KVER(5, 8, 0))
+(struct __sk_buff* skb) {
+    return do_forward4(skb, /* is_ethernet */ true, /* downstream */ true, /* updatetime */ false);
+}
+
+DEFINE_BPF_PROG_KVER_RANGE("schedcls/tether_upstream4_ether$4_14", AID_ROOT, AID_NETWORK_STACK,
+                           sched_cls_tether_upstream4_ether_4_14, KVER(4, 14, 0), KVER(5, 8, 0))
+(struct __sk_buff* skb) {
+    return do_forward4(skb, /* is_ethernet */ true, /* downstream */ false, /* updatetime */ false);
+}
+
+// Placeholder (no-op) implementations for older Q kernels
+
+// RAWIP: 4.9-P/Q, 4.14-P/Q & 4.19-Q kernels -- without bpf_skb_change_head() for tc programs
+
+DEFINE_BPF_PROG_KVER_RANGE("schedcls/tether_downstream4_rawip$stub", AID_ROOT, AID_NETWORK_STACK,
+                           sched_cls_tether_downstream4_rawip_stub, KVER_NONE, KVER(5, 4, 0))
+(struct __sk_buff* skb) {
+    return TC_ACT_PIPE;
+}
+
+DEFINE_BPF_PROG_KVER_RANGE("schedcls/tether_upstream4_rawip$stub", AID_ROOT, AID_NETWORK_STACK,
+                           sched_cls_tether_upstream4_rawip_stub, KVER_NONE, KVER(5, 4, 0))
+(struct __sk_buff* skb) {
+    return TC_ACT_PIPE;
+}
+
+// ETHER: 4.9-P/Q kernel
+
+DEFINE_BPF_PROG_KVER_RANGE("schedcls/tether_downstream4_ether$stub", AID_ROOT, AID_NETWORK_STACK,
+                           sched_cls_tether_downstream4_ether_stub, KVER_NONE, KVER(4, 14, 0))
+(struct __sk_buff* skb) {
+    return TC_ACT_PIPE;
+}
+
+DEFINE_BPF_PROG_KVER_RANGE("schedcls/tether_upstream4_ether$stub", AID_ROOT, AID_NETWORK_STACK,
+                           sched_cls_tether_upstream4_ether_stub, KVER_NONE, KVER(4, 14, 0))
+(struct __sk_buff* skb) {
+    return TC_ACT_PIPE;
+}
+
+// ----- XDP Support -----
+
+DEFINE_BPF_MAP_GRW(tether_dev_map, DEVMAP_HASH, uint32_t, uint32_t, 64, AID_NETWORK_STACK)
+
+static inline __always_inline int do_xdp_forward6(struct xdp_md *ctx, const bool is_ethernet,
+        const bool downstream) {
+    return XDP_PASS;
+}
+
+static inline __always_inline int do_xdp_forward4(struct xdp_md *ctx, const bool is_ethernet,
+        const bool downstream) {
+    return XDP_PASS;
+}
+
+static inline __always_inline int do_xdp_forward_ether(struct xdp_md *ctx, const bool downstream) {
+    const void* data = (void*)(long)ctx->data;
+    const void* data_end = (void*)(long)ctx->data_end;
+    const struct ethhdr* eth = data;
+
+    // Make sure we actually have an ethernet header
+    if ((void*)(eth + 1) > data_end) return XDP_PASS;
+
+    if (eth->h_proto == htons(ETH_P_IPV6))
+        return do_xdp_forward6(ctx, /* is_ethernet */ true, downstream);
+    if (eth->h_proto == htons(ETH_P_IP))
+        return do_xdp_forward4(ctx, /* is_ethernet */ true, downstream);
+
+    // Anything else we don't know how to handle...
+    return XDP_PASS;
+}
+
+static inline __always_inline int do_xdp_forward_rawip(struct xdp_md *ctx, const bool downstream) {
+    const void* data = (void*)(long)ctx->data;
+    const void* data_end = (void*)(long)ctx->data_end;
+
+    // The top nibble of both IPv4 and IPv6 headers is the IP version.
+    if (data_end - data < 1) return XDP_PASS;
+    const uint8_t v = (*(uint8_t*)data) >> 4;
+
+    if (v == 6) return do_xdp_forward6(ctx, /* is_ethernet */ false, downstream);
+    if (v == 4) return do_xdp_forward4(ctx, /* is_ethernet */ false, downstream);
+
+    // Anything else we don't know how to handle...
+    return XDP_PASS;
+}
+
+#define DEFINE_XDP_PROG(str, func) \
+    DEFINE_BPF_PROG_KVER(str, AID_ROOT, AID_NETWORK_STACK, func, KVER(5, 9, 0))(struct xdp_md *ctx)
+
+DEFINE_XDP_PROG("xdp/tether_downstream_ether",
+                 xdp_tether_downstream_ether) {
+    return do_xdp_forward_ether(ctx, /* downstream */ true);
+}
+
+DEFINE_XDP_PROG("xdp/tether_downstream_rawip",
+                 xdp_tether_downstream_rawip) {
+    return do_xdp_forward_rawip(ctx, /* downstream */ true);
+}
+
+DEFINE_XDP_PROG("xdp/tether_upstream_ether",
+                 xdp_tether_upstream_ether) {
+    return do_xdp_forward_ether(ctx, /* downstream */ false);
+}
+
+DEFINE_XDP_PROG("xdp/tether_upstream_rawip",
+                 xdp_tether_upstream_rawip) {
+    return do_xdp_forward_rawip(ctx, /* downstream */ false);
+}
+
+LICENSE("Apache 2.0");
+CRITICAL("tethering");

diff --git a/bpf_progs/test.c b/bpf_progs/test.c
new file mode 100644
index 0000000..a76e346
--- /dev/null
+++ b/bpf_progs/test.c

@@ -0,0 +1,50 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <linux/if_ether.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+
+// The resulting .o needs to load on the Android S bpfloader v0.2
+#define BPFLOADER_MIN_VER 2u
+
+#include "bpf_helpers.h"
+#include "bpf_net_helpers.h"
+#include "bpf_tethering.h"
+
+// Used only by TetheringPrivilegedTests, not by production code.
+DEFINE_BPF_MAP_GRW(tether_downstream6_map, HASH, TetherDownstream6Key, Tether6Value, 16,
+                   AID_NETWORK_STACK)
+
+DEFINE_BPF_PROG_KVER("xdp/drop_ipv4_udp_ether", AID_ROOT, AID_NETWORK_STACK,
+                      xdp_test, KVER(5, 9, 0))
+(struct xdp_md *ctx) {
+    void *data = (void *)(long)ctx->data;
+    void *data_end = (void *)(long)ctx->data_end;
+
+    struct ethhdr *eth = data;
+    int hsize = sizeof(*eth);
+
+    struct iphdr *ip = data + hsize;
+    hsize += sizeof(struct iphdr);
+
+    if (data + hsize > data_end) return XDP_PASS;
+    if (eth->h_proto != htons(ETH_P_IP)) return XDP_PASS;
+    if (ip->protocol == IPPROTO_UDP) return XDP_DROP;
+    return XDP_PASS;
+}
+
+LICENSE("Apache 2.0");
commit	bb57fa9eece916ed06603c1b448a5469ddc6cedf	[log] [tgz]
author	Ken Chen <cken@google.com>	Fri Oct 22 00:49:13 2021 +0800
committer	Ken Chen <cken@google.com>	Sat Nov 20 18:31:08 2021 +0800
tree	9ec679c21ecb2c95a1f16d4042ed23b45525e4d8
parent	2c75c0444dd4105dd0c5803220f81e1b53b06fd6 [diff]