From e560cb8f9beb0778fbc5325c924817258a91ab58 Mon Sep 17 00:00:00 2001 From: Jiping Ma Date: Thu, 11 Apr 2024 07:19:13 -0400 Subject: [PATCH] kernel: ipv6: remove max_size check inline with ipv4 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit resolves the following issues in StarlingX. 1. "Route cache is full: consider increasing sysctl net.ipv6.route.max_size” logs are generated in large quantities in kernel.log. 2. When this happens a network is unreachable some time. We cherry-pick commit af6d10345ca7 ("ipv6: remove max_size check inline with ipv4") to resolve the aforementioned issues. * commit af6d10345ca7 ("ipv6: remove max_size check inline with ipv4") https://git.yoctoproject.org/linux-yocto/commit/?id=af6d10345ca7 Verification: - build-pkgs; build-iso; install and boot up on aio-sx lab. - The issue can not be reproduced in our side, but it has been verified by the customer. - The network benchmark test will be done by the test team to make sure there is not any impact for the system performance. Closes-Bug: 2061043 Change-Id: I6890ff023d33d6a5840a9a135d9445c57a6d8622 Signed-off-by: Jiping Ma --- ...move-max_size-check-inline-with-ipv4.patch | 207 ++++++++++++++++++ kernel-rt/debian/patches/series | 1 + ...move-max_size-check-inline-with-ipv4.patch | 207 ++++++++++++++++++ kernel-std/debian/patches/series | 2 +- 4 files changed, 416 insertions(+), 1 deletion(-) create mode 100644 kernel-rt/debian/patches/0084-ipv6-remove-max_size-check-inline-with-ipv4.patch create mode 100644 kernel-std/debian/patches/0076-ipv6-remove-max_size-check-inline-with-ipv4.patch diff --git a/kernel-rt/debian/patches/0084-ipv6-remove-max_size-check-inline-with-ipv4.patch b/kernel-rt/debian/patches/0084-ipv6-remove-max_size-check-inline-with-ipv4.patch new file mode 100644 index 00000000..30d24266 --- /dev/null +++ b/kernel-rt/debian/patches/0084-ipv6-remove-max_size-check-inline-with-ipv4.patch @@ -0,0 +1,207 @@ +From a616a8c8e5e479cc01a752f93a9887ed51bb150e Mon Sep 17 00:00:00 2001 +From: Jon Maxwell +Date: Thu, 12 Jan 2023 12:25:32 +1100 +Subject: [PATCH] ipv6: remove max_size check inline with ipv4 + +In ip6_dst_gc() replace: + + if (entries > gc_thresh) + +With: + + if (entries > ops->gc_thresh) + +Sending Ipv6 packets in a loop via a raw socket triggers an issue where a +route is cloned by ip6_rt_cache_alloc() for each packet sent. This quickly +consumes the Ipv6 max_size threshold which defaults to 4096 resulting in +these warnings: + +[1] 99.187805] dst_alloc: 7728 callbacks suppressed +[2] Route cache is full: consider increasing sysctl net.ipv6.route.max_size. +. +. +[300] Route cache is full: consider increasing sysctl net.ipv6.route.max_size. + +When this happens the packet is dropped and sendto() gets a network is +unreachable error: + +remaining pkt 200557 errno 101 +remaining pkt 196462 errno 101 +. +. +remaining pkt 126821 errno 101 + +Implement David Aherns suggestion to remove max_size check seeing that Ipv6 +has a GC to manage memory usage. Ipv4 already does not check max_size. + +Here are some memory comparisons for Ipv4 vs Ipv6 with the patch: + +Test by running 5 instances of a program that sends UDP packets to a raw +socket 5000000 times. Compare Ipv4 and Ipv6 performance with a similar +program. + +Ipv4: + +Before test: + +MemFree: 29427108 kB +Slab: 237612 kB + +ip6_dst_cache 1912 2528 256 32 2 : tunables 0 0 0 +xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 +ip_dst_cache 2881 3990 192 42 2 : tunables 0 0 0 + +During test: + +MemFree: 29417608 kB +Slab: 247712 kB + +ip6_dst_cache 1912 2528 256 32 2 : tunables 0 0 0 +xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 +ip_dst_cache 44394 44394 192 42 2 : tunables 0 0 0 + +After test: + +MemFree: 29422308 kB +Slab: 238104 kB + +ip6_dst_cache 1912 2528 256 32 2 : tunables 0 0 0 +xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 +ip_dst_cache 3048 4116 192 42 2 : tunables 0 0 0 + +Ipv6 with patch: + +Errno 101 errors are not observed anymore with the patch. + +Before test: + +MemFree: 29422308 kB +Slab: 238104 kB + +ip6_dst_cache 1912 2528 256 32 2 : tunables 0 0 0 +xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 +ip_dst_cache 3048 4116 192 42 2 : tunables 0 0 0 + +During Test: + +MemFree: 29431516 kB +Slab: 240940 kB + +ip6_dst_cache 11980 12064 256 32 2 : tunables 0 0 0 +xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 +ip_dst_cache 3048 4116 192 42 2 : tunables 0 0 0 + +After Test: + +MemFree: 29441816 kB +Slab: 238132 kB + +ip6_dst_cache 1902 2432 256 32 2 : tunables 0 0 0 +xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 +ip_dst_cache 3048 4116 192 42 2 : tunables 0 0 0 + +Tested-by: Andrea Mayer +Signed-off-by: Jon Maxwell +Reviewed-by: David Ahern +Link: https://lore.kernel.org/r/20230112012532.311021-1-jmaxwell37@gmail.com +Signed-off-by: Jakub Kicinski +(cherry picked from commit af6d10345ca76670c1b7c37799f0d5576ccef277) +Signed-off-by: Jiping Ma +--- + include/net/dst_ops.h | 2 +- + net/core/dst.c | 8 ++------ + net/ipv6/route.c | 13 +++++-------- + 3 files changed, 8 insertions(+), 15 deletions(-) + +diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h +index 88ff7bb2bb9b..632086b2f644 100644 +--- a/include/net/dst_ops.h ++++ b/include/net/dst_ops.h +@@ -16,7 +16,7 @@ struct dst_ops { + unsigned short family; + unsigned int gc_thresh; + +- int (*gc)(struct dst_ops *ops); ++ void (*gc)(struct dst_ops *ops); + struct dst_entry * (*check)(struct dst_entry *, __u32 cookie); + unsigned int (*default_advmss)(const struct dst_entry *); + unsigned int (*mtu)(const struct dst_entry *); +diff --git a/net/core/dst.c b/net/core/dst.c +index fb3bcba87744..453ec8aafc4a 100644 +--- a/net/core/dst.c ++++ b/net/core/dst.c +@@ -83,12 +83,8 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, + + if (ops->gc && + !(flags & DST_NOCOUNT) && +- dst_entries_get_fast(ops) > ops->gc_thresh) { +- if (ops->gc(ops)) { +- pr_notice_ratelimited("Route cache is full: consider increasing sysctl net.ipv6.route.max_size.\n"); +- return NULL; +- } +- } ++ dst_entries_get_fast(ops) > ops->gc_thresh) ++ ops->gc(ops); + + dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); + if (!dst) +diff --git a/net/ipv6/route.c b/net/ipv6/route.c +index a6d5c99f65a3..b23e42efb3df 100644 +--- a/net/ipv6/route.c ++++ b/net/ipv6/route.c +@@ -89,7 +89,7 @@ static struct dst_entry *ip6_negative_advice(struct dst_entry *); + static void ip6_dst_destroy(struct dst_entry *); + static void ip6_dst_ifdown(struct dst_entry *, + struct net_device *dev, int how); +-static int ip6_dst_gc(struct dst_ops *ops); ++static void ip6_dst_gc(struct dst_ops *ops); + + static int ip6_pkt_discard(struct sk_buff *skb); + static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); +@@ -3184,11 +3184,10 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, + return dst; + } + +-static int ip6_dst_gc(struct dst_ops *ops) ++static void ip6_dst_gc(struct dst_ops *ops) + { + struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); + int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; +- int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; + int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; + int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; + unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; +@@ -3196,11 +3195,10 @@ static int ip6_dst_gc(struct dst_ops *ops) + int entries; + + entries = dst_entries_get_fast(ops); +- if (entries > rt_max_size) ++ if (entries > ops->gc_thresh) + entries = dst_entries_get_slow(ops); + +- if (time_after(rt_last_gc + rt_min_interval, jiffies) && +- entries <= rt_max_size) ++ if (time_after(rt_last_gc + rt_min_interval, jiffies)) + goto out; + + fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true); +@@ -3210,7 +3208,6 @@ static int ip6_dst_gc(struct dst_ops *ops) + out: + val = atomic_read(&net->ipv6.ip6_rt_gc_expire); + atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity)); +- return entries > rt_max_size; + } + + static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg, +@@ -6363,7 +6360,7 @@ static int __net_init ip6_route_net_init(struct net *net) + #endif + + net->ipv6.sysctl.flush_delay = 0; +- net->ipv6.sysctl.ip6_rt_max_size = 4096; ++ net->ipv6.sysctl.ip6_rt_max_size = INT_MAX; + net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; + net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; + net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; +-- +2.43.0 + diff --git a/kernel-rt/debian/patches/series b/kernel-rt/debian/patches/series index 4f26939f..02efae83 100644 --- a/kernel-rt/debian/patches/series +++ b/kernel-rt/debian/patches/series @@ -77,3 +77,4 @@ 0081-perf-core-Always-set-cpuctx-cgrp-when-enable-cgroup-.patch 0082-Add-the-pci-reboot-quirk-in-DMI-table-for-Dell-Power.patch 0083-sched-rt-Fix-bad-task-migration-for-rt-tasks.patch +0084-ipv6-remove-max_size-check-inline-with-ipv4.patch diff --git a/kernel-std/debian/patches/0076-ipv6-remove-max_size-check-inline-with-ipv4.patch b/kernel-std/debian/patches/0076-ipv6-remove-max_size-check-inline-with-ipv4.patch new file mode 100644 index 00000000..30d24266 --- /dev/null +++ b/kernel-std/debian/patches/0076-ipv6-remove-max_size-check-inline-with-ipv4.patch @@ -0,0 +1,207 @@ +From a616a8c8e5e479cc01a752f93a9887ed51bb150e Mon Sep 17 00:00:00 2001 +From: Jon Maxwell +Date: Thu, 12 Jan 2023 12:25:32 +1100 +Subject: [PATCH] ipv6: remove max_size check inline with ipv4 + +In ip6_dst_gc() replace: + + if (entries > gc_thresh) + +With: + + if (entries > ops->gc_thresh) + +Sending Ipv6 packets in a loop via a raw socket triggers an issue where a +route is cloned by ip6_rt_cache_alloc() for each packet sent. This quickly +consumes the Ipv6 max_size threshold which defaults to 4096 resulting in +these warnings: + +[1] 99.187805] dst_alloc: 7728 callbacks suppressed +[2] Route cache is full: consider increasing sysctl net.ipv6.route.max_size. +. +. +[300] Route cache is full: consider increasing sysctl net.ipv6.route.max_size. + +When this happens the packet is dropped and sendto() gets a network is +unreachable error: + +remaining pkt 200557 errno 101 +remaining pkt 196462 errno 101 +. +. +remaining pkt 126821 errno 101 + +Implement David Aherns suggestion to remove max_size check seeing that Ipv6 +has a GC to manage memory usage. Ipv4 already does not check max_size. + +Here are some memory comparisons for Ipv4 vs Ipv6 with the patch: + +Test by running 5 instances of a program that sends UDP packets to a raw +socket 5000000 times. Compare Ipv4 and Ipv6 performance with a similar +program. + +Ipv4: + +Before test: + +MemFree: 29427108 kB +Slab: 237612 kB + +ip6_dst_cache 1912 2528 256 32 2 : tunables 0 0 0 +xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 +ip_dst_cache 2881 3990 192 42 2 : tunables 0 0 0 + +During test: + +MemFree: 29417608 kB +Slab: 247712 kB + +ip6_dst_cache 1912 2528 256 32 2 : tunables 0 0 0 +xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 +ip_dst_cache 44394 44394 192 42 2 : tunables 0 0 0 + +After test: + +MemFree: 29422308 kB +Slab: 238104 kB + +ip6_dst_cache 1912 2528 256 32 2 : tunables 0 0 0 +xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 +ip_dst_cache 3048 4116 192 42 2 : tunables 0 0 0 + +Ipv6 with patch: + +Errno 101 errors are not observed anymore with the patch. + +Before test: + +MemFree: 29422308 kB +Slab: 238104 kB + +ip6_dst_cache 1912 2528 256 32 2 : tunables 0 0 0 +xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 +ip_dst_cache 3048 4116 192 42 2 : tunables 0 0 0 + +During Test: + +MemFree: 29431516 kB +Slab: 240940 kB + +ip6_dst_cache 11980 12064 256 32 2 : tunables 0 0 0 +xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 +ip_dst_cache 3048 4116 192 42 2 : tunables 0 0 0 + +After Test: + +MemFree: 29441816 kB +Slab: 238132 kB + +ip6_dst_cache 1902 2432 256 32 2 : tunables 0 0 0 +xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 +ip_dst_cache 3048 4116 192 42 2 : tunables 0 0 0 + +Tested-by: Andrea Mayer +Signed-off-by: Jon Maxwell +Reviewed-by: David Ahern +Link: https://lore.kernel.org/r/20230112012532.311021-1-jmaxwell37@gmail.com +Signed-off-by: Jakub Kicinski +(cherry picked from commit af6d10345ca76670c1b7c37799f0d5576ccef277) +Signed-off-by: Jiping Ma +--- + include/net/dst_ops.h | 2 +- + net/core/dst.c | 8 ++------ + net/ipv6/route.c | 13 +++++-------- + 3 files changed, 8 insertions(+), 15 deletions(-) + +diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h +index 88ff7bb2bb9b..632086b2f644 100644 +--- a/include/net/dst_ops.h ++++ b/include/net/dst_ops.h +@@ -16,7 +16,7 @@ struct dst_ops { + unsigned short family; + unsigned int gc_thresh; + +- int (*gc)(struct dst_ops *ops); ++ void (*gc)(struct dst_ops *ops); + struct dst_entry * (*check)(struct dst_entry *, __u32 cookie); + unsigned int (*default_advmss)(const struct dst_entry *); + unsigned int (*mtu)(const struct dst_entry *); +diff --git a/net/core/dst.c b/net/core/dst.c +index fb3bcba87744..453ec8aafc4a 100644 +--- a/net/core/dst.c ++++ b/net/core/dst.c +@@ -83,12 +83,8 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, + + if (ops->gc && + !(flags & DST_NOCOUNT) && +- dst_entries_get_fast(ops) > ops->gc_thresh) { +- if (ops->gc(ops)) { +- pr_notice_ratelimited("Route cache is full: consider increasing sysctl net.ipv6.route.max_size.\n"); +- return NULL; +- } +- } ++ dst_entries_get_fast(ops) > ops->gc_thresh) ++ ops->gc(ops); + + dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); + if (!dst) +diff --git a/net/ipv6/route.c b/net/ipv6/route.c +index a6d5c99f65a3..b23e42efb3df 100644 +--- a/net/ipv6/route.c ++++ b/net/ipv6/route.c +@@ -89,7 +89,7 @@ static struct dst_entry *ip6_negative_advice(struct dst_entry *); + static void ip6_dst_destroy(struct dst_entry *); + static void ip6_dst_ifdown(struct dst_entry *, + struct net_device *dev, int how); +-static int ip6_dst_gc(struct dst_ops *ops); ++static void ip6_dst_gc(struct dst_ops *ops); + + static int ip6_pkt_discard(struct sk_buff *skb); + static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); +@@ -3184,11 +3184,10 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, + return dst; + } + +-static int ip6_dst_gc(struct dst_ops *ops) ++static void ip6_dst_gc(struct dst_ops *ops) + { + struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); + int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; +- int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; + int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; + int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; + unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; +@@ -3196,11 +3195,10 @@ static int ip6_dst_gc(struct dst_ops *ops) + int entries; + + entries = dst_entries_get_fast(ops); +- if (entries > rt_max_size) ++ if (entries > ops->gc_thresh) + entries = dst_entries_get_slow(ops); + +- if (time_after(rt_last_gc + rt_min_interval, jiffies) && +- entries <= rt_max_size) ++ if (time_after(rt_last_gc + rt_min_interval, jiffies)) + goto out; + + fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true); +@@ -3210,7 +3208,6 @@ static int ip6_dst_gc(struct dst_ops *ops) + out: + val = atomic_read(&net->ipv6.ip6_rt_gc_expire); + atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity)); +- return entries > rt_max_size; + } + + static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg, +@@ -6363,7 +6360,7 @@ static int __net_init ip6_route_net_init(struct net *net) + #endif + + net->ipv6.sysctl.flush_delay = 0; +- net->ipv6.sysctl.ip6_rt_max_size = 4096; ++ net->ipv6.sysctl.ip6_rt_max_size = INT_MAX; + net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; + net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; + net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; +-- +2.43.0 + diff --git a/kernel-std/debian/patches/series b/kernel-std/debian/patches/series index 04ea5b10..53702abd 100644 --- a/kernel-std/debian/patches/series +++ b/kernel-std/debian/patches/series @@ -70,4 +70,4 @@ 0073-perf-core-Fix-perf_cgroup_switch.patch 0074-perf-core-Always-set-cpuctx-cgrp-when-enable-cgroup-.patch 0075-Add-the-pci-reboot-quirk-in-DMI-table-for-Dell-Power.patch - +0076-ipv6-remove-max_size-check-inline-with-ipv4.patch