pwru
pwru copied to clipboard
Collect MTU from skb->_skb_refdst
According to source code, kernel uses MTU from skb->_skb_refdst. Let pwru collect MTU from there using the same logic.
It requires to cast skb->_skb_refdst to dst_entry*, then fetch dst_metric_raw(dst, RTAX_MTU) and dst->dev->mtu.
With this patch, pwru can output the correct MTU used by OS.
Case 1: Cilium could reduce the route MTU to 1423 inside a pod. This couldn't be detected by pwru until this patch because it only checked link MTU.
Case 2: Xfrm could reduce the route MTU to 1446 in ip_forward(). Pwru must inspect route MTU from skb->_skb_dstref to understand that, this patch makes it happen.
// https://elixir.bootlin.com/linux/v6.5/source/net/ipv4/ip_forward.c#L86
// net/ipv4/ip_forward.c
int ip_forward(struct sk_buff *skb)
{
[...]
rt = skb_rtable(skb);
[...]
mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
if (ip_exceeds_mtu(skb, mtu)) {
IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
htonl(mtu));
SKB_DR_SET(reason, PKT_TOO_BIG);
goto drop;
}
[...]
}
// include/linux/skbuff.h
static inline struct rtable *skb_rtable(const struct sk_buff *skb)
{
return (struct rtable *)skb_dst(skb);
}
// include/linux/skbuff.h
static inline struct dst_entry *skb_dst(const struct sk_buff *skb)
{
[...]
return (struct dst_entry *)(skb->_skb_refdst & SKB_DST_PTRMASK);
}
// include/net/ip.h
static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
bool forwarding)
{
[...]
mtu = dst_metric_raw(dst, RTAX_MTU);
if (mtu)
goto out;
mtu = READ_ONCE(dst->dev->mtu);
[...]
// include/net/dst.h
#define DST_METRICS_FLAGS 0x3UL
#define __DST_METRICS_PTR(Y) \
((u32 *)((Y) & ~DST_METRICS_FLAGS))
#define DST_METRICS_PTR(X) __DST_METRICS_PTR((X)->_metrics)
}