source.openwrt.melmac.net
source.openwrt.melmac.net copied to clipboard
[pbr] issue: PBR intermittently ineffective after a few hours and all traffic routed over VPN
Describe the bug
I have a router configured to connect three LANs (lan, lan2, and guest) to two OpenVPN clients (vpn0 and vpn1). I use pbr to whitelist some domains and subdomains to be routed over my WAN or my second VPN. Since upgrading pbr recently (from about a year ago) and changing my resolve_set argument from dnsmasq.ipset to dnsmasq.nftset after domain resolution stopped working, it seems pbr now stops having an effect after a short time (1-2 hours), and only rebooting the router (sometimes /etc/init.d/network restart) will help. Restarting pbr does not. It's unclear to me why/when this happens.
Your configs
/etc/config/dhcp
config dnsmasq
option domainneeded '1'
option boguspriv '1'
option filterwin2k '0'
option localise_queries '1'
option rebind_protection '1'
option rebind_localhost '1'
option local '/lan/'
option domain 'xxx'
option expandhosts '1'
option nonegcache '0'
option cachesize '1000'
option authoritative '1'
option readethers '1'
option leasefile '/tmp/dhcp.leases'
option resolvfile '/tmp/resolv.conf.d/resolv.conf.auto'
option nonwildcard '1'
option localservice '1'
option ednspacket_max '1232'
option filter_aaaa '0'
option filter_a '0'
option sequential_ip '1'
list server '9.9.9.9'
list server '149.112.112.112'
option confdir '/tmp/dnsmasq.d'
config dhcp 'lan'
option interface 'lan'
option start '258'
option limit '253'
option leasetime '12h'
option dhcpv4 'server'
config dhcp 'wan'
option interface 'wan'
option ignore '1'
config odhcpd 'odhcpd'
option maindhcp '0'
option leasefile '/tmp/hosts/odhcpd'
option leasetrigger '/usr/sbin/odhcpd-update'
option loglevel '4'
config dhcp 'vpn1'
option interface 'vpn1'
option start '258'
option limit '253'
option leasetime '12h'
config dhcp 'guest'
option interface 'guest'
option start '258'
option limit '253'
option leasetime '12h'
option force '1'
list dhcp_option '6,9.9.9.9'
list dhcp_option '6,149.112.112.112'
/etc/config/network
config interface 'loopback'
option device 'lo'
option proto 'static'
option ipaddr '127.0.0.1'
option netmask '255.0.0.0'
config globals 'globals'
config device
option name 'br-lan'
option type 'bridge'
list ports 'lan1'
list ports 'lan2'
list ports 'lan3'
list ports 'lan4'
config interface 'lan'
option device 'br-lan.1'
option proto 'static'
option ipaddr 'xxx.xxx.xxx.xxx'
option netmask '255.255.254.0'
option ipv6 '0'
option delegate '0'
config device
option name 'wan'
option macaddr 'xx:xx:xx:xx:xx:xx'
config interface 'wan'
option device 'wan'
option proto 'dhcp'
option hostname '*'
option ipv6 '0'
option delegate '0'
option peerdns '0'
config interface 'wan6'
option device 'wan'
option proto 'none'
option auto '0'
option delegate '0'
option peerdns '0'
config interface 'lan2'
option type 'bridge'
option proto 'static'
option ipv6 '0'
option delegate '0'
option ipaddr 'xxx.xxx.xxx.xxx'
option netmask '255.255.254.0'
config interface 'guest'
option proto 'static'
option ipv6 '0'
option delegate '0'
option ipaddr 'xxx.xxx.xxx.xxx'
option netmask '255.255.254.0'
option device 'br-lan.2'
config bridge-vlan
option device 'br-lan'
option vlan '1'
list ports 'lan2'
list ports 'lan3'
list ports 'lan4'
config bridge-vlan
option device 'br-lan'
option vlan '2'
list ports 'lan1'
config interface 'vpn1'
option proto 'none'
option device 'ovpnc1'
config interface 'vpn0'
option proto 'none'
option device 'ovpnc0'
/etc/config/firewall
config defaults
option syn_flood '1'
option input 'REJECT'
option output 'ACCEPT'
option forward 'REJECT'
config zone
option name 'lan'
list network 'lan'
option input 'ACCEPT'
option output 'ACCEPT'
option forward 'REJECT'
config zone
option name 'wan'
list network 'wan'
list network 'wan6'
list network 'vpn0'
list network 'vpn1'
option input 'REJECT'
option output 'ACCEPT'
option forward 'REJECT'
option masq '1'
option mtu_fix '1'
config forwarding
option src 'lan'
option dest 'wan'
config rule
option name 'Allow-DHCP-Renew'
option src 'wan'
option proto 'udp'
option dest_port '68'
option target 'ACCEPT'
option family 'ipv4'
config rule
option name 'Allow-Ping'
option src 'wan'
option proto 'icmp'
option icmp_type 'echo-request'
option family 'ipv4'
option target 'ACCEPT'
config rule
option name 'Allow-IGMP'
option src 'wan'
option proto 'igmp'
option family 'ipv4'
option target 'ACCEPT'
config rule
option name 'Allow-DHCPv6'
option src 'wan'
option proto 'udp'
option dest_port '546'
option family 'ipv6'
option target 'REJECT'
config rule
option name 'Allow-MLD'
option src 'wan'
option proto 'icmp'
option src_ip 'fe80::/10'
list icmp_type '130/0'
list icmp_type '131/0'
list icmp_type '132/0'
list icmp_type '143/0'
option family 'ipv6'
option target 'REJECT'
config rule
option name 'Allow-ICMPv6-Input'
option src 'wan'
option proto 'icmp'
list icmp_type 'echo-request'
list icmp_type 'echo-reply'
list icmp_type 'destination-unreachable'
list icmp_type 'packet-too-big'
list icmp_type 'time-exceeded'
list icmp_type 'bad-header'
list icmp_type 'unknown-header-type'
list icmp_type 'router-solicitation'
list icmp_type 'neighbour-solicitation'
list icmp_type 'router-advertisement'
list icmp_type 'neighbour-advertisement'
option limit '1000/sec'
option family 'ipv6'
option target 'REJECT'
config rule
option name 'Allow-ICMPv6-Forward'
option src 'wan'
option dest '*'
option proto 'icmp'
list icmp_type 'echo-request'
list icmp_type 'echo-reply'
list icmp_type 'destination-unreachable'
list icmp_type 'packet-too-big'
list icmp_type 'time-exceeded'
list icmp_type 'bad-header'
list icmp_type 'unknown-header-type'
option limit '1000/sec'
option family 'ipv6'
option target 'REJECT'
config rule
option name 'Allow-IPSec-ESP'
option src 'wan'
option dest 'lan'
option proto 'esp'
option target 'ACCEPT'
config rule
option name 'Allow-ISAKMP'
option src 'wan'
option dest 'lan'
option dest_port '500'
option proto 'udp'
option target 'ACCEPT'
config zone
option name 'lan2'
option network 'lan2'
option input 'ACCEPT'
option output 'ACCEPT'
option forward 'REJECT'
config zone
option name 'guest'
option network 'guest'
option input 'REJECT'
option output 'ACCEPT'
option forward 'REJECT'
config rule
option name 'Allow-Guest-DHCP'
option src 'guest'
option dest_port '67'
option proto 'udp'
option family 'ipv4'
option target 'ACCEPT'
config zone
option name 'vpn0'
option network 'vpn0'
option input 'REJECT'
option output 'ACCEPT'
option forward 'REJECT'
option masq '1'
option mtu_fix '1'
config zone
option name 'vpn1'
option network 'vpn1'
option input 'REJECT'
option output 'ACCEPT'
option forward 'REJECT'
option masq '1'
option mtu_fix '1'
config forwarding
option src 'lan2'
option dest 'wan'
config include 'pbr'
option fw4_compatible '1'
option type 'script'
option path '/usr/share/pbr/pbr.firewall.include'
/etc/config/pbr
config pbr 'config'
option enabled '1'
option verbosity '2'
option strict_enforcement '1'
option resolver_set 'dnsmasq.nftset'
option ipv6_enabled '0'
list ignored_interface 'vpnserver'
list ignored_interface 'wgserver'
option boot_timeout '30'
option rule_create_option 'add'
option secure_reload '1'
option procd_reload_delay '6'
option webui_show_ignore_target '0'
list webui_supported_protocol 'all'
list webui_supported_protocol 'tcp'
list webui_supported_protocol 'udp'
list webui_supported_protocol 'tcp udp'
list webui_supported_protocol 'icmp'
list supported_interface 'vpn0'
list supported_interface 'vpn1'
config include
option path '/usr/share/pbr/pbr.user.aws'
option enabled '0'
config include
option path '/usr/share/pbr/pbr.user.netflix'
option enabled '0'
config policy
option name 'Plex/Emby Local Server'
option interface 'wan'
option src_port '8096 8920 32400'
option enabled '0'
config policy
option name 'Plex/Emby Remote Servers'
option interface 'wan'
option dest_addr 'plex.tv my.plexapp.com emby.media app.emby.media tv.emby.media'
option enabled '0'
config policy
option name 'espn-wan'
option dest_addr 'espn.api.edge.bamgrid.com d2f2ekwwtg17a.cloudfront.net'
option interface 'wan'
config policy
option name 'espn-vpn'
option dest_addr 'espn.com'
option interface 'vpn1'
config policy
option name 'hulu-wan'
option dest_addr 'live-mediashield-hulustream-com.akamaized.net hulu.com live-f.hulustream.com' # auth.hulu.com play.hulu.com
option interface 'wan'
To Reproduce/Expected Behavior
After a reboot, everything works as expected:
[host-connected-to-lan]% resolvectl flush-caches
[host-connected-to-lan]% traceroute -m 2 espn.api.edge.bamgrid.com
traceroute to espn.api.edge.bamgrid.com (108.156.83.56), 30 hops max, 60 byte packets
1 openwrt-router.xxx (<router-ip>) 0.992 ms 1.261 ms 2.156 ms
2 <isp-hop-ip> (<isp-hop-ip>) 10.601 ms 18.178 ms 18.156 ms
[host-connected-to-lan]% traceroute -m 2 hulu.com
traceroute to hulu.com (173.222.162.171), 30 hops max, 60 byte packets
1 openwrt-router.xxx (<router-ip>) 1.363 ms 1.526 ms 1.688 ms
2 <isp-hop-ip> (<isp-hop-ip>) 11.791 ms 11.769 ms 11.750 ms
[host-connected-to-lan]% traceroute -m 2 espn.com
traceroute to espn.com (65.8.248.30), 30 hops max, 60 byte packets
1 openwrt-router.xxx (<router-ip>) 1.482 ms 1.672 ms 2.165 ms
2 <vpn1-gateway-ip> (<vpn1-gateway-ip>) 43.936 ms 44.107 ms 44.434 ms
[host-connected-to-lan]% % -m 2 traceroute wikipedia.org
traceroute to wikipedia.org (208.80.154.224), 30 hops max, 60 byte packets
1 openwrt-router.xxx (<router-ip>) 3.500 ms 4.307 ms 4.568 ms
2 <vpn0-gateway-ip> (<vpn0-gateway-ip>) 46.380 ms 52.960 ms 53.128 ms
After some period of time (usually 1-2 hours) without intervention, things deteriorate, and all traffic is routed through vpn0 and /etc/init.d/pbr restart does not fix it:
[host-connected-to-lan]% resolvectl flush-caches
[host-connected-to-lan]% traceroute espn.api.edge.bamgrid.com
traceroute to espn.api.edge.bamgrid.com (108.156.83.56), 30 hops max, 60 byte packets
1 openwrt-router.xxx (<router-ip>) 2.651 ms 2.874 ms 3.136 ms
2 <vpn0-gateway-ip> (<vpn0-gateway-ip>) 46.546 ms 49.302 ms 49.431 ms
3 ...
[host-connected-to-lan]% traceroute -m 2 hulu.com
traceroute to hulu.com (173.222.162.171), 30 hops max, 60 byte packets
1 openwrt-router.xxx (<router-ip>) 2.780 ms 2.942 ms 3.211 ms
2 <vpn0-gateway-ip> (<vpn0-gateway-ip>) 47.044 ms 47.253 ms 47.517 ms
[host-connected-to-lan]% traceroute -m 2 espn.com
traceroute to espn.com (65.8.248.30), 30 hops max, 60 byte packets
1 openwrt-router.xxx (<router-ip>) 2.702 ms 4.836 ms 5.085 ms
2 <vpn0-gateway-ip> (<vpn0-gateway-ip>) 50.497 ms 50.617 ms 50.765 ms
[host-connected-to-lan]% traceroute -m 2 wikipedia.org
traceroute to wikipedia.org (208.80.154.224), 30 hops max, 60 byte packets
1 openwrt-router.xxx (<router-ip>) 3.402 ms 3.644 ms 3.882 ms
2 <vpn0-gateway-ip> (<vpn0-gateway-ip>) 48.074 ms 48.293 ms 48.519 ms
[host-connected-to-lan]% ssh <router-ip> /etc/init.d/pbr restart
...
[host-connected-to-lan]% sleep 300 ; resolvectl flush-caches
[host-connected-to-lan]% traceroute ... # everything *still* routed through vpn0
I can't find anything of note in the logs after I notice this happening.
Policy Routing run-time information
- Output of
/etc/init.d/pbr reloadwithverbosityset to 2:
Activating traffic killswitch [✓]
Setting up routing for 'wan/<isp-gateway-ip>' [✓]
Setting up routing for 'vpn0/ovpnc0/<vpn0-gateway-ip>' [✓]
Setting up routing for 'vpn1/ovpnc1/<vpn1-gateway-ip>' [✓]
Routing 'espn-wan' via wan [✓]
Routing 'espn-vpn' via wan [✓]
Routing 'hulu-wan' via wan [✓]
Deactivating traffic killswitch [✓]
pbr 1.1.1-7 monitoring interfaces: wan vpn0 vpn1
pbr 1.1.1-7 (nft) started with gateways:
wan/<isp-gateway-ip>
vpn0/ovpnc0/<vpn0-gateway-ip> [✓]
vpn1/ovpnc1/<vpn1-gateway-ip>
- Output of
/etc/init.d/pbr status:
============================================================
pbr - environment
pbr 1.1.1-7 running on OpenWrt 23.05.0. WAN (IPv4): wan/wan/<isp-gateway-ip>.
============================================================
Dnsmasq version 2.89 Copyright (c) 2000-2022 Simon Kelley
Compile time options: IPv6 GNU-getopt no-DBus UBus no-i18n no-IDN DHCP DHCPv6 no-Lua TFTP conntrack no-ipset nftset auth cryptohash DNSSEC no-ID loop-detect inotify dumpfile
============================================================
pbr chains - policies
chain pbr_forward { # handle 58
}
chain pbr_input { # handle 59
}
chain pbr_output { # handle 60
}
chain pbr_prerouting { # handle 61
ip daddr @pbr_wan_4_dst_ip_cfg0b6ff5 goto pbr_mark_0x010000 comment "espn-wan" # handle 1576
ip daddr @pbr_vpn1_4_dst_ip_cfg0c6ff5 goto pbr_mark_0x030000 comment "espn-vpn" # handle 1578
ip daddr @pbr_wan_4_dst_ip_cfg0d6ff5 goto pbr_mark_0x010000 comment "hulu-wan" # handle 1580
}
chain pbr_postrouting { # handle 62
}
============================================================
pbr chains - marking
chain pbr_mark_0x010000 { # handle 1554
counter packets 6 bytes 360 meta mark set meta mark & 0xff01ffff | 0x00010000 # handle 1555
return # handle 1556
}
chain pbr_mark_0x020000 { # handle 1557
counter packets 0 bytes 0 meta mark set meta mark & 0xff02ffff | 0x00020000 # handle 1558
return # handle 1559
}
chain pbr_mark_0x030000 { # handle 1560
counter packets 52 bytes 3120 meta mark set meta mark & 0xff03ffff | 0x00030000 # handle 1561
return # handle 1562
}
============================================================
pbr nft sets
set pbr_wan_4_dst_ip_cfg0b6ff5 { # handle 1575
type ipv4_addr
flags interval
counter
auto-merge
comment "espn-wan"
elements = { 108.156.83.55-108.156.83.56 counter packets 0 bytes 0, 108.156.83.90 counter packets 6 bytes 360,
108.156.83.123 counter packets 0 bytes 0 }
}
set pbr_vpn1_4_dst_ip_cfg0c6ff5 { # handle 1577
type ipv4_addr
flags interval
counter
auto-merge
comment "espn-vpn"
elements = { 65.8.248.4 counter packets 52 bytes 3120, 65.8.248.18 counter packets 0 bytes 0,
65.8.248.25 counter packets 0 bytes 0, 65.8.248.30 counter packets 0 bytes 0 }
}
set pbr_wan_4_dst_ip_cfg0d6ff5 { # handle 1579
type ipv4_addr
flags interval
counter
auto-merge
comment "hulu-wan"
}
============================================================
dnsmasq sets
nftset=/espn.api.edge.bamgrid.com/4#inet#fw4#pbr_wan_4_dst_ip_cfg0b6ff5 # espn-wan
nftset=/d2f2ekwwtg17a.cloudfront.net/4#inet#fw4#pbr_wan_4_dst_ip_cfg0b6ff5 # espn-wan
nftset=/espn.com/4#inet#fw4#pbr_vpn1_4_dst_ip_cfg0c6ff5 # espn-vpn
nftset=/live-mediashield-hulustream-com.akamaized.net/4#inet#fw4#pbr_wan_4_dst_ip_cfg0d6ff5 # hulu-wan
nftset=/hulu.com/4#inet#fw4#pbr_wan_4_dst_ip_cfg0d6ff5 # hulu-wan
nftset=/live-f.hulustream.com/4#inet#fw4#pbr_wan_4_dst_ip_cfg0d6ff5 # hulu-wan
============================================================
IPv4 table 256 route: default via <isp-gateway-ip> dev wan
IPv4 table 256 rule(s):
30000: from all fwmark 0x10000/0xff0000 lookup pbr_wan
IPv4 table 257 route: default via <vpn0-gateway-ip> dev ovpnc0
IPv4 table 257 rule(s):
30001: from all fwmark 0x20000/0xff0000 lookup pbr_vpn0
IPv4 table 258 route: default via <vpn1-gateway-ip> dev ovpnc1
IPv4 table 258 rule(s):
30002: from all fwmark 0x30000/0xff0000 lookup pbr_vpn1
I made some tweaks to my firewall and pbr, and now things appear to have stabilized, so I'm going to close this to avoid unproductively consuming your bandwidth. I'll reopen if I see it again.
Thanks for being proactive on this. What's the reason for the espn-vpn policy? My hunch is that's where the problem is. Maybe the espn.api.edge.bamgrid.com matches some higher level espn.com domain which eventually gets added to the set and the routing switches to VPN.
Also, if you're already on the full nft version you may want to switch to a newer pbr from my repo which supports (and uses by default) the fw4-compatible nft file.
Reopening because this is still an issue for me.
Thanks for being proactive on this. What's the reason for the
espn-vpnpolicy? My hunch is that's where the problem is. Maybe theespn.api.edge.bamgrid.commatches some higher level espn.com domain which eventually gets added to the set and the routing switches to VPN.
I'm not sure I understand this. Are you saying that there is overlap of IPs among espn.api.edge.bamgrid.com (which maps to d2f2ekwwtg17a.cloudfront.net) and something in the espn.com domain? How would that explain that all traffic (including that going to hulu.com) is routed via vpn0?
Also, if you're already on the full nft version you may want to switch to a newer
pbrfrom my repo which supports (and uses by default) the fw4-compatible nft file.
I tried following your advice and now appear to be in a worse spot. More specifically, I added your repo per these instructions. Then I did opkg update ; opkg upgrade pbr luci-app-pbr. This resulted in errors (note the Installing fw4 nft file line below), and now pbr appears not to work at all (either nothing is routed, or all traffic is routed via vpn0, regardless of whether pbr is active).
# opkg upgrade luci-app-pbr pbr
Upgrading luci-app-pbr on root from 1.1.1-7 to 1.1.3-25...
Downloading https://repo.openwrt.melmac.net/luci-app-pbr_1.1.3-25_all.ipk
Upgrading pbr on root from 1.1.1-7 to 1.1.3-25...
Downloading https://repo.openwrt.melmac.net/pbr_1.1.3-25_all.ipk
Stopping pbr service...
Activating traffic killswitch [✓]
Removing routing for 'wan/<isp-gateway-ip>' [✓]
Removing routing for 'vpn0/ovpnc0/<vpn0-gateway-ip>' [✓]
Removing routing for 'vpn1/ovpnc1/<vpn1-gateway-ip>' [✓]
Deactivating traffic killswitch [✓]
pbr 1.1.1-7 (nft) stopped [✓]
OK
Removing rc.d symlink for pbr... OK
Command failed: Not found
Removing obsolete file /etc/hotplug.d/iface/70-pbr.
Removing obsolete file /usr/share/pbr/pbr.firewall.include.
Configuring pbr.
Installing rc.d symlink for pbr... OK
Setting up routing for 'wan/<isp-gateway-ip>' [✓]
Setting up routing for 'vpn0/ovpnc0/<vpn0-gateway-ip>' [✓]
Setting up routing for 'vpn1/ovpnc1/<vpn1-gateway-ip>' [✓]
Routing 'espn-wan' via wan [✓]
Routing 'espn-vpn' via wan [✓]
Routing 'hulu-wan' via wan [✓]
Installing fw4 nft file [✗]
Restarting dnsmasq [✓]
pbr 1.1.3-25 monitoring interfaces: wan vpn0 vpn1
pbr 1.1.3-25 (nft mode) started with gateways:
wan/<isp-gateway-ip>
vpn0/ovpnc0/<vpn0-gateway-ip> [✓]
vpn1/ovpnc1/<vpn1-gateway-ip>
ERROR: Failed to install fw4 nft file '/var/run/pbr.nft'!
Configuring luci-app-pbr.
Collected errors:
* resolve_conffiles: Existing conffile /etc/config/pbr is different from the conffile in the new package. The new conffile will be placed at /etc/config/pbr-opkg.
The only reason I've seen nft file fail to install is the IP overlap which nft doesn't handle gracefully. You can make the new version work same as before by setting nft_file_support option to 0.
You can get additional information on the failure by keeping pbr in nft_file_support mode and running: nft -c -f /var/run/pbr.nft.
Okay, I think these came from me fiddling with stuff between my original observation of this issue. I filed #194 as a result of that investigation. I've now removed/corrected my entries to comply with the work-around described in that issue, so no more nft error. After upgrading and rebooting my router, routing appears to work as desired, until I do /etc/init.d/pbr reload. From that point on, all traffic is routed via vpn0 indefinitely and pbr does not appear to affect any routing whatsoever. I've also tried ...
/etc/init.d/pbr stop
# wait for completion
/etc/init.d/openvpn restart
# manually wait for "Initialization Sequence Completed" in logs and verify all traffic now routed to `vpn0`
/etc/init.d/pbr start
# wait for completion
... and ...
/etc/init.d/pbr stop
# wait for completion
/etc/init.d/network restart
# wait for completion; now all traffic is routed through `wan`
/etc/init.d/openvpn restart
# manually wait for "Initialization Sequence Completed" in logs and verify all traffic now routed to `vpn0`
/etc/init.d/pbr start
# wait for completion
... all with the same effect. Once in that state, the only cure seems to be rebooting the router.
As an aside, after rebooting, and routing has been reestablished, doing /etc/init.d/openvpn restart without touching /etc/init.d/pbr at least initially appears to retain policy-based routing behavior.
/etc/init.d/pbr stop
# wait for completion
/etc/init.d/network restart
# wait for completion; now all traffic is routed through `wan`
/etc/init.d/openvpn restart
# manually wait for "Initialization Sequence Completed" in logs and verify all traffic now routed to `vpn0`
/etc/init.d/pbr start
# wait for completion
What's the output of the start command you ran last and the service pbr status at this point?
@mtompkins this issue is about pbr not working a few hours after start, you're mentioning something about immediately after reboot, those are not the same issues. If you want me to have a look at what's happening on your system, then:
- Use the latest pbr from my repo
- open a new issue
- include information requested in the Getting Help section of the README
@stangri While I was not clear in that there is also a failure after start in my most recent post, I was trying to provide some information in the event of commonality between the previous poster and my own as I expressed I have the same experience over time.
Thank you for offering to look at my issue, but my only intent was to try and help you root cause this one. I'll not contribute further here.