drivers/at86rf215: not returning RX/IDLE after a while of operation with RPL/UDP
Description
When using the AT86RF215 radio module driver for communication, the driver does not reliably return to the RX/IDLE state after some time in operation.
Steps to reproduce the issue
- Set up three sensor nodes, each with an AT86RF215 radio module.
(Tested with Nucleo-F411RE + ATREB215-XPRO). - Configure them to use RPL and UDP in a simple linear topology
(Node A → Node B → Node C [sink]).- To ensure this, apply an L2 whitelist so that each node can only reach its direct neighbor.
- On the sink node, run a simple UDP server application.
- On the source node, periodically send UDP packets (e.g., every 1500 ms) to the global IPv6 address of the sink node.
- In my tests, I used 512-byte packets so that fragmentation occurs.
- Let the setup run for several minutes to a few hours.
- Observe that the radio state of the intermediate node (A or B) eventually gets stuck in a TX/RX "sending-ACK" state (hardware/software state).
- Once this happens, the TX packet queue fills up, packets are dropped, and the RPL route breaks down.
- On the console, the following debug message appears:
gnrc_netif: can't queue packet for sending, drop it
Test application
main.c
#include <stdio.h>
#include "thread.h"
#include "ztimer.h"
#include "net/gnrc.h"
#include "net/gnrc/netif.h"
#include "net/gnrc/netapi.h"
#include "net/gnrc/ipv6.h"
#include "net/gnrc/rpl.h"
#include "net/gnrc/udp.h"
#include "net/l2filter.h"
#include "at86rf215.h"
#include "at86rf215_netdev.h"
#include "at86rf215_internal.h"
#include "periph/pm.h"
/* Define UDP port */
#ifndef UDP_PORT
#define UDP_PORT 22346
#endif
typedef struct
{
uint8_t data[504]; // Dummy data
uint32_t id;
uint32_t seq_no;
} message_t;
kernel_pid_t radio_watchdog_thread_pid = KERNEL_PID_UNDEF;
static char _radio_watchdog_stack[THREAD_STACKSIZE_LARGE];
void *_radio_watchdog(void *args)
{
netif_t *netif = (netif_t *) args;
gnrc_netif_t *gnrc_netif = (gnrc_netif_t *) netif;
netdev_ieee802154_t *netdev_ieee802154 = container_of(gnrc_netif->dev, netdev_ieee802154_t, netdev);
at86rf215_t *dev = container_of(netdev_ieee802154, at86rf215_t, netdev);
ztimer_sleep(ZTIMER_MSEC, 1000); // Wait for normal operation
while (1)
{
uint8_t hw_state = at86rf215_get_rf_state(dev);
uint8_t sw_state = dev->state;
if (hw_state != RF_STATE_RX || sw_state != AT86RF215_STATE_IDLE)
{
bool recovered = false;
for (int i = 0; i < 1000; i++) // 1000 checks at 1ms intervals
{
ztimer_sleep(ZTIMER_USEC, 1000);
hw_state = at86rf215_get_rf_state(dev);
sw_state = dev->state;
if (hw_state == RF_STATE_RX && sw_state == AT86RF215_STATE_IDLE)
{
recovered = true;
break;
}
}
if (!recovered) {
printf("Violency detected: HW:%s, SW:%s\n", at86rf215_hw_state2a(hw_state), at86rf215_sw_state2a((at86rf215_state_t) sw_state));
}
}
ztimer_sleep(ZTIMER_MSEC, 1000);
}
/* Should never be reached */
return NULL;
}
#if SINK_NODE == 1
static gnrc_netreg_entry_t udp_server = GNRC_NETREG_ENTRY_INIT_PID(GNRC_NETREG_DEMUX_CTX_ALL, KERNEL_PID_UNDEF);
void sink(netif_t *netif, ipv6_addr_t ipv6_glob_addr)
{
msg_t msg;
msg_t _sink_msg_queue[8];
msg_init_queue(_sink_msg_queue, 8);
/* Initialize RPL root */
uint16_t flags = GNRC_NETIF_IPV6_ADDRS_FLAGS_STATE_VALID;
uint8_t prefix_len = 64;
flags |= (prefix_len << 8U);
netif_set_opt(netif, NETOPT_IPV6_ADDR, flags, (void *) &ipv6_glob_addr, sizeof(ipv6_addr_t)); // Add address to the interface
gnrc_rpl_root_init(1, &ipv6_glob_addr, false, false);
/* Initiate udp server to receive and process data from source nodes */
udp_server.target.pid = thread_getpid();
udp_server.demux_ctx = UDP_PORT;
gnrc_netreg_register(GNRC_NETTYPE_UDP, &udp_server);
/* Initialize data structures and the sensors*/
gnrc_pktsnip_t *pkt;
message_t *data;
while (true)
{
/* Wait for message from other sensor nodes or rtc alarms */
msg_receive(&msg);
switch (msg.type)
{
case GNRC_NETAPI_MSG_TYPE_RCV:
pkt = (gnrc_pktsnip_t *) msg.content.ptr;
data = (message_t *) pkt->data;
printf("RX MSG: ID=%ld, SeqNo=%ld\n", data->id, data->seq_no);
gnrc_pktbuf_release(pkt);
break;
default:
break;
}
}
}
#else
void udp_send(const ipv6_addr_t *addr, const int port, const uint8_t *data, size_t data_size)
{
gnrc_pktsnip_t *payload, *udp, *ip;
/* Allocate payload */
payload = gnrc_pktbuf_add(NULL, data, data_size, GNRC_NETTYPE_UNDEF);
if (payload == NULL)
{
printf("ERROR: Could not allocate payload pktsnip!\n");
return;
}
/* Allocate UDP header, set source and destination port */
udp = gnrc_udp_hdr_build(payload, port - 1, port);
if (udp == NULL)
{
printf("ERROR: Could not allocate udp_hdr pktsnip!\n");
gnrc_pktbuf_release(payload);
return;
}
/* Allocate IPv6 header */
ip = gnrc_ipv6_hdr_build(udp, NULL, addr);
if (ip == NULL)
{
printf("ERROR: Could not allocate ipv6_hdr pktsnip!\n");
gnrc_pktbuf_release(udp);
return;
}
/* Send packet */
if (!gnrc_netapi_dispatch_send(GNRC_NETTYPE_UDP, GNRC_NETREG_DEMUX_CTX_ALL, ip))
{
printf("ERROR: Dispatching udp packet failed!\n");
gnrc_pktbuf_release(ip);
}
}
void source_behavior(ipv6_addr_t ipv6_glob_addr)
{
message_t data;
data.seq_no = 0;
data.id = NODE_ID;
while (true)
{
ztimer_sleep(ZTIMER_MSEC, 1500);
udp_send(&ipv6_glob_addr, UDP_PORT, (uint8_t *) &data, sizeof(message_t));
data.seq_no++;
}
}
#endif
uint8_t node_03_l2addr[8] = {0xAE, 0xCA, 0x6E, 0xF7, 0xF0, 0xF5, 0x67, 0xFA};
uint8_t node_02_l2addr[8] = {0x6A, 0xF8, 0xC0, 0xBD, 0xAC, 0x2A, 0x02, 0x4C};
uint8_t node_01_l2addr[8] = {0x66, 0xC6, 0x71, 0x46, 0xF7, 0xD6, 0xEA, 0x36};
int main(void)
{
/* Get the network interface */
netif_t *netif = netif_iter(NULL);
/* Set up a L2 filter */
if (memcmp(((gnrc_netif_t *)netif)->l2addr, node_03_l2addr, sizeof(node_03_l2addr)) == 0)
{
l2filter_add(((gnrc_netif_t *)netif)->dev->filter, node_02_l2addr, sizeof(node_02_l2addr));
}
else if (memcmp(((gnrc_netif_t *)netif)->l2addr, node_02_l2addr, sizeof(node_02_l2addr)) == 0)
{
l2filter_add(((gnrc_netif_t *)netif)->dev->filter, node_03_l2addr, sizeof(node_03_l2addr));
l2filter_add(((gnrc_netif_t *)netif)->dev->filter, node_01_l2addr, sizeof(node_01_l2addr));
}
else if (memcmp(((gnrc_netif_t *)netif)->l2addr, node_01_l2addr, sizeof(node_01_l2addr)) == 0)
{
l2filter_add(((gnrc_netif_t *)netif)->dev->filter, node_02_l2addr, sizeof(node_02_l2addr));
}
/* Parse global ipv6 addr */
ipv6_addr_t ipv6_glob_addr;
const char* str_addr = "2001:db8::1";
ipv6_addr_from_str(&ipv6_glob_addr, str_addr);
/* Initialize RPL */
gnrc_rpl_init(netif_get_id(netif));
/* Create watchdog thread that observes the radio module, we are using */
thread_create(_radio_watchdog_stack, sizeof(_radio_watchdog_stack), 10,
THREAD_CREATE_STACKTEST,
_radio_watchdog, (void *) netif, "radio_watchdog");
/* Switch to specific behavior routine */
#if SINK_NODE == 1
sink(netif, ipv6_glob_addr);
#else
source_behavior(ipv6_glob_addr);
#endif
/* Should never be reached */
return 0;
}
Makefile
APPLICATION = at86rf215-driver-not-returning-idle
BOARD ?= native
# This has to be the absolute path to the RIOT base directory:
RIOTBASE ?= $(CURDIR)/../..
USEMODULE += netdev_default
USEMODULE += auto_init_gnrc_netif
USEMODULE += gnrc_icmpv6_error
USEMODULE += gnrc_ipv6_router_default
USEMODULE += gnrc_rpl
USEMODULE += gnrc_udp
USEMODULE += at86rf215
USEMODULE += at86rf215_24ghz
USEMODULE += l2filter_whitelist
USEMODULE += ztimer
USEMODULE += ztimer_msec
CFLAGS += -DNODE_ID=$(NODEID)
SINK_NODE ?= 0
CFLAGS += -DSINK_NODE=$(SINK_NODE)
# Comment this out to disable code in RIOT that does safety checking
# which is not needed in a production environment but helps in the
# development process:
DEVELHELP ?= 1
# Change this to 0 show compiler invocation lines by default:
QUIET ?= 1
EXTERNAL_BOARD_DIRS += $(RIOTBASE)/tests/build_system/external_board_dirs/esp-ci-boards
include $(RIOTBASE)/Makefile.include
# Set a custom channel if needed
include $(RIOTMAKE)/default-radio-settings.inc.mk
This is the code I used for reproducing the issue. You could use it to test it by yourself: Just put your boards IPv6 address in the MAC filter and run the application with the following flags:
make flash BOARD=yourboard NODEID=randomID SINK_NODE=1 for the sink node. make flash BOARD=yourboard NODEID=randomID SINK_NODE=0 for the intermediate nodes.
Expected results
The radio should always return to the RX/IDLE state (hardware/software state), even under heavy load.
Actual results
After a period of operation, the radio module remains stuck in the TX/RX "sending-ACK" state (hardware/software state).
As a result, the TX packet queue overflows, packets are dropped, and communication is disrupted.
Versions
- Operating System: Ubuntu 20.04.6 LTS (Focal Fossa)
- Compiler: arm-none-eabi-gcc (15:9-2019-q4-0ubuntu1) 9.2.1 20191025 (release) [ARM/arm-9-branch revision 277599]
- RIOT OS: e49b7fd3a4452f238658f208def901b2c6d77e22
After hours of debugging with a logic analyzer I identified the root cause.
The MAC (netif) thread handles both (a) outgoing send requests (sent by upper layers) and (b) IRQ-completion work from the radio ISR and both are delivered to the same thread via the same IPC queue. When the upper layer wants to send, it posts an IPC message to the MAC thread to initiate transmission. When the radio receives a frame, a small ISR posts an IPC message to the MAC thread so the driver can perform the more complex IRQ handling.
A race occurs when a send IPC message arrives immediately before a frame-end-IRQ IPC message. The MAC thread then processes the two messages in an ordering that confuses the radio state machine: the driver ends up in a TX/RX "sending-ACK" state and never returns to RX/IDLE. Once the radio is stuck, the TX queue fills and packets are dropped, which breaks forwarding and the RPL route.
Logic Analyzer Snip
The IRQ line shows two interrupt events:
- Automatic Gain Control (AGC) Hold interrupt
- Frame End interrupt together with AGC Release.
Before the Frame End IPC message is processed by the MAC thread, a send IPC message arrives in its IPC queue.
This is visible because RX_DONE (driver’s RX_COMPLETE callback) occurs after TX_PREP. TX_PREP is triggered when _send() is called: the frame is loaded into the TX buffer and transmission is prepared. This proves _send() was executed long before the Frame End interrupt was handled, since TX buffer loading takes several hundred microseconds. After the EDC (Energy Detection CCA) starts, the MAC thread now processes the Frame End IPC message and the RX frame handling begins.
Afterward the ACK should be transmitted, but instead, the radio has already been forced into RX state for the EDC due to the premature _send().
Result: the ACK is never sent and the radio state machine deadlocks -> stuck in TX/RX-sending-ACK.
At the moment, I’m not entirely sure how to fully resolve this issue. Fundamentally, we need to guarantee that no transmission is initiated while the radio is in the process of receiving a frame (although loading a frame into the TX buffer beforehand is fine). One possible approach would be to use the AGC Hold interrupt, which triggers immediately after the first preamble bits are detected. At that point, we could update the internal radio state from IDLE to something like RX_INCOMING/STARTED. This way, the _send() function would recognize that the radio is no longer idle and block any concurrent transmission attempts.
That said, this approach only reduces the likelihood of hitting the race condition, it doesn’t eliminate it entirely. But I think it would be the best fix, as this is the closest way possible to know when an upcoming frame reception occurs. Any suggestions?
I haven't looked at the code, just your description. A weirdness about 802.15.4 is that it sends L2 ACKs, so receiving needs to also be able to send. It seems like that is part of the challenge?
Could it be that the send IPC message is doing too much? That all it should do is wake up the MAC thread, which ought to examine a send Q, and then schedule itself.
When an ACK is requested, the radio automatically switches to the TX state and schedules the ACK transmission. This occurs 192 µs after frame reception has completed. The problem arises because a premature data transmission, triggered via the TXPREP command, preempts this automatic state change.
The underlying issue is that the driver’s _send() function decides whether transmission can begin in at86rf215_tx_exec() solely based on the radio state, without considering that the radio may already be in the process of receiving a frame.
int at86rf215_tx_exec(at86rf215_t *dev)
{
/* write frame length */
at86rf215_reg_write16(dev, dev->BBC->RG_TXFLL, dev->tx_frame_len);
dev->retries = dev->retries_max;
dev->csma_retries = dev->csma_retries_max;
dev->flags |= AT86RF215_OPT_TX_PENDING;
if ((dev->flags & AT86RF215_OPT_CSMA) && !(dev->flags & AT86RF215_OPT_CCATX)) {
dev->flags |= AT86RF215_OPT_CCA_PENDING;
}
if (dev->state == AT86RF215_STATE_IDLE) { // <-- Suggested fix: Check if the AGCH bit is set in the device flags
at86rf215_rf_cmd(dev, CMD_RF_TXPREP);
} else {
DEBUG("[at86rf215] will TX after %s\n", at86rf215_sw_state2a(dev->state));
}
return 0;
}
Currently, if the radio state is reported as AT86RF215_STATE_IDLE, the driver immediately initiates transmission. However, the radio remains in the AT86RF215_STATE_IDLE state during the early phase of frame reception, so the driver mistakenly assumes it is safe to transmit. This leads to the collision between the automatic ACK handling and the manually initiated transmission.
The current fix I have in mind is adding a slight check for the AGCH bit in the device flags. The AGCH interrupt can be enabled, indicating when the radio is receiving valid start symbols of the preamble. With this, the transmission gets delayed until the radio received the current frame, effectively reducing the window in which the race condition can occur. However, this does not entirely solve the problem, as there can be cases in which the radio driver just issues the AGCH interrupt after a transmission is executed...
Thank you for debugging this! I now wonder if it would be easier converting the driver to the new radio HAL / subMAC so we can use the common 802.15.4 softMAC. 🤔
I think a refactoring of the driver is needed anyway. Currently, the CCATX feature is exposed only as an optional setting. During the refactoring or conversion process, I suggest enabling CCATX by default, since it significantly reduces the interrupt load during transmissions. (If I remember correctly, from 3 to 1 interrupt(s) at each start of a message transmission.)
However, the race condition will still occur and I'm "racking" my brain trying to figure out how to solve this.