piolib: data transfers slower than expected
In my work for Adafruit, I've implemented a PIO prgram for driving HUB75 style displays.
The data transfer to the PIO peripheral is slower than anticipated, topping out at about 10MB/s. That's too bad, as we ideally would like to run at several times that speed. (naively, I'd expected that like on rp2, we could keep it fed with data every cycle even at the highest PIO frequencies)
Here is a simple reproducer that requires no hardware -- it just does an out x, 32 every cycle, consuming a FIFO entry each time. So e.g., at 1MHz it should consume 4MB/s, at 5MHz it should consume 20MB/s, etc. However, the transfer speed tops out at around 10MB/s:
$ for frequency in 1e6 2e6 5e6 10e6 20e6 200e6; do for xfersize in 65532; do ./build/examples/bench $frequency $xfersize ; done; done 2>/dev/null
{"frequency": 1e+06, "xfer_size": 65532, "rate": 3.99201e+06}
{"frequency": 2e+06, "xfer_size": 65532, "rate": 7.96719e+06}
{"frequency": 5e+06, "xfer_size": 65532, "rate": 1.07482e+07}
{"frequency": 1e+07, "xfer_size": 65532, "rate": 1.07461e+07}
{"frequency": 2e+07, "xfer_size": 65532, "rate": 1.07484e+07}
{"frequency": 2e+08, "xfer_size": 65532, "rate": 1.0746e+07}
Notice how the top rate is about 1e7 (i.e., 10MB/s), and does not continue increasing as the clock rate increases.
Firmware & kernel:
$ uname -a
Linux m5 6.6.70-v8+ #1 SMP PREEMPT Fri Jan 10 13:53:47 UTC 2025 aarch64 GNU/Linux
$ vcgencmd version
2025/01/08 17:52:48
Copyright (c) 2012 Broadcom
version 97facbf4 (release) (embedded)
My test program
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "piolib.h"
#include "ws2812.pio.h"
#define bench_wrap_target 0
#define bench_wrap 0
static const uint16_t bench_program_instructions[] = {
// .wrap_target
0x6020, // out x, 32
// .wrap
};
static const struct pio_program bench_program = {
.instructions = bench_program_instructions,
.length = 1,
.origin = -1,
};
static inline pio_sm_config bench_program_get_default_config(uint offset) {
pio_sm_config c = pio_get_default_sm_config();
sm_config_set_wrap(&c, offset + bench_wrap_target, offset + bench_wrap);
sm_config_set_sideset(&c, 1, false, false);
return c;
}
static inline float bench_program_init(PIO pio, int sm, int offset, float freq) {
pio_sm_config c = bench_program_get_default_config(offset);
sm_config_set_out_shift(&c, false, true, 32);
sm_config_set_fifo_join(&c, PIO_FIFO_JOIN_TX);
float div = clock_get_hz(clk_sys) / freq;
if(div < 1) div = 1;
if(div > 65535) div = 65535;
int div_int = (int)div;
int div_frac = (int)((div - div_int) * 256);
sm_config_set_clkdiv_int_frac(&c, div_int, div_frac);
pio_sm_init(pio, sm, offset, &c);
pio_sm_set_enabled(pio, sm, true);
return clock_get_hz(clk_sys) / (div_int + div_frac / 256.);
}
double monotonic() {
struct timespec tv;
clock_gettime(CLOCK_MONOTONIC, &tv);
return tv.tv_sec + tv.tv_nsec * 1e-9;
}
long databuf[1048576];
int main(int argc, const char **argv)
{
float frequency = argc > 1 ? atof(argv[1]) : 10e6;
size_t xfer_size = argc > 2 ? atoi(argv[2]) : 256;
PIO pio;
int sm;
uint offset;
pio = pio0;
sm = pio_claim_unused_sm(pio, true);
pio_sm_config_xfer(pio, sm, PIO_DIR_TO_SM, xfer_size, 1);
offset = pio_add_program(pio, &bench_program);
fprintf(stderr, "Loaded program at %d, using sm %d\n", offset, sm);
float actual_frequency = bench_program_init(pio, sm, offset, frequency);
fprintf(stderr, "Actual frequency %fMHz\n", actual_frequency/1e6);
pio_sm_clear_fifos(pio, sm);
double t0 = monotonic();
size_t xfer = 0;
do {
pio_sm_xfer_data(pio, sm, PIO_DIR_TO_SM, sizeof(databuf), databuf);
xfer += sizeof(databuf);
} while(monotonic() - t0 < 1);
double t1 = monotonic();
double dt = t1 - t0;
double rate = xfer / dt; // bytes per second
fprintf(stderr, "%zu bytes in %.1fms (%.1fMiB/s)\n",
xfer, dt*1e3, rate / 1048576);
printf("{\"frequency\": %g, \"xfer_size\": %zd, \"rate\": %g}\n",
actual_frequency, xfer_size, rate);
return 0;
}
PS is there a more appropriate repo to report this issue in?
PS is there a more appropriate repo to report this issue in?
No, this is fine.
I modified the state machine to run OUT PINS, 32 having configured a pair of GPIOs for PIO, and filled the buffer with alternating words of 0x55555555 and 0xaaaaaaaa. Putting a logical analyser on the pins then shows the ticks of the SM.
The result is not at all what I expected - the steady transmission of a buffer full of data, with 40us gaps in between the buffers. The limiting factor seems to be the clock speed. which has been capped at ~2.6MHz - the exact value varies with the exact clock frequency, the clock limping slightly due to a non-integer clock divisor.
I'm encouraged by this - fixing some clock weirdness feels more tractable than making something quicker. More next week.
I'm not confident it's a clocking problem.
I can think of two ways to test it. One would involve a program that turns off auto pull and uses pull noblock. something like
set x, 0
.wrap_target
pull noblock
out pins, 32
.wrap
In this case, you'll see 0x55.. or 0xaa.. in the case where data was available, and then 0 whenever data was not available, because (if I understand the docs right) x is copied back to osr in this case.
The other is what I quickly implemented: still using out x, ## but varying the count from 1 to 32. I ran this test at the max 200MHz clock but saw a data rate of around 10MB/s regardless of the PIO clock. If it was actually the PIO clock that was capped at 2.5MHz, the smaller out numbers would have lower throughput.
my new test program
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "piolib.h"
#include "ws2812.pio.h"
#define bench_wrap_target 0
#define bench_wrap 0
static uint16_t bench_program_instructions[] = {
// .wrap_target
0x6020, // out x, 32
// .wrap
};
static const struct pio_program bench_program = {
.instructions = bench_program_instructions,
.length = 1,
.origin = -1,
};
static inline pio_sm_config bench_program_get_default_config(uint offset) {
pio_sm_config c = pio_get_default_sm_config();
sm_config_set_wrap(&c, offset + bench_wrap_target, offset + bench_wrap);
sm_config_set_sideset(&c, 1, false, false);
return c;
}
static inline float bench_program_init(PIO pio, int sm, int offset, float freq) {
pio_sm_config c = bench_program_get_default_config(offset);
sm_config_set_out_shift(&c, false, true, 32);
sm_config_set_fifo_join(&c, PIO_FIFO_JOIN_TX);
float div = clock_get_hz(clk_sys) / freq;
if(div < 1) div = 1;
if(div > 65535) div = 65535;
int div_int = (int)div;
int div_frac = (int)((div - div_int) * 256);
sm_config_set_clkdiv_int_frac(&c, div_int, div_frac);
pio_sm_init(pio, sm, offset, &c);
pio_sm_set_enabled(pio, sm, true);
return clock_get_hz(clk_sys) / (div_int + div_frac / 256.);
}
double monotonic() {
struct timespec tv;
clock_gettime(CLOCK_MONOTONIC, &tv);
return tv.tv_sec + tv.tv_nsec * 1e-9;
}
long databuf[1048576];
int main(int argc, const char **argv)
{
float frequency = argc > 1 ? atof(argv[1]) : 10e6;
int out_count = argc > 2 ? atoi(argv[2]) : 32;
size_t xfer_size = 65532;
PIO pio;
int sm;
uint offset;
pio = pio0;
sm = pio_claim_unused_sm(pio, true);
pio_sm_config_xfer(pio, sm, PIO_DIR_TO_SM, xfer_size, 1);
bench_program_instructions[0] = pio_encode_out(pio_x, out_count);
offset = pio_add_program(pio, &bench_program);
fprintf(stderr, "Loaded program at %d, using sm %d\n", offset, sm);
float actual_frequency = bench_program_init(pio, sm, offset, frequency);
fprintf(stderr, "Actual frequency %fMHz\n", actual_frequency/1e6);
pio_sm_clear_fifos(pio, sm);
double t0 = monotonic();
size_t xfer = 0;
do {
pio_sm_xfer_data(pio, sm, PIO_DIR_TO_SM, sizeof(databuf), databuf);
xfer += sizeof(databuf);
} while(monotonic() - t0 < 3);
double t1 = monotonic();
double dt = t1 - t0;
double rate = xfer / dt; // bytes per second
fprintf(stderr, "%zu bytes in %.1fms (%.1fMiB/s)\n",
xfer, dt*1e3, rate / 1048576);
printf("{\"frequency\": %g, \"out_count\": %d, \"rate\": %g}\n",
actual_frequency, out_count, rate);
return 0;
}
$ for b in 1 2 4 8 16 32; do ./build/examples/bench 200000000 ${b}; done 2>/dev/null
{"frequency": 2e+08, "out_count": 1, "rate": 1.07461e+07}
{"frequency": 2e+08, "out_count": 2, "rate": 1.0746e+07}
{"frequency": 2e+08, "out_count": 4, "rate": 1.0746e+07}
{"frequency": 2e+08, "out_count": 8, "rate": 1.07461e+07}
{"frequency": 2e+08, "out_count": 16, "rate": 1.07461e+07}
{"frequency": 2e+08, "out_count": 32, "rate": 1.07461e+07}
The "pull noblock" test, scoped. Because of my existing probe setup, I have pi5 pins 5 & 6 on scope channels 4 & 6.
PIO clock configured to 10MHz:
PIO clock configured to 100MHz:
note how the pulses get shorter (pio is clocking at expected rate) but stay the same distance apart (data is arriving in the PIO FIFO slower than expected)
The pull noblock test
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "piolib.h"
#include "ws2812.pio.h"
#define bench_wrap_target 1
#define bench_wrap 2
static const uint16_t bench_program_instructions[] = {
0xe020, // set x,0
// .wrap_target
0x8080, // pull noblock
0x6000, // out pins, 32
// .wrap
};
static const struct pio_program bench_program = {
.instructions = bench_program_instructions,
.length = 3,
.origin = -1,
};
static inline pio_sm_config bench_program_get_default_config(uint offset) {
pio_sm_config c = pio_get_default_sm_config();
sm_config_set_wrap(&c, offset + bench_wrap_target, offset + bench_wrap);
sm_config_set_sideset(&c, 1, false, false);
return c;
}
static inline float bench_program_init(PIO pio, int sm, int offset, float freq, int gpio_base) {
pio_sm_config c = bench_program_get_default_config(offset);
sm_config_set_out_shift(&c, false, false /* auto pull */, 32);
sm_config_set_out_pins(&c, 0, 32);
sm_config_set_fifo_join(&c, PIO_FIFO_JOIN_TX);
float div = clock_get_hz(clk_sys) / freq;
if(div < 1) div = 1;
if(div > 65535) div = 65535;
int div_int = (int)div;
int div_frac = (int)((div - div_int) * 256);
sm_config_set_clkdiv_int_frac(&c, div_int, div_frac);
pio_sm_init(pio, sm, offset, &c);
pio_sm_set_enabled(pio, sm, true);
pio_gpio_init(pio, gpio_base);
pio_gpio_init(pio, gpio_base+1);
pio_sm_set_consecutive_pindirs(pio, sm, gpio_base, 2, true);
return clock_get_hz(clk_sys) / (div_int + div_frac / 256.);
}
double monotonic() {
struct timespec tv;
clock_gettime(CLOCK_MONOTONIC, &tv);
return tv.tv_sec + tv.tv_nsec * 1e-9;
}
long databuf[1048576];
int main(int argc, const char **argv)
{
float frequency = argc > 1 ? atof(argv[1]) : 10e6;
size_t xfer_size = 65532;
PIO pio;
int sm;
uint offset;
pio = pio0;
sm = pio_claim_unused_sm(pio, true);
pio_sm_config_xfer(pio, sm, PIO_DIR_TO_SM, xfer_size, 1);
offset = pio_add_program(pio, &bench_program);
fprintf(stderr, "Loaded program at %d, using sm %d\n", offset, sm);
float actual_frequency = bench_program_init(pio, sm, offset, frequency, /* base pin */ 5);
fprintf(stderr, "Actual frequency %fMHz\n", actual_frequency/1e6);
pio_sm_clear_fifos(pio, sm);
for(size_t i=0; i<sizeof(databuf)/sizeof(databuf[0]); i++ )
databuf[i] = i % 2 ? 0x55555555 : 0xaaaaaaaa;
double t0 = monotonic();
size_t xfer = 0;
do {
pio_sm_xfer_data(pio, sm, PIO_DIR_TO_SM, sizeof(databuf), databuf);
xfer += sizeof(databuf);
} while(monotonic() - t0 < 3);
double t1 = monotonic();
double dt = t1 - t0;
double rate = xfer / dt; // bytes per second
fprintf(stderr, "%zu bytes in %.1fms (%.1fMiB/s)\n",
xfer, dt*1e3, rate / 1048576);
printf("{\"frequency\": %g, \"rate\": %g}\n",
actual_frequency, rate);
return 0;
}
Just a quick update:
The DMA controller in RP1 has a slow path to system memory, but we can improve things a bit by playing with FIFO thresholds and increasing the burst size. With a burst size of 16 I can get throughputs of 50MB/s in either direction, but at the cost of ugly warning messages in the kernel log. We're talking about how to avoid that.
To get even higher throughputs would require moving the DMA buffers to shared SRAM, and probably configuring the DMA controller for cyclic usage.
Thanks for working on this! An added factor of 2 or 4 would be amazing already for our LED matrix use case. I look forward to more updates!
ping @ladyada, if they're able to improve things here we might be able to improve our LED matrix dot clock by up to about 5x
Higher throughput would be amazing. Thanks for mentioning that it might be possible.
Is there anything to try in the current version? I would be happy to experiment a bit with piolib to verify that the pi5 is viable as a pio interface.
We are currently stuck on a project which needs to read a lot of parallel data due to data being lost from the slow DMA transfer. The expected data rate is about 12-15MB/s and currently it stops being reliable at under 8MB/s while around 6MB/s seems to get all the data but with possible timing issues.
Additionally when generating a clock from the rp1 significantly more jitter was observed compared to an rp2350.
The pio code works perfectly fine on an rp2350 but this can't be used directly because of the slow usb interface and sending the data directly to the pi5 is required in some way and by running the pio code directly on the pi5 would simplify the system significantly.
Hi! I've been away for a few months, and wanted to check in about this issue. Is there any branch or kernel patch I can test to see if it improves things? For our use case we could probably tolerate some kernel log messages if it enables higher throughput.
I recently hit this problem (trying to feed a DAC at ~26MB/s but only getting about 10). Someone else mentioned it on the forums too: https://forums.raspberrypi.com/viewtopic.php?p=2330537
I have a kernel patch set/PR that may help #6994. It allows the PIO driver to make use of longer DMA bursts. Frustratingly I'm not seeing the same levels of performance as in my early tests, but it did boost throughput to ~27MB/s.
If you are feeling brave (and have backed up any important data, and your system does not require initramfs for booting) you can install a trial build by running sudo rpi-update pulls/6994.
I passed this along and I think someone at Adafruit will be trying it in the next week on our LED matrix software.
ya @blitzcitydiy will try this when back!
@pelwell Did a rpi-update and risked it and while it does seem that DMA throughput is increased significantly and takes less time to transfer, the data i receive seems pretty glitched as if the PIO timings have changed significantly or samples are lost and i can not get it to function properly anymore in my application.
Did anything in the current version also possibly affect the fifo stall thresholds/capacity or timing otherwise or require different buffer handling? The program fills and stalls the fifo on start and reads 8b wide data synced with a clock pulse. Might be unrelated to this specific PR in case other pio related features were changed as well requiring further modifications.
For reference the last version fully working is hash 5cb275145ac4c5fef977ec61083ea10eb67b401b (6.12.41 merge).
The DMA burst sizes have changed, but nothing else should have. Do you have a test program I can try, to make sure we're starting from the same place?
I found a change to the FIFO threshold used by DMA in the rp1-pio driver. https://github.com/raspberrypi/linux/pull/7037 is a reversion of that change. In about 40 minutes time you should be able to install a trial build using sudo rpi-update pulls/7037.
Unfortunately this does not fix the issue. The program is supposed to read 8 bit parallel data synchronized with a 5MHz clock signal at both edges from an image sensor and runs at 20x 5MHz = 100MHz currently. The buffer must be filled and waiting after a reset to ensure proper alignment of samples to a start pin pulse. The clock is generated by another pio sm. I am receiving data but it does not seem correct. Either it is repeating old samples or padding the data.
This is the program i am testing with for reference:
.side_set 1
set x 31 side 0 ; Number of delay samples
loopstart:
in x 32 side 0
push block side 1 ; Fill buffer
jmp x-- loopstart side 0
;push block side 0 ; Wait here when buffer full
wait 0 pin 8 side 0 [0] ; Sync with clock and set start pulse
wait 1 pin 8 side 1 [5]
set pins 9 side 1 [0]
wait 0 pin 8 side 0 [5]
set pins 0 side 0 [0]
.wrap_target
wait 1 pin 8 side 0 [3]
in pins 8 side 1
;push iffull block side 1
wait 0 pin 8 side 1 [3]
in pins 8 side 0 ; Requires faster PIO DMA for 16b
push iffull noblock side 0
.wrap
I have tested with and without autopush and different frequencies. Makes no difference in the new firmware.
Maybe not correctly blocking the fifo when empty/full?
The behaviour is that the first half of the buffer is a repeating pattern of invalid data potentially when it should be blocked and waiting and in the second "half" it seems to have some valid data but misaligned (Data between actual samples? skipping samples?). It kind of looks like when i set the shift threshold to 16 instead of 32 in the working firmware which would pad every sample with invalid data.
Also possible that this approach for syncing does not work anymore and there is a better solution now. It was required due to the long and random delay between starting the state machine and the dma transfer actually accepting data.
Can you post somewhere (GitHub gist or branch, paste in etc.) the full application program (or a minimal version of it)?
i ran jeff's test program from here:
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "piolib.h"
#include "ws2812.pio.h"
#define bench_wrap_target 0
#define bench_wrap 0
static uint16_t bench_program_instructions[] = {
// .wrap_target
0x6020, // out x, 32
// .wrap
};
static const struct pio_program bench_program = {
.instructions = bench_program_instructions,
.length = 1,
.origin = -1,
};
static inline pio_sm_config bench_program_get_default_config(uint offset) {
pio_sm_config c = pio_get_default_sm_config();
sm_config_set_wrap(&c, offset + bench_wrap_target, offset + bench_wrap);
sm_config_set_sideset(&c, 1, false, false);
return c;
}
static inline float bench_program_init(PIO pio, int sm, int offset, float freq) {
pio_sm_config c = bench_program_get_default_config(offset);
sm_config_set_out_shift(&c, false, true, 32);
sm_config_set_fifo_join(&c, PIO_FIFO_JOIN_TX);
float div = clock_get_hz(clk_sys) / freq;
if(div < 1) div = 1;
if(div > 65535) div = 65535;
int div_int = (int)div;
int div_frac = (int)((div - div_int) * 256);
sm_config_set_clkdiv_int_frac(&c, div_int, div_frac);
pio_sm_init(pio, sm, offset, &c);
pio_sm_set_enabled(pio, sm, true);
return clock_get_hz(clk_sys) / (div_int + div_frac / 256.);
}
double monotonic() {
struct timespec tv;
clock_gettime(CLOCK_MONOTONIC, &tv);
return tv.tv_sec + tv.tv_nsec * 1e-9;
}
long databuf[1048576];
int main(int argc, const char **argv)
{
float frequency = argc > 1 ? atof(argv[1]) : 10e6;
int out_count = argc > 2 ? atoi(argv[2]) : 32;
size_t xfer_size = 65532;
PIO pio;
int sm;
uint offset;
pio = pio0;
sm = pio_claim_unused_sm(pio, true);
pio_sm_config_xfer(pio, sm, PIO_DIR_TO_SM, xfer_size, 1);
bench_program_instructions[0] = pio_encode_out(pio_x, out_count);
offset = pio_add_program(pio, &bench_program);
fprintf(stderr, "Loaded program at %d, using sm %d\n", offset, sm);
float actual_frequency = bench_program_init(pio, sm, offset, frequency);
fprintf(stderr, "Actual frequency %fMHz\n", actual_frequency/1e6);
pio_sm_clear_fifos(pio, sm);
double t0 = monotonic();
size_t xfer = 0;
do {
pio_sm_xfer_data(pio, sm, PIO_DIR_TO_SM, sizeof(databuf), databuf);
xfer += sizeof(databuf);
} while(monotonic() - t0 < 3);
double t1 = monotonic();
double dt = t1 - t0;
double rate = xfer / dt; // bytes per second
fprintf(stderr, "%zu bytes in %.1fms (%.1fMiB/s)\n",
xfer, dt*1e3, rate / 1048576);
printf("{\"frequency\": %g, \"out_count\": %d, \"rate\": %g}\n",
actual_frequency, out_count, rate);
return 0;
}
and got these results with pulls/7037:
$ for b in 1 2 4 8 16 32; do ./test 200000000 ${b}; done 2>/dev/null
{"frequency": 2e+08, "out_count": 1, "rate": 2.46578e+07}
{"frequency": 2e+08, "out_count": 2, "rate": 2.76976e+07}
{"frequency": 2e+08, "out_count": 4, "rate": 2.77303e+07}
{"frequency": 2e+08, "out_count": 8, "rate": 2.77106e+07}
{"frequency": 2e+08, "out_count": 16, "rate": 2.76961e+07}
{"frequency": 2e+08, "out_count": 32, "rate": 2.77063e+07}
i also ran the rainbow_spiral_active3 example and i get around 144 fps
And how do you feel about that?
i think we're at a good spot because comparing my results to jeff's there's over a 2x performance improvement (avg 10 mbps vs 27 mbps)
Iam using the rp1-pio driver in the kernel space for my radio the radioberry
- Running with the burst of 8 is not working for both direction i set the burst to 2.
- i also revert the commit because txfifo was not working for me.
Showing the CPU utlization using the IO
https://youtu.be/v34ROMQqOaA
including some additonal info: [email protected]
Keep up the good work!