Add DWC2 cache maintenance routines for STM32
Describe the PR
- Support DMA + DCache ON
- Refactored buffer alignment macro to take into account cache line size
Now #define CFG_TUD_DWC2_DMA_ENABLE 1 is enough.
It's prefer to declare a non-cached region with MPU instead of rely on cache invalidate+clean, benchmark on STM32H7S3 and i.MX RT1170 shows frequent cache invalidate+clean really hurts performance.
** Need rebase after #2960
Benchmark code
extern uint32_t SystemCoreClock;
void SWD_Init(void)
{
//UNLOCK FUNNEL
*(volatile uint32_t*)(0x5C004FB0) = 0xC5ACCE55; // SWTF_LAR
*(volatile uint32_t*)(0x5C003FB0) = 0xC5ACCE55; // SWO_LAR
//SWO current output divisor register
//This divisor value (0x000000C7) corresponds to 400Mhz
//To change it, you can use the following rule
// value = (CPU Freq/sw speed )-1
blink_interval_ms= *(volatile uint32_t*)(0x5C003010);
*(volatile uint32_t*)(0x5C003010) = ((SystemCoreClock / 12000000) - 1); // SWO_CODR
//SWO selected pin protocol register
*(volatile uint32_t*)(0x5C0030F0) = 0x00000002; // SWO_SPPR
//Enable ITM input of SWO trace funnel
*(volatile uint32_t*)(0x5C004000) |= 0x00000001; // SWFT_CTRL
}
static void MPU_AdjustRegionAddressSize(uint32_t Address, uint32_t Size, MPU_Region_InitTypeDef* pInit);
static void MPU_Config(void)
{
MPU_Region_InitTypeDef MPU_InitStruct = {0};
uint32_t index = MPU_REGION_NUMBER0;
uint32_t address;
uint32_t size;
/* Disable the MPU */
HAL_MPU_Disable();
/* Initialize the background region */
MPU_InitStruct.Enable = MPU_REGION_ENABLE;
MPU_InitStruct.Number = index;
MPU_InitStruct.BaseAddress = 0x0;
MPU_InitStruct.Size = MPU_REGION_SIZE_4GB;
MPU_InitStruct.SubRegionDisable = 0x87;
MPU_InitStruct.TypeExtField = MPU_TEX_LEVEL0;
MPU_InitStruct.AccessPermission = MPU_REGION_NO_ACCESS;
MPU_InitStruct.DisableExec = MPU_INSTRUCTION_ACCESS_DISABLE;
MPU_InitStruct.IsShareable = MPU_ACCESS_SHAREABLE;
MPU_InitStruct.IsCacheable = MPU_ACCESS_CACHEABLE;
MPU_InitStruct.IsBufferable = MPU_ACCESS_BUFFERABLE;
HAL_MPU_ConfigRegion(&MPU_InitStruct);
index++;
/* Initialize the non cacheable region */
#if defined ( __ICCARM__ )
/* get the region attribute form the icf file */
extern uint32_t NONCACHEABLEBUFFER_start;
extern uint32_t NONCACHEABLEBUFFER_size;
address = (uint32_t)&NONCACHEABLEBUFFER_start;
size = (uint32_t)&NONCACHEABLEBUFFER_size;
#elif defined (__CC_ARM) || defined(__ARMCC_VERSION)
extern uint32_t Image$$RW_NONCACHEABLEBUFFER$$Base;
extern uint32_t Image$$RW_NONCACHEABLEBUFFER$$Length;
extern uint32_t Image$$RW_NONCACHEABLEBUFFER$$ZI$$Length;
address = (uint32_t)&Image$$RW_NONCACHEABLEBUFFER$$Base;
size = (uint32_t)&Image$$RW_NONCACHEABLEBUFFER$$Length + (uint32_t)&Image$$RW_NONCACHEABLEBUFFER$$ZI$$Length;
#elif defined ( __GNUC__ )
extern int __NONCACHEABLEBUFFER_BEGIN;
extern int __NONCACHEABLEBUFFER_END;
address = (uint32_t)&__NONCACHEABLEBUFFER_BEGIN;
size = (uint32_t)&__NONCACHEABLEBUFFER_END - (uint32_t)&__NONCACHEABLEBUFFER_BEGIN;
#else
#error "Compiler toolchain is unsupported"
#endif
if (size != 0)
{
/* Configure the MPU attributes as Normal Non Cacheable */
MPU_InitStruct.Enable = MPU_REGION_ENABLE;
MPU_InitStruct.AccessPermission = MPU_REGION_FULL_ACCESS;
MPU_InitStruct.IsBufferable = MPU_ACCESS_NOT_BUFFERABLE;
MPU_InitStruct.IsCacheable = MPU_ACCESS_NOT_CACHEABLE;
MPU_InitStruct.IsShareable = MPU_ACCESS_NOT_SHAREABLE;
MPU_InitStruct.Number = index;
MPU_InitStruct.TypeExtField = MPU_TEX_LEVEL1;
MPU_InitStruct.SubRegionDisable = 0x00;
MPU_InitStruct.DisableExec = MPU_INSTRUCTION_ACCESS_DISABLE;
MPU_AdjustRegionAddressSize(address, size, &MPU_InitStruct);
HAL_MPU_ConfigRegion(&MPU_InitStruct);
index++;
}
/* Initialize the region corresponding to the execution area
(external or internal flash or external or internal RAM
depending on scatter file definition) */
#if defined ( __ICCARM__ )
extern uint32_t __ICFEDIT_region_ROM_start__;
extern uint32_t __ICFEDIT_region_ROM_end__;
address = (uint32_t)&__ICFEDIT_region_ROM_start__;
size = (uint32_t)&__ICFEDIT_region_ROM_end__ - (uint32_t)&__ICFEDIT_region_ROM_start__ + 1;
#elif defined (__CC_ARM) || defined(__ARMCC_VERSION)
extern uint32_t Image$$ER_ROM$$Base;
extern uint32_t Image$$ER_ROM$$Limit;
address = (uint32_t)&Image$$ER_ROM$$Base;
size = (uint32_t)&Image$$ER_ROM$$Limit-(uint32_t)&Image$$ER_ROM$$Base;
#elif defined ( __GNUC__ )
extern uint32_t __FLASH_BEGIN;
extern uint32_t __FLASH_SIZE;
address = (uint32_t)&__FLASH_BEGIN;
size = (uint32_t)&__FLASH_SIZE;
#else
#error "Compiler toolchain is unsupported"
#endif
MPU_InitStruct.Enable = MPU_REGION_ENABLE;
MPU_InitStruct.Number = index;
MPU_InitStruct.SubRegionDisable = 0u;
MPU_InitStruct.TypeExtField = MPU_TEX_LEVEL1;
MPU_InitStruct.AccessPermission = MPU_REGION_FULL_ACCESS;
MPU_InitStruct.DisableExec = MPU_INSTRUCTION_ACCESS_ENABLE;
MPU_InitStruct.IsShareable = MPU_ACCESS_SHAREABLE;
MPU_InitStruct.IsCacheable = MPU_ACCESS_CACHEABLE;
MPU_InitStruct.IsBufferable = MPU_ACCESS_BUFFERABLE;
MPU_AdjustRegionAddressSize(address, size, &MPU_InitStruct);
HAL_MPU_ConfigRegion(&MPU_InitStruct);
index++;
/* Reset unused MPU regions */
for(; index < __MPU_REGIONCOUNT ; index++)
{
/* All unused regions disabled */
MPU_InitStruct.Enable = MPU_REGION_DISABLE;
MPU_InitStruct.Number = index;
HAL_MPU_ConfigRegion(&MPU_InitStruct);
}
/* Enable the MPU */
HAL_MPU_Enable(MPU_PRIVILEGED_DEFAULT);
}
/**
* @brief This function adjusts the MPU region Address and Size within an MPU configuration.
* @param Address memory address
* @param Size memory size
* @param pInit pointer to an MPU initialization structure
* @retval None
*/
static void MPU_AdjustRegionAddressSize(uint32_t Address, uint32_t Size, MPU_Region_InitTypeDef* pInit)
{
/* Compute the MPU region size */
pInit->Size = ((31 - __CLZ(Size)) - 1);
if (Size > (1 << (pInit->Size + 1)))
{
pInit->Size++;
}
uint32_t Modulo = Address % (1 << (pInit->Size - 1));
if (0 != Modulo)
{
/* Align address with MPU region size considering there is no need to increase the size */
pInit->BaseAddress = Address - Modulo;
}
else
{
pInit->BaseAddress = Address;
}
}
#define TEST_SIZE 1024
__attribute__((section("dtcm_data")))
__attribute__((aligned(32)))
uint8_t buffer1[TEST_SIZE];
__attribute__((section("dtcm_data")))
__attribute__((aligned(32)))
uint8_t buffer2[TEST_SIZE];
__attribute__((section("noncacheable")))
__attribute__((aligned(32)))
uint8_t buffer_ncache[TEST_SIZE];
__attribute__((aligned(32)))
uint8_t buffer_cached[TEST_SIZE];
unsigned int test_loop(void* dst, const void* src, int size, bool flush, bool invalidate)
{
volatile unsigned int *DWT_CYCCNT = (uint32_t *)0xE0001004; //address of the register
volatile unsigned int *DWT_CONTROL = (uint32_t *)0xE0001000; //address of the register
volatile unsigned int *SCB_DEMCR = (uint32_t *)0xE000EDFC; //address of the register
*SCB_DEMCR = *SCB_DEMCR | 0x01000000;
*DWT_CYCCNT = 0;
*DWT_CONTROL |= 1;
if(invalidate)
SCB_InvalidateDCache_by_Addr((uint32_t*)src, size);
memcpy(dst, src, size);
if (flush)
SCB_CleanDCache_by_Addr((uint32_t*)dst, size);
*DWT_CONTROL &= ~1;
return *DWT_CYCCNT;
}
int main(void) {
MPU_Config();
board_init();
SWD_Init();
unsigned int cycle;
printf("\r\nmemcpy benchmark \r\n");
for(int i = 0; i < TEST_SIZE; i++)
{
buffer1[i] = (uint8_t)i;
}
printf("DTCM - DTCM\r\n");
for(int i = 0; i < 8; i++)
{
cycle = test_loop(buffer2, buffer1, TEST_SIZE, false, false);
printf("Loop:%d Cycle:%d\r\n", i, cycle);
}
printf("Throughput:%d MB/s\r\n", TEST_SIZE * (SystemCoreClock / cycle) / 1048576);
printf("DTCM - NonCache\r\n");
for(int i = 0; i < 8; i++)
{
cycle = test_loop(buffer_ncache, buffer1, TEST_SIZE, false, false);
printf("Loop:%d Cycle:%d\r\n", i, cycle);
}
printf("Throughput:%d MB/s\r\n", TEST_SIZE * (SystemCoreClock / cycle) / 1048576);
printf("DTCM - Cache\r\n");
for(int i = 0; i < 8; i++)
{
cycle = test_loop(buffer_cached, buffer1, TEST_SIZE, false, false);
printf("Loop:%d Cycle:%d\r\n", i, cycle);
}
printf("Throughput:%d MB/s\r\n", TEST_SIZE * (SystemCoreClock / cycle) / 1048576);
printf("DTCM - Cache+Flush\r\n");
for(int i = 0; i < 8; i++)
{
cycle = test_loop(buffer_cached, buffer1, TEST_SIZE, true, false);
printf("Loop:%d Cycle:%d\r\n", i, cycle);
}
printf("Throughput:%d MB/s\r\n", TEST_SIZE * (SystemCoreClock / cycle) / 1048576);
printf("NonCache - DTCM\r\n");
for(int i = 0; i < 8; i++)
{
cycle = test_loop(buffer1, buffer_ncache, TEST_SIZE, false, false);
printf("Loop:%d Cycle:%d\r\n", i, cycle);
}
printf("Throughput:%d MB/s\r\n", TEST_SIZE * (SystemCoreClock / cycle) / 1048576);
printf("Cache - DTCM\r\n");
for(int i = 0; i < 8; i++)
{
cycle = test_loop(buffer1, buffer_cached, TEST_SIZE, false, false);
printf("Loop:%d Cycle:%d\r\n", i, cycle);
}
printf("Throughput:%d MB/s\r\n", TEST_SIZE * (SystemCoreClock / cycle) / 1048576);
printf("Cache+Invalidate - DTCM\r\n");
for(int i = 0; i < 8; i++)
{
cycle = test_loop(buffer1, buffer_cached, TEST_SIZE, false, true);
printf("Loop:%d Cycle:%d\r\n", i, cycle);
}
printf("Throughput:%d MB/s\r\n", TEST_SIZE * (SystemCoreClock / cycle) / 1048576);
while (1)
{
}
}
Looks like my HIL instance has license issue, I think we can add the env locally.
Works great for me, both for CDC and UVC. Thanks !
In addition to #define CFG_TUD_DWC2_DMA_ENABLE 1 I also had to add #define __CORTEX_M 7
Works great for me, both for CDC and UVC. Thanks !
In addition to
#define CFG_TUD_DWC2_DMA_ENABLE 1I also had to add#define __CORTEX_M 7
Thanks for your test.
It's little strange that you need to define __CORTEX_M, normally stm32h7xx.h includes devcie header eg. stm32h747xx.h which includes core_cm7.h and __CORTEX_M is defined inside. Did I miss something ?
Thanks @HiFiPhile for great Pr as usual. Though I am off for TET (Lunar New Year) and won't be able to review this in 2 weeks. Happy New Year 🎉
Works great for me, both for CDC and UVC. Thanks ! In addition to
#define CFG_TUD_DWC2_DMA_ENABLE 1I also had to add#define __CORTEX_M 7Thanks for your test. It's little strange that you need to define
__CORTEX_M, normallystm32h7xx.hincludes devcie header eg.stm32h747xx.hwhich includescore_cm7.hand__CORTEX_Mis defined inside. Did I miss something ?
I don't think so. I'm using a custom RTOS which relies on its own set of headers, that's why.
Thanks @HiFiPhile for great Pr as usual. Though I am off for TET (Lunar New Year) and won't be able to review this in 2 weeks. Happy New Year 🎉
Happy new year also 🎊
@hathach I just rebased on master and did some cleanup.
CFG_TUD_MEM_ALIGN introduce a breaking change, but without this I don't know how to extract numeric value.
@hathach I just rebased on master and did some cleanup.
CFG_TUD_MEM_ALIGNintroduce a breaking change, but without this I don't know how to extract numeric value.
How about adding the CFG_TUD_MEM_ALIGN_BYTE/SIZE instead, CFG_TUD_MEM_ALIGN if not defined will be defined using that. That way people will have time to migrate (or just leave it if they don't need it). Btw, why you need the numeric value (haven't do review yet).
How about adding the CFG_TUD_MEM_ALIGN_BYTE/SIZE instead, CFG_TUD_MEM_ALIGN if not defined will be defined using that.
Yeah it's a good idea. I need to align the buffer correctly if user need a higher alignment than cache line size.
Update: no more need to change CFG_TUD_MEM_ALIGN
I've updated BSP:
- Keep DCache disabled for H7 & F7 for RTT compatibility
- Add MPU_Config to H7RS and use noncacheable_buffer for RTT buffer