tinyusb icon indicating copy to clipboard operation
tinyusb copied to clipboard

Add DWC2 cache maintenance routines for STM32

Open HiFiPhile opened this issue 11 months ago • 6 comments

Describe the PR

  • Support DMA + DCache ON
  • Refactored buffer alignment macro to take into account cache line size

Now #define CFG_TUD_DWC2_DMA_ENABLE 1 is enough.

It's prefer to declare a non-cached region with MPU instead of rely on cache invalidate+clean, benchmark on STM32H7S3 and i.MX RT1170 shows frequent cache invalidate+clean really hurts performance.

** Need rebase after #2960

Benchmark code
extern uint32_t SystemCoreClock;
void SWD_Init(void)
{
  //UNLOCK FUNNEL
  *(volatile uint32_t*)(0x5C004FB0) = 0xC5ACCE55; // SWTF_LAR
  *(volatile uint32_t*)(0x5C003FB0) = 0xC5ACCE55; // SWO_LAR
 
  //SWO current output divisor register
  //This divisor value (0x000000C7) corresponds to 400Mhz
  //To change it, you can use the following rule
  // value = (CPU Freq/sw speed )-1
  blink_interval_ms= *(volatile uint32_t*)(0x5C003010);
   *(volatile uint32_t*)(0x5C003010) = ((SystemCoreClock / 12000000) - 1); // SWO_CODR
 
  //SWO selected pin protocol register
   *(volatile uint32_t*)(0x5C0030F0) = 0x00000002; // SWO_SPPR
 
  //Enable ITM input of SWO trace funnel
   *(volatile uint32_t*)(0x5C004000) |= 0x00000001; // SWFT_CTRL
 
}

static void MPU_AdjustRegionAddressSize(uint32_t Address, uint32_t Size, MPU_Region_InitTypeDef* pInit);
static void MPU_Config(void)
{
  MPU_Region_InitTypeDef MPU_InitStruct = {0};
  uint32_t index = MPU_REGION_NUMBER0;
  uint32_t address;
  uint32_t size;

  /* Disable the MPU */
  HAL_MPU_Disable();

  /* Initialize the background region */
  MPU_InitStruct.Enable = MPU_REGION_ENABLE;
  MPU_InitStruct.Number = index;
  MPU_InitStruct.BaseAddress = 0x0;
  MPU_InitStruct.Size = MPU_REGION_SIZE_4GB;
  MPU_InitStruct.SubRegionDisable = 0x87;
  MPU_InitStruct.TypeExtField = MPU_TEX_LEVEL0;
  MPU_InitStruct.AccessPermission = MPU_REGION_NO_ACCESS;
  MPU_InitStruct.DisableExec = MPU_INSTRUCTION_ACCESS_DISABLE;
  MPU_InitStruct.IsShareable = MPU_ACCESS_SHAREABLE;
  MPU_InitStruct.IsCacheable = MPU_ACCESS_CACHEABLE;
  MPU_InitStruct.IsBufferable = MPU_ACCESS_BUFFERABLE;
  HAL_MPU_ConfigRegion(&MPU_InitStruct);
  index++;

  /* Initialize the non cacheable region */
#if defined ( __ICCARM__ )
  /* get the region attribute form the icf file */
  extern uint32_t NONCACHEABLEBUFFER_start;
  extern uint32_t NONCACHEABLEBUFFER_size;

  address = (uint32_t)&NONCACHEABLEBUFFER_start;
  size = (uint32_t)&NONCACHEABLEBUFFER_size;

#elif defined (__CC_ARM) || defined(__ARMCC_VERSION)
  extern uint32_t Image$$RW_NONCACHEABLEBUFFER$$Base;
  extern uint32_t Image$$RW_NONCACHEABLEBUFFER$$Length;
  extern uint32_t Image$$RW_NONCACHEABLEBUFFER$$ZI$$Length;

  address = (uint32_t)&Image$$RW_NONCACHEABLEBUFFER$$Base;
  size  = (uint32_t)&Image$$RW_NONCACHEABLEBUFFER$$Length + (uint32_t)&Image$$RW_NONCACHEABLEBUFFER$$ZI$$Length;
#elif defined ( __GNUC__ )
  extern int __NONCACHEABLEBUFFER_BEGIN;
  extern int __NONCACHEABLEBUFFER_END;

  address = (uint32_t)&__NONCACHEABLEBUFFER_BEGIN;
  size  = (uint32_t)&__NONCACHEABLEBUFFER_END - (uint32_t)&__NONCACHEABLEBUFFER_BEGIN;
#else
#error "Compiler toolchain is unsupported"
#endif

  if (size != 0)
  {
    /* Configure the MPU attributes as Normal Non Cacheable */
    MPU_InitStruct.Enable = MPU_REGION_ENABLE;
    MPU_InitStruct.AccessPermission = MPU_REGION_FULL_ACCESS;
    MPU_InitStruct.IsBufferable = MPU_ACCESS_NOT_BUFFERABLE;
    MPU_InitStruct.IsCacheable = MPU_ACCESS_NOT_CACHEABLE;
    MPU_InitStruct.IsShareable = MPU_ACCESS_NOT_SHAREABLE;
    MPU_InitStruct.Number = index;
    MPU_InitStruct.TypeExtField = MPU_TEX_LEVEL1;
    MPU_InitStruct.SubRegionDisable = 0x00;
    MPU_InitStruct.DisableExec = MPU_INSTRUCTION_ACCESS_DISABLE;
    MPU_AdjustRegionAddressSize(address, size, &MPU_InitStruct);
    HAL_MPU_ConfigRegion(&MPU_InitStruct);
    index++;
  }

  /* Initialize the region corresponding to the execution area
     (external or internal flash or external or internal RAM
     depending on scatter file definition) */
#if defined ( __ICCARM__ )
  extern uint32_t __ICFEDIT_region_ROM_start__;
  extern uint32_t __ICFEDIT_region_ROM_end__;
  address = (uint32_t)&__ICFEDIT_region_ROM_start__;
  size = (uint32_t)&__ICFEDIT_region_ROM_end__ - (uint32_t)&__ICFEDIT_region_ROM_start__ + 1;
#elif defined (__CC_ARM) || defined(__ARMCC_VERSION)
  extern uint32_t Image$$ER_ROM$$Base;
  extern uint32_t Image$$ER_ROM$$Limit;
  address = (uint32_t)&Image$$ER_ROM$$Base;
  size    = (uint32_t)&Image$$ER_ROM$$Limit-(uint32_t)&Image$$ER_ROM$$Base;
#elif defined ( __GNUC__ )
  extern uint32_t __FLASH_BEGIN;
  extern uint32_t __FLASH_SIZE;
  address = (uint32_t)&__FLASH_BEGIN;
  size  = (uint32_t)&__FLASH_SIZE;
#else
#error "Compiler toolchain is unsupported"
#endif

  MPU_InitStruct.Enable = MPU_REGION_ENABLE;
  MPU_InitStruct.Number = index;
  MPU_InitStruct.SubRegionDisable = 0u;
  MPU_InitStruct.TypeExtField = MPU_TEX_LEVEL1;
  MPU_InitStruct.AccessPermission = MPU_REGION_FULL_ACCESS;
  MPU_InitStruct.DisableExec = MPU_INSTRUCTION_ACCESS_ENABLE;
  MPU_InitStruct.IsShareable = MPU_ACCESS_SHAREABLE;
  MPU_InitStruct.IsCacheable = MPU_ACCESS_CACHEABLE;
  MPU_InitStruct.IsBufferable = MPU_ACCESS_BUFFERABLE;
  MPU_AdjustRegionAddressSize(address, size, &MPU_InitStruct);
  HAL_MPU_ConfigRegion(&MPU_InitStruct);
  index++;

  /* Reset unused MPU regions */
  for(; index < __MPU_REGIONCOUNT ; index++)
  {
    /* All unused regions disabled */
    MPU_InitStruct.Enable = MPU_REGION_DISABLE;
    MPU_InitStruct.Number = index;
    HAL_MPU_ConfigRegion(&MPU_InitStruct);
  }

  /* Enable the MPU */
  HAL_MPU_Enable(MPU_PRIVILEGED_DEFAULT);
}

/**
  * @brief This function adjusts the MPU region Address and Size within an MPU configuration.
  * @param Address memory address
  * @param Size memory size
  * @param pInit pointer to an MPU initialization structure
  * @retval None
  */
static void MPU_AdjustRegionAddressSize(uint32_t Address, uint32_t Size, MPU_Region_InitTypeDef* pInit)
{
  /* Compute the MPU region size */
  pInit->Size = ((31 - __CLZ(Size)) - 1);
  if (Size > (1 << (pInit->Size + 1)))
  {
    pInit->Size++;
  }
  uint32_t Modulo = Address % (1 << (pInit->Size - 1));
  if (0 != Modulo)
  {
    /* Align address with MPU region size considering there is no need to increase the size */
    pInit->BaseAddress = Address - Modulo;
  }
  else
  {
    pInit->BaseAddress = Address;
  }
}

#define TEST_SIZE 1024

__attribute__((section("dtcm_data")))
__attribute__((aligned(32)))
uint8_t buffer1[TEST_SIZE];

__attribute__((section("dtcm_data")))
__attribute__((aligned(32)))
uint8_t buffer2[TEST_SIZE];

__attribute__((section("noncacheable")))
__attribute__((aligned(32)))
uint8_t buffer_ncache[TEST_SIZE];

__attribute__((aligned(32)))
uint8_t buffer_cached[TEST_SIZE];

unsigned int test_loop(void* dst, const void* src, int size, bool flush, bool invalidate)
{
    volatile unsigned int *DWT_CYCCNT = (uint32_t *)0xE0001004; //address of the register
    volatile unsigned int *DWT_CONTROL = (uint32_t *)0xE0001000; //address of the register
    volatile unsigned int *SCB_DEMCR = (uint32_t *)0xE000EDFC; //address of the register
    
    *SCB_DEMCR = *SCB_DEMCR | 0x01000000;
    *DWT_CYCCNT = 0;
    *DWT_CONTROL |=  1;
    
    if(invalidate)
        SCB_InvalidateDCache_by_Addr((uint32_t*)src, size);
    
    
    memcpy(dst, src, size);
    

    if (flush)
        SCB_CleanDCache_by_Addr((uint32_t*)dst, size);
    
    *DWT_CONTROL &= ~1;
    return *DWT_CYCCNT;
}

int main(void) {
  MPU_Config();
  board_init();
  SWD_Init();
    
  unsigned int cycle;
    printf("\r\nmemcpy benchmark \r\n");

    for(int i = 0; i < TEST_SIZE; i++)
    {
        buffer1[i] = (uint8_t)i;
    }
    
    printf("DTCM - DTCM\r\n");
    for(int i = 0; i < 8; i++)
    {
        cycle = test_loop(buffer2, buffer1, TEST_SIZE, false, false);
        printf("Loop:%d   Cycle:%d\r\n", i, cycle);
    }
    printf("Throughput:%d MB/s\r\n", TEST_SIZE * (SystemCoreClock / cycle) / 1048576);
    printf("DTCM - NonCache\r\n");
    for(int i = 0; i < 8; i++)
    {
        cycle = test_loop(buffer_ncache, buffer1, TEST_SIZE, false, false);
        printf("Loop:%d   Cycle:%d\r\n", i, cycle);
    }
    printf("Throughput:%d MB/s\r\n", TEST_SIZE * (SystemCoreClock / cycle) / 1048576);
    printf("DTCM - Cache\r\n");
    for(int i = 0; i < 8; i++)
    {
        cycle = test_loop(buffer_cached, buffer1, TEST_SIZE, false, false);
        printf("Loop:%d   Cycle:%d\r\n", i, cycle);
    }
    printf("Throughput:%d MB/s\r\n", TEST_SIZE * (SystemCoreClock / cycle) / 1048576);
    printf("DTCM - Cache+Flush\r\n");
    for(int i = 0; i < 8; i++)
    {
        cycle = test_loop(buffer_cached, buffer1, TEST_SIZE, true, false);
        printf("Loop:%d   Cycle:%d\r\n", i, cycle);
    }
    printf("Throughput:%d MB/s\r\n", TEST_SIZE * (SystemCoreClock / cycle) / 1048576);
    
    printf("NonCache - DTCM\r\n");
    for(int i = 0; i < 8; i++)
    {
        cycle = test_loop(buffer1, buffer_ncache, TEST_SIZE, false, false);
        printf("Loop:%d   Cycle:%d\r\n", i, cycle);
    }
    printf("Throughput:%d MB/s\r\n", TEST_SIZE * (SystemCoreClock / cycle) / 1048576);
    printf("Cache - DTCM\r\n");
    for(int i = 0; i < 8; i++)
    {
        cycle = test_loop(buffer1, buffer_cached, TEST_SIZE, false, false);
        printf("Loop:%d   Cycle:%d\r\n", i, cycle);
    }
    printf("Throughput:%d MB/s\r\n", TEST_SIZE * (SystemCoreClock / cycle) / 1048576);
    printf("Cache+Invalidate - DTCM\r\n");
    for(int i = 0; i < 8; i++)
    {
        cycle = test_loop(buffer1, buffer_cached, TEST_SIZE, false, true);
        printf("Loop:%d   Cycle:%d\r\n", i, cycle);
    }
    printf("Throughput:%d MB/s\r\n", TEST_SIZE * (SystemCoreClock / cycle) / 1048576);
    
    while (1)
    {
    }
  
}

HiFiPhile avatar Jan 25 '25 12:01 HiFiPhile

Looks like my HIL instance has license issue, I think we can add the env locally.

HiFiPhile avatar Jan 27 '25 08:01 HiFiPhile

Works great for me, both for CDC and UVC. Thanks !

In addition to #define CFG_TUD_DWC2_DMA_ENABLE 1 I also had to add #define __CORTEX_M 7

pstadelmann avatar Jan 27 '25 15:01 pstadelmann

Works great for me, both for CDC and UVC. Thanks !

In addition to #define CFG_TUD_DWC2_DMA_ENABLE 1 I also had to add #define __CORTEX_M 7

Thanks for your test. It's little strange that you need to define __CORTEX_M, normally stm32h7xx.h includes devcie header eg. stm32h747xx.h which includes core_cm7.h and __CORTEX_M is defined inside. Did I miss something ?

HiFiPhile avatar Jan 28 '25 10:01 HiFiPhile

Thanks @HiFiPhile for great Pr as usual. Though I am off for TET (Lunar New Year) and won't be able to review this in 2 weeks. Happy New Year 🎉

hathach avatar Jan 28 '25 12:01 hathach

Works great for me, both for CDC and UVC. Thanks ! In addition to #define CFG_TUD_DWC2_DMA_ENABLE 1 I also had to add #define __CORTEX_M 7

Thanks for your test. It's little strange that you need to define __CORTEX_M, normally stm32h7xx.h includes devcie header eg. stm32h747xx.h which includes core_cm7.h and __CORTEX_M is defined inside. Did I miss something ?

I don't think so. I'm using a custom RTOS which relies on its own set of headers, that's why.

pstadelmann avatar Jan 29 '25 09:01 pstadelmann

Thanks @HiFiPhile for great Pr as usual. Though I am off for TET (Lunar New Year) and won't be able to review this in 2 weeks. Happy New Year 🎉

Happy new year also 🎊

HiFiPhile avatar Jan 29 '25 10:01 HiFiPhile

@hathach I just rebased on master and did some cleanup. CFG_TUD_MEM_ALIGN introduce a breaking change, but without this I don't know how to extract numeric value.

HiFiPhile avatar Jun 13 '25 14:06 HiFiPhile

@hathach I just rebased on master and did some cleanup. CFG_TUD_MEM_ALIGN introduce a breaking change, but without this I don't know how to extract numeric value.

How about adding the CFG_TUD_MEM_ALIGN_BYTE/SIZE instead, CFG_TUD_MEM_ALIGN if not defined will be defined using that. That way people will have time to migrate (or just leave it if they don't need it). Btw, why you need the numeric value (haven't do review yet).

hathach avatar Jun 13 '25 15:06 hathach

How about adding the CFG_TUD_MEM_ALIGN_BYTE/SIZE instead, CFG_TUD_MEM_ALIGN if not defined will be defined using that.

Yeah it's a good idea. I need to align the buffer correctly if user need a higher alignment than cache line size.

Update: no more need to change CFG_TUD_MEM_ALIGN

HiFiPhile avatar Jun 13 '25 16:06 HiFiPhile

I've updated BSP:

  • Keep DCache disabled for H7 & F7 for RTT compatibility
  • Add MPU_Config to H7RS and use noncacheable_buffer for RTT buffer

HiFiPhile avatar Jul 04 '25 10:07 HiFiPhile