简介:
在之前的帖子(RA8 PMU 模块功能寄存器功能说明),我们已经介绍了PMU线管的寄存器,我们在此基础上继续使用CMSIS 的接口使用PMU 模块去评估CPU运行的性能。M85 的 CMSIS 软件包已经提供了PMU 的接口函数对应的接口函数如下,我们可以使用对应的接口来评估CPU的性能。
PMU 使用的配置流程如下:
CMSIS 定义的 PMU 监测的event 如下:
我们参照上述流程图在待评估性能的代码前配置PMU并清0 对应cycle counter 和 event ,代码执行后读取 cycle counter 和 event monitor 的数值信息,从而实现代码执行的性能监测。按照该思路我们实现void pmu_monitor_start(void) 函数使能pmu 配置监测ARM_PMU_INST_RETIRED/ARM_PMU_MEM_ACCESS/ARM_PMU_L1D_CACHE_REFILL/CPU_CYCLE 性能参数,添加代码如下。
void pmu_monitor_start(void) { memset((void*)pmu_event_counter,0,sizeof(pmu_event_counter)); /* Disable the PMU */ ARM_PMU_Disable(); /* Disable PMU Cycle Counter */ ARM_PMU_CNTR_Disable(PMU_CNTENCLR_CCNTR_ENABLE_Msk); /* Disable PMU Cycle Counter IRQ */ ARM_PMU_Set_CNTR_IRQ_Disable(PMU_INTENCLR_CYCCNT_ENABLE_Msk); /* Reset PMU Cycle Counter */ ARM_PMU_CYCCNT_Reset(); /* Clear overflow status */ ARM_PMU_Set_CNTR_OVS(PMU_OVSCLR_CYCCNT_STATUS_Msk); /* Reset PMU Event Counters */ ARM_PMU_EVCNTR_ALL_Reset(); /* configure event counter */ /* 32 bit counter for instruction architecturally executed */ ARM_PMU_Set_EVTYPER(0,ARM_PMU_INST_RETIRED); ARM_PMU_Set_EVTYPER(1,ARM_PMU_CHAIN); /* clear counter 0/1 overflow flag */ ARM_PMU_Set_CNTR_OVS(PMU_OVSCLR_CNT0_STATUS_Msk | PMU_OVSCLR_CNT1_STATUS_Msk); /* enable counter 1 interrupt */ ARM_PMU_Set_CNTR_IRQ_Enable(PMU_INTENSET_CNT1_ENABLE_Msk); /* enable counter 0/1 */ ARM_PMU_CNTR_Enable(PMU_CNTENSET_CNT0_ENABLE_Msk | PMU_CNTENSET_CNT1_ENABLE_Msk); /* 32bit counter for all Data memory Accesses */ ARM_PMU_Set_EVTYPER(2,ARM_PMU_MEM_ACCESS); ARM_PMU_Set_EVTYPER(3,ARM_PMU_CHAIN); /* clear counter 2/3 overflow flag */ ARM_PMU_Set_CNTR_OVS(PMU_OVSCLR_CNT2_STATUS_Msk | PMU_OVSCLR_CNT3_STATUS_Msk); /* enable counter 3 interrupt */ ARM_PMU_Set_CNTR_IRQ_Enable(PMU_INTENSET_CNT3_ENABLE_Msk); /* enable counter 2/3 */ ARM_PMU_CNTR_Enable(PMU_CNTENSET_CNT2_ENABLE_Msk | PMU_CNTENSET_CNT3_ENABLE_Msk); /* 16bit counter for data cache refill */ ARM_PMU_Set_EVTYPER(4,ARM_PMU_L1D_CACHE_REFILL); /* clear counter 4 overflow flag */ ARM_PMU_Set_CNTR_OVS(PMU_OVSCLR_CNT4_STATUS_Msk); /* enable counter 4 interrupt */ ARM_PMU_Set_CNTR_IRQ_Enable(PMU_INTENSET_CNT4_ENABLE_Msk); /* enable counter 4 */ ARM_PMU_CNTR_Enable(PMU_CNTENSET_CNT4_ENABLE_Msk); /* 16bit counter for Instruction architecturally executed, condition code check pass, exception return */ ARM_PMU_Set_EVTYPER(5,ARM_PMU_EXC_RETURN); /* clear counter 5 overflow flag */ ARM_PMU_Set_CNTR_OVS(PMU_OVSCLR_CNT5_STATUS_Msk); /* enable counter 5 interrupt */ ARM_PMU_Set_CNTR_IRQ_Enable(PMU_INTENSET_CNT5_ENABLE_Msk); /* enable counter 5 */ ARM_PMU_CNTR_Enable(PMU_CNTENSET_CNT5_ENABLE_Msk); /* 16bit counter for Exception taken */ ARM_PMU_Set_EVTYPER(6,ARM_PMU_EXC_TAKEN); /* clear counter 6 overflow flag */ ARM_PMU_Set_CNTR_OVS(PMU_OVSCLR_CNT6_STATUS_Msk); /* enable counter 5 interrupt */ ARM_PMU_Set_CNTR_IRQ_Enable(PMU_INTENSET_CNT6_ENABLE_Msk); /* enable counter 5 */ ARM_PMU_CNTR_Enable(PMU_CNTENSET_CNT6_ENABLE_Msk); /* */ /* TRCENA, bit [24] Trace enable. Global enable for all DWT, PMU, and ITM features. UMON_EN, bit [21] Unprivileged monitor enable. DebugMonitor pend enable when the PE is in an unprivileged mode. SDME, bit [20] Secure DebugMonitor enable. Indicates whether the DebugMonitor targets the Secure or the Non-secure state and whether debug events are allowed in Secure state. MON_EN, bit [16] Monitor enable. Enable the DebugMonitor exception. */ DCB->DEMCR |= DCB_DEMCR_UMON_EN_Msk | DCB_DEMCR_SDME_Msk | DCB_DEMCR_TRCENA_Msk | DCB_DEMCR_MON_EN_Msk; /* enable PMU Cycle Counter interrupt */ ARM_PMU_Set_CNTR_IRQ_Enable(PMU_INTENSET_CCYCNT_ENABLE_Msk); ARM_PMU_CNTR_Enable(PMU_CNTENSET_CCNTR_ENABLE_Msk); /* Enable the PMU */ ARM_PMU_Enable(); /* force to disable DWT */ DWT->CTRL = 0; }
因为PMU event 的overflow 事件会触发DebugMon_Handler 中断我们添加中断函数处理PMU 的overflow 事件,对统计计数进行统计并清除对应的中断标志位。
static int64_t pmu_event_counter[__PMU_NUM_EVENTCNT + 1] = {0}; static void pmu_monitor_overflow(void) { if (!(SCB->DFSR & SCB_DFSR_PMU_Msk)) { return ; } if(ARM_PMU_Get_CNTR_OVS() & PMU_OVSSET_CYCCNT_STATUS_Msk) { /* clear overflow event */ ARM_PMU_Set_CNTR_OVS(PMU_OVSCLR_CYCCNT_STATUS_Msk); pmu_event_counter[__PMU_NUM_EVENTCNT] += (int64_t)0x100000000; } if(ARM_PMU_Get_CNTR_OVS() & PMU_OVSSET_CNT1_STATUS_Pos) { /* clear overflow event */ ARM_PMU_Set_CNTR_OVS(PMU_OVSCLR_CNT1_STATUS_Pos); pmu_event_counter[1] += (int64_t)0x100000000; } if(ARM_PMU_Get_CNTR_OVS() & PMU_OVSSET_CNT3_STATUS_Pos) { /* clear overflow event */ ARM_PMU_Set_CNTR_OVS(PMU_OVSCLR_CNT3_STATUS_Pos); pmu_event_counter[3] += (int64_t)0x100000000; } if(ARM_PMU_Get_CNTR_OVS() & PMU_OVSSET_CNT4_STATUS_Pos) { /* clear overflow event */ ARM_PMU_Set_CNTR_OVS(PMU_OVSCLR_CNT4_STATUS_Pos); pmu_event_counter[4] += (int64_t)0x100000; } if(ARM_PMU_Get_CNTR_OVS() & PMU_OVSSET_CNT5_STATUS_Pos) { /* clear overflow event */ ARM_PMU_Set_CNTR_OVS(PMU_OVSCLR_CNT5_STATUS_Pos); pmu_event_counter[5] += (int64_t)0x100000; } if(ARM_PMU_Get_CNTR_OVS() & PMU_OVSSET_CNT6_STATUS_Pos) { /* clear overflow event */ ARM_PMU_Set_CNTR_OVS(PMU_OVSCLR_CNT6_STATUS_Pos); pmu_event_counter[6] += (int64_t)0x100000; } } __USED void DebugMon_Handler(void) { pmu_monitor_overflow(); }
我们在待评估代码后添加stop 接口,停止对应的事件监测,并计算对应的统计结果打印输出添加void pmu_monitor_stop(void) 函数,对应代码如下:
void pmu_monitor_stop(void) { /* Disable the PMU */ ARM_PMU_Disable(); /* Disable ISR */ ARM_PMU_Set_CNTR_IRQ_Disable(PMU_INTENCLR_CYCCNT_ENABLE_Msk); ARM_PMU_Set_CNTR_IRQ_Disable(PMU_INTENCLR_CNT0_ENABLE_Msk); ARM_PMU_Set_CNTR_IRQ_Disable(PMU_INTENCLR_CNT1_ENABLE_Msk); ARM_PMU_Set_CNTR_IRQ_Disable(PMU_INTENCLR_CNT2_ENABLE_Msk); ARM_PMU_Set_CNTR_IRQ_Disable(PMU_INTENCLR_CNT3_ENABLE_Msk); ARM_PMU_Set_CNTR_IRQ_Disable(PMU_INTENCLR_CNT4_ENABLE_Msk); ARM_PMU_Set_CNTR_IRQ_Disable(PMU_INTENCLR_CNT5_ENABLE_Msk); ARM_PMU_Set_CNTR_IRQ_Disable(PMU_INTENCLR_CNT6_ENABLE_Msk); ARM_PMU_Set_CNTR_IRQ_Disable(PMU_INTENCLR_CNT7_ENABLE_Msk); /* Disable PMU Cycle Counter */ ARM_PMU_CNTR_Disable(PMU_CNTENCLR_CCNTR_ENABLE_Msk); ARM_PMU_CNTR_Disable(PMU_CNTENCLR_CNT0_ENABLE_Msk); ARM_PMU_CNTR_Disable(PMU_CNTENCLR_CNT1_ENABLE_Msk); ARM_PMU_CNTR_Disable(PMU_CNTENCLR_CNT2_ENABLE_Msk); ARM_PMU_CNTR_Disable(PMU_CNTENCLR_CNT3_ENABLE_Msk); ARM_PMU_CNTR_Disable(PMU_CNTENCLR_CNT4_ENABLE_Msk); ARM_PMU_CNTR_Disable(PMU_CNTENCLR_CNT5_ENABLE_Msk); ARM_PMU_CNTR_Disable(PMU_CNTENCLR_CNT6_ENABLE_Msk); ARM_PMU_CNTR_Disable(PMU_CNTENCLR_CNT7_ENABLE_Msk); pmu_event_counter[__PMU_NUM_EVENTCNT] += ARM_PMU_Get_CCNTR(); pmu_event_counter[1] += ARM_PMU_Get_EVCNTR(0); pmu_event_counter[1] += ARM_PMU_Get_EVCNTR(1)<<16; pmu_event_counter[3] += ARM_PMU_Get_EVCNTR(2); pmu_event_counter[3] += ARM_PMU_Get_EVCNTR(3)<<16; pmu_event_counter[4] += ARM_PMU_Get_EVCNTR(4); pmu_event_counter[5] += ARM_PMU_Get_EVCNTR(5); pmu_event_counter[6] += ARM_PMU_Get_EVCNTR(6); printf("cycle %lld \r\n",pmu_event_counter[__PMU_NUM_EVENTCNT]); printf("INST_RETIRED %lld \r\n",pmu_event_counter[1]); printf("MEM_ACCESS %lld \r\n",pmu_event_counter[3]); printf("CACHE_REFILL %lld \r\n",pmu_event_counter[4]); printf("EXC_RETURN %lld \r\n",pmu_event_counter[5]); printf("EXC_TAKEN %lld \r\n",pmu_event_counter[6]); }
我么已经添加了start/stop 接口在中间添加测试评估代码即可评估该段代码性能参数。按照此方法添加如下测试代码。
#if (PERFC_LOW_LEVEL_TYPE_PMU != PERFC_LOW_LEVEL_TYPE) #include "drv_pmu.h" #include "littleshell.h" #include "FreeRTOS.h" #include "task.h" unsigned int pmutest(char argc,char ** argv) { pmu_monitor_start(); vTaskDelay(1000); pmu_monitor_stop(); static volatile int buff[100]; pmu_monitor_start(); memset((void *)buff,0,sizeof(buff)); pmu_monitor_stop(); return 1; } LTSH_FUNCTION_EXPORT(pmutest,"test pmu"); #endif /* end of PERFC_LOW_LEVEL_TYPE_PMU != PERFC_LOW_LEVEL_TYPE */
上述测试代码 vTaskDelay(1000)/memset((void *)buff,0,sizeof(buff)) 代码的性能参数。运行结果如下。
至此我们已经完成了使用PMU 测试代码运行期间的性能参数的测试评估,从测试结果看我们的cpu 运行频率480M,vTaskDelay(1000) 占用的cycle counter 和理论值的480000000基本是一致的。