简介:
在之前的帖子(RA8 PMU 模块功能寄存器功能说明),我们已经介绍了PMU线管的寄存器,我们在此基础上继续使用CMSIS 的接口使用PMU 模块去评估CPU运行的性能。M85 的 CMSIS 软件包已经提供了PMU 的接口函数对应的接口函数如下,我们可以使用对应的接口来评估CPU的性能。
PMU 使用的配置流程如下:

CMSIS 定义的 PMU 监测的event 如下:

我们参照上述流程图在待评估性能的代码前配置PMU并清0 对应cycle counter 和 event ,代码执行后读取 cycle counter 和 event monitor 的数值信息,从而实现代码执行的性能监测。按照该思路我们实现void pmu_monitor_start(void) 函数使能pmu 配置监测ARM_PMU_INST_RETIRED/ARM_PMU_MEM_ACCESS/ARM_PMU_L1D_CACHE_REFILL/CPU_CYCLE 性能参数,添加代码如下。
void pmu_monitor_start(void)
{
memset((void*)pmu_event_counter,0,sizeof(pmu_event_counter));
/* Disable the PMU */
ARM_PMU_Disable();
/* Disable PMU Cycle Counter */
ARM_PMU_CNTR_Disable(PMU_CNTENCLR_CCNTR_ENABLE_Msk);
/* Disable PMU Cycle Counter IRQ */
ARM_PMU_Set_CNTR_IRQ_Disable(PMU_INTENCLR_CYCCNT_ENABLE_Msk);
/* Reset PMU Cycle Counter */
ARM_PMU_CYCCNT_Reset();
/* Clear overflow status */
ARM_PMU_Set_CNTR_OVS(PMU_OVSCLR_CYCCNT_STATUS_Msk);
/* Reset PMU Event Counters */
ARM_PMU_EVCNTR_ALL_Reset();
/* configure event counter */
/* 32 bit counter for instruction architecturally executed */
ARM_PMU_Set_EVTYPER(0,ARM_PMU_INST_RETIRED);
ARM_PMU_Set_EVTYPER(1,ARM_PMU_CHAIN);
/* clear counter 0/1 overflow flag */
ARM_PMU_Set_CNTR_OVS(PMU_OVSCLR_CNT0_STATUS_Msk
| PMU_OVSCLR_CNT1_STATUS_Msk);
/* enable counter 1 interrupt */
ARM_PMU_Set_CNTR_IRQ_Enable(PMU_INTENSET_CNT1_ENABLE_Msk);
/* enable counter 0/1 */
ARM_PMU_CNTR_Enable(PMU_CNTENSET_CNT0_ENABLE_Msk
| PMU_CNTENSET_CNT1_ENABLE_Msk);
/* 32bit counter for all Data memory Accesses */
ARM_PMU_Set_EVTYPER(2,ARM_PMU_MEM_ACCESS);
ARM_PMU_Set_EVTYPER(3,ARM_PMU_CHAIN);
/* clear counter 2/3 overflow flag */
ARM_PMU_Set_CNTR_OVS(PMU_OVSCLR_CNT2_STATUS_Msk
| PMU_OVSCLR_CNT3_STATUS_Msk);
/* enable counter 3 interrupt */
ARM_PMU_Set_CNTR_IRQ_Enable(PMU_INTENSET_CNT3_ENABLE_Msk);
/* enable counter 2/3 */
ARM_PMU_CNTR_Enable(PMU_CNTENSET_CNT2_ENABLE_Msk
| PMU_CNTENSET_CNT3_ENABLE_Msk);
/* 16bit counter for data cache refill */
ARM_PMU_Set_EVTYPER(4,ARM_PMU_L1D_CACHE_REFILL);
/* clear counter 4 overflow flag */
ARM_PMU_Set_CNTR_OVS(PMU_OVSCLR_CNT4_STATUS_Msk);
/* enable counter 4 interrupt */
ARM_PMU_Set_CNTR_IRQ_Enable(PMU_INTENSET_CNT4_ENABLE_Msk);
/* enable counter 4 */
ARM_PMU_CNTR_Enable(PMU_CNTENSET_CNT4_ENABLE_Msk);
/* 16bit counter for Instruction architecturally executed, condition code
check pass, exception return */
ARM_PMU_Set_EVTYPER(5,ARM_PMU_EXC_RETURN);
/* clear counter 5 overflow flag */
ARM_PMU_Set_CNTR_OVS(PMU_OVSCLR_CNT5_STATUS_Msk);
/* enable counter 5 interrupt */
ARM_PMU_Set_CNTR_IRQ_Enable(PMU_INTENSET_CNT5_ENABLE_Msk);
/* enable counter 5 */
ARM_PMU_CNTR_Enable(PMU_CNTENSET_CNT5_ENABLE_Msk);
/* 16bit counter for Exception taken */
ARM_PMU_Set_EVTYPER(6,ARM_PMU_EXC_TAKEN);
/* clear counter 6 overflow flag */
ARM_PMU_Set_CNTR_OVS(PMU_OVSCLR_CNT6_STATUS_Msk);
/* enable counter 5 interrupt */
ARM_PMU_Set_CNTR_IRQ_Enable(PMU_INTENSET_CNT6_ENABLE_Msk);
/* enable counter 5 */
ARM_PMU_CNTR_Enable(PMU_CNTENSET_CNT6_ENABLE_Msk);
/* */
/*
TRCENA, bit [24]
Trace enable. Global enable for all DWT, PMU, and ITM features.
UMON_EN, bit [21]
Unprivileged monitor enable. DebugMonitor pend enable when the PE is in an unprivileged mode.
SDME, bit [20]
Secure DebugMonitor enable. Indicates whether the DebugMonitor targets the Secure or the Non-secure
state and whether debug events are allowed in Secure state.
MON_EN, bit [16]
Monitor enable. Enable the DebugMonitor exception.
*/
DCB->DEMCR |= DCB_DEMCR_UMON_EN_Msk |
DCB_DEMCR_SDME_Msk |
DCB_DEMCR_TRCENA_Msk |
DCB_DEMCR_MON_EN_Msk;
/* enable PMU Cycle Counter interrupt */
ARM_PMU_Set_CNTR_IRQ_Enable(PMU_INTENSET_CCYCNT_ENABLE_Msk);
ARM_PMU_CNTR_Enable(PMU_CNTENSET_CCNTR_ENABLE_Msk);
/* Enable the PMU */
ARM_PMU_Enable();
/* force to disable DWT */
DWT->CTRL = 0;
}因为PMU event 的overflow 事件会触发DebugMon_Handler 中断我们添加中断函数处理PMU 的overflow 事件,对统计计数进行统计并清除对应的中断标志位。
static int64_t pmu_event_counter[__PMU_NUM_EVENTCNT + 1] = {0};
static void pmu_monitor_overflow(void)
{
if (!(SCB->DFSR & SCB_DFSR_PMU_Msk)) {
return ;
}
if(ARM_PMU_Get_CNTR_OVS() & PMU_OVSSET_CYCCNT_STATUS_Msk)
{
/* clear overflow event */
ARM_PMU_Set_CNTR_OVS(PMU_OVSCLR_CYCCNT_STATUS_Msk);
pmu_event_counter[__PMU_NUM_EVENTCNT] += (int64_t)0x100000000;
}
if(ARM_PMU_Get_CNTR_OVS() & PMU_OVSSET_CNT1_STATUS_Pos)
{
/* clear overflow event */
ARM_PMU_Set_CNTR_OVS(PMU_OVSCLR_CNT1_STATUS_Pos);
pmu_event_counter[1] += (int64_t)0x100000000;
}
if(ARM_PMU_Get_CNTR_OVS() & PMU_OVSSET_CNT3_STATUS_Pos)
{
/* clear overflow event */
ARM_PMU_Set_CNTR_OVS(PMU_OVSCLR_CNT3_STATUS_Pos);
pmu_event_counter[3] += (int64_t)0x100000000;
}
if(ARM_PMU_Get_CNTR_OVS() & PMU_OVSSET_CNT4_STATUS_Pos)
{
/* clear overflow event */
ARM_PMU_Set_CNTR_OVS(PMU_OVSCLR_CNT4_STATUS_Pos);
pmu_event_counter[4] += (int64_t)0x100000;
}
if(ARM_PMU_Get_CNTR_OVS() & PMU_OVSSET_CNT5_STATUS_Pos)
{
/* clear overflow event */
ARM_PMU_Set_CNTR_OVS(PMU_OVSCLR_CNT5_STATUS_Pos);
pmu_event_counter[5] += (int64_t)0x100000;
}
if(ARM_PMU_Get_CNTR_OVS() & PMU_OVSSET_CNT6_STATUS_Pos)
{
/* clear overflow event */
ARM_PMU_Set_CNTR_OVS(PMU_OVSCLR_CNT6_STATUS_Pos);
pmu_event_counter[6] += (int64_t)0x100000;
}
}
__USED
void DebugMon_Handler(void)
{
pmu_monitor_overflow();
}我们在待评估代码后添加stop 接口,停止对应的事件监测,并计算对应的统计结果打印输出添加void pmu_monitor_stop(void) 函数,对应代码如下:
void pmu_monitor_stop(void)
{
/* Disable the PMU */
ARM_PMU_Disable();
/* Disable ISR */
ARM_PMU_Set_CNTR_IRQ_Disable(PMU_INTENCLR_CYCCNT_ENABLE_Msk);
ARM_PMU_Set_CNTR_IRQ_Disable(PMU_INTENCLR_CNT0_ENABLE_Msk);
ARM_PMU_Set_CNTR_IRQ_Disable(PMU_INTENCLR_CNT1_ENABLE_Msk);
ARM_PMU_Set_CNTR_IRQ_Disable(PMU_INTENCLR_CNT2_ENABLE_Msk);
ARM_PMU_Set_CNTR_IRQ_Disable(PMU_INTENCLR_CNT3_ENABLE_Msk);
ARM_PMU_Set_CNTR_IRQ_Disable(PMU_INTENCLR_CNT4_ENABLE_Msk);
ARM_PMU_Set_CNTR_IRQ_Disable(PMU_INTENCLR_CNT5_ENABLE_Msk);
ARM_PMU_Set_CNTR_IRQ_Disable(PMU_INTENCLR_CNT6_ENABLE_Msk);
ARM_PMU_Set_CNTR_IRQ_Disable(PMU_INTENCLR_CNT7_ENABLE_Msk);
/* Disable PMU Cycle Counter */
ARM_PMU_CNTR_Disable(PMU_CNTENCLR_CCNTR_ENABLE_Msk);
ARM_PMU_CNTR_Disable(PMU_CNTENCLR_CNT0_ENABLE_Msk);
ARM_PMU_CNTR_Disable(PMU_CNTENCLR_CNT1_ENABLE_Msk);
ARM_PMU_CNTR_Disable(PMU_CNTENCLR_CNT2_ENABLE_Msk);
ARM_PMU_CNTR_Disable(PMU_CNTENCLR_CNT3_ENABLE_Msk);
ARM_PMU_CNTR_Disable(PMU_CNTENCLR_CNT4_ENABLE_Msk);
ARM_PMU_CNTR_Disable(PMU_CNTENCLR_CNT5_ENABLE_Msk);
ARM_PMU_CNTR_Disable(PMU_CNTENCLR_CNT6_ENABLE_Msk);
ARM_PMU_CNTR_Disable(PMU_CNTENCLR_CNT7_ENABLE_Msk);
pmu_event_counter[__PMU_NUM_EVENTCNT] += ARM_PMU_Get_CCNTR();
pmu_event_counter[1] += ARM_PMU_Get_EVCNTR(0);
pmu_event_counter[1] += ARM_PMU_Get_EVCNTR(1)<<16;
pmu_event_counter[3] += ARM_PMU_Get_EVCNTR(2);
pmu_event_counter[3] += ARM_PMU_Get_EVCNTR(3)<<16;
pmu_event_counter[4] += ARM_PMU_Get_EVCNTR(4);
pmu_event_counter[5] += ARM_PMU_Get_EVCNTR(5);
pmu_event_counter[6] += ARM_PMU_Get_EVCNTR(6);
printf("cycle %lld \r\n",pmu_event_counter[__PMU_NUM_EVENTCNT]);
printf("INST_RETIRED %lld \r\n",pmu_event_counter[1]);
printf("MEM_ACCESS %lld \r\n",pmu_event_counter[3]);
printf("CACHE_REFILL %lld \r\n",pmu_event_counter[4]);
printf("EXC_RETURN %lld \r\n",pmu_event_counter[5]);
printf("EXC_TAKEN %lld \r\n",pmu_event_counter[6]);
}我么已经添加了start/stop 接口在中间添加测试评估代码即可评估该段代码性能参数。按照此方法添加如下测试代码。
#if (PERFC_LOW_LEVEL_TYPE_PMU != PERFC_LOW_LEVEL_TYPE)
#include "drv_pmu.h"
#include "littleshell.h"
#include "FreeRTOS.h"
#include "task.h"
unsigned int pmutest(char argc,char ** argv)
{
pmu_monitor_start();
vTaskDelay(1000);
pmu_monitor_stop();
static volatile int buff[100];
pmu_monitor_start();
memset((void *)buff,0,sizeof(buff));
pmu_monitor_stop();
return 1;
}
LTSH_FUNCTION_EXPORT(pmutest,"test pmu");
#endif /* end of PERFC_LOW_LEVEL_TYPE_PMU != PERFC_LOW_LEVEL_TYPE */上述测试代码 vTaskDelay(1000)/memset((void *)buff,0,sizeof(buff)) 代码的性能参数。运行结果如下。

至此我们已经完成了使用PMU 测试代码运行期间的性能参数的测试评估,从测试结果看我们的cpu 运行频率480M,vTaskDelay(1000) 占用的cycle counter 和理论值的480000000基本是一致的。
我要赚赏金
