我有个函数,实现yuv转rgb。我觉得速度很慢,并且cpu是arm926ej,所以我想把循环体改成汇编,请问如何混合编码?有例子吗?并且希望在函数的开始把一些表加载到片上内存或者cache里,请问该怎么做?我怎么知道片上内存是否被别的变量占用?否则拷贝的话,程序就出错了吧?c的函数如下。
void YUV420RGB888(uint8 *pY, uint8 *pU, uint8 *pV, uint32 ImgWidth, uint32 ImgHeight, uint8 *pRGB, uint8 *pAllTables) { int16 * RGB_Y_tab = (int16*)pAllTables; int16 * B_U_tab = RGB_Y_tab + 256; int16 * G_U_tab = B_U_tab + 256; int16 * G_V_tab = G_U_tab + 256; int16 * R_V_tab = G_V_tab + 256; uint8 * RGB_Clip = (uint8 *)(R_V_tab + 256); const uint32 dst_dif = 3 * 3 * ImgWidth; int32 y_dif = 2 * ImgWidth - ImgWidth; uint8 *dst1 = pRGB + ImgWidth*3*(ImgHeight-1); uint8 *dst2 = dst1 - ImgWidth*3; uint8 *pY2 = pY + ImgWidth; uint32 x, y; uint32 ImgWidth2 = ImgWidth>>1; for (y = ImgHeight>>1; y; y--) { // process one 2x2 block per iteration for (x = 0; x < ImgWidth2; x++) { int32 u, v; int32 b_u, g_uv, r_v, rgb_y; int32 r, g, b; u = pU[x]; v = pV[x]; b_u = B_U_tab[u]; g_uv = G_U_tab[u] + G_V_tab[v]; r_v = R_V_tab[v]; rgb_y = RGB_Y_tab[*pY]; b = (rgb_y + b_u) >> SCALEBITS_OUT; g = (rgb_y - g_uv) >> SCALEBITS_OUT; r = (rgb_y + r_v) >> SCALEBITS_OUT; dst1[2] = RGB_Clip[(r)+2048]; dst1[1] = RGB_Clip[(g)+2048]; dst1[0] = RGB_Clip[(b)+2048]; pY++; rgb_y = RGB_Y_tab[*pY]; b = (rgb_y + b_u) >> SCALEBITS_OUT; g = (rgb_y - g_uv) >> SCALEBITS_OUT; r = (rgb_y + r_v) >> SCALEBITS_OUT; dst1[5] = RGB_Clip[(r)+2048]; dst1[4] = RGB_Clip[(g)+2048]; dst1[3] = RGB_Clip[(b)+2048]; pY++; rgb_y = RGB_Y_tab[*pY2]; b = (rgb_y + b_u) >> SCALEBITS_OUT; g = (rgb_y - g_uv) >> SCALEBITS_OUT; r = (rgb_y + r_v) >> SCALEBITS_OUT; dst2[2] = RGB_Clip[(r)+2048]; dst2[1] = RGB_Clip[(g)+2048]; dst2[0] = RGB_Clip[(b)+2048]; pY2++; rgb_y = RGB_Y_tab[*pY2]; b = (rgb_y + b_u) >> SCALEBITS_OUT; g = (rgb_y - g_uv) >> SCALEBITS_OUT; r = (rgb_y + r_v) >> SCALEBITS_OUT; dst2[5] = RGB_Clip[(r)+2048]; dst2[4] = RGB_Clip[(g)+2048]; dst2[3] = RGB_Clip[(b)+2048]; pY2++; dst1 += 6; dst2 += 6; } dst1 -= dst_dif; dst2 -= dst_dif; pY += y_dif; pY2 += y_dif; pU += ImgWidth2; pV += ImgWidth2; } }