/* * file test\src\main.c * brief conformity and performance tests for FIR algorithms library * author hugo descoubes * date mixing 09/2012 and 09/2021 */ #include #include /* arrays allocations (bytes) : * xk_sp (DDR) |------------------------- 256Kb or 4Mb --------------------------| * xk_sp_l2 |------------- 128Kb + 256b - 4 -------------| overloap * xk_sp_l1d |--- 8Kb + 256b - 4 ---| overloap * a_sp_l1d |- 256b -| * yk_sp_l1d |----- 8Kb -----| * yk_sp_l2 |------------- 128Kb -------------| * yk_sp (DDR) |------------------- (256Kb or 4Mb) - 256b + 4 -------------------| */ float32_t xk_sp[XK_LENGTH]; /* input vector */ float32_t yk_sp_ti[YK_LENGTH]; /* ouput vector for TI reference algorithm */ float32_t yk_sp[YK_LENGTH]; /* ouput vector for algorithm to optimiez */ float32_t xk_sp_l2[L2_ARRAY_LENGTH + A_LENGTH - 1]; float32_t yk_sp_l2[L2_ARRAY_LENGTH]; float32_t xk_sp_l1d[L1D_ARRAY_LENGTH + A_LENGTH - 1]; float32_t yk_sp_l1d[L1D_ARRAY_LENGTH]; float32_t a_sp_l1d[A_LENGTH]; /* memory segmentation */ #pragma DATA_SECTION(xk_sp,".ddrsdram"); #pragma DATA_SECTION(yk_sp_ti,".ddrsdram"); #pragma DATA_SECTION(yk_sp,".ddrsdram"); #pragma DATA_SECTION(xk_sp_l2,".l2sram"); #pragma DATA_SECTION(yk_sp_l2,".l2sram"); #pragma DATA_SECTION(xk_sp_l1d,".l1dsram"); #pragma DATA_SECTION(yk_sp_l1d,".l1dsram"); #pragma DATA_SECTION(a_sp_l1d,".l1dsram"); /* arrays alignments - CPU data path length 64bits */ #pragma DATA_ALIGN(xk_sp, 8); #pragma DATA_ALIGN(a_sp, 8); #pragma DATA_ALIGN(yk_sp_ti, 8); #pragma DATA_ALIGN(yk_sp, 8); #pragma DATA_ALIGN(xk_sp_l2, 8); #pragma DATA_ALIGN(yk_sp_l2, 8); #pragma DATA_ALIGN(xk_sp_l1d, 8); #pragma DATA_ALIGN(yk_sp_l1d, 8); #pragma DATA_ALIGN(a_sp_l1d, 8); void main(void) { Boolean validity; TestSystem_obj conformity; TestPerf_obj benchmark; int i; firtest_init(&conformity, &benchmark, xk_sp, XK_LENGTH); printf( "\nFIR algorithms benchmarking on TI TMS320C6678 DSP architecture" "\ntesting conditions :" "\n--> input vector size : %d samples" "\n--> floating MAC's per cycle max : 8" "\n--> repetitions for average calculation : %d" "\n--> tolerated error : less than %3.1f%%\n", XK_LENGTH, benchmark.perf_rep, conformity.error_margin); /* FIR algorithm from TI optimized DSPLIB for C6600 DSP */ firtest_perf (&benchmark, UMA_L2CACHE_L1DCACHE, yk_sp_ti, &DSPF_sp_fir_gen); printf( "\n--> FIR algorithm from TI optimized DSPLIB for C6600 DSP - DSPF_sp_fir_gen, dsplib_c66x_3_1_0_0" "\nmemory model : full cacheability" "\nperformance : %d cycles, %3.3f ms, %1.2f MAC's per cycle\n" , benchmark.perf_nbcycles , benchmark.perf_usertime_ms , benchmark.perf_macs); fflush(stdout); /* FIR algorithm in canonical c */ #if ( TEST_FIR_SP != 0 ) printf( "\n--> FIR algorithm in canonical c " "\nmemory model : full cacheability"); for (i=0; i FIR algorithm in canonical C6600 asm" "\nmemory model : full cacheability"); for (i=0; i FIR algorithm in C6600 asm vliw" "\nmemory model : full cacheability"); for (i=0; i FIR algorithm in C6600 asm pipelining software" "\nmemory model : full cacheability"); for (i=0; i FIR algorithm in C6600 vectorial asm radix 4" "\nmemory model : full cacheability"); for (i=0; i FIR algorithm in c canonical unrolling radix 4" "\nmemory model : full cacheability"); for (i=0; i FIR algorithm in c vectorized intrinsics radix 4" "\nmemory model : full cacheability"); for (i=0; i FIR algorithm in c vectorized intrinsics radix 4" "\nmemory model : L2SRAM / L1D Cache"); for (i=0; i FIR algorithm in c vectorized intrinsics radix 4" "\nmemory model : L2SRAM / L1D SRAM"); for (i=0; i FIR algorithm in c vectorized intrinsics radix 4" "\nmemory model : L2SRAM / L1D SRAM - IDMA"); for (i=0; i