/*
* file test/src/firtest_perf.c
* brief performance test for FIR algorithms library
* author hugo descoubes
*/


#include <firtest.h>


void firtest_perf(	TestPerf_obj *benchmark,
					uint8_t memoryModel,
					float * restrict output,
					void (*fir_fct) (	const float * restrict,
										const float * restrict,
										float * restrict,
										int,
										int) ) {

	volatile CSL_Uint64 start, stop, duration=0;
	volatile int32_t i, j, k;

	/* fix number of repetitions */
	for (k=0; k < benchmark->perf_rep; k++) {

		if ( memoryModel == UMA_L2CACHE_L1DCACHE ) {
			/* caches levels initializations */
			CACHE_setL2Size(CACHE_256KCACHE);
			CACHE_setL1DSize(CACHE_L1_32KCACHE);
			CACHE_setL1PSize(CACHE_L1_32KCACHE);

			/* read current TSC value */
			start = CSL_tscRead ();

			/* call fir algorithm for performance test */
			(*fir_fct) (xk_sp, a_sp, output, A_LENGTH, YK_LENGTH);

			/* processing time calculation */
			stop = CSL_tscRead ();
			duration += stop-start;
		}

#if ( TEST_FIR_L2SRAM_L1DCACHE != 0 )
		else if ( memoryModel == UMA_L2SRAM_L1DCACHE ) {
			/* caches levels initializations */
			CACHE_setL2Size(CACHE_32KCACHE);
			CACHE_setL1DSize(CACHE_L1_32KCACHE);
			CACHE_setL1PSize(CACHE_L1_32KCACHE);

			/* read current TSC value */
			start = CSL_tscRead ();

			/* copy part of input array from DDR to L2 */
			for(i=0; i<DDR_ARRAY_LENGTH; i+=L2_ARRAY_LENGTH){

				/* memory copy from DDR to L2 */
				if( i < (DDR_ARRAY_LENGTH - L2_ARRAY_LENGTH) ){
					memcpy(xk_sp_l2, xk_sp + i, (L2_ARRAY_LENGTH + A_LENGTH - 1)*sizeof(float32_t));
				}else{
					memcpy(xk_sp_l2, xk_sp + i, L2_ARRAY_LENGTH*sizeof(float32_t));
				}

				/* call fir algorithm for performance test */
				(*fir_fct) (xk_sp_l2, a_sp, yk_sp_l2, A_LENGTH, L2_ARRAY_LENGTH);

				/* memory copy from DDR to L2 */
				if( i < (YK_LENGTH - L2_ARRAY_LENGTH) ){
					memcpy(output + i, yk_sp_l2, L2_ARRAY_LENGTH*sizeof(float32_t));
				}else{
					memcpy(output + i, yk_sp_l2, (L2_ARRAY_LENGTH - A_LENGTH + 1)*sizeof(float32_t));
				}
			}

			/* processing time calculation */
			stop = CSL_tscRead ();
			duration += stop-start;

		}
#endif

#if ( TEST_FIR_L2SRAM_L1DSRAM != 0 )
		else if ( memoryModel == UMA_L2SRAM_L1DSRAM ) {
			/* caches levels initializations */
			CACHE_setL2Size(CACHE_32KCACHE);
			CACHE_setL1DSize(CACHE_L1_4KCACHE);
			CACHE_setL1PSize(CACHE_L1_32KCACHE);

			/* prepare coefficients in L1D SRAM */
			memcpy(a_sp_l1d, a_sp, A_LENGTH*sizeof(float32_t));

			/* read current TSC value */
			start = CSL_tscRead ();

			/* copy part of input array from DDR to L2 */
			for(i=0; i<DDR_ARRAY_LENGTH; i+=L2_ARRAY_LENGTH){

				/* memory copy from DDR to L2 */
				if( i < (DDR_ARRAY_LENGTH - L2_ARRAY_LENGTH) ){
					memcpy(xk_sp_l2, xk_sp + i, (L2_ARRAY_LENGTH + A_LENGTH - 1)*sizeof(float32_t));
				}else{
					memcpy(xk_sp_l2, xk_sp + i, L2_ARRAY_LENGTH*sizeof(float32_t));
				}

				/* copy part of input array from L2 to L1D */
				for(j=0; j<L2_ARRAY_LENGTH; j+=L1D_ARRAY_LENGTH ){

					/* memory copy from L2 to L1D */
					memcpy(xk_sp_l1d, xk_sp_l2 + j, (L1D_ARRAY_LENGTH + A_LENGTH - 1)*sizeof(float32_t));

					/* call fir algorithm for performance test */
					(*fir_fct) (xk_sp_l1d, a_sp_l1d, yk_sp_l1d, A_LENGTH, L1D_ARRAY_LENGTH);

					/* memory copy from L1D to L2 - coherency of output L2 array*/
					memcpy(yk_sp_l2 + j, yk_sp_l1d, L1D_ARRAY_LENGTH*sizeof(float32_t));
				}

				/* memory copy from DDR to L2 */
				if( i < (YK_LENGTH - L2_ARRAY_LENGTH) ){
					memcpy(output + i, yk_sp_l2, L2_ARRAY_LENGTH*sizeof(float32_t));
				}else{
					memcpy(output + i, yk_sp_l2, (L2_ARRAY_LENGTH - A_LENGTH + 1)*sizeof(float32_t));
				}
			}

			/* processing time calculation */
			stop = CSL_tscRead ();
			duration += stop-start;
		}
#endif

#if ( TEST_FIR_L2SRAM_L1DIDMA != 0 )
		else if ( memoryModel == UMA_L2SRAM_L1DSRAM ) {
			/* caches levels initializations */
			CACHE_setL2Size(CACHE_32KCACHE);
			CACHE_setL1DSize(CACHE_L1_4KCACHE);
			CACHE_setL1PSize(CACHE_L1_32KCACHE);

			/* prepare coefficients in L1D SRAM */
			memcpy(a_sp_l1d, a_sp, A_LENGTH*sizeof(float32_t));

			/* read current TSC value */
			start = CSL_tscRead ();

			/* copy part of input array from DDR to L2 */
			for(i=0; i<DDR_ARRAY_LENGTH; i+=L2_ARRAY_LENGTH){

				/* memory copy from DDR to L2 */
				if( i < (DDR_ARRAY_LENGTH - L2_ARRAY_LENGTH) ){
					memcpy(xk_sp_l2, xk_sp + i, (L2_ARRAY_LENGTH + A_LENGTH - 1)*sizeof(float32_t));
				}else{
					memcpy(xk_sp_l2, xk_sp + i, L2_ARRAY_LENGTH*sizeof(float32_t));
				}

				/* copy part of input array from L2 to L1D */
				for(j=0; j<L2_ARRAY_LENGTH; j+=L1D_ARRAY_LENGTH ){

					/* memory copy from L2 to L1D */
					idmacpy(xk_sp_l1d, xk_sp_l2 + j, (L1D_ARRAY_LENGTH + A_LENGTH - 1)*sizeof(float32_t));

					/* call fir algorithm for performance test */
					(*fir_fct) (xk_sp_l1d, a_sp, yk_sp_l1d, A_LENGTH, L1D_ARRAY_LENGTH);

					/* memory copy from L1D to L2 - coherency of output L2 array*/
					idmacpy(yk_sp_l2 + j, yk_sp_l1d, L1D_ARRAY_LENGTH*sizeof(float32_t));
				}

				/* memory copy from DDR to L2 */
				if( i < (YK_LENGTH - L2_ARRAY_LENGTH) ){
					memcpy(output + i, yk_sp_l2, L2_ARRAY_LENGTH*sizeof(float32_t));
				}else{
					memcpy(output + i, yk_sp_l2, (L2_ARRAY_LENGTH - A_LENGTH + 1)*sizeof(float32_t));
				}
			}

			/* processing time calculation */
			stop = CSL_tscRead ();
			duration += stop-start;
		}
#endif

	} /* end for - repetitions */

	duration /= benchmark->perf_rep;
	benchmark->perf_nbcycles = duration;
	benchmark->perf_usertime_ms = (float32_t) duration * CPU_CLOCK_MS;
	benchmark->perf_macs = ((float32_t) FIR_NB_MACS) / ((float32_t) duration);
}

