; file firlib\src\fir_sp_asm_r14.s
; brief FIR filtering function in C6600 assembler without optimization
; (single precision IEEE754)
; Advices for assembler procedure development :
; - step 1 : procedure call and return
; - step 2 : check loops exit conditions and number of iterations
; - step 3 : check data loading and storage with complex index generation 
; - step 4 : algorithm implementation
; - step 5 : during test and validation, check limits values
; author 
; date

	.global fir_sp_asm_r4


save_context	.macro	rsp
				; save core working registers context on the top of stack
				; cf. SPRU187V Optimizing Compiler
				; Chapter 7.3 Register Conventions, table 7.2
				MV				B15,rsp 	; save Stack Pointer
				STDW			B15:B14,*rsp--[1]
				STDW			B13:B12,*rsp--[1]
				STDW			B11:B10,*rsp--[1]
				STDW			A15:A14,*rsp--[1]
				STDW			A13:A12,*rsp--[1]
				STDW			A11:A10,*rsp--[1]
				MVC				ILC,B15
				MVC				RILC,B14
				STDW			B15:B14,*rsp--[1]				
				; do not use rsp register in current ASM procedure 
				.endm

restore_context	.macro	rsp
				; restore core working registers context from the top of stack
				LDDW			*++rsp[1],B15:B14	
				MVC				B14,RILC
				MVC				B15,ILC				
				LDDW			*++rsp[1],A11:A10
				LDDW			*++rsp[1],A13:A12
				LDDW			*++rsp[1],A15:A14
				LDDW			*++rsp[1],B11:B10
				LDDW			*++rsp[1],B13:B12
				LDDW			*++rsp[1],B15:B14		
				MV				rsp,B15 	; restore Stack Pointer
				NOP				3
				.endm
				
	
; prototype :				parameters registers :
;
; void fir_sp_asm_r14 (	const float * restrict xk, 	-> A4
;						const float * restrict a,	-> B4
;						float * restrict yk, 		-> A6
;						int na,						-> B6
;						int nyk);					-> A8
fir_sp_asm_r4:
				; save core registers context
				save_context	A3
				
				;int i, j;
				MV				A8,A0		;i<nyk
				;SUB				B6,1,B6
								
L1:				; input array loop			
				;for (i=0; i<nyk; i++) {
				;	tmp = 0;
				ZERO			A25:A24	
				ZERO			B25:B24					
				MV				B6,A1		;j<na				
				MV				A4,A15
				MV				B4,B15					
											
L2:				; FIR filter algorithm - dot product	
				;for (j=0; j<na; j++){
				; a0 = a[j];
				; a1 = a[j+1];
				; a2 = a[j+2];
				; a3 = a[j+3];
				LDDW	.D2		*B15++,B17:B16 	; [a1 a0]
				LDDW	.D2		*B15++,B19:B18 	; [a3 a2]
				NOP				4
				; rq : B19:B18:B17:B16 = [a3 a2 a1 a0]
				
				; xk0 = xk[j+i];
				; xk1 = xk[j+i+1];
				; xk2 = xk[j+i+2];
				; xk3 = xk[j+i+3];
				LDNDW	.D1		*A15++,A17:A16 	; [xk1 xk0]
				LDNDW	.D1		*A15++,A19:A18 	; [xk3 xk2]				
				NOP				4
			
				; ... a0*xk0;
				; ... a1*xk1;
				; ... a2*xk2;
				; ... a3*xk3;				
				DMPYSP	.M1x	A17:A16,B17:B16,A21:A20 ; [a1.xk1 a0.xk0]
	||			DMPYSP	.M2x	A19:A18,B19:B18,B21:B20 ; [a3.xk3 a2.xk2]
				NOP				3			

				; acc0 += ... 
				; acc1 += ... 
				; acc2 += ... 
				; acc3 += ... 	
				DADDSP	.L1		A21:A20,A25:A24,A25:A24	 ; [0+a1.xk1+... 0+a0.xk0+...]
	||			DADDSP	.L2		B21:B20,B25:B24,B25:B24	 ; [0+a3.xk3+... 0+a2.xk2+...]				
				NOP				2
							
		[A1] 	SUB				A1,4,A1	
		[A1] 	B				L2											
				NOP				5				
				; end of FIR filter algorithm - dot product	
				
				DADDSP			A25:A24, B25:B24, A27:A26 ; [a1.xk1+a3.xk3+... a0.xk0+a2.xk2+...]
				NOP				2
				FADDSP			A27,A26,A5 ; [a0.xk0+a1.xk1+a2.xk2+a3.xk3+...]
				NOP				2
				STW				A5,*A6++	;yk(i) = tmp;
				ADD				A4,4,A4  	;i+j (float -> 4b)															
		[A0]	BDEC			L1,A0
				NOP				5
				; end of input array loop	

				; restore core registers context and leave procedure
				restore_context	A3
						
				B				B3
				NOP				5
				; end of fir_sp_asm procedure