; file firlib\src\fir_sp_asm_r14.s
; brief FIR filtering function in C6600 assembler without optimization
; (single precision IEEE754)
; Advices for assembler procedure development :
; - step 1 : procedure call and return
; - step 2 : check loops exit conditions and number of iterations
; - step 3 : check data loading and storage with complex index generation 
; - step 4 : algorithm implementation
; - step 5 : during test and validation, check limits values
; author 
; date

	.global fir_sp_asm_r14

save_context	.macro	rsp
				; save core working registers context on the top of stack
				; cf. SPRU187V Optimizing Compiler
				; Chapter 7.3 Register Conventions, table 7.2
				MV				B15,rsp 	; save Stack Pointer
				STDW			B15:B14,*rsp--[1]
				STDW			B13:B12,*rsp--[1]
				STDW			B11:B10,*rsp--[1]
				STDW			A15:A14,*rsp--[1]
				STDW			A13:A12,*rsp--[1]
				STDW			A11:A10,*rsp--[1]
				MVC				ILC,B15
				MVC				RILC,B14
				STDW			B15:B14,*rsp--[1]				
				; do not use rsp register in current ASM procedure 
				.endm

restore_context	.macro	rsp
				; restore core working registers context from the top of stack
				LDDW			*++rsp[1],B15:B14	
				MVC				B14,RILC
				MVC				B15,ILC				
				LDDW			*++rsp[1],A11:A10
				LDDW			*++rsp[1],A13:A12
				LDDW			*++rsp[1],A15:A14
				LDDW			*++rsp[1],B11:B10
				LDDW			*++rsp[1],B13:B12
				LDDW			*++rsp[1],B15:B14		
				MV				rsp,B15 	; restore Stack Pointer
				NOP				3
				.endm	

; prototype :				parameters registers :
;
; void fir_sp_asm_r14 (	const float * restrict xk, 	-> A4
;						const float * restrict a,	-> B4
;						float * restrict yk, 		-> A6
;						int na,						-> B6
;						int nyk);					-> A8

fir_sp_asm_r14:
				; save core registers context
				save_context	A3
				
				; init
				MV				A8, B0		; B0  <=> i

fir_sp_asm_l1:	; input array loop			
				;for (i=0; i<nyk; i++) {
				;	tmp = 0;
				ZERO			A10			; A10 <=> acc0
				ZERO			A11			; A11 <=> acc1
				ZERO			A12			; A12 <=> acc2
				ZERO			A13			; A13 <=> acc3
				MV				B6, A1		; A1  <=> j				
				MV				A4, A19		; A4  <=> *xk   ; A19 <=> xk[i+j]
				MV				B4, B19		; B4  <=> *a    ; B19 <=> a[j]									

fir_sp_asm_l2:	; FIR filter algorithm - dot product	
				;for (j=0; j<na; j++){
				;	tmp += a[j]*xk[i+j];
				LDNDW			*A19++, A15:A14				; A15:A14 <=> xk[i+j+1]:xk[i+j]
				LDDW			*B19++, B15:B14				; B15:B14 <=> a[j+1]:a[j]	
				NOP				4
				LDNDW			*A19++, A17:A16				; A17:A16 <=> xk[i+j+3]:xk[i+j+2]
				LDDW			*B19++, B17:B16				; B17:B16 <=> a[j+3]:a[j+2]		
				NOP				4
				DMPYSP			A15:A14, B15:B14, A21:A20	; A21:A20 <=> xk[i+j+1] * a[j+1]   : xk[i+j  ] * a[j]
		||		DMPYSP			A17:A16, B17:B16, B23:B22	; B23:B22 <=> xk[i+j+3] * a[j+3]   : xk[i+j+2] * a[j+2]
				NOP				3
				DADDSP			A21:A20, A11:A10, A11:A10	; acc1 += ...   :   acc0 += ...
		||		DADDSP			B23:B22, A13:A12, A13:A12	; acc3 += ...   :   acc2 += ...
				NOP				2
		[A1] 	SUB				A1, 4, A1					; j += 4
		[A1] 	B				fir_sp_asm_l2	
				NOP				5	
				; end of FIR filter algorithm - dot product	
				
				DADDSP			A13:A12, A11:A10, A11:A10	; acc1 = acc3 + acc1   :   acc0 = acc2 + acc0
				NOP				2
				FADDSP			A11, A10, A10				; acc_tmp = acc0 + acc1
				NOP				2
				STW				A10, *A6++					; yk(i) = acc_tmp;
				ADD				A4, 4, A4  	;i+j (float -> 4b)	
		[B0] 	SUB				B0, 1, B0	
		[B0] 	B				fir_sp_asm_l1	
				NOP				5
				; end of input array loop	

				; restore core registers context and leave procedure
				restore_context	A3

				B				B3
				NOP				5
				; end of fir_sp_asm procedure