Index: linux/arch/arm/fastfpe/entry.S =================================================================== --- pre/linux/arch/arm/fastfpe/entry.S (revision 26) +++ post/linux/arch/arm/fastfpe/entry.S (working copy) @@ -9,6 +9,8 @@ */ +#include + /*---------------------------------------------------------------------------*/ .data @@ -79,19 +81,16 @@ .globl fastfpe_next fastfpe_next: ldr r5,[r13,#60] -next_after_cond: +next_after_cond_false: __x1: ldrt r4,[r5],#4 ldr r0,=fp_cond @ check condition of next instruction - ldr r1,[r13,#64] @ psr containing flags mov r2,r4,lsr#28 - mov r1,r1,lsr#28 - ldr r0,[r0,r2,lsl#2] - mov r0,r0,lsr r1 - tst r0,#1 - beq next_after_cond @ must not necessarily have been an - @ FP instruction ! + cmp r2,#0xe @ "always" condition code + bne next_check_cond + +next_check_copro: and r1,r4,#0x0f000000 @ Test for copro instruction cmp r1,#0x0c000000 rsbgts r0,r1,#0x0e000000 @ cmpgt #0x0e000000,r1 @@ -102,8 +101,20 @@ movge pc,r9 @ copro = 0 or >=3, return str r5,[r13,#60] @ save updated pc - b next_enter + cmp r1,#1<<8 @ which copro ? + beq copro_1 + b copro_2 +next_check_cond: + ldr r1,[r13,#64] @ psr containing flags + ldr r0,[r0,r2,lsl#2] + mov r1,r1,lsr#28 + mov r0,r0,lsr r1 + tst r0,#1 + bne next_check_copro + b next_after_cond_false @ must not necessarily have been an + @ FP instruction ! + /*---------------------------------------------------------------------------*/ undefined: @@ -130,24 +141,23 @@ and r0,r4,#0x00007000 @ r0=fp register number << 12 add r0,r10,r0,lsr#8 @ r0=address of fp register - mov r1,#0 - tst r4,#0x00008000 - orrne r1,r1,#1 @ T0 + + and r1,r4,#0x00008000 @ T0 tst r4,#0x00400000 - orrne r1,r1,#2 @ T1 + orrne r1,r1,#0x00010000 @ T1 tst r4,#0x00100000 - orrne r1,r1,#4 @ L/S + orrne r1,r1,#0x00020000 @ L/S - add pc,pc,r1,lsl#2 - mov r0,r0 - b CPDT_store_single @ these functions get - b CPDT_store_double @ r0=address of fp register - b CPDT_store_extended @ r6=address of data - b undefined @ CPDT_store_decimal - b CPDT_load_single - b CPDT_load_double - b CPDT_load_extended - b undefined @ CPDT_load_decimal + ldr pc,[pc,r1,lsr#13] + .word 0 + .word CPDT_store_single @ these functions get + .word CPDT_store_double @ r0=address of fp register + .word CPDT_store_extended @ r6=address of data + .word undefined @ CPDT_store_decimal + .word CPDT_load_single + .word CPDT_load_double + .word CPDT_load_extended + .word undefined @ CPDT_load_decimal /*---------------------------------------------------------------------------*/ @@ -194,45 +204,51 @@ add r2,r10,r2,lsl#4 @ r2=address of Fm CPDO_constback: + ldr r3,=round_table + and r5,r4,#0x000000e0 + and r6,r4,#0x00080000 + orr r5,r5,r6,lsr#11 @ r5=containing rounding mode/precision + ldr r14,[r3,r5,lsr#3] @ r14=address of rounding function and r3,r4,#0x00f00000 tst r4,#0x00008000 - orrne r3,r3,#0x01000000 - - add pc,pc,r3,lsr#18 - mov r0,r0 - b CPDO_adf - b CPDO_muf - b CPDO_suf - b CPDO_rsf - b CPDO_dvf - b CPDO_rdf - b undefined - b undefined - b undefined @ CPDO_rmf - b CPDO_muf - b CPDO_dvf - b CPDO_rdf - b undefined - b undefined - b undefined - b undefined - b CPDO_mvf - b CPDO_mnf - b CPDO_abs - b CPDO_rnd - b CPDO_sqt - b undefined - b undefined - b undefined - b undefined - b undefined - b undefined - b undefined - b undefined - b undefined - b CPDO_rnd - b fastfpe_next + orrne r3,r3,#0x01000000 @ r3=operation code + ldr pc,[pc,r3,lsr#18] + .word 0 +CPDO_table: + .word CPDO_adf + .word CPDO_muf + .word CPDO_suf + .word CPDO_rsf + .word CPDO_dvf + .word CPDO_rdf + .word undefined + .word undefined + .word CPDO_rmf + .word CPDO_muf + .word CPDO_dvf + .word CPDO_rdf + .word undefined + .word undefined + .word undefined + .word undefined + .word CPDO_mvf + .word CPDO_mnf + .word CPDO_abs + .word CPDO_rnd + .word CPDO_sqt + .word undefined + .word undefined + .word undefined + .word undefined + .word undefined + .word undefined + .word undefined + .word undefined + .word undefined + .word CPDO_rnd + .word fastfpe_next + CPDO_const: ldr r2,=fp_const and r3,r4,#0x00000007 @@ -253,30 +269,58 @@ CPRT_constback: and r3,r4,#0x00f00000 - add pc,pc,r3,lsr#18 - mov r0,r0 - b CPRT_flt - b CPRT_fix - b CPRT_wfs - b CPRT_rfs - b undefined - b undefined - b undefined - b undefined - b undefined - b CPRT_cmf - b undefined - b CPRT_cnf - b undefined - b CPRT_cmf - b undefined - b CPRT_cnf + ldr pc,[pc,r3,lsr#18] + .word 0 + .word CPRT_flt + .word CPRT_fix + .word CPRT_wfs + .word CPRT_rfs + .word undefined + .word undefined + .word undefined + .word undefined + .word undefined + .word CPRT_cmf + .word undefined + .word CPRT_cnf + .word undefined + .word CPRT_cmf + .word undefined + .word CPRT_cnf CPRT_const: ldr r2,=fp_const and r3,r4,#0x00000007 add r2,r2,r3,lsl#4 b CPRT_constback + +/*---------------------------------------------------------------------------*/ + + @ Test if long multiply instructions are available + + .globl fastfpe_test +fastfpe_test: + .globl elf_hwcap + ldr r0,=elf_hwcap + ldr r0,[r0] + tst r0,#HWCAP_FAST_MULT + bne fastfpe_has_long_multiply + mov r0,#0 + mov pc,r14 + +fastfpe_has_long_multiply: + adr r0,CPDO_table + ldr r1,=CPDO_muf_M + str r1,[r0,#1*4] @ muf + str r1,[r0,#9*4] @ fml + ldr r1,=CPDO_dvf_M + str r1,[r0,#4*4] @ dvf + str r1,[r0,#10*4] @ fdv + ldr r1,=CPDO_rdf_M + str r1,[r0,#5*4] @ rdf + str r1,[r0,#11*4] @ frd + mov r0,#1 + mov pc,r14 /*---------------------------------------------------------------------------*/ Index: linux/arch/arm/fastfpe/CPDO.S =================================================================== --- pre/linux/arch/arm/fastfpe/CPDO.S (revision 26) +++ post/linux/arch/arm/fastfpe/CPDO.S (working copy) @@ -3,103 +3,679 @@ for the sign in bit 31, the second and third are for the mantissa (unsigned integer, high 32 bit first) and the fourth is the exponent (signed integer). The mantissa is always normalized. - + If the exponent is 0x80000000, that is the most negative value, the number represented is 0 and both mantissa words are also 0. - + If the exponent is 0x7fffffff, that is the biggest positive value, the number -represented is infinity if the high 32 mantissa bit are also 0, otherwise it is -a NaN. The low 32 mantissa bit are 0 if the number represented is infinity. - +represented is infinity if the mantissa is 0, otherwise it is a NaN. + Decimal and packed decimal numbers are not supported yet. The parameters to these functions are r0=destination pointer, r1 and r2 -source pointers. r4 is the instruction. They may use r0-r8 and r14. They return -to fastfpe_next, except CPDO_rnf_core which expects the return address in r14. +source pointers. r4 is the instruction. They may use r0-r8, r11. They return +to r14, which contains the address of a rounding function. The rounding +function expects r0=address, r1-r4=sign, mantissa high, mantissa low, +exponent, r5=additional lower mantissa bits. + +CPDO_rnf_core expects the return address in r14. */ /*---------------------------------------------------------------------------*/ .globl CPDO_adf CPDO_adf: - ldmia r1,{r1,r3,r5,r7} - ldmia r2,{r2,r4,r6,r8} + ldmia r2,{r6,r7,r8,r11} + ldmia r1,{r1,r2,r3,r4} - cmp r7,#0x7fffffff - cmpne r8,#0x7fffffff + cmp r11,#0x7fffffff + cmpne r11,#0x80000000 + cmpne r4,#0x7fffffff + cmpne r4,#0x80000000 beq CPDO_adf_extra - - cmp r1,r2 + + cmp r1,r6 bne CPDO_suf_s CPDO_adf_s: - subs r2,r7,r8 - bge CPDO_adf_2nd - - mov r7,r8 - rsb r2,r2,#0 - cmp r2,#32 - ble CPDO_adf_1st2 + subs r6,r4,r11 + bmi CPDO_adf_normalize1st - sub r2,r2,#32 - cmp r2,#32 - movgt r2,#32 - mov r5,r3,lsr r2 - mov r3,#0 +CPDO_adf_normalize2nd: + cmp r6,#32 + ble CPDO_adf_normalize2nd_1 + cmp r6,#64 + bgt CPDO_adf_normalize2nd_3 + +CPDO_adf_normalize2nd_2: + sub r6,r6,#32 + rsb r11,r6,#32 + mov r5,r8,lsr r6 + add r5,r5,r7,lsl r11 + movs r11,r8,lsl r11 + orrne r5,r5,#1 + mov r8,r7,lsr r6 + mov r7,#0 b CPDO_adf_add -CPDO_adf_1st2: - rsb r8,r2,#32 - mov r5,r5,lsr r2 - orr r5,r5,r3,lsl r8 - mov r3,r3,lsr r2 @ 1. op normalized +CPDO_adf_normalize2nd_1: + rsb r11,r6,#32 + mov r5,r8,lsl r11 + mov r8,r8,lsr r6 + add r8,r8,r7,lsl r11 + mov r7,r7,lsr r6 b CPDO_adf_add -CPDO_adf_2nd: - cmp r2,#32 - ble CPDO_adf_2nd2 +CPDO_adf_normalize2nd_3: + mov r5,#0x40000000 + mov pc,r14 - sub r2,r2,#32 - cmp r2,#32 - movgt r2,#32 - mov r6,r4,lsr r2 - mov r4,#0 +CPDO_adf_normalize1st: + mov r4,r11 + rsb r6,r6,#0 + cmp r6,#32 + ble CPDO_adf_normalize1st_1 + cmp r6,#64 + bgt CPDO_adf_normalize1st_3 + +CPDO_adf_normalize1st_2: + sub r6,r6,#32 + rsb r11,r6,#32 + mov r5,r3,lsr r6 + add r5,r5,r2,lsl r11 + movs r11,r3,lsl r11 + orrne r5,r5,#1 + mov r3,r2,lsr r6 + mov r2,#0 b CPDO_adf_add -CPDO_adf_2nd2: - rsb r8,r2,#32 - mov r6,r6,lsr r2 - orr r6,r6,r4,lsl r8 - mov r4,r4,lsr r2 @ 2. op normalized +CPDO_adf_normalize1st_1: + rsb r11,r6,#32 + mov r5,r3,lsl r11 + mov r3,r3,lsr r6 + add r3,r3,r2,lsl r11 + mov r2,r2,lsr r6 + b CPDO_adf_add +CPDO_adf_normalize1st_3: + mov r5,#0x40000000 + mov r2,r7 + mov r3,r8 + mov pc,r14 + CPDO_adf_add: - adds r5,r5,r6 - adcs r3,r3,r4 @ do addition - bcc CPDO_adf_end + adds r3,r3,r8 + adcs r2,r2,r7 + bcc CPDO_adf_add_no_overflow - add r7,r7,#1 + movs r2,r2,rrx movs r3,r3,rrx - mov r5,r5,rrx @ correct for overflow + movs r5,r5,rrx + orrcs r5,r5,#1 + add r4,r4,#1 -CPDO_adf_end: - cmp r7,#0x20000000 - bge CPDO_inf +CPDO_adf_add_no_overflow: + mov pc,r14 - stmia r0,{r1,r3,r5,r7} +CPDO_adf_extra: + cmp r4,#0x7fffffff + beq CPDO_adf_1st_infnan + cmp r11,#0x7fffffff + beq CPDO_adf_2nd_infnan + cmp r11,#0x80000000 + beq CPDO_adf_2nd_0 + +CPDO_adf_1st_0: + mov r1,r6 + mov r2,r7 + mov r3,r8 + mov r4,r11 + mov r5,#0 + mov pc,r14 + +CPDO_adf_2nd_0: + cmp r4,#0x80000000 + beq CPDO_adf_both_0 + mov r5,#0 + mov pc,r14 + +CPDO_adf_both_0: + cmp r1,r6 + beq CPDO_adf_both_0_equal_sign + and r5,r5,#0x00000060 + cmp r5,#0x00000040 // rounding mode M? + moveq r1,#0x80000000 + movne r1,#0 +CPDO_adf_both_0_equal_sign: + stmia r0,{r1,r2,r3,r4} b fastfpe_next +@ mov pc,r14 -CPDO_adf_extra: - cmp r7,#0x7fffffff @ was it the 1st ? - bne CPDO_infnan_2 @ no it was the 2nd - cmp r8,#0x7fffffff @ if 1st, 2nd too ? - bne CPDO_infnan_1 @ no only 1st - cmp r3,#0 - cmpeq r4,#0 - bne CPDO_nan_12 - b CPDO_inf +CPDO_adf_1st_infnan: + cmp r11,#0x7fffffff + beq CPDO_adf_both_infnan +CPDO_adf_1st_infnan_entry: + orrs r5,r3,r2,lsl#1 // ignore MSB + moveq pc,r14 // Inf + tst r2,#0x40000000 + movne pc,r14 // QNaN +CPDO_adf_generate_qnan: + mov r1,#0x80000000 + mov r2,#0x7fffffff + mov r3,#0xffffffff + mov r4,#0x7fffffff + ldr r5,[r10,#128] + orr r5,r5,#1 // set invalid operation flag + str r5,[r10,#128] + mov pc,r14 +CPDO_adf_2nd_infnan: + mov r1,r6 + mov r2,r7 + mov r3,r8 + mov r4,r11 + b CPDO_adf_1st_infnan_entry + +CPDO_adf_both_infnan: + orrs r5,r3,r2,lsl#1 // ignore MSB + beq CPDO_adf_1st_inf + orrs r5,r8,r7,lsl#1 // ignore MSB + beq CPDO_adf_2nd_inf + tst r2,#0x40000000 + tstne r7,#0x40000000 + beq CPDO_adf_generate_qnan // at least one is SNaN + orrs r5,r3,r2,lsl#1 // ignore MSB, FIXME! what is going on here? + moveq r1,r6 // if first is not NaN + moveq r2,r7 // give second as result + moveq r3,r8 + mov pc,r14 + +CPDO_adf_1st_inf: + orrs r5,r8,r7,lsl#1 // ignore MSB + beq CPDO_adf_both_inf + tst r7,#0x40000000 + beq CPDO_adf_generate_qnan + mov r1,r6 //if 2nd no SNaN return 2nd + mov r2,r7 + mov r3,r8 + mov pc,r14 + +CPDO_adf_2nd_inf: + tst r2,#0x40000000 + beq CPDO_adf_generate_qnan + mov pc,r14 // if 1st no SNaN just return it + +CPDO_adf_both_inf: + cmp r1,r6 + bne CPDO_adf_generate_qnan // signs of both inf are different + mov pc,r14 + +/*--------------------------------------------------------------------------*/ + + .globl CPDO_suf +CPDO_suf: + ldmia r2,{r6,r7,r8,r11} + ldmia r1,{r1,r2,r3,r4} + +CPDO_suf_l: + cmp r11,#0x7fffffff + cmpne r11,#0x80000000 + cmpne r4,#0x7fffffff + cmpne r4,#0x80000000 + beq CPDO_suf_extra + + cmp r1,r6 + bne CPDO_adf_s + +CPDO_suf_s: + subs r6,r4,r11 + blt CPDO_suf_normalize1st + bgt CPDO_suf_normalize2nd + cmp r2,r7 + cmpeq r3,r8 + beq CPDO_suf_zero + mov r5,#0 + bcs CPDO_suf_sub_1stbigger + eor r1,r1,#0x80000000 + b CPDO_suf_sub_2ndbigger + +CPDO_suf_normalize2nd: + cmp r6,#32 + ble CPDO_suf_normalize2nd_1 + cmp r6,#64 + bgt CPDO_suf_normalize2nd_3 + +CPDO_suf_normalize2nd_2: + sub r6,r6,#32 + rsb r11,r6,#32 + mov r5,r8,lsr r6 + add r5,r5,r7,lsl r11 + movs r11,r8,lsl r11 + orrne r5,r5,#1 + mov r8,r7,lsr r6 + mov r7,#0 + b CPDO_suf_sub_1stbigger + +CPDO_suf_normalize2nd_1: + rsb r11,r6,#32 + mov r5,r8,lsl r11 + mov r8,r8,lsr r6 + add r8,r8,r7,lsl r11 + mov r7,r7,lsr r6 + b CPDO_suf_sub_1stbigger + +CPDO_suf_normalize2nd_3: + sub r6,r6,#64 + cmp r6,#32 + bge CPDO_suf_normalize2nd_4 + rsb r11,r6,#32 + mov r5,r7,lsr r6 + orrs r11,r8,r7,lsl r11 + orrne r5,r5,#1 + mov r7,#0 + mov r8,#0 + b CPDO_suf_sub_1stbigger + +CPDO_suf_normalize2nd_4: + mov r5,#1 + mov r7,#0 + mov r8,#0 + b CPDO_suf_sub_1stbigger + +CPDO_suf_normalize1st: + eor r1,r1,#0x80000000 + mov r4,r11 + rsb r6,r6,#0 + cmp r6,#32 + ble CPDO_suf_normalize1st_1 + cmp r6,#64 + bgt CPDO_suf_normalize1st_3 + +CPDO_suf_normalize1st_2: + sub r6,r6,#32 + rsb r11,r6,#32 + mov r5,r3,lsr r6 + add r5,r5,r2,lsl r11 + movs r11,r3,lsl r11 + orrne r5,r5,#1 + mov r3,r2,lsr r6 + mov r2,#0 + b CPDO_suf_sub_2ndbigger + +CPDO_suf_normalize1st_1: + rsb r11,r6,#32 + mov r5,r3,lsl r11 + mov r3,r3,lsr r6 + add r3,r3,r2,lsl r11 + mov r2,r2,lsr r6 + b CPDO_suf_sub_2ndbigger + +CPDO_suf_normalize1st_3: + sub r6,r6,#64 + cmp r6,#32 + bge CPDO_suf_normalize1st_4 + rsb r11,r6,#32 + mov r5,r2,lsr r6 + orrs r11,r3,r2,lsl r11 + orrne r5,r5,#1 + mov r2,#0 + mov r3,#0 + b CPDO_suf_sub_2ndbigger + +CPDO_suf_normalize1st_4: + mov r5,#1 + mov r2,#0 + mov r3,#0 + b CPDO_suf_sub_2ndbigger + +CPDO_suf_sub_1stbigger: + rsbs r5,r5,#0 + sbcs r3,r3,r8 + sbcs r2,r2,r7 + movmi pc,r14 + b CPDO_suf_norm + +CPDO_suf_sub_2ndbigger: + rsbs r5,r5,#0 + sbcs r3,r8,r3 + sbcs r2,r7,r2 + movmi pc,r14 + +CPDO_suf_norm: + teq r2,#0 // normalize 32 bit + bne CPDO_suf_norm16 + teq r3,#0 // normalize 64 bit + bne CPDO_suf_norm32 + mov r2,r5 + mov r3,#0 + mov r5,#0 + sub r4,r4,#64 + mov pc,r14 +CPDO_suf_norm32: + mov r2,r3 + mov r3,r5 + mov r5,#0 + sub r4,r4,#32 +CPDO_suf_norm16: + cmp r2,#0x00010000 // normalize 16 bit + bcs CPDO_suf_norm8 + mov r2,r2,lsl#16 + orr r2,r2,r3,lsr#16 + mov r3,r3,lsl#16 + orr r3,r3,r5,lsr#16 + mov r5,r5,lsl#16 + sub r4,r4,#16 +CPDO_suf_norm8: + cmp r2,#0x01000000 // normalize 8 bit + bcs CPDO_suf_norm4 + mov r2,r2,lsl#8 + orr r2,r2,r3,lsr#24 + mov r3,r3,lsl#8 + orr r3,r3,r5,lsr#24 + mov r5,r5,lsl#8 + sub r4,r4,#8 +CPDO_suf_norm4: + cmp r2,#0x10000000 // normalize 4 bit + bcs CPDO_suf_norm2 + mov r2,r2,lsl#4 + orr r2,r2,r3,lsr#28 + mov r3,r3,lsl#4 + orr r3,r3,r5,lsr#28 + mov r5,r5,lsl#4 + sub r4,r4,#4 +CPDO_suf_norm2: + cmp r2,#0x40000000 // normalize 2 bit + bcs CPDO_suf_norm1 + mov r2,r2,lsl#2 + orr r2,r2,r3,lsr#30 + mov r3,r3,lsl#2 + orr r3,r3,r5,lsr#30 + mov r5,r5,lsl#2 + sub r4,r4,#2 +CPDO_suf_norm1: + cmp r2,#0x80000000 // normalize 1 bit + bcs CPDO_suf_norme + mov r2,r2,lsl#1 + orr r2,r2,r3,lsr#31 + mov r3,r3,lsl#1 + orr r3,r3,r5,lsr#31 + mov r5,r5,lsl#1 + sub r4,r4,#1 +CPDO_suf_norme: + mov pc,r14 + +CPDO_suf_zero: + and r5,r5,#0x00000060 + cmp r5,#0x00000040 // rounding mode M? + moveq r1,#0x80000000 + movne r1,#0 + mov r2,#0 + mov r3,#0 + mov r4,#0x80000000 + stmia r0,{r1,r2,r3,r4} + b fastfpe_next +@ mov pc,r14 + +CPDO_suf_extra: // nearly the same as with adf + cmp r11,#0x7fffffff // the only thing we need to do is + bne CPDO_suf_extra_sign // to invert the second sign if + orrnes r5,r8,r7,lsl#1 // it is not a NaN, ignore MSB + bne CPDO_adf_extra +CPDO_suf_extra_sign: + eor r6,r6,#0x80000000 + b CPDO_adf_extra + /*---------------------------------------------------------------------------*/ + .globl CPDO_rsf +CPDO_rsf: + ldmia r1,{r6,r7,r8,r11} + ldmia r2,{r1,r2,r3,r4} + b CPDO_suf_l + +/*---------------------------------------------------------------------------*/ + + .globl CPDO_muf +CPDO_muf: + ldmia r2,{r6,r7,r8,r11} + ldmia r1,{r1,r2,r3,r4} + + cmp r11,#0x7fffffff + cmpne r4,#0x7fffffff + beq CPDO_muf_extra + eor r1,r1,r6 // sign + cmp r11,#0x80000000 + cmpne r4,#0x80000000 + beq CPDO_muf_zero + + add r4,r4,r11 // exponent + +#define x32 r2 +#define x10 r3 +#define y32 r7 +#define y10 r8 +#define z3 r0 +#define z2 r1 +#define z1 r4 +#define z0 r6 +#define v1 r9 +#define v0 r11 +#define tmp r5 + + stmdb r13!,{r0,r1,r4,r9} + + mov z3,x32,lsr#16 + bic z2,x32,z3,lsl#16 + movs v1,y32,lsr#16 + bic v0,y32,v1,lsl#16 + + mul tmp,z3,v0 + mul z3,v1,z3 + mulne v1,z2,v1 + mul z2,v0,z2 + adds z2,z2,tmp,lsl#16 + adc z3,z3,tmp,lsr#16 + adds z2,z2,v1,lsl#16 + adc z3,z3,v1,lsr#16 + + mov z1,x10,lsr#16 + bic z0,x10,z1,lsl#16 + movs v1,y10,lsr#16 + bic v0,y10,v1,lsl#16 + + mul tmp,z1,v0 + mul z1,v1,z1 + mulne v1,z0,v1 + mul z0,v0,z0 + adds z0,z0,tmp,lsl#16 + adc z1,z1,tmp,lsr#16 + adds z0,z0,v1,lsl#16 + adc z1,z1,v1,lsr#16 + + adds z2,z2,z1 // z3 is max. 0xfffffffe + adc z3,z3,#0 // so this trick is possible + adds z1,z2,z0 // to save one addition + adcs z2,z2,z3 + adc z3,z3,#0 + + subs x10,x32,x10 + mov v0,#0 + mov v1,v0,rrx + + sublo v0,y32,y10 + subnes y10,y10,y32 + + orreq v1,v1,#1<<31 + eorcs v1,v1,#1<<31 + subcc v0,v0,x10 + + movs x32,x10,lsr#16 + bic x10,x10,x32,lsl#16 + mov y32,y10,lsr#16 + bic y10,y10,y32,lsl#16 + + mul tmp,x10,y10 + mla v0,x32,y32,v0 + mulne x32,y10,x32 + adds tmp,tmp,x32,lsl#16 + adc v0,v0,x32,lsr#16 + mul y32,x10,y32 + adds tmp,tmp,y32,lsl#16 + adc v0,v0,y32,lsr#16 + adds r5,z1,tmp + adcs r3,z2,v0 + adc r2,z3,v1,asr#31 + + teq z0,#0 + orrne r5,r5,#1 // z0 must not be lost for rounding + cmp r2,#0 + +#undef x32 r2 +#undef x10 r3 +#undef y32 r7 +#undef y10 r8 +#undef z3 r0 +#undef z2 r1 +#undef z1 r4 +#undef z0 r6 +#undef v1 r9 +#undef v0 r11 +#undef tmp r5 + + ldmia r13!,{r0,r1,r4,r9} + + bpl CPDO_muf_norm + add r4,r4,#1 + mov pc,r14 + +CPDO_muf_norm: + adds r5,r5,r5 + adcs r3,r3,r3 + adc r2,r2,r2 + mov pc,r14 + +CPDO_muf_extra: + cmp r4,#0x7fffffff + beq CPDO_muf_1st_infnan +CPDO_muf_2nd_infnan: + orrs r5,r8,r7,lsl#1 // ignore MSB + bne CPDO_muf_2nd_nan + cmp r4,#0x80000000 + beq CPDO_muf_generate_qnan + mov r2,r7 // copy MSB + mov r3,#0 + mov r4,#0x7fffffff + eor r1,r1,r6 + stmia r0,{r1,r2,r3,r4} + b fastfpe_next +@ mov pc,r14 + +CPDO_muf_1st_infnan: + cmp r11,#0x7fffffff + beq CPDO_muf_both_infnan + orrs r5,r3,r2,lsl#1 // ignore MSB + bne CPDO_muf_1st_nan + cmp r11,#0x80000000 + beq CPDO_muf_generate_qnan +// mov r4,#0x7fffffff + eor r1,r1,r6 + stmia r0,{r1,r2,r3,r4} + b fastfpe_next +@ mov pc,r14 + +CPDO_muf_both_infnan: + orrs r5,r3,r2,lsl#1 // ignore MSB + beq CPDO_muf_both_infnan_1st_inf + orrs r5,r8,r7,lsl#1 // ignore MSB + beq CPDO_muf_both_infnan_2nd_inf + tst r2,#0x40000000 + tstne r7,#0x40000000 + beq CPDO_muf_generate_qnan + mov pc,r14 + +CPDO_muf_both_infnan_1st_inf: + orrs r5,r8,r7,lsl#1 // ignore MSB + beq CPDO_muf_both_inf + b CPDO_muf_2nd_nan + +CPDO_muf_both_infnan_2nd_inf: + b CPDO_muf_1st_nan + +CPDO_muf_both_inf: + eor r1,r1,r6 + orr r2,r2,r7 // copy both MSB + stmia r0,{r1,r2,r3,r4} + b fastfpe_next +@ mov pc,r14 + +CPDO_muf_zero: + mov r2,#0 + mov r3,#0 + mov r4,#0x80000000 + stmia r0,{r1,r2,r3,r4} + b fastfpe_next +@ mov pc,r14 + +CPDO_muf_1st_nan: + tst r2,#0x40000000 + beq CPDO_muf_generate_qnan + mov pc,r14 + +CPDO_muf_2nd_nan: + tst r7,#0x40000000 + beq CPDO_muf_generate_qnan + mov r1,r6 + mov r2,r7 + mov r3,r8 + mov r4,r11 + mov pc,r14 + +CPDO_muf_generate_qnan: + mov r1,#0x80000000 + mov r2,#0x7fffffff + mov r3,#0xffffffff + mov r4,#0x7fffffff + ldr r5,[r10,#128] + orr r5,r5,#1 + str r5,[r10,#128] + mov pc,r14 + +/*---------------------------------------------------------------------------*/ + + .globl CPDO_muf_M +CPDO_muf_M: + ldmia r2,{r6,r7,r8,r11} + ldmia r1,{r1,r2,r3,r4} + + cmp r11,#0x7fffffff + cmpne r4,#0x7fffffff + beq CPDO_muf_extra + eor r1,r1,r6 // sign + cmp r11,#0x80000000 + cmpne r4,#0x80000000 + beq CPDO_muf_zero + + add r4,r4,r11 // exponent + umull r12,r11,r2,r7 + umull r2,r6,r8,r2 + umull r8,r5,r3,r8 + adds r5,r5,r2 + adcs r12,r12,r6 + adc r11,r11,#0 + umull r7,r6,r3,r7 + adds r5,r5,r7 + adcs r3,r12,r6 + adc r2,r11,#0 + teq r8,#0 + orrne r5,r5,#1 // r8 must not be lost for rounding + cmp r2,#0 + + bpl CPDO_muf_norm + add r4,r4,#1 + mov pc,r14 + +/*---------------------------------------------------------------------------*/ + CPDO_infnan_1: stmia r0,{r1,r3,r5,r7} b fastfpe_next @@ -115,7 +691,7 @@ CPDO_nan: mov r2,#0x40000000 @ create non signalling NaN b CPDO_inf_1 - + CPDO_inf: mov r2,#0 CPDO_inf_1: @@ -124,7 +700,7 @@ CPDO_store_1234: stmia r0,{r1,r2,r3,r4} b fastfpe_next - + CPDO_zero: mov r1,#0 CPDO_zero_1: @@ -134,309 +710,690 @@ stmia r0,{r1,r2,r3,r4} b fastfpe_next +CPDO_muf_end: + cmp r8,#0x20000000 + bge CPDO_inf + cmp r8,#0xe0000000 + ble CPDO_zero_1 + stmia r0,{r1,r2,r7,r8} + b fastfpe_next + /*---------------------------------------------------------------------------*/ - .globl CPDO_suf -CPDO_suf: - ldmia r1,{r1,r3,r5,r7} - ldmia r2,{r2,r4,r6,r8} + .globl CPDO_dvf +CPDO_dvf: + ldmia r2,{r6,r7,r8,r11} + ldmia r1,{r1,r2,r3,r4} -CPDO_suf_l: - cmp r7,#0x7fffffff - cmpne r8,#0x7fffffff - beq CPDO_suf_extra +CPDO_dvf_l: + cmp r11,#0x7fffffff + cmpne r4,#0x7fffffff + beq CPDO_dvf_infnan + eor r1,r1,r6 + cmp r11,#0x80000000 + cmpne r4,#0x80000000 + beq CPDO_dvf_zero - cmp r1,r2 - bne CPDO_adf_s + sub r4,r4,r11 -CPDO_suf_s: - subs r2,r7,r8 @ determine greater number - bgt CPDO_suf_2nd @ first number is greater - blt CPDO_suf_1st @ second number is greater - cmp r3,r4 @ also mantissa is important - cmpeq r5,r6 - bhi CPDO_suf_2nd @ first number is greater - beq CPDO_zero +#define x4 r11 +#define x3 r7 +#define x2 r12 +#define x1 r8 +#define y2 r14 +#define y1 r9 +#define z3 r4 +#define z2 r5 +#define z1 r6 +#define tmp r10 -CPDO_suf_1st: - eor r1,r1,#0x80000000 @ second number is greater, invert sign - mov r7,r8 - rsb r2,r2,#0 - cmp r2,#32 - ble CPDO_suf_1st2 + cmp r2,r7 + cmpeq r3,r8 + bcs CPDO_dvf_no_normalize - sub r2,r2,#32 - cmp r2,#32 - movgt r2,#32 - mov r5,r3,lsr r2 - mov r3,#0 - b CPDO_suf_1st_sub + sub r4,r4,#1 + stmdb r13!,{r1,r4,r9,r10,r11,r14} + mov r4,r2,lsr#31 + mov r5,r2,lsl#1 + orr r5,r5,r3,lsr#31 + mov r6,r3,lsl#1 // dividend + b CPDO_dvf_normalize_back -CPDO_suf_1st2: - rsb r8,r2,#32 - mov r5,r5,lsr r2 - orr r5,r5,r3,lsl r8 - mov r3,r3,lsr r2 @ 1. op normalized +CPDO_dvf_no_normalize: + stmdb r13!,{r1,r4,r9,r10,r11,r14} + mov r4,#0 + mov r5,r2 + mov r6,r3 // dividend -CPDO_suf_1st_sub: - subs r5,r6,r5 @ do subtraction - sbc r3,r4,r3 - b CPDO_suf_norm +CPDO_dvf_normalize_back: + mov r1,#0 + sub r10,r1,r7,lsr#1 + mov r11,#0x40000000 -CPDO_suf_2nd: - cmp r2,#32 - ble CPDO_suf_2nd2 + .macro inv_step + adds r11,r10,r11,lsl#1 + subcc r11,r11,r10 + adc r1,r1,r1 + .endm - sub r2,r2,#32 - cmp r2,#32 - movgt r2,#32 - mov r6,r4,lsr r2 - mov r4,#0 - b CPDO_suf_2nd_sub + .rept 17 + inv_step + .endr -CPDO_suf_2nd2: - rsb r8,r2,#32 - mov r6,r6,lsr r2 - orr r6,r6,r4,lsl r8 - mov r4,r4,lsr r2 @ 2. op normalized + mov r1,r1,lsl#15 + adds r1,r1,#1<<15 + movcs r1,#0xffffffff // inverse + mov r1,r1,lsr#16 -CPDO_suf_2nd_sub: - subs r5,r5,r6 - sbc r3,r3,r4 @ do subtraction + mov r2,#0 + mov r3,#0 // clear result space -CPDO_suf_norm: - teq r3,#0 @ normalize 32bit - moveq r3,r5 - moveq r5,#0 - subeq r7,r7,#32 - - cmp r3,#0x00010000 @ 16bit - movcc r3,r3,lsl#16 - orrcc r3,r3,r5,lsr#16 - movcc r5,r5,lsl#16 - subcc r7,r7,#16 - - cmp r3,#0x01000000 @ 8bit - movcc r3,r3,lsl#8 - orrcc r3,r3,r5,lsr#24 - movcc r5,r5,lsl#8 - subcc r7,r7,#8 - - cmp r3,#0x10000000 @ 4bit - movcc r3,r3,lsl#4 - orrcc r3,r3,r5,lsr#28 - movcc r5,r5,lsl#4 - subcc r7,r7,#4 - - cmp r3,#0x40000000 @ 2bit - movcc r3,r3,lsl#2 - orrcc r3,r3,r5,lsr#30 - movcc r5,r5,lsl#2 - subcc r7,r7,#2 - - cmp r3,#0x80000000 @ 1bit - movcc r3,r3,lsl#1 - orrcc r3,r3,r5,lsr#31 - movcc r5,r5,lsl#1 - subcc r7,r7,#1 + mov x4,r7,lsr#16 + bic x3,r7,x4,lsl#16 + mov x2,r8,lsr#16 + bic x1,r8,x2,lsl#16 // split divisor for 16x16=32bit mul - cmp r7,#0xe0000000 - ble CPDO_zero_1 +CPDO_dvf_loop_entry: + mov r4,r4,lsl#16 + orrs r4,r4,r5,lsr#16 + mov r5,r5,lsl#16 + orr r5,r5,r6,lsr#16 + mov r6,r6,lsl#16 // shift dividend left by 16 - stmia r0,{r1,r3,r5,r7} - b fastfpe_next + bmi CPDO_dvf_loop_negative + mov r10,r4,lsr#16 + mul r9,r10,r1 + bic r10,r4,r10,lsl#16 + mul r10,r1,r10 + add r9,r9,r10,lsr#16 //estimate 16 bits of result in r9 -CPDO_suf_extra: - cmp r7,#0x7fffffff @ was it the 1st ? - eorne r2,r2,#0x80000000 @ change sign, might have been INF - bne CPDO_infnan_2 @ no it was the 2nd - cmp r8,#0x7fffffff @ if 1st, 2nd too ? - bne CPDO_infnan_1 @ no only 1st - cmp r3,#0 - cmpeq r4,#0 - bne CPDO_nan_12 - b CPDO_nan @ here is difference with adf ! + mov r2,r2,lsl#16 + orr r2,r2,r3,lsr#16 + adds r3,r9,r3,lsl#16 // shift result left by 16 and + adc r2,r2,#0 // add in new result bits -/*---------------------------------------------------------------------------*/ + mov r9,r9,lsl#1 + mov y2,r9,lsr#16 + bic y1,r9,y2,lsl#16 + mul tmp,x1,y1 + subs z1,z1,tmp + mul tmp,x3,y1 + sbcs z2,z2,tmp + mul tmp,x4,y2 + sbc z3,z3,tmp + mul tmp,x2,y2 + subs z2,z2,tmp + sbc z3,z3,#0 + mul tmp,x2,y1 + subs z1,z1,tmp,lsl#16 + sbcs z2,z2,tmp,lsr#16 + sbc z3,z3,#0 + mul tmp,x1,y2 + subs z1,z1,tmp,lsl#16 + sbcs z2,z2,tmp,lsr#16 + sbc z3,z3,#0 + mul tmp,x4,y1 + subs z2,z2,tmp,lsl#16 + sbc z3,z3,tmp,lsr#16 + mul tmp,x3,y2 + subs z2,z2,tmp,lsl#16 + sbc z3,z3,tmp,lsr#16 // subtract divisor * estimated result - .globl CPDO_rsf -CPDO_rsf: - mov r3,r2 - ldmia r1,{r2,r4,r6,r8} - ldmia r3,{r1,r3,r5,r7} - b CPDO_suf_l - -/*---------------------------------------------------------------------------*/ + tst r2,#0xff000000 + beq CPDO_dvf_loop_entry - .globl CPDO_muf -CPDO_muf: - ldmia r1,{r1,r3,r5,r7} - ldmia r2,{r2,r4,r6,r8} + b CPDO_dvf_end_entry - cmp r7,#0x7fffffff - cmpne r8,#0x7fffffff - beq CPDO_muf_extra - - eor r1,r1,r2 - adds r8,r7,r8 - bvs CPDO_zero_1 +CPDO_dvf_loop_negative: + rsb r14,r4,#0 + mov r10,r14,lsr#16 + mul r9,r10,r1 + bic r10,r14,r10,lsl#16 + mul r10,r1,r10 + add r9,r9,r10,lsr#16 // estimate 16 bits of result in r9 - umull r7,r2,r3,r4 - umull r14,r3,r6,r3 - adds r7,r7,r3 @ r2|r7|r14 = r2|r7|#0 + #0|r3|r14 + mov r2,r2,lsl#16 + orr r2,r2,r3,lsr#16 + rsbs r3,r9,r3,lsl#16 // shift result left by 16 and + sbc r2,r2,#0 // add in new result bits + + mov r9,r9,lsl#1 + mov y2,r9,lsr#16 + bic y1,r9,y2,lsl#16 + mul tmp,x1,y1 + adds z1,z1,tmp + mul tmp,x3,y1 + adcs z2,z2,tmp + mul tmp,x4,y2 + adc z3,z3,tmp + mul tmp,x2,y2 + adds z2,z2,tmp + adc z3,z3,#0 + mul tmp,x2,y1 + adds z1,z1,tmp,lsl#16 + adcs z2,z2,tmp,lsr#16 + adc z3,z3,#0 + mul tmp,x1,y2 + adds z1,z1,tmp,lsl#16 + adcs z2,z2,tmp,lsr#16 + adc z3,z3,#0 + mul tmp,x4,y1 + adds z2,z2,tmp,lsl#16 + adc z3,z3,tmp,lsr#16 + mul tmp,x3,y2 + adds z2,z2,tmp,lsl#16 + adc z3,z3,tmp,lsr#16 // subtract divisor * estimated result + + tst r2,#0xff000000 + beq CPDO_dvf_loop_entry + +CPDO_dvf_end_entry: + movs r4,r4,asr#1 + movs r5,r5,rrx // remainder was shifted left by 1 + movs r6,r6,rrx // relative to divisor + + orr r7,x3,x4,lsl#16 + orr r8,x1,x2,lsl#16 // put the split divisor together again + + cmp r4,#0 + blt CPDO_dvf_end_negative + cmpeq r5,r7 + cmpeq r6,r8 + bcc CPDO_dvf_end + +CPDO_dvf_end_positive: + adds r3,r3,#1 adc r2,r2,#0 - umull r4,r3,r5,r4 - adds r14,r14,r4 @ r2|r7|r14 += #0|r3|r4 - adcs r7,r7,r3 - adc r2,r2,#0 - umull r4,r3,r5,r6 - adds r14,r14,r3 @ r2|r7|r14 += #0|#0|r3 - adcs r7,r7,#0 - adcs r2,r2,#0 - bpl CPDO_muf_norm - - add r8,r8,#1 - b CPDO_muf_end - -CPDO_muf_norm: - adds r14,r14,r14 - adcs r7,r7,r7 - adcs r2,r2,r2 + subs r6,r6,r8 + sbcs r5,r5,r7 + sbcs r4,r4,#0 + bne CPDO_dvf_end_positive -CPDO_muf_end: - cmp r8,#0x20000000 - bge CPDO_inf - cmp r8,#0xe0000000 - ble CPDO_zero_1 - stmia r0,{r1,r2,r7,r8} + cmp r5,r7 + cmpeq r6,r8 + bcs CPDO_dvf_end_positive + b CPDO_dvf_end + +CPDO_dvf_end_negative: + subs r3,r3,#1 + sbc r2,r2,#0 + + adds r6,r6,r8 + adcs r5,r5,r7 + adcs r4,r4,#0 + bmi CPDO_dvf_end_negative + +CPDO_dvf_end: + orrs r9,r5,r6 + ldmia r13!,{r1,r4,r9,r10,r11,r14} + moveq pc,r14 + + adds r6,r6,r6 + adcs r5,r5,r5 + movcs r5,#0xc0000000 + movcs pc,r14 + + cmp r5,r7 + cmpeq r6,r8 + movcc r5,#0x40000000 + moveq r5,#0x80000000 + movhi r5,#0xc0000000 + mov pc,r14 + +CPDO_dvf_zero: + cmp r11,#0x80000000 + beq CPDO_dvf_by_zero + + stmia r0,{r1,r2,r3,r4} + b fastfpe_next // 0 already there +@ mov pc,r14 + +CPDO_dvf_by_zero: + cmp r4,#0x80000000 + beq CPDO_dvf_generate_qnan // first 0 too + + mov r2,#0x80000000 // set MSB + mov r3,#0 + mov r4,#0x7fffffff + ldr r5,[r10,#128] + orr r5,r5,#2 // division by zero + str r5,[r10,#128] + stmia r0,{r1,r2,r3,r4} b fastfpe_next +@ mov pc,r14 -CPDO_muf_extra: - cmp r7,#0x7fffffff @ was it the first? - bne CPDO_muf_extra_2nd @ no, so it was the second - cmp r8,#0x7fffffff @ yes, second too? - bne CPDO_muf_extra_1st @ no, only first - orr r3,r3,r4 @ if both inf -> inf, otherwise nan - eor r1,r1,r2 @ sign for the inf case - b CPDO_infnan_1 +CPDO_dvf_infnan: + cmp r4,#0x7fffffff + beq CPDO_dvf_1st_infnan -CPDO_muf_extra_1st: - cmp r3,#0 @ is it a nan? - bne CPDO_infnan_1 - cmp r8,#0x80000000 @ is the second 0? - beq CPDO_nan - eor r1,r1,r2 @ correct sign for inf - b CPDO_inf + orrs r5,r8,r7,lsl#1 // ignore MSB + beq CPDO_dvf_2nd_inf + mov r1,r6 + mov r2,r7 + mov r3,r8 + mov r4,r11 + b CPDO_dvf_1st_or_2nd_nan -CPDO_muf_extra_2nd: - cmp r4,#0 @ is it a nan? - bne CPDO_infnan_2 - cmp r7,#0x80000000 @ is the first 0? - beq CPDO_nan - eor r1,r1,r2 @ correct sign for inf - b CPDO_inf +CPDO_dvf_2nd_inf: + eor r1,r1,r6 + mov r2,#0 + mov r3,#0 + mov r4,#0x80000000 + stmia r0,{r1,r2,r3,r4} + b fastfpe_next // zero created +@ mov pc,r14 +CPDO_dvf_1st_infnan: + cmp r11,#0x7fffffff + beq CPDO_dvf_both_infnan + + orrs r5,r3,r2,lsl#1 // 1st inf? ignore MSB + bne CPDO_dvf_1st_or_2nd_nan + + eor r1,r1,r6 // sign for inf + stmia r0,{r1,r2,r3,r4} + b fastfpe_next // inf already there +@ mov pc,r14 + +CPDO_dvf_1st_or_2nd_nan: + tst r2,#0x40000000 + beq CPDO_dvf_generate_qnan + mov pc,r14 // qnan1/2 already/copied there + +CPDO_dvf_both_infnan: + orrs r5,r3,r2,lsl#1 // ignore MSB + beq CPDO_dvf_both_infnan_1st_inf + orrs r5,r8,r7,lsl#1 // ignore MSB + beq CPDO_dvf_both_infnan_2nd_inf + tst r2,#0x40000000 + tstne r7,#0x40000000 + beq CPDO_dvf_generate_qnan + mov pc,r14 + +CPDO_dvf_both_infnan_1st_inf: + tst r7,#0x40000000 // 2nd inf or SNaN ? + beq CPDO_dvf_generate_qnan + mov r1,r6 + mov r2,r7 + mov r3,r8 + mov r4,r11 // copy 2nd QNaN + mov pc,r14 + +CPDO_dvf_both_infnan_2nd_inf: + tst r2,#0x40000000 // 1st SNaN ? + beq CPDO_dvf_generate_qnan + mov pc,r14 + +CPDO_dvf_generate_qnan: + mov r1,#0x80000000 + mov r2,#0x7fffffff + mov r3,#0xffffffff + mov r4,#0x7fffffff + ldr r5,[r10,#128] + orr r5,r5,#1 + str r5,[r10,#128] + mov pc,r14 + /*---------------------------------------------------------------------------*/ - .globl CPDO_dvf -CPDO_dvf: - ldmia r1,{r1,r3,r5,r7} - ldmia r2,{r2,r4,r6,r8} + .globl CPDO_dvf_M +CPDO_dvf_M: + ldmia r2,{r6,r7,r8,r11} + ldmia r1,{r1,r2,r3,r4} -CPDO_dvf_l: - cmp r7,#0x7fffffff - cmpne r8,#0x7fffffff - beq CPDO_dvf_extra - cmp r8,#0x80000000 - beq CPDO_dvf_by0 +CPDO_dvf_M_l: + cmp r11,#0x7fffffff + cmpne r4,#0x7fffffff + beq CPDO_dvf_infnan + eor r1,r1,r6 + cmp r11,#0x80000000 + cmpne r4,#0x80000000 + beq CPDO_dvf_zero - eor r1,r1,r2 - cmp r7,#0x80000000 - beq CPDO_zero_1 - - sub r8,r7,r8 - + sub r4,r4,r11 + + cmp r2,r7 + cmpeq r3,r8 + bcs CPDO_dvf_M_no_normalize + + sub r4,r4,#1 + stmdb r13!,{r1,r4,r9,r10} + mov r4,r2,lsr#31 + mov r5,r2,lsl#1 + orr r5,r5,r3,lsr#31 + mov r6,r3,lsl#1 // dividend + b CPDO_dvf_M_normalize_back + +CPDO_dvf_M_no_normalize: + stmdb r13!,{r1,r4,r9,r10} + mov r4,#0 + mov r5,r2 + mov r6,r3 // dividend + +CPDO_dvf_M_normalize_back: + mov r1,#0 + sub r10,r1,r7,lsr#1 + mov r11,#0x40000000 + + .rept 18 + inv_step + .endr + + mov r1,r1,lsl#14 + adds r1,r1,#1<<15 + movcs r1,#0xffffffff // inverse + mov r2,#0 - mov r7,#1 + mov r3,#0 // clear result space - cmp r3,r4 - cmpeq r5,r6 - bcs CPDO_dvf_loop_ +CPDO_dvf_M_loop_entry: + mov r4,r4,lsl#16 + orrs r4,r4,r5,lsr#16 + mov r5,r5,lsl#16 + orr r5,r5,r6,lsr#16 + mov r6,r6,lsl#16 // shift dividend left by 16 - sub r8,r8,#1 + bmi CPDO_dvf_M_loop_negative + umull r10,r9,r4,r1 // estimate 16 bits of result in r9 -CPDO_dvf_loop: - adds r5,r5,r5 - adcs r3,r3,r3 - bcs CPDO_dvf_anyway -CPDO_dvf_loop_: - subs r5,r5,r6 - sbcs r3,r3,r4 - bcs CPDO_dvf_okay + mov r2,r2,lsl#16 + orr r2,r2,r3,lsr#16 + adds r3,r9,r3,lsl#16 // shift result left by 16 and + adc r2,r2,#0 // add in new result bits - adds r5,r5,r6 - adc r3,r3,r4 - adds r7,r7,r7 - adcs r2,r2,r2 - bcc CPDO_dvf_loop - b CPDO_dvf_end + mov r9,r9,lsl#1 + umull r11,r10,r8,r9 // divisor lo * estimated result + subs r6,r6,r11 + sbcs r5,r5,r10 + sbc r4,r4,#0 -CPDO_dvf_anyway: - adcs r7,r7,r7 - adcs r2,r2,r2 - bcs CPDO_dvf_end - subs r5,r5,r6 - sbc r3,r3,r4 - b CPDO_dvf_loop + umull r11,r10,r7,r9 // divisor hi * estimated result + subs r5,r5,r11 + sbc r4,r4,r10 -CPDO_dvf_okay: - adcs r7,r7,r7 - adcs r2,r2,r2 - bcc CPDO_dvf_loop + tst r2,#0xff000000 + beq CPDO_dvf_M_loop_entry -CPDO_dvf_end: - b CPDO_muf_end + b CPDO_dvf_M_end_entry -CPDO_dvf_by0: - cmp R7,#0x80000000 - beq CPDO_nan @ first also 0 -> nan - eor r1,r1,r2 @ otherwise calculatesign for inf - b CPDO_inf +CPDO_dvf_M_loop_negative: + rsb r11,r4,#0 + umull r10,r9,r11,r1 // estimate 16 bits of result in r9 -CPDO_dvf_extra: - cmp r7,#0x7fffffff @ was it the first? - bne CPDO_dvf_extra_2nd @ no, so it was the second - cmp r8,#0x7fffffff @ yes, second too? - bne CPDO_dvf_extra_1st @ no, only first - orrs r3,r3,r4 - beq CPDO_nan @ if both inf -> create nan - b CPDO_nan_12 @ otherwise keep nan + mov r2,r2,lsl#16 + orr r2,r2,r3,lsr#16 + rsbs r3,r9,r3,lsl#16 // shift result left by 16 and + sbc r2,r2,#0 // add in new result bits -CPDO_dvf_extra_1st: - eor r1,r1,r2 @ correct sign for inf - b CPDO_infnan_1 + mov r9,r9,lsl#1 + umull r11,r10,r8,r9 // divisor lo * estimated result + adds r6,r6,r11 + adcs r5,r5,r10 + adc r4,r4,#0 -CPDO_dvf_extra_2nd: - cmp r4,#0 @ is it a nan? - bne CPDO_infnan_2 - eor r1,r1,r2 @ correct sign for zero - b CPDO_zero_1 + umlal r5,r4,r7,r9 // divisor hi * estimated result + tst r2,#0xff000000 + beq CPDO_dvf_M_loop_entry + +CPDO_dvf_M_end_entry: + movs r4,r4,asr#1 + movs r5,r5,rrx // remainder was shifted left by 1 + movs r6,r6,rrx // relative to divisor + + cmp r4,#0 + blt CPDO_dvf_M_end_negative + cmpeq r5,r7 + cmpeq r6,r8 + bcc CPDO_dvf_M_end + +CPDO_dvf_M_end_positive: + adds r3,r3,#1 + adc r2,r2,#0 + + subs r6,r6,r8 + sbcs r5,r5,r7 + sbcs r4,r4,#0 + + cmp r5,r7 + cmpeq r6,r8 + bcs CPDO_dvf_M_end_positive + b CPDO_dvf_M_end + +CPDO_dvf_M_end_negative: + subs r3,r3,#1 + sbc r2,r2,#0 + + adds r6,r6,r8 + adcs r5,r5,r7 + adcs r4,r4,#0 + bmi CPDO_dvf_M_end_negative + +CPDO_dvf_M_end: + orrs r9,r5,r6 + ldmia r13!,{r1,r4,r9,r10} + moveq pc,r14 + + adds r6,r6,r6 + adcs r5,r5,r5 + movcs r5,#0xc0000000 + movcs pc,r14 + + cmp r5,r7 + cmpeq r6,r8 + movcc r5,#0x40000000 + moveq r5,#0x80000000 + movhi r5,#0xc0000000 + mov pc,r14 + /*---------------------------------------------------------------------------*/ .globl CPDO_rdf CPDO_rdf: - mov r3,r2 - ldmia r1,{r2,r4,r6,r8} - ldmia r3,{r1,r3,r5,r7} + ldmia r1,{r6,r7,r8,r11} + ldmia r2,{r1,r2,r3,r4} b CPDO_dvf_l /*---------------------------------------------------------------------------*/ + .globl CPDO_rdf_M +CPDO_rdf_M: + ldmia r1,{r6,r7,r8,r11} + ldmia r2,{r1,r2,r3,r4} + b CPDO_dvf_M_l + +/*---------------------------------------------------------------------------*/ + .globl CPDO_rmf CPDO_rmf: + ldmia r2,{r6,r7,r8,r11} + ldmia r1,{r1,r2,r3,r4} + + cmp r11,#0x7fffffff + cmpne r4,#0x7fffffff + beq CPDO_rmf_infnan + cmp r11,#0x80000000 + cmpne r4,#0x80000000 + beq CPDO_rmf_zero + + cmp r4,r11 + bge CPDO_rmf_loop_entry + b CPDO_rmf_smaller + +CPDO_rmf_loop_0: + mov r5,#0 +CPDO_rmf_loop: + cmp r4,r11 + ble CPDO_rmf_loop_end + + sub r4,r4,#1 + + adds r3,r3,r3 + adcs r2,r2,r2 + bcs CPDO_rmf_loop_anyway + +CPDO_rmf_loop_entry: + cmp r2,r7 + cmpeq r3,r8 + bcc CPDO_rmf_loop_0 + +CPDO_rmf_loop_anyway: + subs r3,r3,r8 + sbc r2,r2,r7 + mov r5,#1 + b CPDO_rmf_loop + +CPDO_rmf_loop_end: + teq r2,#0 + teqeq r3,#0 + beq CPDO_rmf_created_zero + + //eor r1,r1,r6 // only if result not zero + + mov r6,r2,lsr#31 + mov r11,r2,lsl#1 + orr r11,r11,r3,lsr#31 + + cmp r6,#0 + cmpeq r11,r7 + rsbeqs r6,r8,r3,lsl#1 + cmpeq r5,#1 // for nearest-even + bcc CPDO_rmf_norm + + eor r1,r1,#0x80000000 + subs r3,r8,r3 + sbc r2,r7,r2 + +CPDO_rmf_norm: + teq r2,#0 // normalize 32 bit + moveq r2,r3 + moveq r3,#0 + subeq r4,r4,#32 + + cmp r2,#0x00010000 // normalize 16 bit + movcc r2,r2,lsl#16 + orrcc r2,r2,r3,lsr#16 + movcc r3,r3,lsl#16 + subcc r4,r4,#16 + + cmp r2,#0x01000000 // normalize 8 bit + movcc r2,r2,lsl#8 + orrcc r2,r2,r3,lsr#24 + movcc r3,r3,lsl#8 + subcc r4,r4,#8 + + cmp r2,#0x10000000 // normalize 4 bit + movcc r2,r2,lsl#4 + orrcc r2,r2,r3,lsr#28 + movcc r3,r3,lsl#4 + subcc r4,r4,#4 + + cmp r2,#0x40000000 // normalize 2 bit + movcc r2,r2,lsl#2 + orrcc r2,r2,r3,lsr#30 + movcc r3,r3,lsl#2 + subcc r4,r4,#2 + + cmp r2,#0x80000000 // normalize 1 bit + movcc r2,r2,lsl#1 + orrcc r2,r2,r3,lsr#31 + movcc r3,r3,lsl#1 + subcc r4,r4,#1 + + mov r5,#0 + mov pc,r14 + +CPDO_rmf_created_zero: + mov r4,#0x80000000 + stmia r0,{r1,r2,r3,r4} b fastfpe_next +@ mov pc,r14 + +CPDO_rmf_smaller: + add r5,r4,#1 + cmp r5,r11 + blt CPDO_rmf_norm + cmp r2,r7 + cmpeq r3,r8 + bls CPDO_rmf_norm + + eor r1,r1,#0x80000000 + adds r8,r8,r8 + adc r7,r7,r7 + subs r3,r8,r3 + sbc r2,r7,r2 + b CPDO_rmf_norm + +CPDO_rmf_zero: + cmp r11,#0x80000000 + beq CPDO_rmf_generate_qnan + stmia r0,{r1,r2,r3,r4} + b fastfpe_next +@ mov pc,r14 + +CPDO_rmf_infnan: + cmp r4,#0x7fffffff + beq CPDO_rmf_1st_infnan + + orrs r5,r8,r7,lsl#1 // ignore MSB + beq CPDO_rmf_2nd_inf + mov r1,r6 + mov r2,r7 + mov r3,r8 + mov r4,r11 + b CPDO_rmf_1st_or_2nd_nan + +CPDO_rmf_2nd_inf: + mov pc,r14 // result = 1st operand + +CPDO_rmf_1st_infnan: + cmp r11,#0x7fffffff + beq CPDO_rmf_both_infnan + + orrs r5,r3,r2,lsl#1 // 1st inf? + bne CPDO_rmf_1st_or_2nd_nan + + b CPDO_rmf_generate_qnan + +CPDO_rmf_1st_or_2nd_nan: + tst r2,#0x40000000 + beq CPDO_rmf_generate_qnan + mov pc,r14 // qnan1/2 already/copied there + +CPDO_rmf_both_infnan: + orrs r5,r3,r2,lsl#1 // ignore MSB + beq CPDO_rmf_both_infnan_1st_inf + orrs r5,r8,r7,lsl#1 // ignore MSB + beq CPDO_rmf_both_infnan_2nd_inf + tst r2,#0x40000000 + tstne r7,#0x40000000 + beq CPDO_rmf_generate_qnan + mov pc,r14 + +CPDO_rmf_both_infnan_1st_inf: + tst r7,#0x40000000 // 2nd inf or SNaN ? + beq CPDO_rmf_generate_qnan + mov r1,r6 + mov r2,r7 + mov r3,r8 + mov r4,r11 // copy 2nd QNaN + mov pc,r14 + +CPDO_rmf_both_infnan_2nd_inf: + tst r2,#0x40000000 // 1st SNaN ? + beq CPDO_rmf_generate_qnan + mov pc,r14 + +CPDO_rmf_generate_qnan: + mov r1,#0x80000000 + mov r2,#0x7fffffff + mov r3,#0xffffffff + mov r4,#0x7fffffff + ldr r5,[r10,#128] + orr r5,r5,#1 + str r5,[r10,#128] + mov pc,r14 /*---------------------------------------------------------------------------*/ @@ -447,8 +1404,8 @@ .globl CPDO_mvf CPDO_mvf: ldmia r2,{r1,r2,r3,r4} - stmia r0,{r1,r2,r3,r4} - b fastfpe_next + mov r5,#0 + mov pc,r14 /*---------------------------------------------------------------------------*/ @@ -456,8 +1413,8 @@ CPDO_mnf: ldmia r2,{r1,r2,r3,r4} eor r1,r1,#0x80000000 - stmia r0,{r1,r2,r3,r4} - b fastfpe_next + mov r5,#0 + mov pc,r14 /*---------------------------------------------------------------------------*/ @@ -481,7 +1438,8 @@ tst r4,r4,lsr#1 @carry=exponent bit 0 bcc CPDO_sqt_exponenteven adds r3,r3,r3 - adcs r2,r2,r2 @carry is needed in loop! + adc r2,r2,r2 + cmp r2,#0x20000000 @set carry for loop CPDO_sqt_exponenteven: mov r4,r4,asr #1 str r4,[r0,#12] @@ -570,113 +1528,178 @@ CPDO_rnd: ldmia r2,{r1,r2,r3,r5} bl CPDO_rnd_core - -CPDO_rnd_store: + ldr r6,[r10,#128] stmia r0,{r1,r2,r3,r5} + orr r6,r6,r4 + str r6,[r10,#128] b fastfpe_next /*---------------------------------------------------------------------------*/ .globl CPDO_rnd_core CPDO_rnd_core: - and r4,r4,#0x00000060 - add pc,pc,r4,lsr#3 + and r6,r4,#0x00000060 + mov r4,#0 // for return of exception flags + cmp r5,#63 + bge CPDO_rnd_big + add pc,pc,r6,lsr#3 mov r0,r0 - b CPDO_rnd_N + b CPDO_rnd_NE b CPDO_rnd_P b CPDO_rnd_M b CPDO_rnd_Z - -CPDO_rnd_N: - cmp r5,#-1 - blt CPDO_rnd_zero - cmp r5,#63 - movge pc,r14 - mov r4,#0x40000000 - cmp r5,#31 - bge CPDO_rnd_N_2 - adds r2,r2,r4,lsr r5 - bcc CPDO_rnd_end - b CPDO_rnd_end_norm +CPDO_rnd_NE: + cmp r5,#0 + blt CPDO_rnd_NE_01 -CPDO_rnd_N_2: -CPDO_rnd_P_2: - sub r6,r5,#32 - adds r3,r3,r4,ror r6 @ror ist needed to handle a -1 correctly + subs r6,r5,#31 + bpl CPDO_rnd_NE_2 + mov r7,#0x40000000 + mov r8,#0x7fffffff + mov r7,r7,lsr r5 + mov r8,r8,lsr r5 + teq r3,#0 + tsteq r2,r8 + orrne r4,r4,#16 // set inexact flag + adds r2,r2,r7 + bcs CPDO_rnd_overflow + teq r3,#0 + tsteq r2,r8 + beq CPDO_rnd_NE_equal + mov r3,#0 + bic r2,r2,r8 + mov pc,r14 + +CPDO_rnd_NE_2: + mov r7,#0x80000000 + mov r8,#0xffffffff + mov r7,r7,lsr r6 + mov r8,r8,lsr r6 + tst r3,r8 + orrne r4,r4,#16 // set inexact flag + adds r3,r3,r7 adcs r2,r2,#0 - bcc CPDO_rnd_end - b CPDO_rnd_end_norm + bcs CPDO_rnd_overflow + tst r3,r8 + beq CPDO_rnd_NE_equal + bic r3,r3,r8 + mov pc,r14 -CPDO_rnd_P: - tst r1,#0x80000000 - bne CPDO_rnd_M_entry -CPDO_rnd_P_entry: - cmp r5,#0 - blt CPDO_rnd_P_small - cmp r5,#63 - movge pc,r14 - mov r4,#0x7fffffff - cmp r5,#32 - bge CPDO_rnd_P_2 +CPDO_rnd_NE_equal: + mov r7,#0x80000000 + subs r6,r5,#32 + bicpl r3,r3,r7,lsr r6 + bicmi r2,r2,r7,lsr r5 + mov pc,r14 - adds r3,r3,#0xffffffff - adcs r2,r2,r4,lsr r5 - bcc CPDO_rnd_end - b CPDO_rnd_end_norm +CPDO_rnd_NE_01: + cmp r5,#-1 + bne CPDO_rnd_0 + cmp r2,#0x80000000 + cmpeq r3,#0 + beq CPDO_rnd_0 -CPDO_rnd_P_small: - cmp r5,#0x80000000 - moveq pc,r14 - b CPDO_rnd_one + mov r2,#0x80000000 + mov r3,#0 + mov r5,#0 + orr r4,r4,#16 // set inexact flag + mov pc,r14 +CPDO_rnd_P: + teq r1,#0 + beq CPDO_rnd_NZ + b CPDO_rnd_Z + CPDO_rnd_M: - tst r1,#0x80000000 - bne CPDO_rnd_P_entry -CPDO_rnd_M_entry: - cmp r5,#0 - blt CPDO_rnd_zero - cmp r5,#63 - movge pc,r14 - - b CPDO_rnd_end + teq r1,#0 + beq CPDO_rnd_Z + b CPDO_rnd_NZ CPDO_rnd_Z: - cmp r5,#0 - blt CPDO_rnd_zero - cmp r5,#63 - movge pc,r14 - b CPDO_rnd_end + cmp r5,#0 // smaller than 1 will be 0 + blt CPDO_rnd_0 -CPDO_rnd_end_norm: - add r5,r5,#1 - movs r2,r2,rrx - mov r3,r3,rrx -CPDO_rnd_end: - rsbs r4,r5,#31 - bmi CPDO_rnd_end_2 - mov r3,#0 - mov r2,r2,lsr r4 - mov r2,r2,lsl r4 + rsbs r6,r5,#31 + bmi CPDO_rnd_Z_2 + cmp r3,#0 + mov r3,#0 + mov r7,r2,lsr r6 + teqeq r2,r7,lsl r6 + mov r2,r7,lsl r6 + orrne r4,r4,#16 // set inexact flag mov pc,r14 -CPDO_rnd_end_2: - rsb r4,r5,#63 - mov r3,r3,lsr r4 - mov r3,r3,lsl r4 +CPDO_rnd_Z_2: + rsb r6,r5,#63 + mov r7,r3,lsr r6 + teq r3,r7,lsl r6 + mov r3,r7,lsl r6 + orrne r4,r4,#16 // set inexact flag mov pc,r14 -CPDO_rnd_one: +CPDO_rnd_0: + cmp r5,#0x80000000 + moveq pc,r14 // already 0 -> ok + + mov r2,#0 + mov r3,#0 + mov r5,#0x80000000 + orr r4,r4,#16 // set inexact flag + mov pc,r14 + +CPDO_rnd_NZ: + cmp r5,#0 // smaller than 1 will be stay 0 or become 1 + blt CPDO_rnd_NZ_01 + + mov r7,#0x7fffffff + subs r6,r5,#32 + bpl CPDO_rnd_NZ_2 + mov r7,r7,lsr r5 + teq r3,#0 + tsteq r2,r7 + orrne r4,r4,#16 // set inexact flag + adds r3,r3,#0xffffffff + adcs r2,r2,r7 + bcs CPDO_rnd_overflow + mov r3,#0 + bic r2,r2,r7 + mov pc,r14 + +CPDO_rnd_NZ_2: + mov r7,r7,lsr r6 + tst r3,r7 + orrne r4,r4,#16 // set inexact flag + adds r3,r3,r7 + adcs r2,r2,#0 + bcs CPDO_rnd_overflow + bic r3,r3,r7 + mov pc,r14 + +CPDO_rnd_NZ_01: + cmp r5,#0x80000000 + moveq pc,r14 // already 0 -> ok + mov r2,#0x80000000 mov r3,#0 mov r5,#0 + orr r4,r4,#16 // set inexact flag mov pc,r14 - -CPDO_rnd_zero: - mov r1,#0 - mov r2,#0 + +CPDO_rnd_overflow: + mov r2,#0x80000000 mov r3,#0 - mov r5,#0x80000000 + add r5,r5,#1 mov pc,r14 +CPDO_rnd_big: + cmp r5,#0x7fffffff + movne pc,r14 // just big + orrs r6,r3,r2,lsl#1 // ignore MSB + moveq pc,r14 // infinity + tst r2,#0x40000000 // signalling NaN ? + orreq r4,r4,#1 // set invalid operation flag + orreq r2,r2,#0x40000000 // make quiet NaN + mov pc,r14 + /*---------------------------------------------------------------------------*/ Index: linux/arch/arm/fastfpe/module.c =================================================================== --- pre/linux/arch/arm/fastfpe/module.c (revision 26) +++ post/linux/arch/arm/fastfpe/module.c (working copy) @@ -33,30 +33,16 @@ static void (*orig_fp_enter)(void); /* old kern_fp_enter value */ extern void (*kern_fp_enter)(void); /* current FP handler */ extern void fastfpe_enter(void); /* forward declarations */ +extern int fastfpe_test(void); /* long multiply available ? */ -#ifdef MODULE -/* - * Return 0 if we can be unloaded. This can only happen if - * kern_fp_enter is still pointing at fastfpe_enter - */ -static int fpe_unload(void) -{ - return (kern_fp_enter == fastfpe_enter) ? 0 : 1; -} -#endif - static int __init fpe_init(void) { -#ifdef MODULE - if (!mod_member_present(&__this_module, can_unload)) - return -EINVAL; - __this_module.can_unload = fpe_unload; -#else if (fpe_type[0] && strcmp(fpe_type, "fastfpe")) return 0; -#endif - printk("Fast Floating Point Emulator V0.9 (c) Peter Teichmann.\n"); + printk("Fast Floating Point Emulator V0.94"); + if (fastfpe_test() == 1) printk("M"); + printk(" by Peter Teichmann.\n"); /* Save pointer to the old FP handler and then patch ourselves in */ orig_fp_enter = kern_fp_enter; Index: linux/arch/arm/fastfpe/round.S =================================================================== --- pre/linux/arch/arm/fastfpe/round.S (revision 0) +++ post/linux/arch/arm/fastfpe/round.S (revision 0) @@ -0,0 +1,912 @@ + +/* +Rounds fp register r1-r4, additional mantissa bits in r5 and stores result +at address r0. Returns to fastfpe_next. +*/ + +/*------------------------------------------------------------------------*/ + + .data + .globl round_table +round_table: + .word round_single_ne + .word round_single_p + .word round_single_m + .word round_single_z + .word round_double_ne + .word round_double_p + .word round_double_m + .word round_double_z + .word round_extended_ne + .word round_extended_p + .word round_extended_m + .word round_extended_z + .word round_undef + .word round_undef + .word round_undef + .word round_undef + +/*------------------------------------------------------------------------*/ + + .text +round_single_ne: + cmp r4,#127 + bgt round_single_nz_ne_overflow + cmp r4,#-126-23-1 + blt round_single_z_ne_underflow + cmp r4,#-126 + blt round_single_ne_denormalized + + adds r6,r2,#0x80 // add 0x80.00000000.00000000 to + bcs round_single_add_ov // mantissa and additional bits + + teq r5,#0 + teqeq r3,#0 + tsteq r2,#0xff // test for inexact + + ldrne r7,[r10,#128] + orrne r7,r7,#16 // set inexact flag + strne r7,[r10,#128] + + teq r5,#0 + teqeq r3,#0 + tsteq r6,#0xff + biceq r6,r6,#0x100 // the even thingy + + mov r3,#0 // remove bits not existing in single + bic r2,r6,#0xff // remove bits not existing in single + stmia r0,{r1-r4} + b fastfpe_next + +round_single_ne_denormalized: + add r7,r4,#150 + mov r6,#0xffffffff + mov r6,r6,lsr r7 + + teq r5,#0 + teqeq r3,#0 + tsteq r2,r6 + ldrne r8,[r10,#128] + orrne r8,r8,#16+8 // set inexact, underflow flag + strne r8,[r10,#128] + + mov r8,#0x80000000 + mov r8,r8,lsr r7 + adds r2,r2,r8 + bcs round_single_ne_denormalized_ov + + teq r5,#0 + teqeq r3,#0 + tsteq r2,r6 + biceq r2,r2,r8,lsl #1 // the even thingy + + mov r3,#0 + bic r2,r2,r6 // removing bits not existing in single + stmia r0,{r1-r4} + b fastfpe_next + +round_single_ne_denormalized_ov: + cmp r4,#-150 + cmpeq r3,#0 + cmpeq r2,#0 + beq round_single_z_ne_underflow // 1.0*2^-150 to zero! + add r4,r4,#1 + cmp r4,#-126 // left denormalized range ? + cmpge r2,#0x80 // yes -> overflow also without denormalisation ? + ldrge r5,[r10,#128] + bicge r5,r5,#8 // yes -> clear underflow flag + strge r5,[r10,#128] + mov r3,#0 + mov r2,#0x80000000 + stmia r0,{r1-r4} + b fastfpe_next + +/*------------------------------------------------------------------------*/ + +round_single_p: + teq r1,#0 + beq round_single_nz + b round_single_z + +/*------------------------------------------------------------------------*/ + +round_single_m: + teq r1,#0 + beq round_single_z + b round_single_nz + +/*------------------------------------------------------------------------*/ + +round_single_z: + cmp r4,#127 + bgt round_single_z_overflow + cmp r4,#-126-23 + blt round_single_z_ne_underflow + cmp r4,#-126 + blt round_single_z_denormalized + + teq r5,#0 + teqeq r3,#0 + tsteq r2,#0xff // testing for inexact + ldrne r5,[r10,#128] + orrne r5,r5,#16 // set inexact flag + strne r5,[r10,#128] + + mov r3,#0 + bic r2,r2,#0xff // removing bits not existing in single + stmia r0,{r1-r4} + b fastfpe_next + +round_single_z_overflow: + cmp r4,#0x7fffffff + beq round_single_infnan + + ldrne r5,[r10,#128] + orrne r5,r5,#16+4 // set inexact,overflow flag + strne r5,[r10,#128] + mov r2,#0xffffff00 + mov r3,#0 + mov r4,#127 // biggest non-infinity single + stmia r0,{r1-r4} + b fastfpe_next + +round_single_infnan: + orrs r5,r3,r2,lsl#1 // is it Inf? ignore MSB + beq round_single_infnan_store + tst r2,#0x40000000 // is it a SNaN? + beq round_single_infnan_create_qnan + mov r3,#0 // these bits can not be stored + bic r2,r2,#0xff // in single precision +round_single_infnan_store: + stmia r0,{r1-r4} + b fastfpe_next + +round_single_infnan_create_qnan: + mov r1,#0x80000000 + mov r2,#0xffffff00 + bic r2,r2,#0x80000000 // r2 = 0x7fffff00 + mov r3,#0 + ldr r5,[r10,#128] + orr r5,r5,#1 // set invalid operation flag + str r5,[r10,#128] + stmia r0,{r1-r4} + b fastfpe_next + +round_single_z_ne_underflow: + cmp r4,#0x80000000 + beq round_single_z_zero + ldrne r5,[r10,#128] + orrne r5,r5,#16+8 // set inexact, underflow flag + strne r5,[r10,#128] + mov r2,#0 + mov r3,#0 + mov r4,#0x80000000 // was by ERROR -127 +round_single_z_zero: + stmia r0,{r1-r4} + b fastfpe_next + +round_single_z_denormalized: + mov r6,#0xffffffff + add r7,r4,#150 + + teq r5,#0 + teqeq r3,#0 + tsteq r2,r6,lsr r7 // testing for tinyness + ldrne r5,[r10,#128] + orrne r5,r5,#16+8 // set inexact, undeflow flag + strne r5,[r10,#128] + + mov r3,#0 + bic r2,r2,r6,lsr r7 // removing bits not existing in single + stmia r0,{r1-r4} + b fastfpe_next + +/*------------------------------------------------------------------------*/ + +round_single_nz: + cmp r4,#127 + bgt round_single_nz_ne_overflow + cmp r4,#-126-23 + blt round_single_nz_underflow + cmp r4,#-126 + blt round_single_nz_denormalized + + adds r5,r5,#0xffffffff + adcs r3,r3,#0xffffffff // add 0xff.ffffffff.ffffffff to + adcs r2,r2,#0xff // mantissa and additional bits + bcs round_single_add_ov + + cmp r5,#0xffffffff + cmpeq r3,#0xffffffff + andeq r5,r2,#0xff + cmpeq r5,#0xff // test for inexact + + bic r2,r2,#0xff // remove bits not existing in single + +round_single_add_ov_back: + ldrne r5,[r10,#128] + orrne r5,r5,#16 // set inexact flag + strne r5,[r10,#128] + + mov r3,#0 // remove bits not existing in single + stmia r0,{r1-r4} + b fastfpe_next + +round_single_add_ov: + add r4,r4,#1 + cmp r4,#127 + bgt round_single_nz_ne_overflow + movs r2,#0x80000000 // so that inexact flag gets set !!! + b round_single_add_ov_back + +round_single_nz_ne_overflow: + cmp r4,#0x7fffffff + beq round_single_infnan + + ldrne r5,[r10,#128] + orrne r5,r5,#16+4 // set inexact,overflow flag + strne r5,[r10,#128] + mov r2,#0x80000000 // set MSB + mov r3,#0 + mov r4,#0x7fffffff + stmia r0,{r1-r4} + b fastfpe_next + +round_single_nz_underflow: + cmp r4,#0x80000000 + beq round_single_nz_zero + + ldrne r5,[r10,#128] + orrne r5,r5,#16+8 // set inexact, underflow flag + strne r5,[r10,#128] + mov r2,#0x80000000 + mov r3,#0 + mov r4,#-149 // smallest non-zero single +round_single_nz_zero: + stmia r0,{r1-r4} + b fastfpe_next + +round_single_nz_denormalized: + mov r6,#0xffffffff + add r7,r4,#150 + mov r6,r6,lsr r7 + + teq r5,#0 + teqeq r3,#0 + tsteq r2,r6 + ldrne r8,[r10,#128] + orrne r8,r8,#16+8 // set inexact, underflow flag + strne r8,[r10,#128] + + adds r5,r5,#0xffffffff + adcs r3,r3,#0xffffffff + adcs r2,r2,r6 + bcs round_single_nz_denormalized_ov + + mov r3,#0 + bic r2,r2,r6 // removing bits not existing in single + stmia r0,{r1-r4} + b fastfpe_next + +round_single_nz_denormalized_ov: + add r4,r4,#1 + cmp r4,#-126 // left denormalized range ? + cmpge r2,#0x100 // yes -> overflow also without denormalisation ? + ldrge r5,[r10,#128] + bicge r5,r5,#8 // yes -> clear underflow flag + strge r5,[r10,#128] + mov r3,#0 + mov r2,#0x80000000 + stmia r0,{r1-r4} + b fastfpe_next + +/*------------------------------------------------------------------------*/ + +round_double_ne: + mov r7,#0xffffffff // to generate e.g. 0x7ff + + cmp r4,#1024 + bge round_double_nz_ne_overflow + add r6,r4,#1024 + cmp r6,#-1022+1024 + blt round_double_ne_denormalized + + teq r5,#0 + tsteq r3,r7,lsr#32-11 // testing for inexact + ldrne r6,[r10,#128] + orrne r6,r6,#16 // set inexact flag + strne r6,[r10,#128] + + adds r3,r3,#0x400 // add 0x0.00000400.00000000 to + adcs r2,r2,#0 // mantissa and additional bits + bcs round_double_add_ov + + teq r5,#0 + tsteq r3,r7,lsr#32-11 + biceq r3,r3,#0x800 // the even thingy + + bic r3,r3,r7,lsr#32-11 // remove bits not existing in double + + stmia r0,{r1-r4} + b fastfpe_next + +round_double_ne_denormalized: + cmp r6,#-1022-52-1+1024 + blt round_double_z_ne_underflow + + adds r6,r6,#1022+53-32-1024 + + addmi r6,r6,#32 + movmi r6,r7,lsr r6 + + movpl r7,r7,lsr r6 + movpl r6,#0 + + teq r5,#0 + tsteq r3,r7 + tsteq r2,r6 // testing for tinyness + ldrne r8,[r10,#128] + orrne r8,r8,#16+8 // set inexact, undeflow flag + strne r8,[r10,#128] + + bics r8,r6,r6,lsr#1 // generate ...0001000... + movne r11,#0 // from ...0001111... + biceq r11,r7,r7,lsr#1 // 64bit + + adds r3,r3,r11 + adcs r2,r2,r8 + bcs round_double_ne_denormalized_ov + + teq r5,#0 + tsteq r3,r7 + tsteq r2,r6 + bne round_double_ne_denormalized_noeventhingy + adds r11,r11,r11 + adc r8,r8,r8 + bic r3,r3,r11 + bic r2,r2,r8 // the even thingy + +round_double_ne_denormalized_noeventhingy: + bic r3,r3,r7 // removing bits not existing in + bic r2,r2,r6 // denormalized double + stmia r0,{r1-r4} + b fastfpe_next + +round_double_ne_denormalized_ov: + add r6,r4,#1024 + cmp r6,#-1023-52+1024 + cmpeq r3,#0 + cmpeq r2,#0 + beq round_single_z_ne_underflow // 1.0*2^(-1023-52) to zero! + add r4,r4,#1 + cmp r6,#-1022-1+1024 // left denormalized range ? + cmpge r3,#0x400 // yes -> overflow also without denormalisation ? + ldrge r5,[r10,#128] + bicge r5,r5,#8 // yes -> clear underflow flag + strge r5,[r10,#128] + mov r3,#0 + mov r2,#0x80000000 + stmia r0,{r1-r4} + b fastfpe_next + +/*------------------------------------------------------------------------*/ + +round_double_p: + teq r1,#0 + beq round_double_nz + b round_double_z + +/*------------------------------------------------------------------------*/ + +round_double_m: + teq r1,#0 + beq round_double_z + b round_double_nz + +/*------------------------------------------------------------------------*/ + +round_double_z: + mov r7,#0xffffffff + + cmp r4,#1024 + bge round_double_z_overflow + add r6,r4,#1024 + cmp r6,#-1022+1024 + blt round_double_z_denormalized + + teq r5,#0 + tsteq r3,r7,lsr#32-11 // testing for inexact + ldrne r5,[r10,#128] + orrne r5,r5,#16 // set inexact flag + strne r5,[r10,#128] + + bic r3,r3,r7,lsr#32-11 // removing bits not existing in double + stmia r0,{r1-r4} + b fastfpe_next + +round_double_z_overflow: + cmp r4,#0x7fffffff + beq round_double_infnan + + ldrne r5,[r10,#128] + orrne r5,r5,#16+4 // set inexact,overflow flag + strne r5,[r10,#128] + mov r2,#0xffffffff + mov r3,r2,lsl#11 // 0xfffff800 + mov r4,#1024 + sub r4,r4,#1 // 1023; biggest non-infinity double + stmia r0,{r1-r4} + b fastfpe_next + +round_double_infnan: + orrs r5,r3,r2,lsl#1 // is it Inf? ignore MSB + beq round_double_infnan_store + tst r2,#0x40000000 // is it a SNaN? + beq round_double_infnan_create_qnan + bic r3,r3,r7,lsr#32-11 // clear bits not in double +round_double_infnan_store: + stmia r0,{r1-r4} + b fastfpe_next + +round_double_infnan_create_qnan: + mov r1,#0x80000000 + mov r2,#0x7fffffff + mov r3,r2,lsl#11 // 0xfffff800 + ldr r5,[r10,#128] + orr r5,r5,#1 // set invalid operation flag + str r5,[r10,#128] + b round_double_infnan_store + +round_double_z_ne_underflow: + cmp r4,#0x80000000 + beq round_double_z_zero + ldr r5,[r10,#128] + orr r5,r5,#16+8 // set inexact, underflow flag + str r5,[r10,#128] + mov r2,#0 + mov r3,#0 + mov r4,#0x80000000 +round_double_z_zero: + stmia r0,{r1-r4} + b fastfpe_next + +round_double_z_denormalized: + cmp r6,#-1022-52+1024 + blt round_double_z_ne_underflow + + adds r6,r6,#1022+53-32-1024 + + addmi r6,r6,#32 + movmi r6,r7,lsr r6 + + movpl r7,r7,lsr r6 + movpl r6,#0 + + teq r5,#0 + tsteq r3,r7 + tsteq r2,r6 // testing for tinyness + ldrne r5,[r10,#128] + orrne r5,r5,#16+8 // set inexact, undeflow flag + strne r5,[r10,#128] + + bic r3,r3,r7 // rmoving bits not existing in + bic r2,r2,r6 // denormalized double + stmia r0,{r1-r4} + b fastfpe_next + +/*------------------------------------------------------------------------*/ + +round_double_nz: + mov r7,#0xffffffff // to generate e.g. 0x7ff + + cmp r4,#1024 + bge round_double_nz_ne_overflow + add r6,r4,#1024 + cmp r6,#-1022+1024 + blt round_double_nz_denormalized + + teq r5,#0 + tsteq r3,r7,lsr#32-11 // testing for inexact + ldrne r6,[r10,#128] + orrne r6,r6,#16 // set inexact flag + strne r6,[r10,#128] + + adds r5,r5,#0xffffffff + adcs r3,r3,r7,lsr#32-11 // add 0x0.000007ff.ffffffff to + adcs r2,r2,#0 // mantissa and additional bits + bcs round_double_add_ov + + bic r3,r3,r7,lsr#32-11 // remove bits not existing in double + + stmia r0,{r1-r4} + b fastfpe_next + +round_double_add_ov: + add r4,r4,#1 + cmp r4,#1024 + bge round_double_nz_ne_overflow + +// ldrne r6,[r10,#128] +// orrne r6,r6,#16 // set inexact flag +// strne r6,[r10,#128] + mov r2,#0x80000000 + mov r3,#0 + stmia r0,{r1-r4} + b fastfpe_next + +round_double_nz_ne_overflow: + cmp r4,#0x7fffffff + beq round_double_infnan + + ldrne r5,[r10,#128] + orrne r5,r5,#16+4 // set inexact,overflow flag + strne r5,[r10,#128] + mov r2,#0x80000000 // set MSB + mov r3,#0 + mov r4,#0x7fffffff + stmia r0,{r1-r4} + b fastfpe_next + +round_double_nz_underflow: + cmp r4,#0x80000000 + beq round_double_nz_zero + + ldrne r5,[r10,#128] + orrne r5,r5,#16+8 // set inexact, underflow flag + strne r5,[r10,#128] + mov r2,#0x80000000 + mov r3,#0 + mov r4,#-1074+1024 + sub r4,r4,#1024 // smallest non-zero double +round_double_nz_zero: + stmia r0,{r1-r4} + b fastfpe_next + +round_double_nz_denormalized: + cmp r6,#-1022-52+1024 + blt round_double_nz_underflow + + adds r6,r6,#1022+53-32-1024 + + addmi r6,r6,#32 + movmi r6,r7,lsr r6 + + movpl r7,r7,lsr r6 + movpl r6,#0 + + teq r5,#0 + tsteq r3,r7 + tsteq r2,r6 // testing for tinyness + ldrne r8,[r10,#128] + orrne r8,r8,#16+8 // set inexact, undeflow flag + strne r8,[r10,#128] + + adds r5,r5,#0xffffffff + adcs r3,r3,r7 + adcs r2,r2,r6 + bcs round_double_nz_denormalized_ov + + bic r3,r3,r7 // rmoving bits not existing in + bic r2,r2,r6 // denormalized double + stmia r0,{r1-r4} + b fastfpe_next + +round_double_nz_denormalized_ov: + add r4,r4,#1 + add r6,r4,#1024 + cmp r6,#-1022+1024 // left denormalized range ? + cmpge r3,#0x800 // yes -> overflow also without denormalisation ? + ldrge r5,[r10,#128] + bicge r5,r5,#8 // yes -> clear underflow flag + strge r5,[r10,#128] + mov r3,#0 + mov r2,#0x80000000 + stmia r0,{r1-r4} + b fastfpe_next + +/*------------------------------------------------------------------------*/ + +round_extended_ne: + mov r7,#0xffffffff // to generate e.g. 0x7ff + + cmp r4,#16384 + bge round_extended_nz_ne_overflow + add r6,r4,#16384 + cmp r6,#-16382+16384 + blt round_extended_ne_denormalized + + teq r5,#0 // testing for inexact + ldrne r6,[r10,#128] + orrne r6,r6,#16 // set inexact flag + strne r6,[r10,#128] + + adds r5,r5,#0x80000000 // add 0x0.00000400.00000000 to + adcs r3,r3,#0 // mantissa and additional bits + adcs r2,r2,#0 + bcs round_extended_add_ov + + teq r5,#0 + biceq r3,r3,#1 // the even thingy + + stmia r0,{r1-r4} + b fastfpe_next + +round_extended_ne_denormalized: + cmp r6,#-16382-63-1+16384 + blt round_extended_z_ne_underflow + + adds r6,r6,#16382+64-32-16384 + + addmi r6,r6,#32 + movmi r6,r7,lsr r6 + + movpl r7,r7,lsr r6 + movpl r6,#0 + + teq r5,#0 + tsteq r3,r7 + tsteq r2,r6 // testing for tinyness + ldrne r8,[r10,#128] + orrne r8,r8,#16+8 // set inexact, undeflow flag + strne r8,[r10,#128] + + bics r8,r6,r6,lsr#1 // generate ...0001000... + movne r11,#0 // from ...0001111... + biceq r11,r7,r7,lsr#1 // 64bit + + adds r3,r3,r11 + adcs r2,r2,r8 + bcs round_extended_ne_denormalized_ov + + teq r5,#0 + tsteq r3,r7 + tsteq r2,r6 + bne round_extended_ne_denormalized_noeventhingy + adds r11,r11,r11 + adc r8,r8,r8 + bic r3,r3,r11 + bic r2,r2,r8 // the even thingy + +round_extended_ne_denormalized_noeventhingy: + bic r3,r3,r7 // removing bits not existing in + bic r2,r2,r6 // denormalized extended + stmia r0,{r1-r4} + b fastfpe_next + +round_extended_ne_denormalized_ov: + add r6,r4,#16384 + cmp r6,#-16383-63+16384 + cmpeq r5,#0 + cmpeq r3,#0 + cmpeq r2,#0 + beq round_single_z_ne_underflow // 1.0*2^(-16383-63) to zero! + add r4,r4,#1 + cmp r6,#-16382-1+16384 // left denormalized range ? + blt round_extended_ne_still_denormalized + cmp r5,#0x80000000 // FIXME yes -> overflow also without denormalisation ? + ldrcs r5,[r10,#128] + biccs r5,r5,#8 // yes -> clear underflow flag + strcs r5,[r10,#128] +round_extended_ne_still_denormalized: + mov r3,#0 + mov r2,#0x80000000 + stmia r0,{r1-r4} + b fastfpe_next + +/*------------------------------------------------------------------------*/ + +round_extended_p: + teq r1,#0 + beq round_extended_nz + b round_extended_z + +/*------------------------------------------------------------------------*/ + +round_extended_m: + teq r1,#0 + beq round_extended_z + b round_extended_nz + +/*------------------------------------------------------------------------*/ + +round_extended_z: + mov r7,#0xffffffff + + cmp r4,#16384 + bge round_extended_z_overflow + add r6,r4,#16384 + cmp r6,#-16382+16384 + blt round_extended_z_denormalized + + teq r5,#0 // testing for inexact + ldrne r5,[r10,#128] + orrne r5,r5,#16 // set inexact flag + strne r5,[r10,#128] + + stmia r0,{r1-r4} + b fastfpe_next + +round_extended_z_overflow: + cmp r4,#0x7fffffff + beq round_extended_infnan + + ldrne r5,[r10,#128] + orrne r5,r5,#16+4 // set inexact,overflow flag + strne r5,[r10,#128] + mov r2,#0xffffffff + mov r3,#0xffffffff + mov r4,#16384 + sub r4,r4,#1 // 16383; biggest non-infinity extended + stmia r0,{r1-r4} + b fastfpe_next + +round_extended_infnan: + orrs r5,r3,r2,lsl#1 // is it Inf? ignore MSB + beq round_extended_infnan_store + tst r2,#0x40000000 // is it a SNaN? + beq round_extended_infnan_create_qnan + bic r3,r3,r7,lsr#32-11 // clear bits not in extended +round_extended_infnan_store: + stmia r0,{r1-r4} + b fastfpe_next + +round_extended_infnan_create_qnan: + mov r1,#0x80000000 + mov r2,#0x7fffffff + mov r3,#0xffffffff + ldr r5,[r10,#128] + orr r5,r5,#1 // set invalid operation flag + str r5,[r10,#128] + b round_extended_infnan_store + +round_extended_z_ne_underflow: + cmp r4,#0x80000000 + beq round_extended_z_zero + ldr r5,[r10,#128] + orr r5,r5,#16+8 // set inexact, underflow flag + str r5,[r10,#128] + mov r2,#0 + mov r3,#0 + mov r4,#0x80000000 +round_extended_z_zero: + stmia r0,{r1-r4} + b fastfpe_next + +round_extended_z_denormalized: + cmp r6,#-16382-63+16384 + blt round_extended_z_ne_underflow + + adds r6,r6,#16382+64-32-16384 + + addmi r6,r6,#32 + movmi r6,r7,lsr r6 + + movpl r7,r7,lsr r6 + movpl r6,#0 + + teq r5,#0 + tsteq r3,r7 + tsteq r2,r6 // testing for tinyness + ldrne r5,[r10,#128] + orrne r5,r5,#16+8 // set inexact, undeflow flag + strne r5,[r10,#128] + + bic r3,r3,r7 // removing bits not existing in + bic r2,r2,r6 // denormalized extended + stmia r0,{r1-r4} + b fastfpe_next + +/*------------------------------------------------------------------------*/ + +round_extended_nz: + mov r7,#0xffffffff // to generate e.g. 0x7ff + + cmp r4,#16384 + bge round_extended_nz_ne_overflow + add r6,r4,#16384 + cmp r6,#-16382+16384 + blt round_extended_nz_denormalized + + teq r5,#0 // testing for inexact + ldrne r6,[r10,#128] + orrne r6,r6,#16 // set inexact flag + strne r6,[r10,#128] + + adds r5,r5,#0xffffffff + adcs r3,r3,#0 // add 0x0.0.ffffffff to + adcs r2,r2,#0 // mantissa and additional bits + bcs round_extended_add_ov + + stmia r0,{r1-r4} + b fastfpe_next + +round_extended_add_ov: + add r4,r4,#1 + cmp r4,#16384 + bge round_extended_nz_ne_overflow + +// ldrne r6,[r10,#128] +// orrne r6,r6,#16 // set inexact flag +// strne r6,[r10,#128] + mov r2,#0x80000000 + mov r3,#0 + stmia r0,{r1-r4} + b fastfpe_next + +round_extended_nz_ne_overflow: + cmp r4,#0x7fffffff + beq round_extended_infnan + + ldrne r5,[r10,#128] + orrne r5,r5,#16+4 // set inexact,overflow flag + strne r5,[r10,#128] + mov r2,#0x80000000 // set MSB + mov r3,#0 + mov r4,#0x7fffffff + stmia r0,{r1-r4} + b fastfpe_next + +round_extended_nz_underflow: + cmp r4,#0x80000000 + beq round_extended_nz_zero + + ldrne r5,[r10,#128] + orrne r5,r5,#16+8 // set inexact, underflow flag + strne r5,[r10,#128] + mov r2,#0x80000000 + mov r3,#0 + mov r4,#-16445+16384 + sub r4,r4,#16384 // smallest non-zero extended +round_extended_nz_zero: + stmia r0,{r1-r4} + b fastfpe_next + +round_extended_nz_denormalized: + cmp r6,#-16382-63+16384 + blt round_extended_nz_underflow + + adds r6,r6,#16382+64-32-16384 + + addmi r6,r6,#32 + movmi r6,r7,lsr r6 + + movpl r7,r7,lsr r6 + movpl r6,#0 + + teq r5,#0 + tsteq r3,r7 + tsteq r2,r6 // testing for tinyness + ldrne r8,[r10,#128] + orrne r8,r8,#16+8 // set inexact, undeflow flag + strne r8,[r10,#128] + + adds r5,r5,#0xffffffff + adcs r3,r3,r7 + adcs r2,r2,r6 + bcs round_extended_nz_denormalized_ov + + bic r3,r3,r7 // removing bits not existing in + bic r2,r2,r6 // denormalized extended + stmia r0,{r1-r4} + b fastfpe_next + +round_extended_nz_denormalized_ov: + add r4,r4,#1 + add r6,r4,#16384 + cmp r6,#-16382+16384 // left denormalized range ? + cmpge r3,#1 // yes -> overflow also without denormalisation ? + ldrge r5,[r10,#128] + bicge r5,r5,#8 // yes -> clear underflow flag + strge r5,[r10,#128] + mov r3,#0 + mov r2,#0x80000000 + stmia r0,{r1-r4} + b fastfpe_next + +/*------------------------------------------------------------------------*/ + +round_undef: + stmia r0,{r1-r4} + b fastfpe_next + +/*------------------------------------------------------------------------*/ Index: linux/arch/arm/fastfpe/CPRT.S =================================================================== --- pre/linux/arch/arm/fastfpe/CPRT.S (revision 26) +++ post/linux/arch/arm/fastfpe/CPRT.S (working copy) @@ -1,6 +1,5 @@ /* -The FP structure has 4 words reserved for each register, the first is used -just +The FP structure has 4 words reserved for each register, the first is used just for the sign in bit 31, the second and third are for the mantissa (unsigned integer, high 32 bit first) and the fourth is the exponent (signed integer). The mantissa is always normalized. @@ -8,11 +7,8 @@ If the exponent is 0x80000000, that is the most negative value, the number represented is 0 and both mantissa words are also 0. -If the exponent is 0x7fffffff, that is the biggest positive value, the -number -represented is infinity if the high 32 mantissa bit are also 0, otherwise it -is -a NaN. The low 32 mantissa bit are 0 if the number represented is infinity. +If the exponent is 0x7fffffff, that is the biggest positive value, the number +represented is infinity if the mantissa is 0, otherwise it is a NaN. Decimal and packed decimal numbers are not supported yet. */ @@ -24,11 +20,18 @@ CPRT_flt: add r0,r13,r0,lsr#10 ldr r2,[r0] + mov r0,r1 mov r3,#0 cmp r2,#0 beq CPRT_flt_zero + + ldr r6,=round_table + and r5,r4,#0x000000e0 + and r4,r4,#0x00080000 + orr r5,r5,r4,lsr#11 + ldr r6,[r6,r5,lsr#3] // address of rounding function - ands r0,r2,#0x80000000 + ands r1,r2,#0x80000000 rsbne r2,r2,#0 mov r4,#31 @@ -52,13 +55,14 @@ movcc r2,r2,lsl#1 subcc r4,r4,#1 - stmia r1,{r0,r2,r3,r4} - b fastfpe_next + mov r5,#0 + ldr r14,=fastfpe_next + mov pc,r6 CPRT_flt_zero: - mov r0,#0 + mov r1,#0 mov r4,#0x80000000 - stmia r1,{r0,r2,r3,r4} + stmia r0,{r1,r2,r3,r4} b fastfpe_next /*---------------------------------------------------------------------------*/ @@ -68,46 +72,60 @@ ldmia r2,{r1,r2,r3,r5} bl CPDO_rnd_core -CPRT_back: add r0,r13,r0,lsr#10 cmp r5,#0 - blt CPRT_int_zero + blt CPRT_fix_zero cmp r5,#30 - bgt CPRT_overflow - + bgt CPRT_fix_overflow + +CPRT_fix_no_overflow: rsb r5,r5,#31 mov r2,r2,lsr r5 tst r1,#0x80000000 rsbne r2,r2,#0 - +CPRT_fix_zero_back: str r2,[r0] + ldr r1,[r10,#128] + orr r1,r1,r4 // set flags possibly caused by rounding + str r1,[r10,#128] b fastfpe_next -CPRT_int_zero: +CPRT_fix_zero: mov r2,#0 - str r2,[r0] - b fastfpe_next + b CPRT_fix_zero_back -CPRT_overflow: +CPRT_fix_overflow: + cmp r1,#0x80000000 // -2^31 is not exactly an overflow ... + cmpeq r2,#0x80000000 + cmpeq r5,#31 + beq CPRT_fix_no_overflow + mov r2,#0x80000000 tst r1,#0x80000000 subeq r2,r2,#1 str r2,[r0] + + ldr r1,[r10,#128] + orr r1,r1,#1 // set invalid operation flag + str r1,[r10,#128] b fastfpe_next /*---------------------------------------------------------------------------*/ .globl CPRT_wfs CPRT_wfs: + ldr r0,[r13,r0,lsr#10] + str r0,[r10,#128] b fastfpe_next /*---------------------------------------------------------------------------*/ .globl CPRT_rfs CPRT_rfs: - add r0,r13,r0,lsr#10 - mov r1,#0x02000000 @ Software Emulation, not Acorn FPE - str r1,[r0] + ldr r1,[r10,#128] + bic r1,r1,#0xff000000 + orr r1,r1,#0x02000000 @ Software Emulation, not Acorn FPE + str r1,[r13,r0,lsr#10] b fastfpe_next /*---------------------------------------------------------------------------*/ @@ -119,20 +137,29 @@ CPRT_cmf_e: ldr r0,[r13,#16*4] + bic r0,r0,#0xf0000000 cmp r7,#0x7fffffff - bic r0,r0,#0xf0000000 - - cmpeq r3,#0xffffffff - beq CPRT_cmf_unordered + beq CPRT_cmf_nan1 +CPRT_cmf_nixnan1: cmp r8,#0x7fffffff - cmpeq r4,#0xffffffff - beq CPRT_cmf_unordered + beq CPRT_cmf_nan2 +CPRT_cmf_nixnan2: cmp r1,r2 beq CPRT_cmf_equalsign - b CPRT_cmf_sign + b CPRT_cmf_signx +CPRT_cmf_nan1: + orrs r11,r5,r3,lsl#1 // ignore MSB + beq CPRT_cmf_nixnan1 + b CPRT_cmf_unordered + +CPRT_cmf_nan2: + orrs r11,r6,r4,lsl#1 // ignore MSB + beq CPRT_cmf_nixnan2 + b CPRT_cmf_unordered + CPRT_cmf_equalsign: cmp r7,r8 beq CPRT_cmf_equalexponent @@ -146,31 +173,39 @@ bhi CPRT_cmf_sign b CPRT_cmf_signb +CPRT_cmf_signx: + teq r7,#0x80000000 + teqeq r8,#0x80000000 + beq CPRT_cmf_equal CPRT_cmf_sign: - cmp r7,#0x80000000 @ (0.0 == -0.0)? - cmpeq r7,r8 - beq CPRT_cmf_equal tst r1,#0x80000000 - orreq r0,r0,#0x20000000 - orrne r0,r0,#0x80000000 + orreq r0,r0,#0x20000000 // PSR carry + orrne r0,r0,#0x80000000 // PSR negative str r0,[r13,#16*4] b fastfpe_next CPRT_cmf_signb: tst r1,#0x80000000 - orrne r0,r0,#0x20000000 - orreq r0,r0,#0x80000000 + orrne r0,r0,#0x20000000 // PSR carry + orreq r0,r0,#0x80000000 // PSR negative str r0,[r13,#16*4] b fastfpe_next CPRT_cmf_equal: - orr r0,r0,#0x60000000 + orr r0,r0,#0x60000000 // PSR carry, zero str r0,[r13,#16*4] b fastfpe_next CPRT_cmf_unordered: - orr r0,r0,#0x10000000 + ldr r1,[r10,#128] + orr r1,r1,#1 // set invalid operation flag + str r1,[r10,#128] + + tst r0,#1<<12 // FPSR AC bit set ? + orrne r0,r0,#0x20000000 // PSR carry + orr r0,r0,#0x10000000 // PSR overflow str r0,[r13,#16*4] + b fastfpe_next /*---------------------------------------------------------------------------*/ Index: linux/arch/arm/fastfpe/CPDT.S =================================================================== --- pre/linux/arch/arm/fastfpe/CPDT.S (revision 26) +++ post/linux/arch/arm/fastfpe/CPDT.S (working copy) @@ -8,8 +8,7 @@ represented is 0 and both mantissa words are also 0. If the exponent is 0x7fffffff, that is the biggest positive value, the number -represented is infinity if the high 32 mantissa bit are also 0, otherwise it is -a NaN. The low 32 mantissa bit are 0 if the number represented is infinity. +represented is infinity if the mantissa is 0, otherwise it is a NaN. Decimal and packed decimal numbers are not supported yet. */ @@ -27,20 +26,20 @@ beq CPDT_ls_e0 @ exponent = 0; zero/denormalized teq r5,#255 beq CPDT_ls_e255 @ exponent = 255; infinity/NaN - + sub r5,r5,#127 @ r5 = exponent, remove normalized bias - + mov r3,r1,lsl#8 orr r3,r3,#0x80000000 mov r4,#0 @ r3,r4 = mantissa stmia r0,{r2-r5} b fastfpe_next - + CPDT_ls_e0: movs r3,r1,lsl#9 beq CPDT_load_zero - + mov r5,#-127 CPDT_ls_e0_norm: @@ -48,17 +47,19 @@ subeq r5,r5,#1 moveq r3,r3,lsl#1 beq CPDT_ls_e0_norm - + mov r4,#0 stmia r0,{r2-r5} b fastfpe_next - + CPDT_ls_e255: - mov r3,r1,lsl#9 + mov r3,r1,lsl#8 + bics r3,r3,#0x80000000 + orreq r3,r3,#0x80000000 // set MSB for inf mov r4,#0 mov r5,#0x7fffffff stmia r0,{r2-r5} - b fastfpe_next + b fastfpe_next CPDT_load_zero: mov r3,#0 @@ -73,19 +74,19 @@ CPDT_load_double: ldr r1,[r6] ldr r6,[r6,#4] - + and r2,r1,#0x80000000 @ r2 = sign - + mov r5,r1,lsr#20 bics r5,r5,#0x800 beq CPDT_ld_e0 @ exponent = 0; zero/denormalized add r4,r5,#1 teq r4,#2048 beq CPDT_ld_e2047 @ exponent = 2047; infinity/NaN - + add r5,r5,#1 sub r5,r5,#1024 @ r5 = exponent, remove normalized bias - + mov r3,r1,lsl#11 orr r3,r3,#0x80000000 orr r3,r3,r6,lsr #21 @@ -93,7 +94,7 @@ stmia r0,{r2-r5} b fastfpe_next - + CPDT_ld_e0: mov r3,r1,lsl#12 orr r3,r3,r6,lsr#20 @@ -103,27 +104,29 @@ mov r5,#1 sub r5,r5,#1024 - + CPDT_ld_e0_norm: tst r3,#0x80000000 - subeq r5,r5,#1 - moveqs r4,r4,lsl#1 - adceq r3,r3,r3 - beq CPDT_ld_e0_norm - + bne CPDT_ld_e0_norm_end + sub r5,r5,#1 + movs r4,r4,lsl#1 + adc r3,r3,r3 + b CPDT_ld_e0_norm +CPDT_ld_e0_norm_end: stmia r0,{r2-r5} b fastfpe_next CPDT_ld_e2047: - mov r3,r1,lsl#12 - orr r3,r3,r6,lsr#1 - bic r6,r6,#0x80000000 - orr r3,r3,r6 @ to get all fraction bits ! - mov r4,#0 + mov r3,r1,lsl#11 + orr r3,r3,r6,lsr #21 + bic r3,r3,#0x80000000 + mov r4,r6,lsl#11 @ r3,r4 = mantissa + orrs r5,r3,r4 + orreq r3,r3,#0x80000000 // set MSB fo inf mov r5,#0x7fffffff stmia r0,{r2-r5} b fastfpe_next - + /*---------------------------------------------------------------------------*/ .globl CPDT_load_extended @@ -132,19 +135,21 @@ ldr r3,[r6,#4] ldr r4,[r6,#8] - and r2,r1,#0x80000000 - bics r5,r1,#0x80000000 + and r2,r1,#0x8000 + mov r2,r2,lsl#16 + mov r5,r1,lsl#17 + movs r5,r5,lsr#17 beq CPDT_le_e0 add r1,r5,#1 - teq r4,#32768 + teq r1,#32768 beq CPDT_le_e32767 - + add r5,r5,#1 sub r5,r5,#16384 - + stmia r0,{r2-r5} b fastfpe_next - + CPDT_le_e0: teq r3,#0 teqeq r4,#0 @@ -153,12 +158,8 @@ mov r5,#2 sub r5,r5,#16384 b CPDT_ld_e0_norm - + CPDT_le_e32767: - mov r3,r3,lsl#1 - orr r3,r3,r4,lsr#1 - bic r4,r4,#0x80000000 - orr r3,r3,r4 mov r5,#0x7fffffff stmia r0,{r2-r5} b fastfpe_next @@ -181,10 +182,6 @@ cmp r4,#128 bge CPDT_ss_e255 - adds r2,r2,#1<<7 @ round to nearest - bcs CPDT_ss_rnd_ovfl @ very very seldom taken - -CPDT_ss_store: add r4,r4,#127 orr r1,r1,r4,lsl#23 @@ -194,46 +191,39 @@ str r1,[r6] b fastfpe_next -CPDT_ss_rnd_ovfl: - add r4,r4,#1 - cmp r4,#128 - bge CPDT_ss_e255 - - mov r2,#0x80000000 - mov r3,#0 - b CPDT_ss_store - CPDT_ss_e0: cmp r4,#-150 ble CPDT_ss_zero - + add r4,r4,#126 -CPDT_ss_unnormalize: - mov r2,r2,lsr#1 - adds r4,r4,#1 - bne CPDT_ss_unnormalize - + rsb r4,r4,#0 + mov r2,r2,lsr r4 + orr r1,r1,r2,lsr#8 - + CPDT_ss_zero: str r1,[r6] b fastfpe_next CPDT_ss_e255: - cmp r4,#0x7fffffff - bne CPDT_ss_inf - cmp r2,#0 - beq CPDT_ss_inf - - orr r1,r1,#0x00200000 @ for safety so that it is not INF - orr r1,r1,r2,lsr#9 @ get highest bit of mantissa - -CPDT_ss_inf: orr r1,r1,#0x7f000000 orr r1,r1,#0x00800000 + cmp r4,#0x7fffffff + movne r2,#0 + movne r3,#0 + bic r2,r2,#0x80000000 + orrs r4,r3,r2,lsl#24 // only bits not stored in single + bne CPDT_ss_nan_special // NaN must not become Inf +CPDT_ss_nan_back: + orr r1,r1,r2,lsr#8 str r1,[r6] b fastfpe_next +CPDT_ss_nan_special: + cmp r2,#1<<8 + movlt r2,#1<<8 + b CPDT_ss_nan_back + /*---------------------------------------------------------------------------*/ .globl CPDT_store_double @@ -246,38 +236,24 @@ cmp r0,#-1023+3 @ cmp with -1023 ble CPDT_sd_e0 - adds r3,r3,#1<<10 @ round to nearest - adcs r2,r2,#0 - bcs CPDT_sd_rnd_ovfl @ very very seldom taken - -CPDT_sd_store: sub r4,r4,#1 add r4,r4,#1024 orr r1,r1,r4,lsl#20 bic r2,r2,#0x80000000 orr r1,r1,r2,lsr#11 - + mov r2,r2,lsl#21 orr r2,r2,r3,lsr#11 - + stmia r6,{r1,r2} b fastfpe_next -CPDT_sd_rnd_ovfl: - add r4,r4,#1 - cmp r4,#1024 - bge CPDT_sd_e2047 - - mov r2,#0x80000000 - mov r3,#0 - b CPDT_sd_store - CPDT_sd_e0: add r0,r4,#1075-1024 cmp r0,#-1024 ble CPDT_sd_zero - + add r4,r4,#1024 sub r4,r4,#2 CPDT_sd_unnormalize: @@ -285,11 +261,11 @@ mov r3,r3,rrx adds r4,r4,#1 bne CPDT_sd_unnormalize - + orr r1,r1,r2,lsr#11 mov r2,r2,lsl#21 orr r2,r2,r3,lsr#11 - + stmia r6,{r1,r2} b fastfpe_next @@ -299,20 +275,27 @@ b fastfpe_next CPDT_sd_e2047: - cmp r4,#0x7fffffff - bne CPDT_sd_inf - cmp r2,#0 - beq CPDT_sd_inf - - orr r1,r1,#0x00040000 @ for safety so that it is not INF - orr r1,r1,r2,lsr#12 @ get highest bit of mantissa - -CPDT_sd_inf: orr r1,r1,#0x7f000000 orr r1,r1,#0x00f00000 + cmp r4,#0x7fffffff + movne r2,#0 + movne r3,#0 + movs r5,r3,lsl#21 // only bits not stored in double ! + bne CPDT_sd_nan_special +CPDT_sd_nan_back: + orr r1,r1,r2,lsr#11 + mov r2,r2,lsl#21 + orr r2,r2,r3,lsr#11 stmia r6,{r1,r2} b fastfpe_next +CPDT_sd_nan_special: + bics r2,r2,#0x80000000 + bne CPDT_sd_nan_back + cmp r3,#1<<11 + movlt r3,#1<<11 + b CPDT_sd_nan_back + /*---------------------------------------------------------------------------*/ .globl CPDT_store_extended @@ -324,10 +307,10 @@ add r0,r4,#63 cmp r0,#-16383+63 ble CPDT_se_e0 - + sub r4,r4,#1 add r4,r4,#16384 - orr r1,r1,r4 + orr r1,r4,r1,lsr#16 stmia r6,{r1-r3} b fastfpe_next @@ -336,7 +319,7 @@ add r0,r4,#16446-16384 cmp r0,#-16384 ble CPDT_se_zero - + add r4,r4,#16384 sub r4,r4,#2 CPDT_se_unnormalize: @@ -345,10 +328,12 @@ adds r4,r4,#1 bne CPDT_se_unnormalize + mov r1,r1,lsr#16 stmia r6,{r1-r3} b fastfpe_next CPDT_se_zero: + mov r1,r1,lsr#16 mov r2,#0 mov r3,#0 stmia r6,{r1-r3} @@ -356,19 +341,14 @@ CPDT_se_e32767: cmp r4,#0x7fffffff - bne CPDT_se_inf - cmp r2,#0 - beq CPDT_se_inf - - mov r2,r2,lsl#1 - orr r2,r2,#0x20000000 - -CPDT_se_inf: + movne r2,#0 + movne r3,#0 + mov r1,r1,lsr#16 orr r1,r1,#0x00007f00 orr r1,r1,#0x000000ff stmia r6,{r1-r3} b fastfpe_next - + /*---------------------------------------------------------------------------*/ .globl CPDT_store_decimal @@ -379,52 +359,38 @@ /*---------------------------------------------------------------------------*/ .globl CPDT_sfm +CPDT_sfm_loop: + add r0,r0,#1<<12 + and r0,r0,#7<<12 CPDT_sfm: - add r2,r10,r0,lsr#8 - ldr r4,[r2,#0] - ldr r3,[r2,#4] + add r7,r10,r0,lsr#8 + ldmia r7,{r2-r5} bic r3,r3,#0x80000000 - orr r3,r3,r4 - str r3,[r6],#4 - ldr r3,[r2,#8] - str r3,[r6],#4 - ldr r3,[r2,#12] - str r3,[r6],#4 + orr r3,r3,r2 + stmia r6!,{r3-r5} - add r0,r0,#1<<12 - and r0,r0,#7<<12 subs r1,r1,#1 - bne CPDT_sfm + bne CPDT_sfm_loop b fastfpe_next /*---------------------------------------------------------------------------*/ .globl CPDT_lfm -CPDT_lfm: - add r2,r10,r0,lsr#8 - ldr r4,[r6],#4 - and r3,r4,#0x80000000 - str r3,[r2,#0] - ldr r3,[r6],#4 - str r3,[r2,#8] - ldr r3,[r6],#4 - str r3,[r2,#12] - - cmp r3,#0x80000000 @ does the exp indicate zero? - biceq r4,r4,#0x80000000 @ if so, indicate 'denormalized' - beq CPDT_lfm_storer4 - cmp r3,#0x7fffffff @ does the exp indicate inf or NaN? - biceq r4,r4,#0x80000000 @ if so, indicate 'denormalized' - beq CPDT_lfm_storer4 - orrne r4,r4,#0x80000000 @ otherwise, set normalized bit - -CPDT_lfm_storer4: - str r4,[r2,#4] - +CPDT_lfm_loop: add r0,r0,#1<<12 and r0,r0,#7<<12 +CPDT_lfm: + add r7,r10,r0,lsr#8 + ldmia r6!,{r3-r5} + and r2,r3,#0x80000000 + cmp r5,#0x80000000 // check if the number was 0 + cmpne r5,#0x7fffffff // or inf/NaN + biceq r3,r3,#0x80000000 // yes -> clear mantissa MSB + orrne r3,r3,#0x80000000 // no -> set mantissa MSB + stmia r7,{r2-r5} + subs r1,r1,#1 - bne CPDT_lfm + bne CPDT_lfm_loop b fastfpe_next /*---------------------------------------------------------------------------*/ Index: linux/arch/arm/fastfpe/Makefile =================================================================== --- pre/linux/arch/arm/fastfpe/Makefile (revision 26) +++ post/linux/arch/arm/fastfpe/Makefile (working copy) @@ -11,7 +11,7 @@ obj-n := obj- := -fastfpe-objs := module.o entry.o CPDO.o CPRT.o CPDT.o +fastfpe-objs := module.o round.o CPDT.o CPRT.o CPDO.o entry.o list-multi := fastfpe.o @@ -22,4 +22,4 @@ include $(TOPDIR)/Rules.make fastfpe.o: $(fastfpe-objs) - $(LD) -r -o $@ $(fastfpe-objs) + $(LD) -r -o $@ $(fastfpe-objs)