--- ../linux-sharp-a300/arch/arm/fastfpe/CPDO.S 2002-05-15 21:37:41.000000000 +0900 +++ linux/arch/arm/fastfpe/CPDO.S 2006-05-17 19:20:27.753602360 +0900 @@ -1,224 +1,1399 @@ /* -Inside the emulator the FP numbers are kept with 32 bit accuracy for both -mantissa and exponent. The FP structure has 4 words reserved for each -register, the first is used just for the sign in bit 31, the second is the -mantissa (unsigned integer) and the third is the exponent (signed integer). - -The functions do actually only work properly for normalized values, and if -no overflow occurs. Hopfully most programs are not disturbed by this, and it -will probably be improved in future versions. +The FP structure has 4 words reserved for each register, the first is used just +for the sign in bit 31, the second and third are for the mantissa (unsigned +integer, high 32 bit first) and the fourth is the exponent (signed integer). +The mantissa is always normalized. + +If the exponent is 0x80000000, that is the most negative value, the number +represented is 0 and both mantissa words are also 0. + +If the exponent is 0x7fffffff, that is the biggest positive value, the number +represented is infinity if the mantissa is 0, otherwise it is a NaN. + +Decimal and packed decimal numbers are not supported yet. The parameters to these functions are r0=destination pointer, r1 and r2 -source pointers. r4 is the instruction. They may use r0-r7. The return address -is in r14, except CPDO_rnf_core which expects the return address in r5 to -save memory accesses. +source pointers. r4 is the instruction. They may use r0-r8, r11. They return +to r14, which contains the address of a rounding function. The rounding +function expects r0=address, r1-r4=sign, mantissa high, mantissa low, +exponent, r5=additional lower mantissa bits. + +CPDO_rnf_core expects the return address in r14. */ /*---------------------------------------------------------------------------*/ .globl CPDO_adf CPDO_adf: - ldmia r1,{r1,r3,r5} - ldmia r2,{r2,r4,r6} - cmp r1,r2 + ldmia r2,{r6,r7,r8,r11} + ldmia r1,{r1,r2,r3,r4} + + cmp r11,#0x7fffffff + cmpne r11,#0x80000000 + cmpne r4,#0x7fffffff + cmpne r4,#0x80000000 + beq CPDO_adf_extra + + cmp r1,r6 bne CPDO_suf_s CPDO_adf_s: - subs r2,r5,r6 - bge CPDO_adf_1 - - rsb r2,r2,#0 - mov r5,r6 - adds r3,r4,r3,lsr r2 - b CPDO_adf_2 - -CPDO_adf_1: - adds r3,r3,r4,lsr r2 - -CPDO_adf_2: - addcs r5,r5,#1 - movcss r3,r3,rrx - beq CPDO_zero - stmia r0,{r1,r3,r5} - + subs r6,r4,r11 + bmi CPDO_adf_normalize1st + +CPDO_adf_normalize2nd: + cmp r6,#32 + ble CPDO_adf_normalize2nd_1 + cmp r6,#64 + bgt CPDO_adf_normalize2nd_3 + +CPDO_adf_normalize2nd_2: + sub r6,r6,#32 + rsb r11,r6,#32 + mov r5,r8,lsr r6 + add r5,r5,r7,lsl r11 + movs r11,r8,lsl r11 + orrne r5,r5,#1 + mov r8,r7,lsr r6 + mov r7,#0 + b CPDO_adf_add + +CPDO_adf_normalize2nd_1: + rsb r11,r6,#32 + mov r5,r8,lsl r11 + mov r8,r8,lsr r6 + add r8,r8,r7,lsl r11 + mov r7,r7,lsr r6 + b CPDO_adf_add + +CPDO_adf_normalize2nd_3: + mov r5,#0x40000000 mov pc,r14 -/*---------------------------------------------------------------------------*/ +CPDO_adf_normalize1st: + mov r4,r11 + rsb r6,r6,#0 + cmp r6,#32 + ble CPDO_adf_normalize1st_1 + cmp r6,#64 + bgt CPDO_adf_normalize1st_3 + +CPDO_adf_normalize1st_2: + sub r6,r6,#32 + rsb r11,r6,#32 + mov r5,r3,lsr r6 + add r5,r5,r2,lsl r11 + movs r11,r3,lsl r11 + orrne r5,r5,#1 + mov r3,r2,lsr r6 + mov r2,#0 + b CPDO_adf_add + +CPDO_adf_normalize1st_1: + rsb r11,r6,#32 + mov r5,r3,lsl r11 + mov r3,r3,lsr r6 + add r3,r3,r2,lsl r11 + mov r2,r2,lsr r6 + b CPDO_adf_add + +CPDO_adf_normalize1st_3: + mov r5,#0x40000000 + mov r2,r7 + mov r3,r8 + mov pc,r14 + +CPDO_adf_add: + adds r3,r3,r8 + adcs r2,r2,r7 + bcc CPDO_adf_add_no_overflow + + movs r2,r2,rrx + movs r3,r3,rrx + movs r5,r5,rrx + orrcs r5,r5,#1 + add r4,r4,#1 + +CPDO_adf_add_no_overflow: + mov pc,r14 + +CPDO_adf_extra: + cmp r4,#0x7fffffff + beq CPDO_adf_1st_infnan + cmp r11,#0x7fffffff + beq CPDO_adf_2nd_infnan + cmp r11,#0x80000000 + beq CPDO_adf_2nd_0 + +CPDO_adf_1st_0: + mov r1,r6 + mov r2,r7 + mov r3,r8 + mov r4,r11 + mov r5,#0 + mov pc,r14 + +CPDO_adf_2nd_0: + cmp r4,#0x80000000 + beq CPDO_adf_both_0 + mov r5,#0 + mov pc,r14 + +CPDO_adf_both_0: + cmp r1,r6 + beq CPDO_adf_both_0_equal_sign + and r5,r5,#0x00000060 + cmp r5,#0x00000040 // rounding mode M? + moveq r1,#0x80000000 + movne r1,#0 +CPDO_adf_both_0_equal_sign: + stmia r0,{r1,r2,r3,r4} + b fastfpe_next +@ mov pc,r14 + +CPDO_adf_1st_infnan: + cmp r11,#0x7fffffff + beq CPDO_adf_both_infnan +CPDO_adf_1st_infnan_entry: + orrs r5,r3,r2,lsl#1 // ignore MSB + moveq pc,r14 // Inf + tst r2,#0x40000000 + movne pc,r14 // QNaN +CPDO_adf_generate_qnan: + mov r1,#0x80000000 + mov r2,#0x7fffffff + mov r3,#0xffffffff + mov r4,#0x7fffffff + ldr r5,[r10,#128] + orr r5,r5,#1 // set invalid operation flag + str r5,[r10,#128] + mov pc,r14 + +CPDO_adf_2nd_infnan: + mov r1,r6 + mov r2,r7 + mov r3,r8 + mov r4,r11 + b CPDO_adf_1st_infnan_entry + +CPDO_adf_both_infnan: + orrs r5,r3,r2,lsl#1 // ignore MSB + beq CPDO_adf_1st_inf + orrs r5,r8,r7,lsl#1 // ignore MSB + beq CPDO_adf_2nd_inf + tst r2,#0x40000000 + tstne r7,#0x40000000 + beq CPDO_adf_generate_qnan // at least one is SNaN + orrs r5,r3,r2,lsl#1 // ignore MSB, FIXME! what is going on here? + moveq r1,r6 // if first is not NaN + moveq r2,r7 // give second as result + moveq r3,r8 + mov pc,r14 + +CPDO_adf_1st_inf: + orrs r5,r8,r7,lsl#1 // ignore MSB + beq CPDO_adf_both_inf + tst r7,#0x40000000 + beq CPDO_adf_generate_qnan + mov r1,r6 //if 2nd no SNaN return 2nd + mov r2,r7 + mov r3,r8 + mov pc,r14 + +CPDO_adf_2nd_inf: + tst r2,#0x40000000 + beq CPDO_adf_generate_qnan + mov pc,r14 // if 1st no SNaN just return it + +CPDO_adf_both_inf: + cmp r1,r6 + bne CPDO_adf_generate_qnan // signs of both inf are different + mov pc,r14 + +/*--------------------------------------------------------------------------*/ .globl CPDO_suf CPDO_suf: - ldmia r1,{r1,r3,r5} - ldmia r2,{r2,r4,r6} + ldmia r2,{r6,r7,r8,r11} + ldmia r1,{r1,r2,r3,r4} CPDO_suf_l: - cmp r1,r2 + cmp r11,#0x7fffffff + cmpne r11,#0x80000000 + cmpne r4,#0x7fffffff + cmpne r4,#0x80000000 + beq CPDO_suf_extra + + cmp r1,r6 bne CPDO_adf_s -CPDO_suf_s: - subs r2,r5,r6 - bge CPDO_suf_1 - - rsb r2,r2,#0 - mov r5,r6 - rsbs r3,r4,r3,lsr r2 - b CPDO_suf_2 - -CPDO_suf_1: - subs r3,r3,r4,lsr r2 - -CPDO_suf_2: - beq CPDO_zero - - eorcc r1,r1,#0x80000000 - rsbcc r3,r3,#0 - - cmp r3,#0x00010000 - movcc r3,r3,lsl#16 - subcc r5,r5,#16 - - cmp r3,#0x01000000 - movcc r3,r3,lsl#8 - subcc r5,r5,#8 - - cmp r3,#0x10000000 - movcc r3,r3,lsl#4 - subcc r5,r5,#4 - - cmp r3,#0x40000000 - movcc r3,r3,lsl#2 - subcc r5,r5,#2 - - cmp r3,#0x80000000 - movcc r3,r3,lsl#1 - subcc r5,r5,#1 - - stmia r0,{r1,r3,r5} - +CPDO_suf_s: + subs r6,r4,r11 + blt CPDO_suf_normalize1st + bgt CPDO_suf_normalize2nd + cmp r2,r7 + cmpeq r3,r8 + beq CPDO_suf_zero + mov r5,#0 + bcs CPDO_suf_sub_1stbigger + eor r1,r1,#0x80000000 + b CPDO_suf_sub_2ndbigger + +CPDO_suf_normalize2nd: + cmp r6,#32 + ble CPDO_suf_normalize2nd_1 + cmp r6,#64 + bgt CPDO_suf_normalize2nd_3 + +CPDO_suf_normalize2nd_2: + sub r6,r6,#32 + rsb r11,r6,#32 + mov r5,r8,lsr r6 + add r5,r5,r7,lsl r11 + movs r11,r8,lsl r11 + orrne r5,r5,#1 + mov r8,r7,lsr r6 + mov r7,#0 + b CPDO_suf_sub_1stbigger + +CPDO_suf_normalize2nd_1: + rsb r11,r6,#32 + mov r5,r8,lsl r11 + mov r8,r8,lsr r6 + add r8,r8,r7,lsl r11 + mov r7,r7,lsr r6 + b CPDO_suf_sub_1stbigger + +CPDO_suf_normalize2nd_3: + sub r6,r6,#64 + cmp r6,#32 + bge CPDO_suf_normalize2nd_4 + rsb r11,r6,#32 + mov r5,r7,lsr r6 + orrs r11,r8,r7,lsl r11 + orrne r5,r5,#1 + mov r7,#0 + mov r8,#0 + b CPDO_suf_sub_1stbigger + +CPDO_suf_normalize2nd_4: + mov r5,#1 + mov r7,#0 + mov r8,#0 + b CPDO_suf_sub_1stbigger + +CPDO_suf_normalize1st: + eor r1,r1,#0x80000000 + mov r4,r11 + rsb r6,r6,#0 + cmp r6,#32 + ble CPDO_suf_normalize1st_1 + cmp r6,#64 + bgt CPDO_suf_normalize1st_3 + +CPDO_suf_normalize1st_2: + sub r6,r6,#32 + rsb r11,r6,#32 + mov r5,r3,lsr r6 + add r5,r5,r2,lsl r11 + movs r11,r3,lsl r11 + orrne r5,r5,#1 + mov r3,r2,lsr r6 + mov r2,#0 + b CPDO_suf_sub_2ndbigger + +CPDO_suf_normalize1st_1: + rsb r11,r6,#32 + mov r5,r3,lsl r11 + mov r3,r3,lsr r6 + add r3,r3,r2,lsl r11 + mov r2,r2,lsr r6 + b CPDO_suf_sub_2ndbigger + +CPDO_suf_normalize1st_3: + sub r6,r6,#64 + cmp r6,#32 + bge CPDO_suf_normalize1st_4 + rsb r11,r6,#32 + mov r5,r2,lsr r6 + orrs r11,r3,r2,lsl r11 + orrne r5,r5,#1 + mov r2,#0 + mov r3,#0 + b CPDO_suf_sub_2ndbigger + +CPDO_suf_normalize1st_4: + mov r5,#1 + mov r2,#0 + mov r3,#0 + b CPDO_suf_sub_2ndbigger + +CPDO_suf_sub_1stbigger: + rsbs r5,r5,#0 + sbcs r3,r3,r8 + sbcs r2,r2,r7 + movmi pc,r14 + b CPDO_suf_norm + +CPDO_suf_sub_2ndbigger: + rsbs r5,r5,#0 + sbcs r3,r8,r3 + sbcs r2,r7,r2 + movmi pc,r14 + +CPDO_suf_norm: + teq r2,#0 // normalize 32 bit + bne CPDO_suf_norm16 + teq r3,#0 // normalize 64 bit + bne CPDO_suf_norm32 + mov r2,r5 + mov r3,#0 + mov r5,#0 + sub r4,r4,#64 + mov pc,r14 +CPDO_suf_norm32: + mov r2,r3 + mov r3,r5 + mov r5,#0 + sub r4,r4,#32 +CPDO_suf_norm16: + cmp r2,#0x00010000 // normalize 16 bit + bcs CPDO_suf_norm8 + mov r2,r2,lsl#16 + orr r2,r2,r3,lsr#16 + mov r3,r3,lsl#16 + orr r3,r3,r5,lsr#16 + mov r5,r5,lsl#16 + sub r4,r4,#16 +CPDO_suf_norm8: + cmp r2,#0x01000000 // normalize 8 bit + bcs CPDO_suf_norm4 + mov r2,r2,lsl#8 + orr r2,r2,r3,lsr#24 + mov r3,r3,lsl#8 + orr r3,r3,r5,lsr#24 + mov r5,r5,lsl#8 + sub r4,r4,#8 +CPDO_suf_norm4: + cmp r2,#0x10000000 // normalize 4 bit + bcs CPDO_suf_norm2 + mov r2,r2,lsl#4 + orr r2,r2,r3,lsr#28 + mov r3,r3,lsl#4 + orr r3,r3,r5,lsr#28 + mov r5,r5,lsl#4 + sub r4,r4,#4 +CPDO_suf_norm2: + cmp r2,#0x40000000 // normalize 2 bit + bcs CPDO_suf_norm1 + mov r2,r2,lsl#2 + orr r2,r2,r3,lsr#30 + mov r3,r3,lsl#2 + orr r3,r3,r5,lsr#30 + mov r5,r5,lsl#2 + sub r4,r4,#2 +CPDO_suf_norm1: + cmp r2,#0x80000000 // normalize 1 bit + bcs CPDO_suf_norme + mov r2,r2,lsl#1 + orr r2,r2,r3,lsr#31 + mov r3,r3,lsl#1 + orr r3,r3,r5,lsr#31 + mov r5,r5,lsl#1 + sub r4,r4,#1 +CPDO_suf_norme: mov pc,r14 +CPDO_suf_zero: + and r5,r5,#0x00000060 + cmp r5,#0x00000040 // rounding mode M? + moveq r1,#0x80000000 + movne r1,#0 + mov r2,#0 + mov r3,#0 + mov r4,#0x80000000 + stmia r0,{r1,r2,r3,r4} + b fastfpe_next +@ mov pc,r14 + +CPDO_suf_extra: // nearly the same as with adf + cmp r11,#0x7fffffff // the only thing we need to do is + bne CPDO_suf_extra_sign // to invert the second sign if + orrnes r5,r8,r7,lsl#1 // it is not a NaN, ignore MSB + bne CPDO_adf_extra +CPDO_suf_extra_sign: + eor r6,r6,#0x80000000 + b CPDO_adf_extra + /*---------------------------------------------------------------------------*/ .globl CPDO_rsf CPDO_rsf: - mov r3,r2 - ldmia r1,{r2,r4,r6} - ldmia r3,{r1,r3,r5} + ldmia r1,{r6,r7,r8,r11} + ldmia r2,{r1,r2,r3,r4} b CPDO_suf_l - + /*---------------------------------------------------------------------------*/ .globl CPDO_muf CPDO_muf: - ldmia r1,{r1,r3,r5} - ldmia r2,{r2,r4,r6} - - eor r1,r1,r2 - add r6,r5,r6 - umulls r2,r5,r4,r3 - beq CPDO_zero + ldmia r2,{r6,r7,r8,r11} + ldmia r1,{r1,r2,r3,r4} + + cmp r11,#0x7fffffff + cmpne r4,#0x7fffffff + beq CPDO_muf_extra + eor r1,r1,r6 // sign + cmp r11,#0x80000000 + cmpne r4,#0x80000000 + beq CPDO_muf_zero + + add r4,r4,r11 // exponent + +#define x32 r2 +#define x10 r3 +#define y32 r7 +#define y10 r8 +#define z3 r0 +#define z2 r1 +#define z1 r4 +#define z0 r6 +#define v1 r9 +#define v0 r11 +#define tmp r5 + + stmdb r13!,{r0,r1,r4,r9} + + mov z3,x32,lsr#16 + bic z2,x32,z3,lsl#16 + movs v1,y32,lsr#16 + bic v0,y32,v1,lsl#16 + + mul tmp,z3,v0 + mul z3,v1,z3 + mulne v1,z2,v1 + mul z2,v0,z2 + adds z2,z2,tmp,lsl#16 + adc z3,z3,tmp,lsr#16 + adds z2,z2,v1,lsl#16 + adc z3,z3,v1,lsr#16 + + mov z1,x10,lsr#16 + bic z0,x10,z1,lsl#16 + movs v1,y10,lsr#16 + bic v0,y10,v1,lsl#16 + + mul tmp,z1,v0 + mul z1,v1,z1 + mulne v1,z0,v1 + mul z0,v0,z0 + adds z0,z0,tmp,lsl#16 + adc z1,z1,tmp,lsr#16 + adds z0,z0,v1,lsl#16 + adc z1,z1,v1,lsr#16 + + adds z2,z2,z1 // z3 is max. 0xfffffffe + adc z3,z3,#0 // so this trick is possible + adds z1,z2,z0 // to save one addition + adcs z2,z2,z3 + adc z3,z3,#0 + + subs x10,x32,x10 + mov v0,#0 + mov v1,v0,rrx + + sublo v0,y32,y10 + subnes y10,y10,y32 + + orreq v1,v1,#1<<31 + eorcs v1,v1,#1<<31 + subcc v0,v0,x10 + + movs x32,x10,lsr#16 + bic x10,x10,x32,lsl#16 + mov y32,y10,lsr#16 + bic y10,y10,y32,lsl#16 + + mul tmp,x10,y10 + mla v0,x32,y32,v0 + mulne x32,y10,x32 + adds tmp,tmp,x32,lsl#16 + adc v0,v0,x32,lsr#16 + mul y32,x10,y32 + adds tmp,tmp,y32,lsl#16 + adc v0,v0,y32,lsr#16 + adds r5,z1,tmp + adcs r3,z2,v0 + adc r2,z3,v1,asr#31 + + teq z0,#0 + orrne r5,r5,#1 // z0 must not be lost for rounding + cmp r2,#0 + +#undef x32 r2 +#undef x10 r3 +#undef y32 r7 +#undef y10 r8 +#undef z3 r0 +#undef z2 r1 +#undef z1 r4 +#undef z0 r6 +#undef v1 r9 +#undef v0 r11 +#undef tmp r5 + + ldmia r13!,{r0,r1,r4,r9} + bpl CPDO_muf_norm - - add r6,r6,#1 - stmia r0,{r1,r5,r6} + add r4,r4,#1 mov pc,r14 CPDO_muf_norm: - adds r2,r2,r2 - adcs r5,r5,r5 - - stmia r0,{r1,r5,r6} + adds r5,r5,r5 + adcs r3,r3,r3 + adc r2,r2,r2 + mov pc,r14 + +CPDO_muf_extra: + cmp r4,#0x7fffffff + beq CPDO_muf_1st_infnan +CPDO_muf_2nd_infnan: + orrs r5,r8,r7,lsl#1 // ignore MSB + bne CPDO_muf_2nd_nan + cmp r4,#0x80000000 + beq CPDO_muf_generate_qnan + mov r2,r7 // copy MSB + mov r3,#0 + mov r4,#0x7fffffff + eor r1,r1,r6 + stmia r0,{r1,r2,r3,r4} + b fastfpe_next +@ mov pc,r14 + +CPDO_muf_1st_infnan: + cmp r11,#0x7fffffff + beq CPDO_muf_both_infnan + orrs r5,r3,r2,lsl#1 // ignore MSB + bne CPDO_muf_1st_nan + cmp r11,#0x80000000 + beq CPDO_muf_generate_qnan +// mov r4,#0x7fffffff + eor r1,r1,r6 + stmia r0,{r1,r2,r3,r4} + b fastfpe_next +@ mov pc,r14 + +CPDO_muf_both_infnan: + orrs r5,r3,r2,lsl#1 // ignore MSB + beq CPDO_muf_both_infnan_1st_inf + orrs r5,r8,r7,lsl#1 // ignore MSB + beq CPDO_muf_both_infnan_2nd_inf + tst r2,#0x40000000 + tstne r7,#0x40000000 + beq CPDO_muf_generate_qnan + mov pc,r14 + +CPDO_muf_both_infnan_1st_inf: + orrs r5,r8,r7,lsl#1 // ignore MSB + beq CPDO_muf_both_inf + b CPDO_muf_2nd_nan + +CPDO_muf_both_infnan_2nd_inf: + b CPDO_muf_1st_nan + +CPDO_muf_both_inf: + eor r1,r1,r6 + orr r2,r2,r7 // copy both MSB + stmia r0,{r1,r2,r3,r4} + b fastfpe_next +@ mov pc,r14 + +CPDO_muf_zero: + mov r2,#0 + mov r3,#0 + mov r4,#0x80000000 + stmia r0,{r1,r2,r3,r4} + b fastfpe_next +@ mov pc,r14 + +CPDO_muf_1st_nan: + tst r2,#0x40000000 + beq CPDO_muf_generate_qnan + mov pc,r14 + +CPDO_muf_2nd_nan: + tst r7,#0x40000000 + beq CPDO_muf_generate_qnan + mov r1,r6 + mov r2,r7 + mov r3,r8 + mov r4,r11 + mov pc,r14 + +CPDO_muf_generate_qnan: + mov r1,#0x80000000 + mov r2,#0x7fffffff + mov r3,#0xffffffff + mov r4,#0x7fffffff + ldr r5,[r10,#128] + orr r5,r5,#1 + str r5,[r10,#128] + mov pc,r14 + +/*---------------------------------------------------------------------------*/ + + .globl CPDO_muf_M +CPDO_muf_M: + ldmia r2,{r6,r7,r8,r11} + ldmia r1,{r1,r2,r3,r4} + + cmp r11,#0x7fffffff + cmpne r4,#0x7fffffff + beq CPDO_muf_extra + eor r1,r1,r6 // sign + cmp r11,#0x80000000 + cmpne r4,#0x80000000 + beq CPDO_muf_zero + + add r4,r4,r11 // exponent + umull r12,r11,r2,r7 + umull r2,r6,r8,r2 + umull r8,r5,r3,r8 + adds r5,r5,r2 + adcs r12,r12,r6 + adc r11,r11,#0 + umull r7,r6,r3,r7 + adds r5,r5,r7 + adcs r3,r12,r6 + adc r2,r11,#0 + teq r8,#0 + orrne r5,r5,#1 // r8 must not be lost for rounding + cmp r2,#0 + + bpl CPDO_muf_norm + add r4,r4,#1 mov pc,r14 /*---------------------------------------------------------------------------*/ -/* Divison ignores the LSB in both mantissa, but needs only ~110 cycles. */ + +CPDO_infnan_1: + stmia r0,{r1,r3,r5,r7} + b fastfpe_next + +CPDO_infnan_2: + stmia r0,{r2,r4,r6,r8} + b fastfpe_next + +CPDO_nan_12: + orr r2,r3,r4 + b CPDO_inf_1 + +CPDO_nan: + mov r2,#0x40000000 @ create non signalling NaN + b CPDO_inf_1 + +CPDO_inf: + mov r2,#0 +CPDO_inf_1: + mov r3,#0 + mov r4,#0x7fffffff +CPDO_store_1234: + stmia r0,{r1,r2,r3,r4} + b fastfpe_next + +CPDO_zero: + mov r1,#0 +CPDO_zero_1: + mov r2,#0 + mov r3,#0 + mov r4,#0x80000000 + stmia r0,{r1,r2,r3,r4} + b fastfpe_next + +CPDO_muf_end: + cmp r8,#0x20000000 + bge CPDO_inf + cmp r8,#0xe0000000 + ble CPDO_zero_1 + stmia r0,{r1,r2,r7,r8} + b fastfpe_next + +/*---------------------------------------------------------------------------*/ .globl CPDO_dvf CPDO_dvf: - ldmia r1,{r1,r3,r5} - ldmia r2,{r2,r4,r6} + ldmia r2,{r6,r7,r8,r11} + ldmia r1,{r1,r2,r3,r4} CPDO_dvf_l: - eor r1,r1,r2 - sub r6,r5,r6 + cmp r11,#0x7fffffff + cmpne r4,#0x7fffffff + beq CPDO_dvf_infnan + eor r1,r1,r6 + cmp r11,#0x80000000 + cmpne r4,#0x80000000 + beq CPDO_dvf_zero + + sub r4,r4,r11 + +#define x4 r11 +#define x3 r7 +#define x2 r12 +#define x1 r8 +#define y2 r14 +#define y1 r9 +#define z3 r4 +#define z2 r5 +#define z1 r6 +#define tmp r10 + + cmp r2,r7 + cmpeq r3,r8 + bcs CPDO_dvf_no_normalize + + sub r4,r4,#1 + stmdb r13!,{r1,r4,r9,r10,r11,r14} + mov r4,r2,lsr#31 + mov r5,r2,lsl#1 + orr r5,r5,r3,lsr#31 + mov r6,r3,lsl#1 // dividend + b CPDO_dvf_normalize_back + +CPDO_dvf_no_normalize: + stmdb r13!,{r1,r4,r9,r10,r11,r14} + mov r4,#0 + mov r5,r2 + mov r6,r3 // dividend - movs r3,r3,lsr#1 - beq CPDO_zero - mov r4,r4,lsr#1 - rsb r4,r4,#0 - - .macro div_step - adcs r3,r4,r3,lsl#1 - subcc r3,r3,r4 - adc r5,r5,r5 +CPDO_dvf_normalize_back: + mov r1,#0 + sub r10,r1,r7,lsr#1 + mov r11,#0x40000000 + + .macro inv_step + adds r11,r10,r11,lsl#1 + subcc r11,r11,r10 + adc r1,r1,r1 .endm - - adds r3,r4,r3 - adc r5,r5,r5 - bcs CPDO_dvf_b - subcc r3,r3,r4 - add r3,r4,r3,lsl#1 - mov r5,#1 - sub r6,r6,#1 - -CPDO_dvf_b: - div_step - div_step - div_step - div_step - div_step - div_step - div_step - div_step - div_step - div_step - div_step - div_step - div_step - div_step - div_step - div_step - div_step - div_step - div_step - div_step - div_step - div_step - div_step - div_step - div_step - div_step - div_step - div_step - div_step - div_step - div_step - -CPDO_dvf_e: - stmia r0,{r1,r5,r6} + .rept 17 + inv_step + .endr + + mov r1,r1,lsl#15 + adds r1,r1,#1<<15 + movcs r1,#0xffffffff // inverse + mov r1,r1,lsr#16 + + mov r2,#0 + mov r3,#0 // clear result space + + mov x4,r7,lsr#16 + bic x3,r7,x4,lsl#16 + mov x2,r8,lsr#16 + bic x1,r8,x2,lsl#16 // split divisor for 16x16=32bit mul + +CPDO_dvf_loop_entry: + mov r4,r4,lsl#16 + orrs r4,r4,r5,lsr#16 + mov r5,r5,lsl#16 + orr r5,r5,r6,lsr#16 + mov r6,r6,lsl#16 // shift dividend left by 16 + + bmi CPDO_dvf_loop_negative + mov r10,r4,lsr#16 + mul r9,r10,r1 + bic r10,r4,r10,lsl#16 + mul r10,r1,r10 + add r9,r9,r10,lsr#16 //estimate 16 bits of result in r9 + + mov r2,r2,lsl#16 + orr r2,r2,r3,lsr#16 + adds r3,r9,r3,lsl#16 // shift result left by 16 and + adc r2,r2,#0 // add in new result bits + + mov r9,r9,lsl#1 + mov y2,r9,lsr#16 + bic y1,r9,y2,lsl#16 + mul tmp,x1,y1 + subs z1,z1,tmp + mul tmp,x3,y1 + sbcs z2,z2,tmp + mul tmp,x4,y2 + sbc z3,z3,tmp + mul tmp,x2,y2 + subs z2,z2,tmp + sbc z3,z3,#0 + mul tmp,x2,y1 + subs z1,z1,tmp,lsl#16 + sbcs z2,z2,tmp,lsr#16 + sbc z3,z3,#0 + mul tmp,x1,y2 + subs z1,z1,tmp,lsl#16 + sbcs z2,z2,tmp,lsr#16 + sbc z3,z3,#0 + mul tmp,x4,y1 + subs z2,z2,tmp,lsl#16 + sbc z3,z3,tmp,lsr#16 + mul tmp,x3,y2 + subs z2,z2,tmp,lsl#16 + sbc z3,z3,tmp,lsr#16 // subtract divisor * estimated result + + tst r2,#0xff000000 + beq CPDO_dvf_loop_entry + + b CPDO_dvf_end_entry + +CPDO_dvf_loop_negative: + rsb r14,r4,#0 + mov r10,r14,lsr#16 + mul r9,r10,r1 + bic r10,r14,r10,lsl#16 + mul r10,r1,r10 + add r9,r9,r10,lsr#16 // estimate 16 bits of result in r9 + + mov r2,r2,lsl#16 + orr r2,r2,r3,lsr#16 + rsbs r3,r9,r3,lsl#16 // shift result left by 16 and + sbc r2,r2,#0 // add in new result bits + + mov r9,r9,lsl#1 + mov y2,r9,lsr#16 + bic y1,r9,y2,lsl#16 + mul tmp,x1,y1 + adds z1,z1,tmp + mul tmp,x3,y1 + adcs z2,z2,tmp + mul tmp,x4,y2 + adc z3,z3,tmp + mul tmp,x2,y2 + adds z2,z2,tmp + adc z3,z3,#0 + mul tmp,x2,y1 + adds z1,z1,tmp,lsl#16 + adcs z2,z2,tmp,lsr#16 + adc z3,z3,#0 + mul tmp,x1,y2 + adds z1,z1,tmp,lsl#16 + adcs z2,z2,tmp,lsr#16 + adc z3,z3,#0 + mul tmp,x4,y1 + adds z2,z2,tmp,lsl#16 + adc z3,z3,tmp,lsr#16 + mul tmp,x3,y2 + adds z2,z2,tmp,lsl#16 + adc z3,z3,tmp,lsr#16 // subtract divisor * estimated result + + tst r2,#0xff000000 + beq CPDO_dvf_loop_entry + +CPDO_dvf_end_entry: + movs r4,r4,asr#1 + movs r5,r5,rrx // remainder was shifted left by 1 + movs r6,r6,rrx // relative to divisor + + orr r7,x3,x4,lsl#16 + orr r8,x1,x2,lsl#16 // put the split divisor together again + + cmp r4,#0 + blt CPDO_dvf_end_negative + cmpeq r5,r7 + cmpeq r6,r8 + bcc CPDO_dvf_end + +CPDO_dvf_end_positive: + adds r3,r3,#1 + adc r2,r2,#0 + + subs r6,r6,r8 + sbcs r5,r5,r7 + sbcs r4,r4,#0 + bne CPDO_dvf_end_positive + + cmp r5,r7 + cmpeq r6,r8 + bcs CPDO_dvf_end_positive + b CPDO_dvf_end + +CPDO_dvf_end_negative: + subs r3,r3,#1 + sbc r2,r2,#0 + + adds r6,r6,r8 + adcs r5,r5,r7 + adcs r4,r4,#0 + bmi CPDO_dvf_end_negative + +CPDO_dvf_end: + orrs r9,r5,r6 + ldmia r13!,{r1,r4,r9,r10,r11,r14} + moveq pc,r14 + + adds r6,r6,r6 + adcs r5,r5,r5 + movcs r5,#0xc0000000 + movcs pc,r14 + + cmp r5,r7 + cmpeq r6,r8 + movcc r5,#0x40000000 + moveq r5,#0x80000000 + movhi r5,#0xc0000000 mov pc,r14 - -CPDO_zero: + +CPDO_dvf_zero: + cmp r11,#0x80000000 + beq CPDO_dvf_by_zero + + stmia r0,{r1,r2,r3,r4} + b fastfpe_next // 0 already there +@ mov pc,r14 + +CPDO_dvf_by_zero: + cmp r4,#0x80000000 + beq CPDO_dvf_generate_qnan // first 0 too + + mov r2,#0x80000000 // set MSB + mov r3,#0 + mov r4,#0x7fffffff + ldr r5,[r10,#128] + orr r5,r5,#2 // division by zero + str r5,[r10,#128] + stmia r0,{r1,r2,r3,r4} + b fastfpe_next +@ mov pc,r14 + +CPDO_dvf_infnan: + cmp r4,#0x7fffffff + beq CPDO_dvf_1st_infnan + + orrs r5,r8,r7,lsl#1 // ignore MSB + beq CPDO_dvf_2nd_inf + mov r1,r6 + mov r2,r7 + mov r3,r8 + mov r4,r11 + b CPDO_dvf_1st_or_2nd_nan + +CPDO_dvf_2nd_inf: + eor r1,r1,r6 + mov r2,#0 + mov r3,#0 + mov r4,#0x80000000 + stmia r0,{r1,r2,r3,r4} + b fastfpe_next // zero created +@ mov pc,r14 + +CPDO_dvf_1st_infnan: + cmp r11,#0x7fffffff + beq CPDO_dvf_both_infnan + + orrs r5,r3,r2,lsl#1 // 1st inf? ignore MSB + bne CPDO_dvf_1st_or_2nd_nan + + eor r1,r1,r6 // sign for inf + stmia r0,{r1,r2,r3,r4} + b fastfpe_next // inf already there +@ mov pc,r14 + +CPDO_dvf_1st_or_2nd_nan: + tst r2,#0x40000000 + beq CPDO_dvf_generate_qnan + mov pc,r14 // qnan1/2 already/copied there + +CPDO_dvf_both_infnan: + orrs r5,r3,r2,lsl#1 // ignore MSB + beq CPDO_dvf_both_infnan_1st_inf + orrs r5,r8,r7,lsl#1 // ignore MSB + beq CPDO_dvf_both_infnan_2nd_inf + tst r2,#0x40000000 + tstne r7,#0x40000000 + beq CPDO_dvf_generate_qnan + mov pc,r14 + +CPDO_dvf_both_infnan_1st_inf: + tst r7,#0x40000000 // 2nd inf or SNaN ? + beq CPDO_dvf_generate_qnan + mov r1,r6 + mov r2,r7 + mov r3,r8 + mov r4,r11 // copy 2nd QNaN + mov pc,r14 + +CPDO_dvf_both_infnan_2nd_inf: + tst r2,#0x40000000 // 1st SNaN ? + beq CPDO_dvf_generate_qnan + mov pc,r14 + +CPDO_dvf_generate_qnan: + mov r1,#0x80000000 + mov r2,#0x7fffffff + mov r3,#0xffffffff + mov r4,#0x7fffffff + ldr r5,[r10,#128] + orr r5,r5,#1 + str r5,[r10,#128] + mov pc,r14 + +/*---------------------------------------------------------------------------*/ + + .globl CPDO_dvf_M +CPDO_dvf_M: + ldmia r2,{r6,r7,r8,r11} + ldmia r1,{r1,r2,r3,r4} + +CPDO_dvf_M_l: + cmp r11,#0x7fffffff + cmpne r4,#0x7fffffff + beq CPDO_dvf_infnan + eor r1,r1,r6 + cmp r11,#0x80000000 + cmpne r4,#0x80000000 + beq CPDO_dvf_zero + + sub r4,r4,r11 + + cmp r2,r7 + cmpeq r3,r8 + bcs CPDO_dvf_M_no_normalize + + sub r4,r4,#1 + stmdb r13!,{r1,r4,r9,r10} + mov r4,r2,lsr#31 + mov r5,r2,lsl#1 + orr r5,r5,r3,lsr#31 + mov r6,r3,lsl#1 // dividend + b CPDO_dvf_M_normalize_back + +CPDO_dvf_M_no_normalize: + stmdb r13!,{r1,r4,r9,r10} + mov r4,#0 + mov r5,r2 + mov r6,r3 // dividend + +CPDO_dvf_M_normalize_back: mov r1,#0 + sub r10,r1,r7,lsr#1 + mov r11,#0x40000000 + + .rept 18 + inv_step + .endr + + mov r1,r1,lsl#14 + adds r1,r1,#1<<15 + movcs r1,#0xffffffff // inverse + mov r2,#0 - mov r3,#0x80000000 - stmia r0,{r1-r3} + mov r3,#0 // clear result space + +CPDO_dvf_M_loop_entry: + mov r4,r4,lsl#16 + orrs r4,r4,r5,lsr#16 + mov r5,r5,lsl#16 + orr r5,r5,r6,lsr#16 + mov r6,r6,lsl#16 // shift dividend left by 16 + + bmi CPDO_dvf_M_loop_negative + umull r10,r9,r4,r1 // estimate 16 bits of result in r9 + + mov r2,r2,lsl#16 + orr r2,r2,r3,lsr#16 + adds r3,r9,r3,lsl#16 // shift result left by 16 and + adc r2,r2,#0 // add in new result bits + + mov r9,r9,lsl#1 + umull r11,r10,r8,r9 // divisor lo * estimated result + subs r6,r6,r11 + sbcs r5,r5,r10 + sbc r4,r4,#0 + + umull r11,r10,r7,r9 // divisor hi * estimated result + subs r5,r5,r11 + sbc r4,r4,r10 + + tst r2,#0xff000000 + beq CPDO_dvf_M_loop_entry + + b CPDO_dvf_M_end_entry + +CPDO_dvf_M_loop_negative: + rsb r11,r4,#0 + umull r10,r9,r11,r1 // estimate 16 bits of result in r9 + + mov r2,r2,lsl#16 + orr r2,r2,r3,lsr#16 + rsbs r3,r9,r3,lsl#16 // shift result left by 16 and + sbc r2,r2,#0 // add in new result bits + + mov r9,r9,lsl#1 + umull r11,r10,r8,r9 // divisor lo * estimated result + adds r6,r6,r11 + adcs r5,r5,r10 + adc r4,r4,#0 + + umlal r5,r4,r7,r9 // divisor hi * estimated result + + tst r2,#0xff000000 + beq CPDO_dvf_M_loop_entry + +CPDO_dvf_M_end_entry: + movs r4,r4,asr#1 + movs r5,r5,rrx // remainder was shifted left by 1 + movs r6,r6,rrx // relative to divisor + + cmp r4,#0 + blt CPDO_dvf_M_end_negative + cmpeq r5,r7 + cmpeq r6,r8 + bcc CPDO_dvf_M_end + +CPDO_dvf_M_end_positive: + adds r3,r3,#1 + adc r2,r2,#0 + + subs r6,r6,r8 + sbcs r5,r5,r7 + sbcs r4,r4,#0 + + cmp r5,r7 + cmpeq r6,r8 + bcs CPDO_dvf_M_end_positive + b CPDO_dvf_M_end + +CPDO_dvf_M_end_negative: + subs r3,r3,#1 + sbc r2,r2,#0 + + adds r6,r6,r8 + adcs r5,r5,r7 + adcs r4,r4,#0 + bmi CPDO_dvf_M_end_negative + +CPDO_dvf_M_end: + orrs r9,r5,r6 + ldmia r13!,{r1,r4,r9,r10} + moveq pc,r14 + + adds r6,r6,r6 + adcs r5,r5,r5 + movcs r5,#0xc0000000 + movcs pc,r14 + + cmp r5,r7 + cmpeq r6,r8 + movcc r5,#0x40000000 + moveq r5,#0x80000000 + movhi r5,#0xc0000000 mov pc,r14 /*---------------------------------------------------------------------------*/ .globl CPDO_rdf CPDO_rdf: - mov r3,r2 - ldmia r1,{r2,r4,r6} - ldmia r3,{r1,r3,r5} + ldmia r1,{r6,r7,r8,r11} + ldmia r2,{r1,r2,r3,r4} b CPDO_dvf_l /*---------------------------------------------------------------------------*/ + .globl CPDO_rdf_M +CPDO_rdf_M: + ldmia r1,{r6,r7,r8,r11} + ldmia r2,{r1,r2,r3,r4} + b CPDO_dvf_M_l + +/*---------------------------------------------------------------------------*/ + .globl CPDO_rmf CPDO_rmf: + ldmia r2,{r6,r7,r8,r11} + ldmia r1,{r1,r2,r3,r4} + + cmp r11,#0x7fffffff + cmpne r4,#0x7fffffff + beq CPDO_rmf_infnan + cmp r11,#0x80000000 + cmpne r4,#0x80000000 + beq CPDO_rmf_zero + + cmp r4,r11 + bge CPDO_rmf_loop_entry + b CPDO_rmf_smaller + +CPDO_rmf_loop_0: + mov r5,#0 +CPDO_rmf_loop: + cmp r4,r11 + ble CPDO_rmf_loop_end + + sub r4,r4,#1 + + adds r3,r3,r3 + adcs r2,r2,r2 + bcs CPDO_rmf_loop_anyway + +CPDO_rmf_loop_entry: + cmp r2,r7 + cmpeq r3,r8 + bcc CPDO_rmf_loop_0 + +CPDO_rmf_loop_anyway: + subs r3,r3,r8 + sbc r2,r2,r7 + mov r5,#1 + b CPDO_rmf_loop + +CPDO_rmf_loop_end: + teq r2,#0 + teqeq r3,#0 + beq CPDO_rmf_created_zero + + //eor r1,r1,r6 // only if result not zero + + mov r6,r2,lsr#31 + mov r11,r2,lsl#1 + orr r11,r11,r3,lsr#31 + + cmp r6,#0 + cmpeq r11,r7 + rsbeqs r6,r8,r3,lsl#1 + cmpeq r5,#1 // for nearest-even + bcc CPDO_rmf_norm + + eor r1,r1,#0x80000000 + subs r3,r8,r3 + sbc r2,r7,r2 + +CPDO_rmf_norm: + teq r2,#0 // normalize 32 bit + moveq r2,r3 + moveq r3,#0 + subeq r4,r4,#32 + + cmp r2,#0x00010000 // normalize 16 bit + movcc r2,r2,lsl#16 + orrcc r2,r2,r3,lsr#16 + movcc r3,r3,lsl#16 + subcc r4,r4,#16 + + cmp r2,#0x01000000 // normalize 8 bit + movcc r2,r2,lsl#8 + orrcc r2,r2,r3,lsr#24 + movcc r3,r3,lsl#8 + subcc r4,r4,#8 + + cmp r2,#0x10000000 // normalize 4 bit + movcc r2,r2,lsl#4 + orrcc r2,r2,r3,lsr#28 + movcc r3,r3,lsl#4 + subcc r4,r4,#4 + + cmp r2,#0x40000000 // normalize 2 bit + movcc r2,r2,lsl#2 + orrcc r2,r2,r3,lsr#30 + movcc r3,r3,lsl#2 + subcc r4,r4,#2 + + cmp r2,#0x80000000 // normalize 1 bit + movcc r2,r2,lsl#1 + orrcc r2,r2,r3,lsr#31 + movcc r3,r3,lsl#1 + subcc r4,r4,#1 + + mov r5,#0 mov pc,r14 + +CPDO_rmf_created_zero: + mov r4,#0x80000000 + stmia r0,{r1,r2,r3,r4} + b fastfpe_next +@ mov pc,r14 + +CPDO_rmf_smaller: + add r5,r4,#1 + cmp r5,r11 + blt CPDO_rmf_norm + cmp r2,r7 + cmpeq r3,r8 + bls CPDO_rmf_norm + + eor r1,r1,#0x80000000 + adds r8,r8,r8 + adc r7,r7,r7 + subs r3,r8,r3 + sbc r2,r7,r2 + b CPDO_rmf_norm + +CPDO_rmf_zero: + cmp r11,#0x80000000 + beq CPDO_rmf_generate_qnan + stmia r0,{r1,r2,r3,r4} + b fastfpe_next +@ mov pc,r14 + +CPDO_rmf_infnan: + cmp r4,#0x7fffffff + beq CPDO_rmf_1st_infnan + + orrs r5,r8,r7,lsl#1 // ignore MSB + beq CPDO_rmf_2nd_inf + mov r1,r6 + mov r2,r7 + mov r3,r8 + mov r4,r11 + b CPDO_rmf_1st_or_2nd_nan + +CPDO_rmf_2nd_inf: + mov pc,r14 // result = 1st operand + +CPDO_rmf_1st_infnan: + cmp r11,#0x7fffffff + beq CPDO_rmf_both_infnan + + orrs r5,r3,r2,lsl#1 // 1st inf? + bne CPDO_rmf_1st_or_2nd_nan + + b CPDO_rmf_generate_qnan + +CPDO_rmf_1st_or_2nd_nan: + tst r2,#0x40000000 + beq CPDO_rmf_generate_qnan + mov pc,r14 // qnan1/2 already/copied there + +CPDO_rmf_both_infnan: + orrs r5,r3,r2,lsl#1 // ignore MSB + beq CPDO_rmf_both_infnan_1st_inf + orrs r5,r8,r7,lsl#1 // ignore MSB + beq CPDO_rmf_both_infnan_2nd_inf + tst r2,#0x40000000 + tstne r7,#0x40000000 + beq CPDO_rmf_generate_qnan + mov pc,r14 + +CPDO_rmf_both_infnan_1st_inf: + tst r7,#0x40000000 // 2nd inf or SNaN ? + beq CPDO_rmf_generate_qnan + mov r1,r6 + mov r2,r7 + mov r3,r8 + mov r4,r11 // copy 2nd QNaN + mov pc,r14 + +CPDO_rmf_both_infnan_2nd_inf: + tst r2,#0x40000000 // 1st SNaN ? + beq CPDO_rmf_generate_qnan + mov pc,r14 + +CPDO_rmf_generate_qnan: + mov r1,#0x80000000 + mov r2,#0x7fffffff + mov r3,#0xffffffff + mov r4,#0x7fffffff + ldr r5,[r10,#128] + orr r5,r5,#1 + str r5,[r10,#128] + mov pc,r14 /*---------------------------------------------------------------------------*/ @@ -228,196 +1403,303 @@ .globl CPDO_mvf CPDO_mvf: - ldmia r2,{r1,r2,r3} - stmia r0,{r1,r2,r3} + ldmia r2,{r1,r2,r3,r4} + mov r5,#0 mov pc,r14 /*---------------------------------------------------------------------------*/ .globl CPDO_mnf CPDO_mnf: - ldmia r2,{r1,r2,r3} + ldmia r2,{r1,r2,r3,r4} eor r1,r1,#0x80000000 - stmia r0,{r1,r2,r3} + mov r5,#0 mov pc,r14 /*---------------------------------------------------------------------------*/ .globl CPDO_abs CPDO_abs: - ldmia r2,{r1,r2,r3} + ldmia r2,{r1,r2,r3,r4} bic r1,r1,#0x80000000 - stmia r0,{r1,r2,r3} - mov pc,r14 + stmia r0,{r1,r2,r3,r4} + b fastfpe_next /*---------------------------------------------------------------------------*/ .globl CPDO_sqt CPDO_sqt: - ldmia r2,{r1,r2,r4} - - and r5,r4,#1 - movs r2,r2,lsl r5 - - mov r3,#0x80000000 + ldmia r2,{r1,r2,r3,r4} + cmp r1,#0 + bne CPDO_nan + cmp r4,#0x7fffffff + beq CPDO_store_1234 + + tst r4,r4,lsr#1 @carry=exponent bit 0 + bcc CPDO_sqt_exponenteven + adds r3,r3,r3 + adc r2,r2,r2 + cmp r2,#0x20000000 @set carry for loop +CPDO_sqt_exponenteven: + mov r4,r4,asr #1 + str r4,[r0,#12] + + mov r4,#0x80000000 + mov r5,#0 sub r2,r2,#0x80000000 - .macro sqrt_step,N - add r5,r3,#(0x40000000>>\N) - cmpcc r2,r5 - addcs r3,r3,#(0x80000000>>\N) - subcs r2,r2,r5 - movs r2,r2,lsl#1 - .endm - - sqrt_step 1 - sqrt_step 2 - sqrt_step 3 - sqrt_step 4 - sqrt_step 5 - sqrt_step 6 - sqrt_step 7 - sqrt_step 8 - sqrt_step 9 - sqrt_step 10 - sqrt_step 11 - sqrt_step 12 - sqrt_step 13 - sqrt_step 14 - sqrt_step 15 - sqrt_step 16 - sqrt_step 17 - sqrt_step 18 - sqrt_step 19 - sqrt_step 20 - sqrt_step 21 - sqrt_step 22 - sqrt_step 23 - sqrt_step 24 - sqrt_step 25 - sqrt_step 26 - sqrt_step 27 - sqrt_step 28 - sqrt_step 29 - sqrt_step 30 - sqrt_step 31 - - mov r4,r4,asr#1 - stmia r0,{r1,r3,r4} - mov pc,r14 + mov r8,#0x40000000 + mov r14,#0x80000000 + + mov r1,#1 + b CPDO_sqt_loop1_first +CPDO_sqt_loop1: + adds r3,r3,r3 + adcs r2,r2,r2 +CPDO_sqt_loop1_first: + add r6,r4,r8,lsr r1 @r7 const = r5 + bcs CPDO_sqt_loop1_1 + cmp r2,r6 + cmpeq r3,r5 @r5 for r7 + bcc CPDO_sqt_loop1_0 +CPDO_sqt_loop1_1: + orr r4,r4,r14,lsr r1 + subs r3,r3,r5 @r5 for r7 + sbc r2,r2,r6 +CPDO_sqt_loop1_0: + add r1,r1,#1 + cmp r1,#30 + ble CPDO_sqt_loop1 + + adds r3,r3,r3 + adcs r2,r2,r2 + bcs CPDO_sqt_between_1 + adds r7,r5,#0x80000000 + adc r6,r4,#0 + cmp r2,r6 + cmpeq r3,r7 + bcc CPDO_sqt_between_0 +CPDO_sqt_between_1: + orr r4,r4,#0x00000001 + subs r3,r3,r5 + sbc r2,r2,r4 + subs r3,r3,#0x80000000 + sbc r2,r2,#0 +CPDO_sqt_between_0: + mov r1,#0 + +CPDO_sqt_loop2: + adds r3,r3,r3 + adcs r2,r2,r2 + bcs CPDO_sqt_loop2_1 + adds r7,r5,r8,lsr r1 + adc r6,r4,#0 + cmp r2,r6 + cmpeq r3,r7 + bcc CPDO_sqt_loop2_0 +CPDO_sqt_loop2_1: + orr r5,r5,r14,lsr r1 + subs r3,r3,r5 + sbc r2,r2,r4 + subs r3,r3,r8,lsr r1 + sbc r2,r2,#0 +CPDO_sqt_loop2_0: + add r1,r1,#1 + cmp r1,#30 + ble CPDO_sqt_loop2 + + adds r3,r3,r3 + adcs r2,r2,r2 + bcs CPDO_sqt_after_1 + cmp r2,r6 + cmpeq r3,r7 + bcc CPDO_sqt_after_0 +CPDO_sqt_after_1: + orr r5,r5,#0x00000001 +CPDO_sqt_after_0: + + mov r1,#0 + stmia r0,{r1,r4,r5} + b fastfpe_next /*---------------------------------------------------------------------------*/ .globl CPDO_rnd CPDO_rnd: - adr r5,CPDO_rnd_store - b CPDO_rnd_core - -CPDO_rnd_store: - stmia r0,{r1,r2,r3} - mov pc,r14 + ldmia r2,{r1,r2,r3,r5} + bl CPDO_rnd_core + ldr r6,[r10,#128] + stmia r0,{r1,r2,r3,r5} + orr r6,r6,r4 + str r6,[r10,#128] + b fastfpe_next /*---------------------------------------------------------------------------*/ .globl CPDO_rnd_core CPDO_rnd_core: - ldmia r2,{r1,r2,r3} - and r4,r4,#0x00000060 - add pc,pc,r4,lsr#3 + and r6,r4,#0x00000060 + mov r4,#0 // for return of exception flags + cmp r5,#63 + bge CPDO_rnd_big + add pc,pc,r6,lsr#3 mov r0,r0 - b CPDO_rnd_N + b CPDO_rnd_NE b CPDO_rnd_P b CPDO_rnd_M b CPDO_rnd_Z - -CPDO_rnd_N: - cmp r3,#-1 - blt CPDO_rnd_zero - cmp r3,#31 - bge CPDO_rnd_end - - rsb r4,r3,#30 - mov r2,r2,lsr r4 - add r2,r2,#1 - bic r2,r2,#1 - movs r2,r2,lsl r4 - addcs r3,r3,#1 - movcs r2,r2,rrx - - mov pc,r5 + +CPDO_rnd_NE: + cmp r5,#0 + blt CPDO_rnd_NE_01 + + subs r6,r5,#31 + bpl CPDO_rnd_NE_2 + mov r7,#0x40000000 + mov r8,#0x7fffffff + mov r7,r7,lsr r5 + mov r8,r8,lsr r5 + teq r3,#0 + tsteq r2,r8 + orrne r4,r4,#16 // set inexact flag + adds r2,r2,r7 + bcs CPDO_rnd_overflow + teq r3,#0 + tsteq r2,r8 + beq CPDO_rnd_NE_equal + mov r3,#0 + bic r2,r2,r8 + mov pc,r14 + +CPDO_rnd_NE_2: + mov r7,#0x80000000 + mov r8,#0xffffffff + mov r7,r7,lsr r6 + mov r8,r8,lsr r6 + tst r3,r8 + orrne r4,r4,#16 // set inexact flag + adds r3,r3,r7 + adcs r2,r2,#0 + bcs CPDO_rnd_overflow + tst r3,r8 + beq CPDO_rnd_NE_equal + bic r3,r3,r8 + mov pc,r14 + +CPDO_rnd_NE_equal: + mov r7,#0x80000000 + subs r6,r5,#32 + bicpl r3,r3,r7,lsr r6 + bicmi r2,r2,r7,lsr r5 + mov pc,r14 + +CPDO_rnd_NE_01: + cmp r5,#-1 + bne CPDO_rnd_0 + cmp r2,#0x80000000 + cmpeq r3,#0 + beq CPDO_rnd_0 + + mov r2,#0x80000000 + mov r3,#0 + mov r5,#0 + orr r4,r4,#16 // set inexact flag + mov pc,r14 CPDO_rnd_P: - cmp r3,#0 - blt CPDO_rnd_P_small - cmp r3,#31 - movge pc,r5 - - tst r1,#0x80000000 - bne CPDO_rnd_end - - mov r4,#0x80000000 - sub r4,r4,#1 - adds r2,r2,r4,lsr r3 - addcs r3,r3,#1 - movcs r2,r2,rrx + teq r1,#0 + beq CPDO_rnd_NZ + b CPDO_rnd_Z - b CPDO_rnd_end - -CPDO_rnd_P_small: - cmp r2,#0 - beq CPDO_rnd_zero - tst r1,#0x80000000 - bne CPDO_rnd_zero - b CPDO_rnd_one - - CPDO_rnd_M: - cmp r3,#0 - blt CPDO_rnd_M_small - cmp r3,#31 - movge pc,r5 - - tst r1,#0x80000000 - beq CPDO_rnd_end - - mov r4,#0x80000000 - sub r4,r4,#1 - adds r2,r2,r4,lsr r3 - addcs r3,r3,#1 - movcs r2,r2,rrx - - b CPDO_rnd_end - -CPDO_rnd_M_small: - cmp r2,#0 - beq CPDO_rnd_zero - tst r1,#0x80000000 - beq CPDO_rnd_zero - b CPDO_rnd_one + teq r1,#0 + beq CPDO_rnd_Z + b CPDO_rnd_NZ CPDO_rnd_Z: + cmp r5,#0 // smaller than 1 will be 0 + blt CPDO_rnd_0 + + rsbs r6,r5,#31 + bmi CPDO_rnd_Z_2 cmp r3,#0 - blt CPDO_rnd_zero - cmp r3,#31 - movge pc,r5 - - b CPDO_rnd_end + mov r3,#0 + mov r7,r2,lsr r6 + teqeq r2,r7,lsl r6 + mov r2,r7,lsl r6 + orrne r4,r4,#16 // set inexact flag + mov pc,r14 + +CPDO_rnd_Z_2: + rsb r6,r5,#63 + mov r7,r3,lsr r6 + teq r3,r7,lsl r6 + mov r3,r7,lsl r6 + orrne r4,r4,#16 // set inexact flag + mov pc,r14 + +CPDO_rnd_0: + cmp r5,#0x80000000 + moveq pc,r14 // already 0 -> ok + + mov r2,#0 + mov r3,#0 + mov r5,#0x80000000 + orr r4,r4,#16 // set inexact flag + mov pc,r14 + +CPDO_rnd_NZ: + cmp r5,#0 // smaller than 1 will be stay 0 or become 1 + blt CPDO_rnd_NZ_01 + + mov r7,#0x7fffffff + subs r6,r5,#32 + bpl CPDO_rnd_NZ_2 + mov r7,r7,lsr r5 + teq r3,#0 + tsteq r2,r7 + orrne r4,r4,#16 // set inexact flag + adds r3,r3,#0xffffffff + adcs r2,r2,r7 + bcs CPDO_rnd_overflow + mov r3,#0 + bic r2,r2,r7 + mov pc,r14 + +CPDO_rnd_NZ_2: + mov r7,r7,lsr r6 + tst r3,r7 + orrne r4,r4,#16 // set inexact flag + adds r3,r3,r7 + adcs r2,r2,#0 + bcs CPDO_rnd_overflow + bic r3,r3,r7 + mov pc,r14 + +CPDO_rnd_NZ_01: + cmp r5,#0x80000000 + moveq pc,r14 // already 0 -> ok -CPDO_rnd_one: mov r2,#0x80000000 mov r3,#0 - mov pc,r5 - -CPDO_rnd_zero: - mov r1,#0 - mov r2,#0 - mov r3,#0x80000000 - mov pc,r5 + mov r5,#0 + orr r4,r4,#16 // set inexact flag + mov pc,r14 + +CPDO_rnd_overflow: + mov r2,#0x80000000 + mov r3,#0 + add r5,r5,#1 + mov pc,r14 + +CPDO_rnd_big: + cmp r5,#0x7fffffff + movne pc,r14 // just big + orrs r6,r3,r2,lsl#1 // ignore MSB + moveq pc,r14 // infinity + tst r2,#0x40000000 // signalling NaN ? + orreq r4,r4,#1 // set invalid operation flag + orreq r2,r2,#0x40000000 // make quiet NaN + mov pc,r14 -CPDO_rnd_end: - rsb r4,r3,#31 - mov r2,r2,lsr r4 - mov r2,r2,lsl r4 - mov pc,r5 - /*---------------------------------------------------------------------------*/ --- ../linux-sharp-a300/arch/arm/fastfpe/CPDT.S 2002-05-15 21:37:41.000000000 +0900 +++ linux/arch/arm/fastfpe/CPDT.S 2006-05-17 19:20:27.754602208 +0900 @@ -1,229 +1,456 @@ /* -Inside the emulator the FP numbers are kept with 32 bit accuracy for both -mantissa and exponent. The FP structure has 4 words reserved for each -register, the first is used just for the sign in bit 31, the second is the -mantissa (unsigned integer) and the third is the exponent (signed integer). - -The functions do actually only work properly for normalized values, and if -no overflow occurs. Hopfully most programs are not disturbed by this, and it -will probably be improved in future versions. +The FP structure has 4 words reserved for each register, the first is used just +for the sign in bit 31, the second and third are for the mantissa (unsigned +integer, high 32 bit first) and the fourth is the exponent (signed integer). +The mantissa is always normalized. -Decimal and packed decimal numbers are not supported so yet. +If the exponent is 0x80000000, that is the most negative value, the number +represented is 0 and both mantissa words are also 0. + +If the exponent is 0x7fffffff, that is the biggest positive value, the number +represented is infinity if the mantissa is 0, otherwise it is a NaN. + +Decimal and packed decimal numbers are not supported yet. */ /*---------------------------------------------------------------------------*/ .globl CPDT_load_single CPDT_load_single: - ldr r1,[r6] +__x1: ldrt r1,[r6] - bics r2,r1,#0x80000000 - beq CPDT_load_zero @ test for 0 + and r2,r1,#0x80000000 @ r2 = sign - mov r2,r1,lsl#8 - orr r2,r2,#0x80000000 @ insert leading 1 - - mov r3,r1,lsr#23 - bic r3,r3,#0x100 - sub r3,r3,#127 @ subtract normalized bias + mov r5,r1,lsr#23 + bics r5,r5,#0x100 + beq CPDT_ls_e0 @ exponent = 0; zero/denormalized + teq r5,#255 + beq CPDT_ls_e255 @ exponent = 255; infinity/NaN + + sub r5,r5,#127 @ r5 = exponent, remove normalized bias + + mov r3,r1,lsl#8 + orr r3,r3,#0x80000000 + mov r4,#0 @ r3,r4 = mantissa - and r1,r1,#0x80000000 @ only sign + stmia r0,{r2-r5} + b fastfpe_next + +CPDT_ls_e0: + movs r3,r1,lsl#9 + beq CPDT_load_zero + + mov r5,#-127 - stmia r0,{r1-r3} - mov pc,r14 +CPDT_ls_e0_norm: + tst r3,#0x80000000 + subeq r5,r5,#1 + moveq r3,r3,lsl#1 + beq CPDT_ls_e0_norm + + mov r4,#0 + stmia r0,{r2-r5} + b fastfpe_next + +CPDT_ls_e255: + mov r3,r1,lsl#8 + bics r3,r3,#0x80000000 + orreq r3,r3,#0x80000000 // set MSB for inf + mov r4,#0 + mov r5,#0x7fffffff + stmia r0,{r2-r5} + b fastfpe_next CPDT_load_zero: - mov r1,#0 - mov r2,#0 - mov r3,#0x80000000 - stmia r0,{r1-r3} - mov pc,r14 + mov r3,#0 + mov r4,#0 + mov r5,#0x80000000 + stmia r0,{r2-r5} + b fastfpe_next /*---------------------------------------------------------------------------*/ .globl CPDT_load_double CPDT_load_double: - ldr r2,[r6,#4] - ldr r1,[r6] - - bics r3,r1,#0x80000000 - cmpeq r2,#0 - beq CPDT_load_zero @ test for 0 +__x2: ldrt r1,[r6],#4 +__x3: ldrt r6,[r6] - mov r2,r2,lsr#21 - orr r2,r2,r1,lsl#11 - orr r2,r2,#0x80000000 @ insert leading 1 - - mov r3,r1,lsr#20 - bic r3,r3,#0x800 - sub r3,r3,#1024 - add r3,r3,#1 @ subtract normalized bias - - and r1,r1,#0x80000000 - - cmp r2,#0 + and r2,r1,#0x80000000 @ r2 = sign + + mov r5,r1,lsr#20 + bics r5,r5,#0x800 + beq CPDT_ld_e0 @ exponent = 0; zero/denormalized + add r4,r5,#1 + teq r4,#2048 + beq CPDT_ld_e2047 @ exponent = 2047; infinity/NaN + + add r5,r5,#1 + sub r5,r5,#1024 @ r5 = exponent, remove normalized bias + + mov r3,r1,lsl#11 + orr r3,r3,#0x80000000 + orr r3,r3,r6,lsr #21 + mov r4,r6,lsl#11 @ r3,r4 = mantissa + + stmia r0,{r2-r5} + b fastfpe_next + +CPDT_ld_e0: + mov r3,r1,lsl#12 + orr r3,r3,r6,lsr#20 + movs r4,r6,lsl#12 + teqeq r3,#0 beq CPDT_load_zero - stmia r0,{r1-r3} - mov pc,r14 - + mov r5,#1 + sub r5,r5,#1024 + +CPDT_ld_e0_norm: + tst r3,#0x80000000 + bne CPDT_ld_e0_norm_end + sub r5,r5,#1 + movs r4,r4,lsl#1 + adc r3,r3,r3 + b CPDT_ld_e0_norm +CPDT_ld_e0_norm_end: + stmia r0,{r2-r5} + b fastfpe_next + +CPDT_ld_e2047: + mov r3,r1,lsl#11 + orr r3,r3,r6,lsr #21 + bic r3,r3,#0x80000000 + mov r4,r6,lsl#11 @ r3,r4 = mantissa + orrs r5,r3,r4 + orreq r3,r3,#0x80000000 // set MSB fo inf + mov r5,#0x7fffffff + stmia r0,{r2-r5} + b fastfpe_next + /*---------------------------------------------------------------------------*/ .globl CPDT_load_extended CPDT_load_extended: - ldr r2,[r6,#4] - ldr r1,[r6] - - cmp r2,#0 - bics r3,r1,#0x80000000 - beq CPDT_load_zero @ test for 0 - - orr r2,r2,#0x80000000 @ insert leading 1 - - bic r3,r1,#0x80000000 - sub r3,r3,#16384 - add r3,r3,#1 @ subtract normalized bias - - and r1,r1,#0x80000000 +__x4: ldrt r1,[r6],#4 +__x5: ldrt r3,[r6],#4 +__x6: ldrt r4,[r6] + + and r2,r1,#0x8000 + mov r2,r2,lsl#16 + mov r5,r1,lsl#17 + movs r5,r5,lsr#17 + beq CPDT_le_e0 + add r1,r5,#1 + teq r1,#32768 + beq CPDT_le_e32767 + + add r5,r5,#1 + sub r5,r5,#16384 + + stmia r0,{r2-r5} + b fastfpe_next + +CPDT_le_e0: + teq r3,#0 + teqeq r4,#0 + beq CPDT_load_zero - cmp r2,#0 - beq CPDT_load_zero - - stmia r0,{r1-r3} - mov pc,r14 + mov r5,#2 + sub r5,r5,#16384 + b CPDT_ld_e0_norm + +CPDT_le_e32767: + mov r5,#0x7fffffff + stmia r0,{r2-r5} + b fastfpe_next /*---------------------------------------------------------------------------*/ .globl CPDT_load_decimal CPDT_load_decimal: - mov pc,r14 + b fastfpe_next /*---------------------------------------------------------------------------*/ .globl CPDT_store_single CPDT_store_single: - ldmia r0,{r1-r3} + ldmia r0,{r1-r4} - cmp r2,#0 - beq CPDT_store_single_zero + cmp r4,#-127 + ble CPDT_ss_e0 + cmp r4,#128 + bge CPDT_ss_e255 - adds r3,r3,#127 - ble CPDT_store_single_zero - - bic r3,r3,#0x100 - orr r1,r1,r3,lsl#23 + add r4,r4,#127 + orr r1,r1,r4,lsl#23 bic r2,r2,#0x80000000 orr r1,r1,r2,lsr#8 - str r1,[r6] - mov pc,r14 +__x7: strt r1,[r6] + b fastfpe_next + +CPDT_ss_e0: + cmp r4,#-150 + ble CPDT_ss_zero + + add r4,r4,#126 + rsb r4,r4,#0 + mov r2,r2,lsr r4 + + orr r1,r1,r2,lsr#8 + +CPDT_ss_zero: +__x8: strt r1,[r6] + b fastfpe_next + +CPDT_ss_e255: + orr r1,r1,#0x7f000000 + orr r1,r1,#0x00800000 + cmp r4,#0x7fffffff + movne r2,#0 + movne r3,#0 + bic r2,r2,#0x80000000 + orrs r4,r3,r2,lsl#24 // only bits not stored in single + bne CPDT_ss_nan_special // NaN must not become Inf +CPDT_ss_nan_back: + orr r1,r1,r2,lsr#8 +__x9: strt r1,[r6] + b fastfpe_next -CPDT_store_single_zero: - mov r1,#0 - str r1,[r6] - mov pc,r14 +CPDT_ss_nan_special: + cmp r2,#1<<8 + movlt r2,#1<<8 + b CPDT_ss_nan_back /*---------------------------------------------------------------------------*/ .globl CPDT_store_double CPDT_store_double: - ldmia r0,{r1-r3} - - cmp r2,#0 - beq CPDT_store_double_zero - - adds r3,r3,#1024 - ble CPDT_store_double_zero + ldmia r0,{r1-r4} - sub r3,r3,#1 - bic r3,r3,#0x800 - orr r1,r1,r3,lsl#20 + cmp r4,#1024 @ this check has to be first, or + bge CPDT_sd_e2047 @ overflow can occur on second ! + add r0,r4,#3 + cmp r0,#-1023+3 @ cmp with -1023 + ble CPDT_sd_e0 + + sub r4,r4,#1 + add r4,r4,#1024 + orr r1,r1,r4,lsl#20 bic r2,r2,#0x80000000 orr r1,r1,r2,lsr#11 - + mov r2,r2,lsl#21 + orr r2,r2,r3,lsr#11 + +__x10: strt r1,[r6],#4 +__x11: strt r2,[r6] + b fastfpe_next + +CPDT_sd_e0: + add r0,r4,#1075-1024 + cmp r0,#-1024 + ble CPDT_sd_zero + + add r4,r4,#1024 + sub r4,r4,#2 +CPDT_sd_unnormalize: + movs r2,r2,lsr#1 + mov r3,r3,rrx + adds r4,r4,#1 + bne CPDT_sd_unnormalize + + orr r1,r1,r2,lsr#11 + mov r2,r2,lsl#21 + orr r2,r2,r3,lsr#11 + +__x12: strt r1,[r6],#4 +__x13: strt r2,[r6] + b fastfpe_next - stmia r6,{r1-r2} - mov pc,r14 - -CPDT_store_double_zero: - mov r1,#0 +CPDT_sd_zero: mov r2,#0 - stmia r6,{r1-r2} - mov pc,r14 +__x14: strt r1,[r6],#4 +__x15: strt r2,[r6] + b fastfpe_next + +CPDT_sd_e2047: + orr r1,r1,#0x7f000000 + orr r1,r1,#0x00f00000 + cmp r4,#0x7fffffff + movne r2,#0 + movne r3,#0 + movs r5,r3,lsl#21 // only bits not stored in double ! + bne CPDT_sd_nan_special +CPDT_sd_nan_back: + orr r1,r1,r2,lsr#11 + mov r2,r2,lsl#21 + orr r2,r2,r3,lsr#11 +__x16: strt r1,[r6],#4 +__x17: strt r2,[r6] + b fastfpe_next + +CPDT_sd_nan_special: + bics r2,r2,#0x80000000 + bne CPDT_sd_nan_back + cmp r3,#1<<11 + movlt r3,#1<<11 + b CPDT_sd_nan_back /*---------------------------------------------------------------------------*/ .globl CPDT_store_extended CPDT_store_extended: - ldmia r0,{r1-r3} + ldmia r0,{r1-r4} - cmp r2,#0 - beq CPDT_store_extended_zero - - adds r3,r3,#16384 - ble CPDT_store_extended_zero + cmp r4,#16384 @ this check has to be first, or + bge CPDT_se_e32767 @ overflow can occur with second ! + add r0,r4,#63 + cmp r0,#-16383+63 + ble CPDT_se_e0 + + sub r4,r4,#1 + add r4,r4,#16384 + orr r1,r4,r1,lsr#16 + +__x18: strt r1,[r6],#4 +__x19: strt r2,[r6],#4 +__x20: strt r3,[r6] + b fastfpe_next + +CPDT_se_e0: + add r0,r4,#16446-16384 + cmp r0,#-16384 + ble CPDT_se_zero + + add r4,r4,#16384 + sub r4,r4,#2 +CPDT_se_unnormalize: + movs r2,r2,lsr#1 + mov r3,r3,rrx + adds r4,r4,#1 + bne CPDT_se_unnormalize + + mov r1,r1,lsr#16 +__x21: strt r1,[r6],#4 +__x22: strt r2,[r6],#4 +__x23: strt r3,[r6] + b fastfpe_next - sub r3,r3,#1 - mov r3,r3,lsl#17 - orr r1,r1,r3,lsr#17 - - mov r3,#0 - - stmia r6,{r1-r3} - mov pc,r14 - -CPDT_store_extended_zero: - mov r1,#0 +CPDT_se_zero: + mov r1,r1,lsr#16 mov r2,#0 mov r3,#0 - stmia r6,{r1-r3} - mov pc,r14 - +__x24: strt r1,[r6],#4 +__x25: strt r2,[r6],#4 +__x26: strt r3,[r6] + b fastfpe_next + +CPDT_se_e32767: + cmp r4,#0x7fffffff + movne r2,#0 + movne r3,#0 + mov r1,r1,lsr#16 + orr r1,r1,#0x00007f00 + orr r1,r1,#0x000000ff +__x27: strt r1,[r6],#4 +__x28: strt r2,[r6],#4 +__x29: strt r3,[r6] + b fastfpe_next + /*---------------------------------------------------------------------------*/ .globl CPDT_store_decimal CPDT_store_decimal: - mov pc,r14 + b fastfpe_next /*---------------------------------------------------------------------------*/ .globl CPDT_sfm -CPDT_sfm: - add r2,r10,r0,lsr#8 - ldr r3,[r2],#4 - str r3,[r6],#4 - ldr r3,[r2],#4 - str r3,[r6],#4 - ldr r3,[r2],#4 - str r3,[r6],#4 - +CPDT_sfm_loop: add r0,r0,#1<<12 and r0,r0,#7<<12 +CPDT_sfm: + add r7,r10,r0,lsr#8 + ldmia r7,{r2-r5} + bic r3,r3,#0x80000000 + orr r3,r3,r2 +__x30: strt r3,[r6],#4 +__x31: strt r4,[r6],#4 +__x32: strt r5,[r6],#4 + subs r1,r1,#1 - bne CPDT_sfm - mov pc,r14 + bne CPDT_sfm_loop + b fastfpe_next /*---------------------------------------------------------------------------*/ .globl CPDT_lfm -CPDT_lfm: - add r2,r10,r0,lsr#8 - ldr r3,[r6],#4 - str r3,[r2],#4 - ldr r3,[r6],#4 - str r3,[r2],#4 - ldr r3,[r6],#4 - str r3,[r2],#4 - +CPDT_lfm_loop: add r0,r0,#1<<12 and r0,r0,#7<<12 +CPDT_lfm: + add r7,r10,r0,lsr#8 +__x33: ldrt r3,[r6],#4 +__x34: ldrt r4,[r6],#4 +__x35: ldrt r5,[r6],#4 + and r2,r3,#0x80000000 + cmp r5,#0x80000000 // check if the number was 0 + cmpne r5,#0x7fffffff // or inf/NaN + biceq r3,r3,#0x80000000 // yes -> clear mantissa MSB + orrne r3,r3,#0x80000000 // no -> set mantissa MSB + stmia r7,{r2-r5} + subs r1,r1,#1 - bne CPDT_lfm - mov pc,r14 + bne CPDT_lfm_loop + b fastfpe_next /*---------------------------------------------------------------------------*/ + + .section .fixup,"ax" + .align +__f1: + mov pc,r9 + .previous + .section __ex_table,"a" + .align 3 + .long __x1,__f1 + .long __x2,__f1 + .long __x3,__f1 + .long __x4,__f1 + .long __x5,__f1 + .long __x6,__f1 + .long __x7,__f1 + .long __x8,__f1 + .long __x9,__f1 + .long __x10,__f1 + .long __x11,__f1 + .long __x12,__f1 + .long __x13,__f1 + .long __x14,__f1 + .long __x15,__f1 + .long __x16,__f1 + .long __x17,__f1 + .long __x18,__f1 + .long __x19,__f1 + .long __x20,__f1 + .long __x21,__f1 + .long __x22,__f1 + .long __x23,__f1 + .long __x24,__f1 + .long __x25,__f1 + .long __x26,__f1 + .long __x27,__f1 + .long __x28,__f1 + .long __x29,__f1 + .long __x30,__f1 + .long __x31,__f1 + .long __x32,__f1 + .long __x33,__f1 + .long __x34,__f1 + .long __x35,__f1 + .previous --- ../linux-sharp-a300/arch/arm/fastfpe/CPRT.S 2002-05-15 21:37:41.000000000 +0900 +++ linux/arch/arm/fastfpe/CPRT.S 2006-05-17 19:20:27.754602208 +0900 @@ -1,148 +1,219 @@ +/* +The FP structure has 4 words reserved for each register, the first is used just +for the sign in bit 31, the second and third are for the mantissa (unsigned +integer, high 32 bit first) and the fourth is the exponent (signed integer). +The mantissa is always normalized. + +If the exponent is 0x80000000, that is the most negative value, the number +represented is 0 and both mantissa words are also 0. + +If the exponent is 0x7fffffff, that is the biggest positive value, the number +represented is infinity if the mantissa is 0, otherwise it is a NaN. + +Decimal and packed decimal numbers are not supported yet. +*/ + /*---------------------------------------------------------------------------*/ .text .globl CPRT_flt CPRT_flt: add r0,r13,r0,lsr#10 - ldr r3,[r0] - cmp r3,#0 - beq CPRT_zero + ldr r2,[r0] + mov r0,r1 + mov r3,#0 + cmp r2,#0 + beq CPRT_flt_zero + + ldr r6,=round_table + and r5,r4,#0x000000e0 + and r4,r4,#0x00080000 + orr r5,r5,r4,lsr#11 + ldr r6,[r6,r5,lsr#3] // address of rounding function - ands r2,r3,#0x80000000 - rsbne r3,r3,#0 + ands r1,r2,#0x80000000 + rsbne r2,r2,#0 mov r4,#31 - cmp r3,#0x00010000 - movcc r3,r3,lsl#16 + cmp r2,#0x00010000 + movcc r2,r2,lsl#16 subcc r4,r4,#16 - cmp r3,#0x01000000 - movcc r3,r3,lsl#8 + cmp r2,#0x01000000 + movcc r2,r2,lsl#8 subcc r4,r4,#8 - cmp r3,#0x10000000 - movcc r3,r3,lsl#4 + cmp r2,#0x10000000 + movcc r2,r2,lsl#4 subcc r4,r4,#4 - cmp r3,#0x40000000 - movcc r3,r3,lsl#2 + cmp r2,#0x40000000 + movcc r2,r2,lsl#2 subcc r4,r4,#2 - cmp r3,#0x80000000 - movcc r3,r3,lsl#1 + cmp r2,#0x80000000 + movcc r2,r2,lsl#1 subcc r4,r4,#1 - stmia r1,{r2,r3,r4} - mov pc,r14 + mov r5,#0 + ldr r14,=fastfpe_next + mov pc,r6 -CPRT_zero: - mov r2,#0 - mov r3,#0 +CPRT_flt_zero: + mov r1,#0 mov r4,#0x80000000 - stmia r1,{r2,r3,r4} - mov pc,r14 + stmia r0,{r1,r2,r3,r4} + b fastfpe_next /*---------------------------------------------------------------------------*/ .globl CPRT_fix CPRT_fix: - adr r5,CPRT_back - b CPDO_rnd_core + ldmia r2,{r1,r2,r3,r5} + bl CPDO_rnd_core -CPRT_back: add r0,r13,r0,lsr#10 - cmp r3,#0 - blt CPRT_int_zero - cmp r3,#30 - bgt CPRT_overflow - - rsb r3,r3,#31 - mov r2,r2,lsr r3 + cmp r5,#0 + blt CPRT_fix_zero + cmp r5,#30 + bgt CPRT_fix_overflow + +CPRT_fix_no_overflow: + rsb r5,r5,#31 + mov r2,r2,lsr r5 tst r1,#0x80000000 rsbne r2,r2,#0 - +CPRT_fix_zero_back: str r2,[r0] - mov pc,r14 + ldr r1,[r10,#128] + orr r1,r1,r4 // set flags possibly caused by rounding + str r1,[r10,#128] + b fastfpe_next -CPRT_int_zero: +CPRT_fix_zero: mov r2,#0 - str r2,[r0] - mov pc,r14 + b CPRT_fix_zero_back + +CPRT_fix_overflow: + cmp r1,#0x80000000 // -2^31 is not exactly an overflow ... + cmpeq r2,#0x80000000 + cmpeq r5,#31 + beq CPRT_fix_no_overflow -CPRT_overflow: mov r2,#0x80000000 tst r1,#0x80000000 subeq r2,r2,#1 str r2,[r0] - mov pc,r14 + + ldr r1,[r10,#128] + orr r1,r1,#1 // set invalid operation flag + str r1,[r10,#128] + b fastfpe_next /*---------------------------------------------------------------------------*/ .globl CPRT_wfs CPRT_wfs: - mov pc,r14 + ldr r0,[r13,r0,lsr#10] + str r0,[r10,#128] + b fastfpe_next /*---------------------------------------------------------------------------*/ .globl CPRT_rfs CPRT_rfs: - add r0,r13,r0,lsr#10 - mov r1,#0x02000000 @ Software Emulation, not Acorn FPE - str r1,[r0] - mov pc,r14 + ldr r1,[r10,#128] + bic r1,r1,#0xff000000 + orr r1,r1,#0x02000000 @ Software Emulation, not Acorn FPE + str r1,[r13,r0,lsr#10] + b fastfpe_next /*---------------------------------------------------------------------------*/ .globl CPRT_cmf CPRT_cmf: - ldmia r1,{r1,r3,r5} - ldmia r2,{r2,r4,r6} + ldmia r1,{r1,r3,r5,r7} + ldmia r2,{r2,r4,r6,r8} CPRT_cmf_e: ldr r0,[r13,#16*4] bic r0,r0,#0xf0000000 + cmp r7,#0x7fffffff + beq CPRT_cmf_nan1 +CPRT_cmf_nixnan1: + cmp r8,#0x7fffffff + beq CPRT_cmf_nan2 +CPRT_cmf_nixnan2: + cmp r1,r2 beq CPRT_cmf_equalsign - b CPRT_cmf_sign + b CPRT_cmf_signx + +CPRT_cmf_nan1: + orrs r11,r5,r3,lsl#1 // ignore MSB + beq CPRT_cmf_nixnan1 + b CPRT_cmf_unordered + +CPRT_cmf_nan2: + orrs r11,r6,r4,lsl#1 // ignore MSB + beq CPRT_cmf_nixnan2 + b CPRT_cmf_unordered CPRT_cmf_equalsign: - cmp r5,r6 + cmp r7,r8 beq CPRT_cmf_equalexponent bgt CPRT_cmf_sign b CPRT_cmf_signb CPRT_cmf_equalexponent: cmp r3,r4 + cmpeq r5,r6 beq CPRT_cmf_equal - bgt CPRT_cmf_sign + bhi CPRT_cmf_sign b CPRT_cmf_signb +CPRT_cmf_signx: + teq r7,#0x80000000 + teqeq r8,#0x80000000 + beq CPRT_cmf_equal CPRT_cmf_sign: tst r1,#0x80000000 - orreq r0,r0,#0x20000000 - orrne r0,r0,#0x80000000 + orreq r0,r0,#0x20000000 // PSR carry + orrne r0,r0,#0x80000000 // PSR negative str r0,[r13,#16*4] - mov pc,r14 + b fastfpe_next CPRT_cmf_signb: tst r1,#0x80000000 - orrne r0,r0,#0x20000000 - orreq r0,r0,#0x80000000 + orrne r0,r0,#0x20000000 // PSR carry + orreq r0,r0,#0x80000000 // PSR negative str r0,[r13,#16*4] - mov pc,r14 + b fastfpe_next CPRT_cmf_equal: - orr r0,r0,#0x60000000 + orr r0,r0,#0x60000000 // PSR carry, zero str r0,[r13,#16*4] - mov pc,r14 + b fastfpe_next + +CPRT_cmf_unordered: + ldr r1,[r10,#128] + orr r1,r1,#1 // set invalid operation flag + str r1,[r10,#128] + + tst r0,#1<<12 // FPSR AC bit set ? + orrne r0,r0,#0x20000000 // PSR carry + orr r0,r0,#0x10000000 // PSR overflow + str r0,[r13,#16*4] + + b fastfpe_next /*---------------------------------------------------------------------------*/ .globl CPRT_cnf CPRT_cnf: - ldmia r1,{r1,r3,r5} - ldmia r2,{r2,r4,r6} + ldmia r1,{r1,r3,r5,r7} + ldmia r2,{r2,r4,r6,r8} eor r2,r2,#0x80000000 b CPRT_cmf_e --- ../linux-sharp-a300/arch/arm/fastfpe/Makefile 2002-05-15 21:37:41.000000000 +0900 +++ linux/arch/arm/fastfpe/Makefile 2006-05-17 19:20:27.755602056 +0900 @@ -11,13 +11,15 @@ obj-n := obj- := -fastfpe-objs := module.o entry.o CPDO.o CPRT.o CPDT.o +fastfpe-objs := module.o round.o CPDT.o CPRT.o CPDO.o entry.o list-multi := fastfpe.o obj-$(CONFIG_FPE_FASTFPE) += fastfpe.o +USE_STANDARD_AS_RULE := true + include $(TOPDIR)/Rules.make fastfpe.o: $(fastfpe-objs) - $(LD) -r -o $@ $(fastfpe-objs) + $(LD) -r -o $@ $(fastfpe-objs) --- ../linux-sharp-a300/arch/arm/kernel/entry-common.S 2002-05-15 21:37:38.000000000 +0900 +++ linux/arch/arm/kernel/entry-common.S 2006-05-17 19:20:39.592802528 +0900 @@ -22,12 +22,10 @@ * Our do_softirq out of line code. See include/asm-arm/softirq.h for * the calling assembly. */ - .section ".text.lock","ax" ENTRY(__do_softirq) stmfd sp!, {r0 - r3, ip, lr} bl do_softirq ldmfd sp!, {r0 - r3, ip, pc} - .previous .align 5 /* @@ -132,7 +130,7 @@ ldr ip, [ip] mcr p15, 0, ip, c1, c0 @ update control register #endif - enable_irqs ip + enable_irq ip str r4, [sp, #-S_OFF]! @ push fifth arg --- ../linux-sharp-a300/arch/arm/kernel/entry-header.S 2002-05-15 21:37:38.000000000 +0900 +++ linux/arch/arm/kernel/entry-header.S 2006-05-17 19:20:42.576348960 +0900 @@ -13,7 +13,7 @@ #endif .macro zero_fp -#ifndef CONFIG_NO_FRAME_POINTER +#ifdef CONFIG_FRAME_POINTER mov fp, #0 #endif .endm @@ -108,7 +108,7 @@ .macro mask_pc, rd, rm .endm - .macro enable_irqs, temp + .macro enable_irq, temp mov \temp, #MODE_SVC msr cpsr_c, \temp .endm @@ -164,7 +164,7 @@ bic \rd, \rm, #PCMASK .endm - .macro enable_irqs, temp + .macro enable_irq, temp teqp pc, #0x00000003 .endm --- ../linux-sharp-a300/arch/arm/fastfpe/entry.S 2002-05-15 21:37:41.000000000 +0900 +++ linux/arch/arm/fastfpe/entry.S 2006-05-17 19:20:27.755602056 +0900 @@ -9,18 +9,20 @@ */ +#include + /*---------------------------------------------------------------------------*/ .data fp_const: - .word 0, 0x00000000, 0x80000000, 0 @ 0 - .word 0, 0x80000000, 0, 0 @ 1 - .word 0, 0x80000000, 1, 0 @ 2 - .word 0, 0xc0000000, 1, 0 @ 3 - .word 0, 0x80000000, 2, 0 @ 4 - .word 0, 0xa0000000, 2, 0 @ 5 - .word 0, 0x80000000, -1, 0 @ 0.5 - .word 0, 0xa0000000, 3, 0 @ 10 + .word 0, 0x00000000, 0, 0x80000000 @ 0 + .word 0, 0x80000000, 0, 0 @ 1 + .word 0, 0x80000000, 0, 1 @ 2 + .word 0, 0xc0000000, 0, 1 @ 3 + .word 0, 0x80000000, 0, 2 @ 4 + .word 0, 0xa0000000, 0, 2 @ 5 + .word 0, 0x80000000, 0, -1 @ 0.5 + .word 0, 0xa0000000, 0, 3 @ 10 fp_undef: .word 0 fp_cond: @@ -76,21 +78,19 @@ /*---------------------------------------------------------------------------*/ -finish: + .globl fastfpe_next +fastfpe_next: ldr r5,[r13,#60] -next_after_cond: +next_after_cond_false: __x1: ldrt r4,[r5],#4 ldr r0,=fp_cond @ check condition of next instruction - ldr r1,[r13,#64] @ psr containing flags mov r2,r4,lsr#28 - mov r1,r1,lsr#28 - ldr r0,[r0,r2,lsl#2] - mov r0,r0,lsr r1 - tst r0,#1 - beq next_after_cond @ must not necessarily have been an - @ FP instruction ! + cmp r2,#0xe @ "always" condition code + bne next_check_cond + +next_check_copro: and r1,r4,#0x0f000000 @ Test for copro instruction cmp r1,#0x0c000000 rsbgts r0,r1,#0x0e000000 @ cmpgt #0x0e000000,r1 @@ -101,7 +101,19 @@ movge pc,r9 @ copro = 0 or >=3, return str r5,[r13,#60] @ save updated pc - b next_enter + cmp r1,#1<<8 @ which copro ? + beq copro_1 + b copro_2 + +next_check_cond: + ldr r1,[r13,#64] @ psr containing flags + ldr r0,[r0,r2,lsl#2] + mov r1,r1,lsr#28 + mov r0,r0,lsr r1 + tst r0,#1 + bne next_check_copro + b next_after_cond_false @ must not necessarily have been an + @ FP instruction ! /*---------------------------------------------------------------------------*/ @@ -123,36 +135,29 @@ subeq r7,r6,r7,lsl#2 @ r6=base address +/- offset tst r4,#0x01000000 @ preindexing ? movne r6,r7 + tst r4,#0x00200000 @ write back ? + cmpne r5,#0x000f0000 @ base register = pc ? + strne r7,[r13,r5,lsr#14] and r0,r4,#0x00007000 @ r0=fp register number << 12 add r0,r10,r0,lsr#8 @ r0=address of fp register - mov r1,#0 - tst r4,#0x00008000 - orrne r1,r1,#1 @ T0 + + and r1,r4,#0x00008000 @ T0 tst r4,#0x00400000 - orrne r1,r1,#2 @ T1 + orrne r1,r1,#0x00010000 @ T1 tst r4,#0x00100000 - orrne r1,r1,#4 @ L/S + orrne r1,r1,#0x00020000 @ L/S - adr r14,CPDT_1_writeback @ for being able to "call" something - add pc,pc,r1,lsl#2 - mov r0,r0 - b CPDT_store_single @ these functions get - b CPDT_store_double @ r0=address of fp register - b CPDT_store_extended @ r6=address of data - b undefined @ CPDT_store_decimal @ and may modify r0-r3 - b CPDT_load_single - b CPDT_load_double - b CPDT_load_extended - b undefined @ CPDT_load_decimal - -CPDT_1_writeback: - tst r4,#0x00200000 @ write back ? - cmpne r5,#0x000f0000 @ base register = pc ? - beq finish - - str r7,[r13,r5,lsr#14] - b finish + ldr pc,[pc,r1,lsr#13] + .word 0 + .word CPDT_store_single @ these functions get + .word CPDT_store_double @ r0=address of fp register + .word CPDT_store_extended @ r6=address of data + .word undefined @ CPDT_store_decimal + .word CPDT_load_single + .word CPDT_load_double + .word CPDT_load_extended + .word undefined @ CPDT_load_decimal /*---------------------------------------------------------------------------*/ @@ -168,27 +173,22 @@ subeq r7,r6,r7,lsl#2 @ r7=base address +/- offset tst r4,#0x01000000 @ preindexing ? movne r6,r7 + tst r4,#0x00200000 @ write back ? + cmpne r5,#0x000f0000 @ base register = pc ? + strne r7,[r13,r5,lsr#14] and r0,r4,#0x00007000 @ r0=fp register number << 12 and r1,r4,#0x00008000 mov r1,r1,lsr#15 @ N0 and r2,r4,#0x00400000 - orrs r1,r1,r2,lsr#21 @ N1 + orr r1,r1,r2,lsr#21 @ N1 + cmp r1,#0 addeq r1,r1,#4 @ r1=register count - adr r14,CPDT_M_writeback @ for being able to "call" something tst r4,#0x00100000 @ load/store beq CPDT_sfm b CPDT_lfm -CPDT_M_writeback: - tst r4,#0x00200000 @ write back ? - cmpne r5,#0x000f0000 @ base register = pc ? - beq finish - - str r7,[r13,r5,lsr#14] - b finish - /*---------------------------------------------------------------------------*/ CPDO_CPRT_enter: @@ -205,45 +205,50 @@ add r2,r10,r2,lsl#4 @ r2=address of Fm CPDO_constback: + ldr r3,=round_table + and r5,r4,#0x000000e0 + and r6,r4,#0x00080000 + orr r5,r5,r6,lsr#11 @ r5=containing rounding mode/precision + ldr r14,[r3,r5,lsr#3] @ r14=address of rounding function and r3,r4,#0x00f00000 tst r4,#0x00008000 - orrne r3,r3,#0x01000000 - - adr r14,finish @ call return address - add pc,pc,r3,lsr#18 - mov r0,r0 - b CPDO_adf - b CPDO_muf - b CPDO_suf - b CPDO_rsf - b CPDO_dvf - b CPDO_rdf - b undefined - b undefined - b undefined @ CPDO_rmf - b CPDO_muf - b CPDO_dvf - b CPDO_rdf - b undefined - b undefined - b undefined - b undefined - b CPDO_mvf - b CPDO_mnf - b CPDO_abs - b CPDO_rnd - b CPDO_sqt - b undefined - b undefined - b undefined - b undefined - b undefined - b undefined - b undefined - b undefined - b undefined - b CPDO_rnd - b finish + orrne r3,r3,#0x01000000 @ r3=operation code + + ldr pc,[pc,r3,lsr#18] + .word 0 +CPDO_table: + .word CPDO_adf + .word CPDO_muf + .word CPDO_suf + .word CPDO_rsf + .word CPDO_dvf + .word CPDO_rdf + .word undefined + .word undefined + .word CPDO_rmf + .word CPDO_muf + .word CPDO_dvf + .word CPDO_rdf + .word undefined + .word undefined + .word undefined + .word undefined + .word CPDO_mvf + .word CPDO_mnf + .word CPDO_abs + .word CPDO_rnd + .word CPDO_sqt + .word undefined + .word undefined + .word undefined + .word undefined + .word undefined + .word undefined + .word undefined + .word undefined + .word undefined + .word CPDO_rnd + .word fastfpe_next CPDO_const: ldr r2,=fp_const @@ -265,31 +270,58 @@ CPRT_constback: and r3,r4,#0x00f00000 - adr r14,finish @ call return address - add pc,pc,r3,lsr#18 - mov r0,r0 - b CPRT_flt - b CPRT_fix - b CPRT_wfs - b CPRT_rfs - b undefined - b undefined - b undefined - b undefined - b undefined - b CPRT_cmf - b undefined - b CPRT_cnf - b undefined - b CPRT_cmf - b undefined - b CPRT_cnf + ldr pc,[pc,r3,lsr#18] + .word 0 + .word CPRT_flt + .word CPRT_fix + .word CPRT_wfs + .word CPRT_rfs + .word undefined + .word undefined + .word undefined + .word undefined + .word undefined + .word CPRT_cmf + .word undefined + .word CPRT_cnf + .word undefined + .word CPRT_cmf + .word undefined + .word CPRT_cnf CPRT_const: ldr r2,=fp_const and r3,r4,#0x00000007 add r2,r2,r3,lsl#4 b CPRT_constback + +/*---------------------------------------------------------------------------*/ + + @ Test if long multiply instructions are available + + .globl fastfpe_test +fastfpe_test: + .globl elf_hwcap + ldr r0,=elf_hwcap + ldr r0,[r0] + tst r0,#HWCAP_FAST_MULT + bne fastfpe_has_long_multiply + mov r0,#0 + mov pc,r14 + +fastfpe_has_long_multiply: + adr r0,CPDO_table + ldr r1,=CPDO_muf_M + str r1,[r0,#1*4] @ muf + str r1,[r0,#9*4] @ fml + ldr r1,=CPDO_dvf_M + str r1,[r0,#4*4] @ dvf + str r1,[r0,#10*4] @ fdv + ldr r1,=CPDO_rdf_M + str r1,[r0,#5*4] @ rdf + str r1,[r0,#11*4] @ frd + mov r0,#1 + mov pc,r14 /*---------------------------------------------------------------------------*/ @@ -306,4 +338,3 @@ .previous /*---------------------------------------------------------------------------*/ - --- ../linux-sharp-a300/arch/arm/fastfpe/module.c 2002-05-15 21:37:41.000000000 +0900 +++ linux/arch/arm/fastfpe/module.c 2006-05-17 19:20:27.756601904 +0900 @@ -1,6 +1,6 @@ /* Fast Floating Point Emulator - (c) Peter Teichmann + (c) Peter Teichmann This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -33,30 +33,16 @@ static void (*orig_fp_enter)(void); /* old kern_fp_enter value */ extern void (*kern_fp_enter)(void); /* current FP handler */ extern void fastfpe_enter(void); /* forward declarations */ - -#ifdef MODULE -/* - * Return 0 if we can be unloaded. This can only happen if - * kern_fp_enter is still pointing at fastfpe_enter - */ -static int fpe_unload(void) -{ - return (kern_fp_enter == fastfpe_enter) ? 0 : 1; -} -#endif +extern int fastfpe_test(void); /* long multiply available ? */ static int __init fpe_init(void) { -#ifdef MODULE - if (!mod_member_present(&__this_module, can_unload)) - return -EINVAL; - __this_module.can_unload = fpe_unload; -#else if (fpe_type[0] && strcmp(fpe_type, "fastfpe")) return 0; -#endif - printk("Fast Floating Point Emulator V0.0 (c) Peter Teichmann.\n"); + printk("Fast Floating Point Emulator V0.94"); + if (fastfpe_test() == 1) printk("M"); + printk(" by Peter Teichmann.\n"); /* Save pointer to the old FP handler and then patch ourselves in */ orig_fp_enter = kern_fp_enter; @@ -74,5 +60,5 @@ module_init(fpe_init); module_exit(fpe_exit); -MODULE_AUTHOR("Peter Teichmann "); -MODULE_DESCRIPTION("Fast floating point emulator"); +MODULE_AUTHOR("Peter Teichmann "); +MODULE_DESCRIPTION("Fast floating point emulator with full precision"); --- ../linux-sharp-a300/arch/arm/fastfpe/round.S 1970-01-01 09:00:00.000000000 +0900 +++ linux/arch/arm/fastfpe/round.S 2006-05-17 19:20:27.757601752 +0900 @@ -0,0 +1,912 @@ + +/* +Rounds fp register r1-r4, additional mantissa bits in r5 and stores result +at address r0. Returns to fastfpe_next. +*/ + +/*------------------------------------------------------------------------*/ + + .data + .globl round_table +round_table: + .word round_single_ne + .word round_single_p + .word round_single_m + .word round_single_z + .word round_double_ne + .word round_double_p + .word round_double_m + .word round_double_z + .word round_extended_ne + .word round_extended_p + .word round_extended_m + .word round_extended_z + .word round_undef + .word round_undef + .word round_undef + .word round_undef + +/*------------------------------------------------------------------------*/ + + .text +round_single_ne: + cmp r4,#127 + bgt round_single_nz_ne_overflow + cmp r4,#-126-23-1 + blt round_single_z_ne_underflow + cmp r4,#-126 + blt round_single_ne_denormalized + + adds r6,r2,#0x80 // add 0x80.00000000.00000000 to + bcs round_single_add_ov // mantissa and additional bits + + teq r5,#0 + teqeq r3,#0 + tsteq r2,#0xff // test for inexact + + ldrne r7,[r10,#128] + orrne r7,r7,#16 // set inexact flag + strne r7,[r10,#128] + + teq r5,#0 + teqeq r3,#0 + tsteq r6,#0xff + biceq r6,r6,#0x100 // the even thingy + + mov r3,#0 // remove bits not existing in single + bic r2,r6,#0xff // remove bits not existing in single + stmia r0,{r1-r4} + b fastfpe_next + +round_single_ne_denormalized: + add r7,r4,#150 + mov r6,#0xffffffff + mov r6,r6,lsr r7 + + teq r5,#0 + teqeq r3,#0 + tsteq r2,r6 + ldrne r8,[r10,#128] + orrne r8,r8,#16+8 // set inexact, underflow flag + strne r8,[r10,#128] + + mov r8,#0x80000000 + mov r8,r8,lsr r7 + adds r2,r2,r8 + bcs round_single_ne_denormalized_ov + + teq r5,#0 + teqeq r3,#0 + tsteq r2,r6 + biceq r2,r2,r8,lsl #1 // the even thingy + + mov r3,#0 + bic r2,r2,r6 // removing bits not existing in single + stmia r0,{r1-r4} + b fastfpe_next + +round_single_ne_denormalized_ov: + cmp r4,#-150 + cmpeq r3,#0 + cmpeq r2,#0 + beq round_single_z_ne_underflow // 1.0*2^-150 to zero! + add r4,r4,#1 + cmp r4,#-126 // left denormalized range ? + cmpge r2,#0x80 // yes -> overflow also without denormalisation ? + ldrge r5,[r10,#128] + bicge r5,r5,#8 // yes -> clear underflow flag + strge r5,[r10,#128] + mov r3,#0 + mov r2,#0x80000000 + stmia r0,{r1-r4} + b fastfpe_next + +/*------------------------------------------------------------------------*/ + +round_single_p: + teq r1,#0 + beq round_single_nz + b round_single_z + +/*------------------------------------------------------------------------*/ + +round_single_m: + teq r1,#0 + beq round_single_z + b round_single_nz + +/*------------------------------------------------------------------------*/ + +round_single_z: + cmp r4,#127 + bgt round_single_z_overflow + cmp r4,#-126-23 + blt round_single_z_ne_underflow + cmp r4,#-126 + blt round_single_z_denormalized + + teq r5,#0 + teqeq r3,#0 + tsteq r2,#0xff // testing for inexact + ldrne r5,[r10,#128] + orrne r5,r5,#16 // set inexact flag + strne r5,[r10,#128] + + mov r3,#0 + bic r2,r2,#0xff // removing bits not existing in single + stmia r0,{r1-r4} + b fastfpe_next + +round_single_z_overflow: + cmp r4,#0x7fffffff + beq round_single_infnan + + ldrne r5,[r10,#128] + orrne r5,r5,#16+4 // set inexact,overflow flag + strne r5,[r10,#128] + mov r2,#0xffffff00 + mov r3,#0 + mov r4,#127 // biggest non-infinity single + stmia r0,{r1-r4} + b fastfpe_next + +round_single_infnan: + orrs r5,r3,r2,lsl#1 // is it Inf? ignore MSB + beq round_single_infnan_store + tst r2,#0x40000000 // is it a SNaN? + beq round_single_infnan_create_qnan + mov r3,#0 // these bits can not be stored + bic r2,r2,#0xff // in single precision +round_single_infnan_store: + stmia r0,{r1-r4} + b fastfpe_next + +round_single_infnan_create_qnan: + mov r1,#0x80000000 + mov r2,#0xffffff00 + bic r2,r2,#0x80000000 // r2 = 0x7fffff00 + mov r3,#0 + ldr r5,[r10,#128] + orr r5,r5,#1 // set invalid operation flag + str r5,[r10,#128] + stmia r0,{r1-r4} + b fastfpe_next + +round_single_z_ne_underflow: + cmp r4,#0x80000000 + beq round_single_z_zero + ldrne r5,[r10,#128] + orrne r5,r5,#16+8 // set inexact, underflow flag + strne r5,[r10,#128] + mov r2,#0 + mov r3,#0 + mov r4,#0x80000000 // was by ERROR -127 +round_single_z_zero: + stmia r0,{r1-r4} + b fastfpe_next + +round_single_z_denormalized: + mov r6,#0xffffffff + add r7,r4,#150 + + teq r5,#0 + teqeq r3,#0 + tsteq r2,r6,lsr r7 // testing for tinyness + ldrne r5,[r10,#128] + orrne r5,r5,#16+8 // set inexact, undeflow flag + strne r5,[r10,#128] + + mov r3,#0 + bic r2,r2,r6,lsr r7 // removing bits not existing in single + stmia r0,{r1-r4} + b fastfpe_next + +/*------------------------------------------------------------------------*/ + +round_single_nz: + cmp r4,#127 + bgt round_single_nz_ne_overflow + cmp r4,#-126-23 + blt round_single_nz_underflow + cmp r4,#-126 + blt round_single_nz_denormalized + + adds r5,r5,#0xffffffff + adcs r3,r3,#0xffffffff // add 0xff.ffffffff.ffffffff to + adcs r2,r2,#0xff // mantissa and additional bits + bcs round_single_add_ov + + cmp r5,#0xffffffff + cmpeq r3,#0xffffffff + andeq r5,r2,#0xff + cmpeq r5,#0xff // test for inexact + + bic r2,r2,#0xff // remove bits not existing in single + +round_single_add_ov_back: + ldrne r5,[r10,#128] + orrne r5,r5,#16 // set inexact flag + strne r5,[r10,#128] + + mov r3,#0 // remove bits not existing in single + stmia r0,{r1-r4} + b fastfpe_next + +round_single_add_ov: + add r4,r4,#1 + cmp r4,#127 + bgt round_single_nz_ne_overflow + movs r2,#0x80000000 // so that inexact flag gets set !!! + b round_single_add_ov_back + +round_single_nz_ne_overflow: + cmp r4,#0x7fffffff + beq round_single_infnan + + ldrne r5,[r10,#128] + orrne r5,r5,#16+4 // set inexact,overflow flag + strne r5,[r10,#128] + mov r2,#0x80000000 // set MSB + mov r3,#0 + mov r4,#0x7fffffff + stmia r0,{r1-r4} + b fastfpe_next + +round_single_nz_underflow: + cmp r4,#0x80000000 + beq round_single_nz_zero + + ldrne r5,[r10,#128] + orrne r5,r5,#16+8 // set inexact, underflow flag + strne r5,[r10,#128] + mov r2,#0x80000000 + mov r3,#0 + mov r4,#-149 // smallest non-zero single +round_single_nz_zero: + stmia r0,{r1-r4} + b fastfpe_next + +round_single_nz_denormalized: + mov r6,#0xffffffff + add r7,r4,#150 + mov r6,r6,lsr r7 + + teq r5,#0 + teqeq r3,#0 + tsteq r2,r6 + ldrne r8,[r10,#128] + orrne r8,r8,#16+8 // set inexact, underflow flag + strne r8,[r10,#128] + + adds r5,r5,#0xffffffff + adcs r3,r3,#0xffffffff + adcs r2,r2,r6 + bcs round_single_nz_denormalized_ov + + mov r3,#0 + bic r2,r2,r6 // removing bits not existing in single + stmia r0,{r1-r4} + b fastfpe_next + +round_single_nz_denormalized_ov: + add r4,r4,#1 + cmp r4,#-126 // left denormalized range ? + cmpge r2,#0x100 // yes -> overflow also without denormalisation ? + ldrge r5,[r10,#128] + bicge r5,r5,#8 // yes -> clear underflow flag + strge r5,[r10,#128] + mov r3,#0 + mov r2,#0x80000000 + stmia r0,{r1-r4} + b fastfpe_next + +/*------------------------------------------------------------------------*/ + +round_double_ne: + mov r7,#0xffffffff // to generate e.g. 0x7ff + + cmp r4,#1024 + bge round_double_nz_ne_overflow + add r6,r4,#1024 + cmp r6,#-1022+1024 + blt round_double_ne_denormalized + + teq r5,#0 + tsteq r3,r7,lsr#32-11 // testing for inexact + ldrne r6,[r10,#128] + orrne r6,r6,#16 // set inexact flag + strne r6,[r10,#128] + + adds r3,r3,#0x400 // add 0x0.00000400.00000000 to + adcs r2,r2,#0 // mantissa and additional bits + bcs round_double_add_ov + + teq r5,#0 + tsteq r3,r7,lsr#32-11 + biceq r3,r3,#0x800 // the even thingy + + bic r3,r3,r7,lsr#32-11 // remove bits not existing in double + + stmia r0,{r1-r4} + b fastfpe_next + +round_double_ne_denormalized: + cmp r6,#-1022-52-1+1024 + blt round_double_z_ne_underflow + + adds r6,r6,#1022+53-32-1024 + + addmi r6,r6,#32 + movmi r6,r7,lsr r6 + + movpl r7,r7,lsr r6 + movpl r6,#0 + + teq r5,#0 + tsteq r3,r7 + tsteq r2,r6 // testing for tinyness + ldrne r8,[r10,#128] + orrne r8,r8,#16+8 // set inexact, undeflow flag + strne r8,[r10,#128] + + bics r8,r6,r6,lsr#1 // generate ...0001000... + movne r11,#0 // from ...0001111... + biceq r11,r7,r7,lsr#1 // 64bit + + adds r3,r3,r11 + adcs r2,r2,r8 + bcs round_double_ne_denormalized_ov + + teq r5,#0 + tsteq r3,r7 + tsteq r2,r6 + bne round_double_ne_denormalized_noeventhingy + adds r11,r11,r11 + adc r8,r8,r8 + bic r3,r3,r11 + bic r2,r2,r8 // the even thingy + +round_double_ne_denormalized_noeventhingy: + bic r3,r3,r7 // removing bits not existing in + bic r2,r2,r6 // denormalized double + stmia r0,{r1-r4} + b fastfpe_next + +round_double_ne_denormalized_ov: + add r6,r4,#1024 + cmp r6,#-1023-52+1024 + cmpeq r3,#0 + cmpeq r2,#0 + beq round_single_z_ne_underflow // 1.0*2^(-1023-52) to zero! + add r4,r4,#1 + cmp r6,#-1022-1+1024 // left denormalized range ? + cmpge r3,#0x400 // yes -> overflow also without denormalisation ? + ldrge r5,[r10,#128] + bicge r5,r5,#8 // yes -> clear underflow flag + strge r5,[r10,#128] + mov r3,#0 + mov r2,#0x80000000 + stmia r0,{r1-r4} + b fastfpe_next + +/*------------------------------------------------------------------------*/ + +round_double_p: + teq r1,#0 + beq round_double_nz + b round_double_z + +/*------------------------------------------------------------------------*/ + +round_double_m: + teq r1,#0 + beq round_double_z + b round_double_nz + +/*------------------------------------------------------------------------*/ + +round_double_z: + mov r7,#0xffffffff + + cmp r4,#1024 + bge round_double_z_overflow + add r6,r4,#1024 + cmp r6,#-1022+1024 + blt round_double_z_denormalized + + teq r5,#0 + tsteq r3,r7,lsr#32-11 // testing for inexact + ldrne r5,[r10,#128] + orrne r5,r5,#16 // set inexact flag + strne r5,[r10,#128] + + bic r3,r3,r7,lsr#32-11 // removing bits not existing in double + stmia r0,{r1-r4} + b fastfpe_next + +round_double_z_overflow: + cmp r4,#0x7fffffff + beq round_double_infnan + + ldrne r5,[r10,#128] + orrne r5,r5,#16+4 // set inexact,overflow flag + strne r5,[r10,#128] + mov r2,#0xffffffff + mov r3,r2,lsl#11 // 0xfffff800 + mov r4,#1024 + sub r4,r4,#1 // 1023; biggest non-infinity double + stmia r0,{r1-r4} + b fastfpe_next + +round_double_infnan: + orrs r5,r3,r2,lsl#1 // is it Inf? ignore MSB + beq round_double_infnan_store + tst r2,#0x40000000 // is it a SNaN? + beq round_double_infnan_create_qnan + bic r3,r3,r7,lsr#32-11 // clear bits not in double +round_double_infnan_store: + stmia r0,{r1-r4} + b fastfpe_next + +round_double_infnan_create_qnan: + mov r1,#0x80000000 + mov r2,#0x7fffffff + mov r3,r2,lsl#11 // 0xfffff800 + ldr r5,[r10,#128] + orr r5,r5,#1 // set invalid operation flag + str r5,[r10,#128] + b round_double_infnan_store + +round_double_z_ne_underflow: + cmp r4,#0x80000000 + beq round_double_z_zero + ldr r5,[r10,#128] + orr r5,r5,#16+8 // set inexact, underflow flag + str r5,[r10,#128] + mov r2,#0 + mov r3,#0 + mov r4,#0x80000000 +round_double_z_zero: + stmia r0,{r1-r4} + b fastfpe_next + +round_double_z_denormalized: + cmp r6,#-1022-52+1024 + blt round_double_z_ne_underflow + + adds r6,r6,#1022+53-32-1024 + + addmi r6,r6,#32 + movmi r6,r7,lsr r6 + + movpl r7,r7,lsr r6 + movpl r6,#0 + + teq r5,#0 + tsteq r3,r7 + tsteq r2,r6 // testing for tinyness + ldrne r5,[r10,#128] + orrne r5,r5,#16+8 // set inexact, undeflow flag + strne r5,[r10,#128] + + bic r3,r3,r7 // rmoving bits not existing in + bic r2,r2,r6 // denormalized double + stmia r0,{r1-r4} + b fastfpe_next + +/*------------------------------------------------------------------------*/ + +round_double_nz: + mov r7,#0xffffffff // to generate e.g. 0x7ff + + cmp r4,#1024 + bge round_double_nz_ne_overflow + add r6,r4,#1024 + cmp r6,#-1022+1024 + blt round_double_nz_denormalized + + teq r5,#0 + tsteq r3,r7,lsr#32-11 // testing for inexact + ldrne r6,[r10,#128] + orrne r6,r6,#16 // set inexact flag + strne r6,[r10,#128] + + adds r5,r5,#0xffffffff + adcs r3,r3,r7,lsr#32-11 // add 0x0.000007ff.ffffffff to + adcs r2,r2,#0 // mantissa and additional bits + bcs round_double_add_ov + + bic r3,r3,r7,lsr#32-11 // remove bits not existing in double + + stmia r0,{r1-r4} + b fastfpe_next + +round_double_add_ov: + add r4,r4,#1 + cmp r4,#1024 + bge round_double_nz_ne_overflow + +// ldrne r6,[r10,#128] +// orrne r6,r6,#16 // set inexact flag +// strne r6,[r10,#128] + mov r2,#0x80000000 + mov r3,#0 + stmia r0,{r1-r4} + b fastfpe_next + +round_double_nz_ne_overflow: + cmp r4,#0x7fffffff + beq round_double_infnan + + ldrne r5,[r10,#128] + orrne r5,r5,#16+4 // set inexact,overflow flag + strne r5,[r10,#128] + mov r2,#0x80000000 // set MSB + mov r3,#0 + mov r4,#0x7fffffff + stmia r0,{r1-r4} + b fastfpe_next + +round_double_nz_underflow: + cmp r4,#0x80000000 + beq round_double_nz_zero + + ldrne r5,[r10,#128] + orrne r5,r5,#16+8 // set inexact, underflow flag + strne r5,[r10,#128] + mov r2,#0x80000000 + mov r3,#0 + mov r4,#-1074+1024 + sub r4,r4,#1024 // smallest non-zero double +round_double_nz_zero: + stmia r0,{r1-r4} + b fastfpe_next + +round_double_nz_denormalized: + cmp r6,#-1022-52+1024 + blt round_double_nz_underflow + + adds r6,r6,#1022+53-32-1024 + + addmi r6,r6,#32 + movmi r6,r7,lsr r6 + + movpl r7,r7,lsr r6 + movpl r6,#0 + + teq r5,#0 + tsteq r3,r7 + tsteq r2,r6 // testing for tinyness + ldrne r8,[r10,#128] + orrne r8,r8,#16+8 // set inexact, undeflow flag + strne r8,[r10,#128] + + adds r5,r5,#0xffffffff + adcs r3,r3,r7 + adcs r2,r2,r6 + bcs round_double_nz_denormalized_ov + + bic r3,r3,r7 // rmoving bits not existing in + bic r2,r2,r6 // denormalized double + stmia r0,{r1-r4} + b fastfpe_next + +round_double_nz_denormalized_ov: + add r4,r4,#1 + add r6,r4,#1024 + cmp r6,#-1022+1024 // left denormalized range ? + cmpge r3,#0x800 // yes -> overflow also without denormalisation ? + ldrge r5,[r10,#128] + bicge r5,r5,#8 // yes -> clear underflow flag + strge r5,[r10,#128] + mov r3,#0 + mov r2,#0x80000000 + stmia r0,{r1-r4} + b fastfpe_next + +/*------------------------------------------------------------------------*/ + +round_extended_ne: + mov r7,#0xffffffff // to generate e.g. 0x7ff + + cmp r4,#16384 + bge round_extended_nz_ne_overflow + add r6,r4,#16384 + cmp r6,#-16382+16384 + blt round_extended_ne_denormalized + + teq r5,#0 // testing for inexact + ldrne r6,[r10,#128] + orrne r6,r6,#16 // set inexact flag + strne r6,[r10,#128] + + adds r5,r5,#0x80000000 // add 0x0.00000400.00000000 to + adcs r3,r3,#0 // mantissa and additional bits + adcs r2,r2,#0 + bcs round_extended_add_ov + + teq r5,#0 + biceq r3,r3,#1 // the even thingy + + stmia r0,{r1-r4} + b fastfpe_next + +round_extended_ne_denormalized: + cmp r6,#-16382-63-1+16384 + blt round_extended_z_ne_underflow + + adds r6,r6,#16382+64-32-16384 + + addmi r6,r6,#32 + movmi r6,r7,lsr r6 + + movpl r7,r7,lsr r6 + movpl r6,#0 + + teq r5,#0 + tsteq r3,r7 + tsteq r2,r6 // testing for tinyness + ldrne r8,[r10,#128] + orrne r8,r8,#16+8 // set inexact, undeflow flag + strne r8,[r10,#128] + + bics r8,r6,r6,lsr#1 // generate ...0001000... + movne r11,#0 // from ...0001111... + biceq r11,r7,r7,lsr#1 // 64bit + + adds r3,r3,r11 + adcs r2,r2,r8 + bcs round_extended_ne_denormalized_ov + + teq r5,#0 + tsteq r3,r7 + tsteq r2,r6 + bne round_extended_ne_denormalized_noeventhingy + adds r11,r11,r11 + adc r8,r8,r8 + bic r3,r3,r11 + bic r2,r2,r8 // the even thingy + +round_extended_ne_denormalized_noeventhingy: + bic r3,r3,r7 // removing bits not existing in + bic r2,r2,r6 // denormalized extended + stmia r0,{r1-r4} + b fastfpe_next + +round_extended_ne_denormalized_ov: + add r6,r4,#16384 + cmp r6,#-16383-63+16384 + cmpeq r5,#0 + cmpeq r3,#0 + cmpeq r2,#0 + beq round_single_z_ne_underflow // 1.0*2^(-16383-63) to zero! + add r4,r4,#1 + cmp r6,#-16382-1+16384 // left denormalized range ? + blt round_extended_ne_still_denormalized + cmp r5,#0x80000000 // FIXME yes -> overflow also without denormalisation ? + ldrcs r5,[r10,#128] + biccs r5,r5,#8 // yes -> clear underflow flag + strcs r5,[r10,#128] +round_extended_ne_still_denormalized: + mov r3,#0 + mov r2,#0x80000000 + stmia r0,{r1-r4} + b fastfpe_next + +/*------------------------------------------------------------------------*/ + +round_extended_p: + teq r1,#0 + beq round_extended_nz + b round_extended_z + +/*------------------------------------------------------------------------*/ + +round_extended_m: + teq r1,#0 + beq round_extended_z + b round_extended_nz + +/*------------------------------------------------------------------------*/ + +round_extended_z: + mov r7,#0xffffffff + + cmp r4,#16384 + bge round_extended_z_overflow + add r6,r4,#16384 + cmp r6,#-16382+16384 + blt round_extended_z_denormalized + + teq r5,#0 // testing for inexact + ldrne r5,[r10,#128] + orrne r5,r5,#16 // set inexact flag + strne r5,[r10,#128] + + stmia r0,{r1-r4} + b fastfpe_next + +round_extended_z_overflow: + cmp r4,#0x7fffffff + beq round_extended_infnan + + ldrne r5,[r10,#128] + orrne r5,r5,#16+4 // set inexact,overflow flag + strne r5,[r10,#128] + mov r2,#0xffffffff + mov r3,#0xffffffff + mov r4,#16384 + sub r4,r4,#1 // 16383; biggest non-infinity extended + stmia r0,{r1-r4} + b fastfpe_next + +round_extended_infnan: + orrs r5,r3,r2,lsl#1 // is it Inf? ignore MSB + beq round_extended_infnan_store + tst r2,#0x40000000 // is it a SNaN? + beq round_extended_infnan_create_qnan + bic r3,r3,r7,lsr#32-11 // clear bits not in extended +round_extended_infnan_store: + stmia r0,{r1-r4} + b fastfpe_next + +round_extended_infnan_create_qnan: + mov r1,#0x80000000 + mov r2,#0x7fffffff + mov r3,#0xffffffff + ldr r5,[r10,#128] + orr r5,r5,#1 // set invalid operation flag + str r5,[r10,#128] + b round_extended_infnan_store + +round_extended_z_ne_underflow: + cmp r4,#0x80000000 + beq round_extended_z_zero + ldr r5,[r10,#128] + orr r5,r5,#16+8 // set inexact, underflow flag + str r5,[r10,#128] + mov r2,#0 + mov r3,#0 + mov r4,#0x80000000 +round_extended_z_zero: + stmia r0,{r1-r4} + b fastfpe_next + +round_extended_z_denormalized: + cmp r6,#-16382-63+16384 + blt round_extended_z_ne_underflow + + adds r6,r6,#16382+64-32-16384 + + addmi r6,r6,#32 + movmi r6,r7,lsr r6 + + movpl r7,r7,lsr r6 + movpl r6,#0 + + teq r5,#0 + tsteq r3,r7 + tsteq r2,r6 // testing for tinyness + ldrne r5,[r10,#128] + orrne r5,r5,#16+8 // set inexact, undeflow flag + strne r5,[r10,#128] + + bic r3,r3,r7 // removing bits not existing in + bic r2,r2,r6 // denormalized extended + stmia r0,{r1-r4} + b fastfpe_next + +/*------------------------------------------------------------------------*/ + +round_extended_nz: + mov r7,#0xffffffff // to generate e.g. 0x7ff + + cmp r4,#16384 + bge round_extended_nz_ne_overflow + add r6,r4,#16384 + cmp r6,#-16382+16384 + blt round_extended_nz_denormalized + + teq r5,#0 // testing for inexact + ldrne r6,[r10,#128] + orrne r6,r6,#16 // set inexact flag + strne r6,[r10,#128] + + adds r5,r5,#0xffffffff + adcs r3,r3,#0 // add 0x0.0.ffffffff to + adcs r2,r2,#0 // mantissa and additional bits + bcs round_extended_add_ov + + stmia r0,{r1-r4} + b fastfpe_next + +round_extended_add_ov: + add r4,r4,#1 + cmp r4,#16384 + bge round_extended_nz_ne_overflow + +// ldrne r6,[r10,#128] +// orrne r6,r6,#16 // set inexact flag +// strne r6,[r10,#128] + mov r2,#0x80000000 + mov r3,#0 + stmia r0,{r1-r4} + b fastfpe_next + +round_extended_nz_ne_overflow: + cmp r4,#0x7fffffff + beq round_extended_infnan + + ldrne r5,[r10,#128] + orrne r5,r5,#16+4 // set inexact,overflow flag + strne r5,[r10,#128] + mov r2,#0x80000000 // set MSB + mov r3,#0 + mov r4,#0x7fffffff + stmia r0,{r1-r4} + b fastfpe_next + +round_extended_nz_underflow: + cmp r4,#0x80000000 + beq round_extended_nz_zero + + ldrne r5,[r10,#128] + orrne r5,r5,#16+8 // set inexact, underflow flag + strne r5,[r10,#128] + mov r2,#0x80000000 + mov r3,#0 + mov r4,#-16445+16384 + sub r4,r4,#16384 // smallest non-zero extended +round_extended_nz_zero: + stmia r0,{r1-r4} + b fastfpe_next + +round_extended_nz_denormalized: + cmp r6,#-16382-63+16384 + blt round_extended_nz_underflow + + adds r6,r6,#16382+64-32-16384 + + addmi r6,r6,#32 + movmi r6,r7,lsr r6 + + movpl r7,r7,lsr r6 + movpl r6,#0 + + teq r5,#0 + tsteq r3,r7 + tsteq r2,r6 // testing for tinyness + ldrne r8,[r10,#128] + orrne r8,r8,#16+8 // set inexact, undeflow flag + strne r8,[r10,#128] + + adds r5,r5,#0xffffffff + adcs r3,r3,r7 + adcs r2,r2,r6 + bcs round_extended_nz_denormalized_ov + + bic r3,r3,r7 // removing bits not existing in + bic r2,r2,r6 // denormalized extended + stmia r0,{r1-r4} + b fastfpe_next + +round_extended_nz_denormalized_ov: + add r4,r4,#1 + add r6,r4,#16384 + cmp r6,#-16382+16384 // left denormalized range ? + cmpge r3,#1 // yes -> overflow also without denormalisation ? + ldrge r5,[r10,#128] + bicge r5,r5,#8 // yes -> clear underflow flag + strge r5,[r10,#128] + mov r3,#0 + mov r2,#0x80000000 + stmia r0,{r1-r4} + b fastfpe_next + +/*------------------------------------------------------------------------*/ + +round_undef: + stmia r0,{r1-r4} + b fastfpe_next + +/*------------------------------------------------------------------------*/