diff options
Diffstat (limited to 'sysdeps/ia64/fpu/libm_sincos.S')
-rw-r--r-- | sysdeps/ia64/fpu/libm_sincos.S | 164 |
1 files changed, 82 insertions, 82 deletions
diff --git a/sysdeps/ia64/fpu/libm_sincos.S b/sysdeps/ia64/fpu/libm_sincos.S index a3f4c72743..3475b6281b 100644 --- a/sysdeps/ia64/fpu/libm_sincos.S +++ b/sysdeps/ia64/fpu/libm_sincos.S @@ -46,12 +46,13 @@ // 03/19/02 Added stack unwind around call to __libm_cis_large // 09/05/02 Work range is widened by reduction strengthen (3 parts of Pi/16) // 02/10/03 Reordered header: .section, .global, .proc, .align -// +// 08/08/03 Improved performance +// 02/11/04 cis is moved to the separate file. +// // API //============================================================== -// 1) double _Complex cis(double) -// 2) void sincos(double, double*s, double*c) -// 3) __libm_sincos - internal LIBM function, that accepts +// 1) void sincos(double, double*s, double*c) +// 2) __libm_sincos - internal LIBM function, that accepts // argument in f8 and returns cosine through f8, sine through f9 // // Overview of operation @@ -65,12 +66,12 @@ // nfloat = Round result to integer (round-to-nearest) // // r = x - nfloat * pi/2^k -// Do this as ((((x - nfloat * HIGH(pi/2^k))) - -// nfloat * LOW(pi/2^k)) - +// Do this as ((((x - nfloat * HIGH(pi/2^k))) - +// nfloat * LOW(pi/2^k)) - // nfloat * LOWEST(pi/2^k) for increased accuracy. // pi/2^k is stored as two numbers that when added make pi/2^k. // pi/2^k = HIGH(pi/2^k) + LOW(pi/2^k) -// HIGH and LOW parts are rounded to zero values, +// HIGH and LOW parts are rounded to zero values, // and LOWEST is rounded to nearest one. // // x = (nfloat * pi/2^k) + r @@ -166,15 +167,14 @@ // Registers used //============================================================== // general input registers: -// r14 -> r19 -// r32 -> r49 +// r14 -> r39 // predicate registers used: // p6 -> p14 - +// // floating-point registers used // f9 -> f15 -// f32 -> f100 +// f32 -> f67 // Assembly macros //============================================================== @@ -246,38 +246,32 @@ cis_Q = f67 cis_pResSin = r33 cis_pResCos = r34 -cis_exp_limit = r35 -cis_r_signexp = r36 -cis_AD_beta_table = r37 -cis_r_sincos = r38 - -cis_r_exp = r39 -cis_r_17_ones = r40 - cis_GR_sig_inv_pi_by_16 = r14 cis_GR_rshf_2to61 = r15 cis_GR_rshf = r16 cis_GR_exp_2tom61 = r17 cis_GR_n = r18 - cis_GR_n_sin = r19 -cis_GR_m_sin = r41 -cis_GR_32m_sin = r41 - -cis_GR_n_cos = r42 -cis_GR_m_cos = r43 -cis_GR_32m_cos = r43 - -cis_AD_2_sin = r44 -cis_AD_2_cos = r45 - -cis_gr_tmp = r46 -GR_SAVE_B0 = r47 -GR_SAVE_GP = r48 -rB0_SAVED = r49 -GR_SAVE_PFS = r50 -GR_SAVE_PR = r51 -cis_AD_1 = r52 +cis_exp_limit = r20 +cis_r_signexp = r21 +cis_AD_1 = r22 +cis_r_sincos = r23 +cis_r_exp = r24 +cis_r_17_ones = r25 +cis_GR_m_sin = r26 +cis_GR_32m_sin = r26 +cis_GR_n_cos = r27 +cis_GR_m_cos = r28 +cis_GR_32m_cos = r28 +cis_AD_2_sin = r29 +cis_AD_2_cos = r30 +cis_gr_tmp = r31 + +GR_SAVE_B0 = r35 +GR_SAVE_GP = r36 +rB0_SAVED = r37 +GR_SAVE_PFS = r38 +GR_SAVE_PR = r39 RODATA @@ -408,14 +402,14 @@ LOCAL_OBJECT_END(double_sin_cos_beta_k4) GLOBAL_IEEE754_ENTRY(sincos) // cis_GR_sig_inv_pi_by_16 = significand of 16/pi { .mlx - alloc GR_SAVE_PFS = ar.pfs, 0, 21, 0, 0 + getf.exp cis_r_signexp = cis_Arg movl cis_GR_sig_inv_pi_by_16 = 0xA2F9836E4E44152A - + } // cis_GR_rshf_2to61 = 1.1000 2^(63+63-2) { .mlx addl cis_AD_1 = @ltoff(double_cis_pi), gp - movl cis_GR_rshf_2to61 = 0x47b8000000000000 + movl cis_GR_rshf_2to61 = 0x47b8000000000000 };; { .mfi @@ -430,12 +424,11 @@ GLOBAL_IEEE754_ENTRY(sincos) br.cond.sptk _CIS_COMMON };; GLOBAL_IEEE754_END(sincos) -LOCAL_LIBM_ENTRY(cis) -LOCAL_LIBM_END(cis) + GLOBAL_LIBM_ENTRY(__libm_sincos) // cis_GR_sig_inv_pi_by_16 = significand of 16/pi { .mlx - alloc GR_SAVE_PFS = ar.pfs,0,21,0,0 + getf.exp cis_r_signexp = cis_Arg movl cis_GR_sig_inv_pi_by_16 = 0xA2F9836E4E44152A } // cis_GR_rshf_2to61 = 1.1000 2^(63+63-2) @@ -443,11 +436,12 @@ GLOBAL_LIBM_ENTRY(__libm_sincos) addl cis_AD_1 = @ltoff(double_cis_pi), gp movl cis_GR_rshf_2to61 = 0x47b8000000000000 };; + // p14 set for __libm_sincos and cis { .mfi ld8 cis_AD_1 = [cis_AD_1] fnorm.s1 cis_NORM_f8 = cis_Arg - cmp.eq p14, p13 = r0, r0 + cmp.eq p14, p13 = r0, r0 } // cis_GR_exp_2tom61 = exponent of scaling factor 2^-61 { .mib @@ -476,10 +470,15 @@ _CIS_COMMON: // 2^-61 for scaling Nfloat // 0x1001a is register_bias + 27. // So if f8 >= 2^27, go to large arguments routine -{ .mmi - getf.exp cis_r_signexp = cis_Arg - setf.exp cis_2TOM61 = cis_GR_exp_2tom61 +{ .mfi + alloc GR_SAVE_PFS = ar.pfs, 3, 5, 0, 0 + fclass.m p11,p0 = cis_Arg, 0x0b // Test for x=unorm mov cis_exp_limit = 0x1001a +} +{ .mib + setf.exp cis_2TOM61 = cis_GR_exp_2tom61 + nop.i 0 +(p6) br.cond.spnt _CIS_SPECIAL_ARGS };; // Load the two pieces of pi/16 @@ -488,9 +487,11 @@ _CIS_COMMON: { .mmb ldfe cis_Pi_by_16_hi = [cis_AD_1],16 setf.d cis_RSHF = cis_GR_rshf -(p6) br.cond.spnt _CIS_SPECIAL_ARGS +(p11) br.cond.spnt _CIS_UNORM // Branch if x=unorm };; +_CIS_COMMON2: +// Return here if x=unorm // Create constant inexact set { .mmi ldfe cis_Pi_by_16_lo = [cis_AD_1],16 @@ -498,23 +499,18 @@ _CIS_COMMON: nop.i 0 };; +// Select exponent (17 lsb) { .mfi ldfe cis_Pi_by_16_lowest = [cis_AD_1],16 nop.f 0 - nop.i 0 -};; - -// Start loading P, Q coefficients -{ .mib - ldfpd cis_P4,cis_Q4 = [cis_AD_1],16 dep.z cis_r_exp = cis_r_signexp, 0, 17 - nop.b 0 };; +// Start loading P, Q coefficients // p10 is true if we must call routines to handle larger arguments // p10 is true if f8 exp is > 0x1001a { .mmb - ldfpd cis_P3,cis_Q3 = [cis_AD_1],16 + ldfpd cis_P4,cis_Q4 = [cis_AD_1],16 cmp.ge p10, p0 = cis_r_exp, cis_exp_limit (p10) br.cond.spnt _CIS_LARGE_ARGS // go to |x| >= 2^27 path };; @@ -523,39 +519,33 @@ _CIS_COMMON: // Multiply x by scaled 16/pi and add large const to shift integer part of W to // rightmost bits of significand { .mfi - ldfpd cis_P2,cis_Q2 = [cis_AD_1],16 + ldfpd cis_P3,cis_Q3 = [cis_AD_1],16 fma.s1 cis_W_2TO61_RSH = cis_NORM_f8,cis_SIG_INV_PI_BY_16_2TO61,cis_RSHF_2TO61 nop.i 0 };; +// get N = (int)cis_int_Nfloat // cis_NFLOAT = Round_Int_Nearest(cis_W) -{ .mfi - ldfpd cis_P1,cis_Q1 = [cis_AD_1], 16 +{ .mmf + getf.sig cis_GR_n = cis_W_2TO61_RSH + ldfpd cis_P2,cis_Q2 = [cis_AD_1],16 fms.s1 cis_NFLOAT = cis_W_2TO61_RSH,cis_2TOM61,cis_RSHF - nop.i 0 -};; - -// get N = (int)cis_int_Nfloat -{ .mfi - getf.sig cis_GR_n = cis_W_2TO61_RSH - nop.f 0 - nop.i 0 };; -// Add 2^(k-1) (which is in cis_r_sincos) to N // cis_r = -cis_Nfloat * cis_Pi_by_16_hi + x -// cis_r = cis_r -cis_Nfloat * cis_Pi_by_16_lo { .mfi - add cis_GR_n_cos = 0x8, cis_GR_n + ldfpd cis_P1,cis_Q1 = [cis_AD_1], 16 fnma.s1 cis_r = cis_NFLOAT,cis_Pi_by_16_hi,cis_NORM_f8 nop.i 0 };; -//Get M (least k+1 bits of N) +// Add 2^(k-1) (which is in cis_r_sincos) to N { .mmi + add cis_GR_n_cos = 0x8, cis_GR_n +;; +//Get M (least k+1 bits of N) and cis_GR_m_sin = 0x1f,cis_GR_n and cis_GR_m_cos = 0x1f,cis_GR_n_cos - nop.i 0 };; { .mmi @@ -565,9 +555,10 @@ _CIS_COMMON: };; // Add 32*M to address of sin_cos_beta table -{ .mmi +// cis_r = cis_r -cis_Nfloat * cis_Pi_by_16_lo +{ .mfi add cis_AD_2_sin = cis_GR_32m_sin, cis_AD_1 - nop.m 0 + fnma.s1 cis_r = cis_NFLOAT, cis_Pi_by_16_lo, cis_r shl cis_GR_32m_cos = cis_GR_m_cos,5 };; @@ -580,7 +571,6 @@ _CIS_COMMON: { .mfi ldfe cis_Sm_cos = [cis_AD_2_cos], 16 - fnma.s1 cis_r = cis_NFLOAT, cis_Pi_by_16_lo, cis_r nop.i 0 };; @@ -604,7 +594,7 @@ _CIS_COMMON: { .mfi ldfe cis_Cm_cos = [cis_AD_2_cos] - fma.s1 cis_P_temp1 = cis_rsq, cis_P4, cis_P3 + fma.s1 cis_P_temp1 = cis_rsq, cis_P4, cis_P3 nop.i 0 } @@ -638,18 +628,18 @@ _CIS_COMMON: { .mfi nop.m 0 - fma.s1 cis_Q = cis_rsq, cis_Q_temp2, cis_Q1 + fmpy.s1 cis_rcub = cis_r_exact, cis_rsq // get r^3 nop.i 0 -} +};; + { .mfi nop.m 0 - fma.s1 cis_P = cis_rsq, cis_P_temp2, cis_P1 + fma.s1 cis_Q = cis_rsq, cis_Q_temp2, cis_Q1 nop.i 0 -};; - +} { .mfi nop.m 0 - fmpy.s1 cis_rcub = cis_r_exact, cis_rsq // get r^3 + fma.s1 cis_P = cis_rsq, cis_P_temp2, cis_P1 nop.i 0 };; @@ -717,7 +707,17 @@ _CIS_SPECIAL_ARGS: stfd [cis_pResCos] = cis_Cos_res br.ret.sptk b0 // common exit for sincos main path };; + +_CIS_UNORM: +// Here if x=unorm +{ .mfb + getf.exp cis_r_signexp = cis_NORM_f8 // Get signexp of x + fcmp.eq.s0 p11,p0 = cis_Arg, f0 // Dummy op to set denorm + br.cond.sptk _CIS_COMMON2 // Return to main path +};; + GLOBAL_LIBM_END(__libm_sincos) + //// |x| > 2^27 path /////// .proc _CIS_LARGE_ARGS _CIS_LARGE_ARGS: |