aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/ia64/fpu/e_acosl.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/ia64/fpu/e_acosl.S')
-rw-r--r--sysdeps/ia64/fpu/e_acosl.S1094
1 files changed, 1094 insertions, 0 deletions
diff --git a/sysdeps/ia64/fpu/e_acosl.S b/sysdeps/ia64/fpu/e_acosl.S
new file mode 100644
index 0000000000..81f56e41c8
--- /dev/null
+++ b/sysdeps/ia64/fpu/e_acosl.S
@@ -0,0 +1,1094 @@
+.file "acosl.s"
+
+// Copyright (c) 2000, 2001, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
+// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// WARRANTY DISCLAIMER
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://developer.intel.com/opensource.
+//
+// History
+//==============================================================
+// 2/02/00 Initial version
+// 2/07/00 Modified calculation of acos_corr to correct acosl
+// 4/04/00 Unwind support added
+// 8/15/00 Bundle added after call to __libm_error_support to properly
+// set [the previously overwritten] GR_Parameter_RESULT.
+// 12/20/00 Set denormal flag properly.
+//
+// API
+//==============================================================
+// double-extended = acosl (double-extended)
+// input floating point f8
+// output floating point f8
+//
+// Registers used
+//==============================================================
+//
+// predicate registers used:
+// p6 -> p12
+//
+// floating-point registers used:
+// f8 has input, then output
+// f8 -> f15, f32 ->f99
+//
+// general registers used:
+// r32 -> r48
+//
+// Overview of operation
+//==============================================================
+// There are three paths
+// 1. |x| < 2^-25 ACOS_TINY
+// 2. 2^-25 <= |x| < 1/4 ACOS_POLY
+// 3. 1/4 <= |x| < 1 ACOS_ATAN
+
+#include "libm_support.h"
+
+// Assembly macros
+//==============================================================
+
+// f8 is input, but acos_V must be put in f8
+// when __libm_atan2_reg is called, f8 must get V
+// f9 gets U when __libm_atan2_reg is called
+
+
+// __libm_atan2_reg returns
+// f8 = Z_hi
+// f10 = Z_lo
+// f11 = s_lo
+
+acos_Z_hi = f8
+acos_Z_lo = f10
+acos_S_lo = f11
+
+// When we call __libm_atan2_reg, we must save
+// the following:
+
+acos_corr = f12
+acos_X = f13
+acos_pi_hi = f14
+acos_pi_lo = f15
+
+// The rest of the assembly macros
+
+acos_P79 = f32
+acos_P59 = f33
+acos_P39 = f34
+acos_P19 = f35
+
+acos_P810 = f36
+acos_P610 = f37
+acos_P410 = f38
+acos_P210 = f39
+
+acos_A1 = f41
+acos_A2 = f42
+acos_A3 = f43
+acos_A4 = f44
+acos_A5 = f45
+acos_A6 = f46
+acos_A7 = f47
+acos_A8 = f48
+acos_A9 = f49
+acos_A10 = f50
+
+acos_X2 = f51
+acos_X4 = f52
+
+acos_B = f53
+acos_Bb = f54
+acos_A = f55
+acos_Aa = f56
+
+acos_1mA = f57
+
+acos_W = f58
+acos_Ww = f59
+
+acos_y0 = f60
+acos_y1 = f61
+acos_y2 = f62
+
+acos_H = f63
+acos_Hh = f64
+
+acos_t1 = f65
+acos_t2 = f66
+acos_t3 = f67
+acos_t4 = f68
+acos_t5 = f69
+
+acos_Pseries = f70
+acos_NORM_f8 = f71
+acos_ABS_NORM_f8 = f72
+
+acos_2 = f73
+acos_P1P2 = f74
+acos_HALF = f75
+acos_U = f76
+
+acos_1mB = f77
+acos_V = f78
+acos_S = f79
+
+acos_BmUU = f80
+acos_BmUUpb = f81
+acos_2U = f82
+acos_1d2U = f83
+
+acos_Dd = f84
+
+acos_pi_by_2_hi = f85
+acos_pi_by_2_lo = f86
+acos_xmpi_by_2_lo = f87
+acos_xPmw = f88
+
+acos_Uu = f89
+acos_AmVV = f90
+acos_AmVVpa = f91
+
+acos_2V = f92
+acos_1d2V = f93
+acos_Vv = f94
+
+acos_Vu = f95
+acos_Uv = f96
+
+acos_2_Z_hi = f97
+acos_s_lo_Z_lo = f98
+acos_result_lo = f99
+
+acos_Z_hi = f8
+acos_Z_lo = f10
+acos_s_lo = f11
+
+acos_GR_17_ones = r33
+acos_GR_16_ones = r34
+acos_GR_signexp_f8 = r35
+acos_GR_exp = r36
+acos_GR_true_exp = r37
+acos_GR_fffe = r38
+
+GR_SAVE_PFS = r43
+GR_SAVE_B0 = r39
+GR_SAVE_GP = r41
+
+// r40 is address of table of coefficients
+// r42
+
+GR_Parameter_X = r44
+GR_Parameter_Y = r45
+GR_Parameter_RESULT = r46
+GR_Parameter_TAG = r47
+
+
+// 2^-40:
+// A true exponent of -40 is
+// : -40 + register_bias
+// : -28 + ffff = ffd7
+
+// A true exponent of 1 is
+// : 1 + register_bias
+// : 1 + ffff = 10000
+
+// Data tables
+//==============================================================
+
+#ifdef _LIBC
+.rodata
+#else
+.data
+#endif
+
+.align 16
+
+acos_coefficients:
+ASM_TYPE_DIRECTIVE(acos_coefficients,@object)
+data8 0xc90fdaa22168c234, 0x00003FFF // pi_by_2_hi
+data8 0xc4c6628b80dc1cd1, 0x00003FBF // pi_by_2_lo
+data8 0xc90fdaa22168c234, 0x00004000 // pi_hi
+data8 0xc4c6628b80dc1cd1, 0x00003FC0 // pi_lo
+
+data8 0xBB08911F2013961E, 0x00003FF8 // A10
+data8 0x981F1095A23A87D3, 0x00003FF8 // A9
+data8 0xBDF09C6C4177BCC6, 0x00003FF8 // A8
+data8 0xE4C3A60B049ACCEA, 0x00003FF8 // A7
+data8 0x8E2789F4E8A8F1AD, 0x00003FF9 // A6
+data8 0xB745D09B2B0E850B, 0x00003FF9 // A5
+data8 0xF8E38E3BC4C50920, 0x00003FF9 // A4
+data8 0xB6DB6DB6D89FCD81, 0x00003FFA // A3
+data8 0x99999999999AF376, 0x00003FFB // A2
+data8 0xAAAAAAAAAAAAAA71, 0x00003FFC // A1
+ASM_SIZE_DIRECTIVE(acos_coefficients)
+
+
+.align 32
+.global acosl#
+ASM_TYPE_DIRECTIVE(acosl#,@function)
+
+.section .text
+.proc acosl#
+.align 32
+
+
+acosl:
+
+// After normalizing f8, get its true exponent
+{ .mfi
+ alloc r32 = ar.pfs,1,11,4,0
+(p0) fnorm.s1 acos_NORM_f8 = f8
+(p0) mov acos_GR_17_ones = 0x1ffff
+}
+
+{ .mmi
+(p0) mov acos_GR_16_ones = 0xffff
+(p0) addl r40 = @ltoff(acos_coefficients), gp
+ nop.i 999
+}
+;;
+
+// Set denormal flag on denormal input with fcmp
+{ .mfi
+ ld8 r40 = [r40]
+ fcmp.eq p6,p0 = f8,f0
+ nop.i 999
+}
+;;
+
+
+// Load the constants pi_by_2 and pi.
+// Each is stored as hi and lo values
+// Also load the coefficients for ACOS_POLY
+
+{ .mmi
+(p0) ldfe acos_pi_by_2_hi = [r40],16 ;;
+(p0) ldfe acos_pi_by_2_lo = [r40],16
+ nop.i 999 ;;
+}
+
+{ .mmi
+(p0) ldfe acos_pi_hi = [r40],16 ;;
+(p0) ldfe acos_pi_lo = [r40],16
+ nop.i 999 ;;
+}
+
+{ .mmi
+(p0) ldfe acos_A10 = [r40],16 ;;
+(p0) ldfe acos_A9 = [r40],16
+ nop.i 999 ;;
+}
+
+// Take the absolute value of f8
+{ .mmf
+ nop.m 999
+(p0) getf.exp acos_GR_signexp_f8 = acos_NORM_f8
+(p0) fmerge.s acos_ABS_NORM_f8 = f0, acos_NORM_f8
+}
+
+{ .mii
+(p0) ldfe acos_A8 = [r40],16
+ nop.i 999 ;;
+(p0) and acos_GR_exp = acos_GR_signexp_f8, acos_GR_17_ones ;;
+}
+
+// case 1: |x| < 2^-25 ==> p6 ACOS_TINY
+// case 2: 2^-25 <= |x| < 2^-2 ==> p8 ACOS_POLY
+// case 3: 2^-2 <= |x| < 1 ==> p9 ACOS_ATAN
+// case 4: 1 <= |x| ==> p11 ACOS_ERROR_RETURN
+// Admittedly |x| = 1 is not an error but this is where that case is
+// handled.
+
+{ .mii
+(p0) ldfe acos_A7 = [r40],16
+(p0) sub acos_GR_true_exp = acos_GR_exp, acos_GR_16_ones ;;
+(p0) cmp.ge.unc p6, p7 = -26, acos_GR_true_exp ;;
+}
+
+{ .mii
+(p0) ldfe acos_A6 = [r40],16
+(p7) cmp.ge.unc p8, p9 = -3, acos_GR_true_exp ;;
+(p9) cmp.ge.unc p10, p11 = -1, acos_GR_true_exp
+}
+
+{ .mmi
+(p0) ldfe acos_A5 = [r40],16 ;;
+(p0) ldfe acos_A4 = [r40],16
+ nop.i 999 ;;
+}
+
+{ .mmi
+(p0) ldfe acos_A3 = [r40],16 ;;
+(p0) ldfe acos_A2 = [r40],16
+ nop.i 999 ;;
+}
+
+// ACOS_ERROR_RETURN ==> p11 is true
+// case 4: |x| >= 1
+{ .mib
+(p0) ldfe acos_A1 = [r40],16
+ nop.i 999
+(p11) br.spnt L(ACOS_ERROR_RETURN) ;;
+}
+
+// ACOS_TINY ==> p6 is true
+// case 1: |x| < 2^-25
+{ .mfi
+ nop.m 999
+(p6) fms.s1 acos_xmpi_by_2_lo = acos_NORM_f8,f1, acos_pi_by_2_lo
+ nop.i 999 ;;
+}
+
+{ .mfb
+ nop.m 999
+(p6) fms.s0 f8 = acos_pi_by_2_hi,f1, acos_xmpi_by_2_lo
+(p6) br.ret.spnt b0 ;;
+}
+
+
+
+// ACOS_POLY ==> p8 is true
+// case 2: 2^-25 <= |x| < 2^-2
+{ .mfi
+ nop.m 999
+(p8) fms.s1 acos_W = acos_pi_by_2_hi, f1, acos_NORM_f8
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p8) fma.s1 acos_X2 = f8,f8, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p8) fms.s1 acos_Ww = acos_pi_by_2_hi, f1, acos_W
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p8) fma.s1 acos_X4 = acos_X2,acos_X2, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p8) fms.s1 acos_Ww = acos_Ww, f1, acos_NORM_f8
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p8) fma.s1 acos_P810 = acos_X4, acos_A10, acos_A8
+ nop.i 999
+}
+
+// acos_P79 = X4*A9 + A7
+// acos_P810 = X4*A10 + A8
+{ .mfi
+ nop.m 999
+(p8) fma.s1 acos_P79 = acos_X4, acos_A9, acos_A7
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p8) fma.s1 acos_Ww = acos_Ww, f1, acos_pi_by_2_lo
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p8) fma.s1 acos_P610 = acos_X4, acos_P810, acos_A6
+ nop.i 999
+}
+
+
+// acos_P59 = X4*(X4*A9 + A7) + A5
+// acos_P610 = X4*(X4*A10 + A8) + A6
+{ .mfi
+ nop.m 999
+(p8) fma.s1 acos_P59 = acos_X4, acos_P79, acos_A5
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p8) fma.s1 acos_P410 = acos_X4, acos_P610, acos_A4
+ nop.i 999
+}
+
+// acos_P39 = X4*(X4*(X4*A9 + A7) + A5) + A3
+// acos_P410 = X4*(X4*(X4*A10 + A8) + A6) + A4
+{ .mfi
+ nop.m 999
+(p8) fma.s1 acos_P39 = acos_X4, acos_P59, acos_A3
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p8) fma.s1 acos_P210 = acos_X4, acos_P410, acos_A2
+ nop.i 999
+}
+
+// acos_P19 = X4*(X4*(X4*(X4*A9 + A7) + A5) + A3) + A1 = P1
+// acos_P210 = X4*(X4*(X4*(X4*A10 + A8) + A6) + A4) + A2 = P2
+{ .mfi
+ nop.m 999
+(p8) fma.s1 acos_P19 = acos_X4, acos_P39, acos_A1
+ nop.i 999 ;;
+}
+
+// acos_P1P2 = Xsq*P2 + P1
+// acos_P1P2 = Xsq*(Xsq*P2 + P1)
+{ .mfi
+ nop.m 999
+(p8) fma.s1 acos_P1P2 = acos_X2, acos_P210, acos_P19
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p8) fma.s1 acos_P1P2 = acos_X2, acos_P1P2, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p8) fms.s1 acos_xPmw = acos_NORM_f8, acos_P1P2, acos_Ww
+ nop.i 999 ;;
+}
+
+{ .mfb
+ nop.m 999
+(p8) fms.s0 f8 = acos_W, f1, acos_xPmw
+(p8) br.ret.spnt b0 ;;
+}
+
+
+// ACOS_ATAN
+// case 3: 2^-2 <= |x| < 1
+// case 3: 2^-2 <= |x| < 1 ==> p9 ACOS_ATAN
+
+// Step 1.1: Get A,B and a,b
+// A + a = 1- |X|
+// B + b = 1+ |X|
+// Note also that we will use acos_corr (f13)
+// and acos_W
+
+// Step 2
+// Call __libm_atan2_reg
+
+
+{ .mfi
+(p0) mov acos_GR_fffe = 0xfffe
+(p0) fma.s1 acos_B = f1,f1, acos_ABS_NORM_f8
+(p0) mov GR_SAVE_B0 = b0 ;;
+}
+
+{ .mmf
+(p0) mov GR_SAVE_GP = gp
+ nop.m 999
+(p0) fms.s1 acos_A = f1,f1, acos_ABS_NORM_f8
+}
+
+{ .mfi
+(p0) setf.exp acos_HALF = acos_GR_fffe
+ nop.f 999
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p0) fms.s1 acos_1mB = f1,f1, acos_B
+ nop.i 999 ;;
+}
+
+// We want atan2(V,U)
+// so put V in f8 and U in f9
+// but save X in acos_X
+
+{ .mfi
+ nop.m 999
+(p0) fmerge.se acos_X = f8, f8
+ nop.i 999 ;;
+}
+
+// Step 1.2:
+/////////////////////////
+// Get U = sqrt(B)
+/////////////////////////
+
+{ .mfi
+ nop.m 999
+(p0) frsqrta.s1 acos_y0,p8 = acos_B
+ nop.i 999
+}
+
+{ .mfi
+ nop.m 999
+(p0) fms.s1 acos_1mA = f1,f1, acos_A
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p0) fma.s1 acos_Bb = acos_1mB,f1, acos_ABS_NORM_f8
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p0) fma.s1 acos_Hh = acos_HALF, acos_B, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p0) fma.s1 acos_t1 = acos_y0, acos_y0, f0
+ nop.i 999
+}
+
+{ .mfi
+ nop.m 999
+(p0) fms.s1 acos_Aa = acos_1mA,f1, acos_ABS_NORM_f8
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p0) fnma.s1 acos_t2 = acos_t1, acos_Hh, acos_HALF
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p0) fma.s1 acos_y1 = acos_t2, acos_y0, acos_y0
+ nop.i 999
+}
+
+
+// Step 1.2:
+/////////////////////////
+// Get V = sqrt(A)
+/////////////////////////
+{ .mfi
+ nop.m 999
+(p0) frsqrta.s1 acos_y0,p8 = acos_A
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p0) fma.s1 acos_t3 = acos_y1, acos_Hh, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p0) fma.s1 acos_t1 = acos_y0, acos_y0, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p0) fnma.s1 acos_t4 = acos_t3, acos_y1, acos_HALF
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p0) fma.s1 acos_y2 = acos_t4, acos_y1, acos_y1
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p0) fma.s1 acos_S = acos_B, acos_y2, f0
+ nop.i 999
+}
+
+{ .mfi
+ nop.m 999
+(p0) fma.s1 acos_H = acos_y2, acos_HALF, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p0) fma.s1 acos_t5 = acos_Hh, acos_y2, f0
+ nop.i 999
+}
+
+{ .mfi
+ nop.m 999
+(p0) fma.s1 acos_Hh = acos_HALF, acos_A, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p0) fnma.s1 acos_Dd = acos_S, acos_S, acos_B
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p0) fnma.s1 acos_t2 = acos_t1, acos_Hh, acos_HALF
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p0) fma.s1 acos_U = acos_Dd, acos_H, acos_S
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p0) fma.s1 acos_y1 = acos_t2, acos_y0, acos_y0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p0) fma.s1 acos_2U = acos_U, f1, acos_U
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p0) fma.s1 acos_t3 = acos_y1, acos_Hh, f0
+ nop.i 999
+}
+
+
+// Step 1.3:
+// sqrt(A + a) = V + v
+// sqrt(B + b) = U + u
+
+/////////////////////////
+// Get u
+/////////////////////////
+
+// acos_BmUU = B - UU
+// acos_BmUUpb = (B - UU) + b
+
+{ .mfi
+ nop.m 999
+(p0) fnma.s1 acos_BmUU = acos_U, acos_U, acos_B
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p0) fmerge.se f9 = acos_U, acos_U
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p0) fnma.s1 acos_t4 = acos_t3, acos_y1, acos_HALF
+ nop.i 999 ;;
+}
+
+// acos_1d2U = frcpa(2U)
+{ .mfi
+ nop.m 999
+(p0) frcpa.s1 acos_1d2U,p9 = f1, acos_2U
+ nop.i 999
+}
+
+{ .mfi
+ nop.m 999
+(p0) fma.s1 acos_BmUUpb = acos_BmUU, f1, acos_Bb
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p0) fma.s1 acos_y2 = acos_t4, acos_y1, acos_y1
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+// acos_Uu = ((B - UU) + b) * frcpa(2U)
+(p0) fma.s1 acos_Uu = acos_BmUUpb, acos_1d2U, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p0) fma.s1 acos_S = acos_A, acos_y2, f0
+ nop.i 999
+}
+
+{ .mfi
+ nop.m 999
+(p0) fma.s1 acos_H = acos_y2, acos_HALF, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p0) fma.s1 acos_t5 = acos_Hh, acos_y2, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p0) fnma.s1 acos_Dd = acos_S, acos_S, acos_A
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p0) fma.s1 acos_V = acos_Dd, acos_H, acos_S
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p0) fma.s1 acos_2V = acos_V, f1, acos_V
+ nop.i 999
+}
+
+// Step 3
+/////////////////////////
+// Calculate the correction, acos_corr
+/////////////////////////
+// acos_corr = U*v - (V*u)
+
+{ .mfi
+ nop.m 999
+(p0) fma.s1 acos_Vu = acos_V,acos_Uu, f0
+ nop.i 999 ;;
+}
+
+/////////////////////////
+// Get v
+/////////////////////////
+// acos_AmVV = A - VV
+// acos_AmVVpa = (A - VV) + a
+
+{ .mfi
+ nop.m 999
+(p0) fnma.s1 acos_AmVV = acos_V, acos_V, acos_A
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p0) fmerge.se f8 = acos_V, acos_V
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p0) fma.s1 acos_AmVVpa = acos_AmVV, f1, acos_Aa
+ nop.i 999 ;;
+}
+
+// acos_1d2V = frcpa(2V)
+{ .mfi
+ nop.m 999
+(p0) frcpa.s1 acos_1d2V,p9 = f1, acos_2V
+ nop.i 999 ;;
+}
+
+// acos_Vv = ((A - VV) + a) * frcpa(2V)
+{ .mfi
+ nop.m 999
+(p0) fma.s1 acos_Vv = acos_AmVVpa, acos_1d2V, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p0) fma.s1 acos_Uv = acos_U,acos_Vv, f0
+ nop.i 999 ;;
+}
+
+
+.endp acosl#
+ASM_SIZE_DIRECTIVE(acosl#)
+
+
+.proc __libm_callout
+__libm_callout:
+.prologue
+{ .mfi
+ nop.m 0
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs
+}
+;;
+
+{ .mfi
+ mov GR_SAVE_GP=gp
+ nop.f 0
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0
+}
+
+.body
+{ .mfb
+ nop.m 999
+(p0) fms.s1 acos_corr = acos_Uv,f1, acos_Vu
+(p0) br.call.sptk.many b0=__libm_atan2_reg# ;;
+}
+
+
+// p6 ==> X is negative
+// p7 ==> x is positive
+// We know that |X| >= 1/4
+
+{ .mfi
+(p0) mov gp = GR_SAVE_GP
+(p0) fcmp.lt.unc p6,p7 = acos_X , f0
+(p0) mov b0 = GR_SAVE_B0 ;;
+}
+
+// acos_2_Z_hi = 2 * acos_Z_hi
+// acos_s_lo_Z_lo = s_lo * Z_lo
+
+{ .mfi
+ nop.m 999
+(p0) fma.s1 acos_2_Z_hi = acos_Z_hi, f1, acos_Z_hi
+(p0) mov ar.pfs = GR_SAVE_PFS
+}
+
+{ .mfi
+ nop.m 999
+(p0) fma.s1 acos_s_lo_Z_lo = acos_s_lo, acos_Z_lo, f0
+ nop.i 999 ;;
+}
+
+// 2 is a constant needed later
+{ .mfi
+ nop.m 999
+(p0) fma.s1 acos_2 = f1,f1,f1
+ nop.i 999 ;;
+}
+
+// X >= 1/4
+// acos_result_lo = 2(s_lo * Z_lo) - corr
+// f8 = (2*Z_hi) + (2(s_lo * Z_lo) - corr)
+
+{ .mfi
+ nop.m 999
+(p7) fma.s1 acos_result_lo = acos_s_lo_Z_lo, acos_2, acos_corr
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p7) fma.s0 f8 = acos_2_Z_hi, f1, acos_result_lo
+ nop.i 999
+}
+
+// acos_result_lo = (pi_lo - corr)
+// acos_result_lo = (pi_lo - corr) + acos_Ww
+{ .mfi
+ nop.m 999
+(p6) fms.s1 acos_result_lo = acos_pi_lo, f1, acos_corr
+ nop.i 999 ;;
+}
+
+// X <= -1/4
+// acos_W = pi_hi - 2 * Z_hi
+{ .mfi
+ nop.m 999
+(p6) fnma.s1 acos_W = acos_2, acos_Z_hi, acos_pi_hi
+ nop.i 999 ;;
+}
+
+// acos_Ww = pi_hi - W
+// acos_Ww = (pi_hi - W) + (2 * Z_hi)
+{ .mfi
+ nop.m 999
+(p6) fms.s1 acos_Ww = acos_pi_hi, f1, acos_W
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p6) fms.s1 acos_Ww = acos_Ww, f1, acos_2_Z_hi
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p6) fma.s1 acos_result_lo = acos_result_lo, f1, acos_Ww
+ nop.i 999 ;;
+}
+
+// acos_Z_lo = ((pi_lo - corr) + acos_Ww) - 2 * (s_lo * Z_lo)
+{ .mfi
+ nop.m 999
+(p6) fnma.s1 acos_Z_lo = acos_s_lo_Z_lo, acos_2, acos_result_lo
+ nop.i 999 ;;
+}
+
+{ .mfb
+ nop.m 999
+(p6) fma.s0 f8 = acos_W, f1, acos_Z_lo
+(p0) br.ret.sptk b0 ;;
+}
+.endp __libm_callout
+ASM_SIZE_DIRECTIVE(__libm_callout)
+
+.proc SPECIAL
+SPECIAL:
+L(ACOS_NAN):
+{ .mfb
+ nop.m 999
+(p0) fma.s0 f8 = f8,f1,f0
+(p0) br.ret.sptk b0 ;;
+}
+
+L(ACOS_ERROR_RETURN):
+// Save ar.pfs, b0, and gp; restore on exit
+
+// qnan snan inf norm unorm 0 -+
+// 1 1 0 0 0 0 11 = 0xc3
+
+// Coming in as X = +- 1
+// What should we return?
+
+// If X is 1, return (sign of X)pi/2
+
+
+{ .mfi
+ nop.m 999
+(p0) fcmp.eq.unc p6,p7 = acos_ABS_NORM_f8,f1
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p6) fcmp.lt.unc p8,p9 = f8,f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p8) fma.s0 f8 = acos_pi_hi, f1, acos_pi_lo
+ nop.i 999
+}
+
+{ .mfb
+ nop.m 999
+(p9) fmerge.s f8 = f8,f0
+(p6) br.ret.spnt b0 ;;
+}
+
+// If X is a NAN, leave
+{ .mfi
+ nop.m 999
+(p0) fclass.m.unc p12,p0 = f8, 0xc3
+ nop.i 999 ;;
+}
+
+{ .mfb
+ nop.m 999
+(p12) fma.s0 f8 = f8,f1,f0
+(p12) br.ret.spnt b0 ;;
+}
+
+{ .mfi
+(p0) mov GR_Parameter_TAG = 57
+(p0) frcpa f10, p6 = f0, f0
+nop.i 999
+};;
+
+.endp SPECIAL
+ASM_SIZE_DIRECTIVE(SPECIAL)
+
+.proc __libm_error_region
+__libm_error_region:
+.prologue
+// (1)
+{ .mfi
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+}
+{ .mfi
+.fframe 64
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
+};;
+
+
+// (2)
+{ .mmi
+ stfe [GR_Parameter_Y] = f1,16 // Store Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
+};;
+
+.body
+// (3)
+{ .mib
+ stfe [GR_Parameter_X] = f8 // Store Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y
+ nop.b 0 // Parameter 3 address
+}
+{ .mib
+ stfe [GR_Parameter_Y] = f10 // Store Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
+};;
+{ .mmi
+ nop.m 0
+ nop.m 0
+ add GR_Parameter_RESULT = 48,sp
+};;
+
+// (4)
+{ .mmi
+ ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
+.restore sp
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
+};;
+
+{ .mib
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
+
+.endp __libm_error_region
+ASM_SIZE_DIRECTIVE(__libm_error_region)
+
+.type __libm_error_support#,@function
+.global __libm_error_support#
+
+.type __libm_atan2_reg#,@function
+.global __libm_atan2_reg#