Add the optimized implementation of 18 math functions for x86 and x86_64 respectively

Change-Id: I31bf601448a9427f825517f3a0ff24de47f49bfa
Signed-off-by: Jingwei Zhang <jingwei.zhang@intel.com>
Signed-off-by: Mingwei Shi <mingwei.shi@intel.com>
diff --git a/libm/x86/e_hypot.S b/libm/x86/e_hypot.S
new file mode 100644
index 0000000..aa6ab64
--- /dev/null
+++ b/libm/x86/e_hypot.S
@@ -0,0 +1,221 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/******************************************************************************/
+//                     ALGORITHM DESCRIPTION
+//                     ---------------------
+//
+// X87 version:
+// Use 80-bit FPU precision fmul, fsqrt to compute square and sqrt.
+//
+// SSE version:
+// Swap x, y if |x|<|y|
+// For x=2^k*x, get y=y*2^(-k)
+// Get S ~ sqrt(x^2+y^2)  (leading 1 + leading 25 mantissa bits)
+//
+// Get D = ( RN(x^2+y^2) - S^2 ) + ( x^2 - RN(x^2) ) +
+//                               + ( y^2 - ((RN(x^2+y^2)-RN(x^2)) )
+//
+// Result is 2^k*(S + Se),  where Se = S*e
+//        S*e is approximated as (D/2S)*( 1 - (D/2S)^2*1.0/S )
+//
+// Return 2^k*(S+Se)
+//
+// For |y/x|<2^(-64), return x
+//
+// For cases where maximum biased exponent is either greater than 7fdh or
+// below 32, take a special path to check for special cases (0, NaN, Inf),
+// possible overflow, and more accurate computation for denormal results
+//
+// Special cases:
+//  hypot(x,y), hypot(y,x), and hypot(x,-y) are equivalent
+//  hypot(x,+-0) is equivalent to fabs(x)
+//  hypot(x,y) = y if (x==NaN or x==INF) and y==INF
+//  hypot(x,y) = x if (x==NaN or x==INF) and y!=INF (even if y==NaN!)
+//  hypot(x,y) = y if (x!=NaN and x!=INF) and (y==NaN or y==INF)
+//
+/******************************************************************************/
+
+#include <private/bionic_asm.h>
+# -- Begin  static_func
+        .text
+        .align __bionic_asm_align
+        .type static_func, @function
+static_func:
+..B1.1:
+        call      ..L2
+..L2:
+        popl      %eax
+        lea       _GLOBAL_OFFSET_TABLE_+[. - ..L2](%eax), %eax
+        lea       static_const_table@GOTOFF(%eax), %eax
+        ret
+        .size   static_func,.-static_func
+# -- End  static_func
+
+# -- Begin  hypot
+ENTRY(hypot)
+# parameter 1: 8 + %ebp
+# parameter 2: 16 + %ebp
+..B2.1:
+..B2.2:
+        pushl     %ebp
+        movl      %esp, %ebp
+        subl      $152, %esp
+        movl      %ebx, 96(%esp)
+        call      static_func
+        movl      %eax, %ebx
+        movapd    (%ebx), %xmm3
+        movsd     160(%esp), %xmm0
+        movsd     168(%esp), %xmm1
+        andpd     %xmm3, %xmm0
+        andpd     %xmm3, %xmm1
+        pextrw    $3, %xmm0, %eax
+        pextrw    $3, %xmm1, %edx
+        cmpl      $24528, %eax
+        ja        .L_2TAG_PACKET_0.0.2
+        cmpl      $24528, %edx
+        ja        .L_2TAG_PACKET_0.0.2
+.L_2TAG_PACKET_1.0.2:
+        fldl      160(%esp)
+        fldl      168(%esp)
+        fxch      %st(1)
+        fmul      %st(0), %st
+        fxch      %st(1)
+        nop       
+        fmul      %st(0), %st
+        faddp     %st, %st(1)
+        fsqrt     
+        jmp       .L_2TAG_PACKET_2.0.2
+.L_2TAG_PACKET_0.0.2:
+        cmpl      $32752, %eax
+        movl      %eax, %ecx
+        jae       .L_2TAG_PACKET_3.0.2
+        subl      %edx, %ecx
+        cmpl      $32752, %edx
+        jae       .L_2TAG_PACKET_3.0.2
+        addl      $928, %ecx
+        addl      %edx, %eax
+        cmpl      $1856, %ecx
+        ja        .L_2TAG_PACKET_4.0.2
+        cmpl      $49056, %eax
+        jb        .L_2TAG_PACKET_1.0.2
+        fldl      160(%esp)
+        fldl      168(%esp)
+        fxch      %st(1)
+        fmul      %st(0), %st
+        fxch      %st(1)
+        nop       
+        fmul      %st(0), %st
+        faddp     %st, %st(1)
+        fsqrt     
+.L_2TAG_PACKET_5.0.2:
+        fstl      (%esp)
+        fstpt     16(%esp)
+        xorl      %eax, %eax
+        movw      24(%esp), %ax
+        cmpl      $17407, %eax
+        jae       .L_2TAG_PACKET_6.0.2
+        fldl      (%esp)
+        jmp       .L_2TAG_PACKET_7.0.2
+.L_2TAG_PACKET_4.0.2:
+        movsd     %xmm0, 32(%esp)
+        movsd     %xmm1, 40(%esp)
+        fldl      32(%esp)
+        faddl     40(%esp)
+        jmp       .L_2TAG_PACKET_5.0.2
+.L_2TAG_PACKET_6.0.2:
+        movl      $46, %edx
+.L_2TAG_PACKET_8.0.2:
+        movsd     160(%esp), %xmm0
+        movsd     168(%esp), %xmm1
+        fldl      (%esp)
+        jmp       .L_2TAG_PACKET_7.0.2
+.L_2TAG_PACKET_3.0.2:
+        shufpd    $0, %xmm1, %xmm0
+        movdqa    %xmm0, %xmm2
+        movdqa    16(%ebx), %xmm3
+        movsd     %xmm0, 32(%esp)
+        movsd     %xmm1, 40(%esp)
+        cmppd     $3, %xmm0, %xmm2
+        cmppd     $0, %xmm0, %xmm3
+        movmskpd  %xmm2, %edx
+        movmskpd  %xmm3, %eax
+        testl     %edx, %edx
+        je        .L_2TAG_PACKET_9.0.2
+        fldl      32(%esp)
+        fmull     40(%esp)
+        testl     $1, %eax
+        jne       .L_2TAG_PACKET_10.0.2
+        testl     $2, %eax
+        jne       .L_2TAG_PACKET_11.0.2
+        jmp       .L_2TAG_PACKET_2.0.2
+.L_2TAG_PACKET_9.0.2:
+        fldl      32(%esp)
+        faddl     40(%esp)
+        jmp       .L_2TAG_PACKET_2.0.2
+.L_2TAG_PACKET_10.0.2:
+        fstpl     40(%esp)
+        fldl      32(%esp)
+        jmp       .L_2TAG_PACKET_7.0.2
+.L_2TAG_PACKET_11.0.2:
+        fstpl     32(%esp)
+        fldl      40(%esp)
+        jmp       .L_2TAG_PACKET_7.0.2
+.L_2TAG_PACKET_2.0.2:
+.L_2TAG_PACKET_7.0.2:
+        movl      96(%esp), %ebx
+        movl      %ebp, %esp
+        popl      %ebp
+        ret       
+..B2.3:
+END(hypot)
+# -- End  hypot
+
+# Start file scope ASM
+.weak hypotl
+.equ hypotl, hypot
+# End file scope ASM
+	.section .rodata, "a"
+	.align 16
+	.align 16
+static_const_table:
+	.long	4294967295
+	.long	2147483647
+	.long	4294967295
+	.long	2147483647
+	.long	0
+	.long	2146435072
+	.long	0
+	.long	2146435072
+	.type	static_const_table,@object
+	.size	static_const_table,32
+	.data
+	.section .note.GNU-stack, ""
+# End