blob: 9da1d9dd64ac79a7e8dd830558aee7877aec3443 [file] [log] [blame]
Jingwei Zhang5d4f0e62014-10-31 18:29:18 +08001/*
2Copyright (c) 2014, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8 * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10
11 * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14
15 * Neither the name of Intel Corporation nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31/******************************************************************************/
32// ALGORITHM DESCRIPTION
33// ---------------------
34//
35// Description:
36// Let K = 64 (table size).
37//
38// Four sub-domains:
39// 1. |x| < 1/(2*K)
40// expm1(x) ~ P(x)
41// 2. 1/(2*K) <= |x| <= 56*log(2)
42// x x/log(2) n
43// e - 1 = 2 = 2 * T[j] * (1 + P(y)) - 1
44// 3. 56*log(2) < x < MAX_LOG
45// x x x/log(2) n
46// e - 1 ~ e = 2 = 2 * T[j] * (1 + P(y))
47// 4. x < -56*log(2)
48// x x
49// e - 1 = -1 + e ~ -1
50// where
51// x = m*log(2)/K + y, y in [-log(2)/K..log(2)/K]
52// m = n*K + j, m,n,j - signed integer, j in [-K/2..K/2]
53// j/K
54// values of 2 are tabulated as T[j] = T_hi[j] ( 1 + T_lo[j]).
55//
56// P(y) is a minimax polynomial approximation of exp(x)-1
57// on small interval [-log(2)/K..log(2)/K] (were calculated by Maple V).
58//
59// In case 3, to avoid problems with arithmetic overflow and underflow,
60// n n1 n2
61// value of 2 is safely computed as 2 * 2 where n1 in [-BIAS/2..BIAS/2]
62// and BIAS is a value of exponent bias.
63//
64// Special cases:
65// expm1(NaN) is NaN
66// expm1(+INF) is +INF
67// expm1(-INF) is -1
68// expm1(x) is x for subnormals
69// for finite argument, only expm1(0)=0 is exact.
70// For IEEE double
71// if x > 709.782712893383973096 then expm1(x) overflow
72//
73/******************************************************************************/
74
75#include <private/bionic_asm.h>
76# -- Begin expm1
77ENTRY(expm1)
78# parameter 1: %xmm0
79..B1.1:
80..___tag_value_expm1.1:
81 subq $56, %rsp
82..___tag_value_expm1.3:
83 movsd %xmm0, 32(%rsp)
84..B1.2:
85 unpcklpd %xmm0, %xmm0
86 movapd cv(%rip), %xmm1
87 movapd Shifter(%rip), %xmm6
88 movapd 16+cv(%rip), %xmm2
89 movapd 32+cv(%rip), %xmm3
90 pextrw $3, %xmm0, %eax
91 andl $32767, %eax
92 movl $16527, %edx
93 subl %eax, %edx
94 subl $16304, %eax
95 orl %eax, %edx
96 cmpl $-2147483648, %edx
97 jae .L_2TAG_PACKET_0.0.2
98 mulpd %xmm0, %xmm1
99 addpd %xmm6, %xmm1
100 movapd %xmm1, %xmm7
101 subpd %xmm6, %xmm1
102 mulpd %xmm1, %xmm2
103 movapd 48+cv(%rip), %xmm4
104 mulpd %xmm1, %xmm3
105 movapd 64+cv(%rip), %xmm5
106 subpd %xmm2, %xmm0
107 movd %xmm7, %eax
108 movl %eax, %ecx
109 andl $63, %ecx
110 shll $4, %ecx
111 sarl $6, %eax
112 movl %eax, %edx
113 subpd %xmm3, %xmm0
114 lea Tbl_addr(%rip), %r11
115 movapd (%rcx,%r11), %xmm2
116 movq 80+cv(%rip), %xmm3
117 mulpd %xmm0, %xmm4
118 movapd %xmm0, %xmm1
119 mulpd %xmm0, %xmm0
120 mulsd %xmm0, %xmm3
121 addpd %xmm4, %xmm5
122 mulsd %xmm0, %xmm0
123 movq %xmm2, %xmm4
124 unpckhpd %xmm2, %xmm2
125 movdqa mmask(%rip), %xmm6
126 pand %xmm6, %xmm7
127 movdqa bias(%rip), %xmm6
128 paddq %xmm6, %xmm7
129 psllq $46, %xmm7
130 mulsd %xmm0, %xmm3
131 mulpd %xmm5, %xmm0
132 addl $894, %edx
133 cmpl $1916, %edx
134 ja .L_2TAG_PACKET_1.0.2
135 addsd %xmm3, %xmm0
136 xorpd %xmm3, %xmm3
137 movl $16368, %eax
138 pinsrw $3, %eax, %xmm3
139 orpd %xmm7, %xmm2
140 mulsd %xmm4, %xmm7
141 movq %xmm3, %xmm6
142 addsd %xmm1, %xmm3
143 pextrw $3, %xmm2, %edx
144 pshufd $238, %xmm0, %xmm5
145 psrlq $38, %xmm3
146 psllq $38, %xmm3
147 movq %xmm2, %xmm4
148 subsd %xmm3, %xmm6
149 addsd %xmm5, %xmm0
150 addsd %xmm6, %xmm1
151 addsd %xmm7, %xmm4
152 mulsd %xmm3, %xmm7
153 mulsd %xmm2, %xmm3
154 xorpd %xmm5, %xmm5
155 movl $16368, %eax
156 pinsrw $3, %eax, %xmm5
157 addsd %xmm1, %xmm0
158 movl $17184, %ecx
159 subl %edx, %ecx
160 subl $16256, %edx
161 orl %edx, %ecx
162 jl .L_2TAG_PACKET_2.0.2
163 mulsd %xmm4, %xmm0
164 subsd %xmm5, %xmm3
165 addsd %xmm7, %xmm0
166 addsd %xmm3, %xmm0
167.L_2TAG_PACKET_3.0.2:
168 jmp ..B1.5
169.L_2TAG_PACKET_2.0.2:
170 cmpl $0, %edx
171 jl .L_2TAG_PACKET_4.0.2
172 mulsd %xmm4, %xmm0
173 subsd %xmm5, %xmm7
174 addsd %xmm7, %xmm0
175 addsd %xmm3, %xmm0
176 jmp ..B1.5
177.L_2TAG_PACKET_4.0.2:
178 mulsd %xmm4, %xmm0
179 addsd %xmm7, %xmm0
180 addsd %xmm3, %xmm0
181 subsd %xmm5, %xmm0
182 jmp ..B1.5
183.L_2TAG_PACKET_1.0.2:
184 movl 36(%rsp), %ecx
185 addsd %xmm0, %xmm1
186 unpckhpd %xmm0, %xmm0
187 addsd %xmm1, %xmm0
188 cmpl $0, %ecx
189 jl .L_2TAG_PACKET_5.0.2
190 fstcw (%rsp)
191 movw (%rsp), %dx
192 orw $768, %dx
193 movw %dx, 4(%rsp)
194 fldcw 4(%rsp)
195 movl %eax, %edx
196 sarl $1, %eax
197 subl %eax, %edx
198 movdqa emask(%rip), %xmm6
199 pandn %xmm2, %xmm6
200 addl $1023, %eax
201 movd %eax, %xmm3
202 psllq $52, %xmm3
203 orpd %xmm3, %xmm6
204 mulsd %xmm3, %xmm4
205 movsd %xmm0, 16(%rsp)
206 fldl 16(%rsp)
207 movsd %xmm6, 24(%rsp)
208 fldl 24(%rsp)
209 movsd %xmm4, 16(%rsp)
210 fldl 16(%rsp)
211 addl $1023, %edx
212 movd %edx, %xmm4
213 psllq $52, %xmm4
214 faddp %st, %st(1)
215 fmul %st, %st(1)
216 faddp %st, %st(1)
217 movsd %xmm4, 24(%rsp)
218 fldl 24(%rsp)
219 fmulp %st, %st(1)
220 fstpl 16(%rsp)
221 movsd 16(%rsp), %xmm0
222 fldcw (%rsp)
223 pextrw $3, %xmm0, %ecx
224 andl $32752, %ecx
225 cmpl $32752, %ecx
226 jae .L_2TAG_PACKET_6.0.2
227 jmp ..B1.5
228 cmpl $-2147483648, %ecx
229 jb .L_2TAG_PACKET_6.0.2
230 jmp ..B1.5
231.L_2TAG_PACKET_6.0.2:
232 movl $41, 8(%rsp)
233 jmp .L_2TAG_PACKET_7.0.2
234.L_2TAG_PACKET_8.0.2:
235 cmpl $2146435072, %eax
236 jae .L_2TAG_PACKET_9.0.2
237 movsd XMAX(%rip), %xmm0
238 mulsd %xmm0, %xmm0
239 movl $41, 8(%rsp)
240 jmp .L_2TAG_PACKET_7.0.2
241.L_2TAG_PACKET_9.0.2:
242 movl 36(%rsp), %eax
243 movl 32(%rsp), %edx
244 movl %eax, %ecx
245 andl $2147483647, %eax
246 cmpl $2146435072, %eax
247 ja .L_2TAG_PACKET_10.0.2
248 cmpl $0, %edx
249 jne .L_2TAG_PACKET_10.0.2
250 cmpl $0, %ecx
251 jl .L_2TAG_PACKET_11.0.2
252 movq INF(%rip), %xmm0
253 jmp ..B1.5
254.L_2TAG_PACKET_11.0.2:
255 jmp .L_2TAG_PACKET_5.0.2
256.L_2TAG_PACKET_10.0.2:
257 movsd 32(%rsp), %xmm0
258 addsd %xmm0, %xmm0
259 jmp ..B1.5
260.L_2TAG_PACKET_12.0.2:
261 addl $16304, %eax
262 cmpl $15504, %eax
263 jb .L_2TAG_PACKET_13.0.2
264 movapd cvl(%rip), %xmm2
265 pshufd $68, %xmm0, %xmm1
266 movapd 16+cvl(%rip), %xmm3
267 movapd 32+cvl(%rip), %xmm4
268 movq 48+cvl(%rip), %xmm5
269 mulsd %xmm1, %xmm1
270 xorpd %xmm6, %xmm6
271 movl $16352, %eax
272 pinsrw $3, %eax, %xmm6
273 mulpd %xmm0, %xmm2
274 xorpd %xmm7, %xmm7
275 movl $16368, %edx
276 pinsrw $3, %edx, %xmm7
277 addpd %xmm3, %xmm2
278 mulsd %xmm1, %xmm5
279 pshufd $228, %xmm1, %xmm3
280 mulpd %xmm1, %xmm1
281 mulsd %xmm0, %xmm6
282 mulpd %xmm0, %xmm2
283 addpd %xmm4, %xmm2
284 movq %xmm7, %xmm4
285 addsd %xmm6, %xmm7
286 mulpd %xmm3, %xmm1
287 psrlq $27, %xmm7
288 psllq $27, %xmm7
289 movq HIGHMASK(%rip), %xmm3
290 subsd %xmm7, %xmm4
291 mulpd %xmm1, %xmm2
292 addsd %xmm4, %xmm6
293 pshufd $238, %xmm2, %xmm1
294 addsd %xmm2, %xmm6
295 andpd %xmm0, %xmm3
296 movq %xmm0, %xmm4
297 addsd %xmm6, %xmm1
298 subsd %xmm3, %xmm0
299 addsd %xmm5, %xmm1
300 mulsd %xmm7, %xmm3
301 mulsd %xmm7, %xmm0
302 mulsd %xmm1, %xmm4
303 addsd %xmm4, %xmm0
304 addsd %xmm3, %xmm0
305 jmp ..B1.5
306.L_2TAG_PACKET_13.0.2:
307 cmpl $16, %eax
308 jae .L_2TAG_PACKET_3.0.2
309 movq %xmm0, %xmm2
310 movd %xmm0, %eax
311 psrlq $31, %xmm2
312 movd %xmm2, %ecx
313 orl %ecx, %eax
314 je .L_2TAG_PACKET_3.0.2
315 movl $16, %edx
316 xorpd %xmm1, %xmm1
317 pinsrw $3, %edx, %xmm1
318 mulsd %xmm1, %xmm1
319 movl $42, 8(%rsp)
320 jmp .L_2TAG_PACKET_7.0.2
321.L_2TAG_PACKET_0.0.2:
322 cmpl $0, %eax
323 jl .L_2TAG_PACKET_12.0.2
324 movl 36(%rsp), %eax
325 cmpl $1083179008, %eax
326 jge .L_2TAG_PACKET_8.0.2
327 cmpl $-1048576, %eax
328 jae .L_2TAG_PACKET_9.0.2
329.L_2TAG_PACKET_5.0.2:
330 xorpd %xmm0, %xmm0
331 movl $49136, %eax
332 pinsrw $3, %eax, %xmm0
333 jmp ..B1.5
334.L_2TAG_PACKET_7.0.2:
335 movq %xmm0, 40(%rsp)
336..B1.3:
337 movq 40(%rsp), %xmm0
338.L_2TAG_PACKET_14.0.2:
339..B1.5:
340 addq $56, %rsp
341..___tag_value_expm1.4:
342 ret
343..___tag_value_expm1.5:
344END(expm1)
345# -- End expm1
346 .section .rodata, "a"
347 .align 16
348 .align 16
349cv:
350 .long 1697350398
351 .long 1079448903
352 .long 1697350398
353 .long 1079448903
354 .long 4277796864
355 .long 1065758274
356 .long 4277796864
357 .long 1065758274
358 .long 3164486458
359 .long 1025308570
360 .long 3164486458
361 .long 1025308570
362 .long 1963358694
363 .long 1065423121
364 .long 1431655765
365 .long 1069897045
366 .long 1431655765
367 .long 1067799893
368 .long 0
369 .long 1071644672
370 .long 381774871
371 .long 1062650220
372 .long 381774871
373 .long 1062650220
374 .type cv,@object
375 .size cv,96
376 .align 16
377Shifter:
378 .long 0
379 .long 1127743488
380 .long 0
381 .long 1127743488
382 .type Shifter,@object
383 .size Shifter,16
384 .align 16
385Tbl_addr:
386 .long 0
387 .long 0
388 .long 0
389 .long 0
390 .long 1000070955
391 .long 1042145304
392 .long 1040187392
393 .long 11418
394 .long 988267849
395 .long 1039500660
396 .long 3539992576
397 .long 22960
398 .long 36755401
399 .long 1042114290
400 .long 402653184
401 .long 34629
402 .long 3634769483
403 .long 1042178627
404 .long 1820327936
405 .long 46424
406 .long 2155991225
407 .long 1041560680
408 .long 847249408
409 .long 58348
410 .long 2766913307
411 .long 1039293264
412 .long 3489660928
413 .long 70401
414 .long 3651174602
415 .long 1040488175
416 .long 2927624192
417 .long 82586
418 .long 3073892131
419 .long 1042240606
420 .long 1006632960
421 .long 94904
422 .long 1328391742
423 .long 1042019037
424 .long 3942645760
425 .long 107355
426 .long 2650893825
427 .long 1041903210
428 .long 822083584
429 .long 119943
430 .long 2397289153
431 .long 1041802037
432 .long 2281701376
433 .long 132667
434 .long 430997175
435 .long 1042110606
436 .long 1845493760
437 .long 145530
438 .long 1230936525
439 .long 1041801015
440 .long 1702887424
441 .long 158533
442 .long 740675935
443 .long 1040178913
444 .long 4110417920
445 .long 171677
446 .long 3489810261
447 .long 1041825986
448 .long 2793406464
449 .long 184965
450 .long 2532600530
451 .long 1040767882
452 .long 167772160
453 .long 198398
454 .long 3542557060
455 .long 1041827263
456 .long 2986344448
457 .long 211976
458 .long 1401563777
459 .long 1041061093
460 .long 922746880
461 .long 225703
462 .long 3129406026
463 .long 1041852413
464 .long 880803840
465 .long 239579
466 .long 900993572
467 .long 1039283234
468 .long 1275068416
469 .long 253606
470 .long 2115029358
471 .long 1042140042
472 .long 562036736
473 .long 267786
474 .long 1086643152
475 .long 1041785419
476 .long 1610612736
477 .long 282120
478 .long 82864366
479 .long 1041256244
480 .long 3045064704
481 .long 296610
482 .long 2392968152
483 .long 1040913683
484 .long 3573547008
485 .long 311258
486 .long 2905856183
487 .long 1040002214
488 .long 1988100096
489 .long 326066
490 .long 3742008261
491 .long 1040011137
492 .long 1451229184
493 .long 341035
494 .long 863393794
495 .long 1040880621
496 .long 914358272
497 .long 356167
498 .long 1446136837
499 .long 1041372426
500 .long 3707764736
501 .long 371463
502 .long 927855201
503 .long 1040617636
504 .long 360710144
505 .long 386927
506 .long 1492679939
507 .long 1041050306
508 .long 2952790016
509 .long 402558
510 .long 608827001
511 .long 1041582217
512 .long 2181038080
513 .long 418360
514 .long 606260204
515 .long 1042271987
516 .long 1711276032
517 .long 434334
518 .long 3163044019
519 .long 1041843851
520 .long 1006632960
521 .long 450482
522 .long 4148747325
523 .long 1041962972
524 .long 3900702720
525 .long 466805
526 .long 802924201
527 .long 1041275378
528 .long 1442840576
529 .long 483307
530 .long 3052749833
531 .long 1041940577
532 .long 1937768448
533 .long 499988
534 .long 2216116399
535 .long 1041486744
536 .long 914358272
537 .long 516851
538 .long 2729697836
539 .long 1041445764
540 .long 2566914048
541 .long 533897
542 .long 540608356
543 .long 1041310907
544 .long 2600468480
545 .long 551129
546 .long 2916344493
547 .long 1040535661
548 .long 1107296256
549 .long 568549
550 .long 731391814
551 .long 1039497014
552 .long 2566914048
553 .long 586158
554 .long 1024722704
555 .long 1041461625
556 .long 2961178624
557 .long 603959
558 .long 3806831748
559 .long 1041732499
560 .long 2675965952
561 .long 621954
562 .long 238953304
563 .long 1040316488
564 .long 2189426688
565 .long 640145
566 .long 749123235
567 .long 1041725785
568 .long 2063597568
569 .long 658534
570 .long 1168187977
571 .long 1041175214
572 .long 2986344448
573 .long 677123
574 .long 3506096399
575 .long 1042186095
576 .long 1426063360
577 .long 695915
578 .long 1470221620
579 .long 1041675499
580 .long 2566914048
581 .long 714911
582 .long 3182425146
583 .long 1041483134
584 .long 3087007744
585 .long 734114
586 .long 3131698208
587 .long 1042208657
588 .long 4068474880
589 .long 753526
590 .long 2300504125
591 .long 1041428596
592 .long 2415919104
593 .long 773150
594 .long 2290297931
595 .long 1037388400
596 .long 3716153344
597 .long 792987
598 .long 3532148223
599 .long 1041626194
600 .long 771751936
601 .long 813041
602 .long 1161884404
603 .long 1042015258
604 .long 3699376128
605 .long 833312
606 .long 876383176
607 .long 1037968878
608 .long 1241513984
609 .long 853805
610 .long 3379986796
611 .long 1042213153
612 .long 3699376128
613 .long 874520
614 .long 1545797737
615 .long 1041681569
616 .long 58720256
617 .long 895462
618 .long 2925146801
619 .long 1042212567
620 .long 855638016
621 .long 916631
622 .long 1316627971
623 .long 1038516204
624 .long 3883925504
625 .long 938030
626 .long 3267869137
627 .long 1040337004
628 .long 2726297600
629 .long 959663
630 .long 3720868999
631 .long 1041782409
632 .long 3992977408
633 .long 981531
634 .long 433316142
635 .long 1041994064
636 .long 1526726656
637 .long 1003638
638 .long 781232103
639 .long 1040093400
640 .long 2172649472
641 .long 1025985
642 .type Tbl_addr,@object
643 .size Tbl_addr,1024
644 .align 16
645mmask:
646 .long 4294967232
647 .long 0
648 .long 4294967232
649 .long 0
650 .type mmask,@object
651 .size mmask,16
652 .align 16
653bias:
654 .long 65472
655 .long 0
656 .long 65472
657 .long 0
658 .type bias,@object
659 .size bias,16
660 .align 16
661emask:
662 .long 0
663 .long 4293918720
664 .long 0
665 .long 4293918720
666 .type emask,@object
667 .size emask,16
668 .align 16
669cvl:
670 .long 2773927732
671 .long 1053236707
672 .long 381774871
673 .long 1062650220
674 .long 379653899
675 .long 1056571845
676 .long 286331153
677 .long 1065423121
678 .long 436314138
679 .long 1059717536
680 .long 1431655765
681 .long 1067799893
682 .long 1431655765
683 .long 1069897045
684 .long 0
685 .long 1071644672
686 .type cvl,@object
687 .size cvl,64
688 .align 8
689XMAX:
690 .long 4294967295
691 .long 2146435071
692 .type XMAX,@object
693 .size XMAX,8
694 .align 8
695INF:
696 .long 0
697 .long 2146435072
698 .type INF,@object
699 .size INF,8
700 .align 8
701HIGHMASK:
702 .long 4227858432
703 .long 4294967295
704 .type HIGHMASK,@object
705 .size HIGHMASK,8
706 .data
707 .section .note.GNU-stack, ""
708// -- Begin DWARF2 SEGMENT .eh_frame
709 .section .eh_frame,"a",@progbits
710.eh_frame_seg:
711 .align 1
712 .4byte 0x00000014
713 .8byte 0x00527a0100000000
714 .8byte 0x08070c1b01107801
715 .4byte 0x00000190
716 .4byte 0x0000001c
717 .4byte 0x0000001c
718 .4byte ..___tag_value_expm1.1-.
719 .4byte ..___tag_value_expm1.5-..___tag_value_expm1.1
720 .2byte 0x0400
721 .4byte ..___tag_value_expm1.3-..___tag_value_expm1.1
722 .2byte 0x400e
723 .byte 0x04
724 .4byte ..___tag_value_expm1.4-..___tag_value_expm1.3
725 .2byte 0x080e
726 .byte 0x00
727# End