Implement single-precision round intrinsic in x86 Rationale: X86 does not provide a direct instruction for the required rounding and NaN and large positive numbers must be dealt with too. This CL generates code that correctly implements SP round in a reasonably efficient manner (I hope....) Test: 580-checker-round BUG=26327751 Change-Id: Ic5f4d9cff9c27c855a8ad577c51ed3ed37fb60cd

commit: 2c9f495e8f8c1b367662a5a7a32d216b70d3bb18 [log] [tgz]
author: Aart Bik <ajcbik@google.com> Mon Aug 01 16:52:27 2016 -0700
committer: Aart Bik <ajcbik@google.com> Thu Aug 04 09:53:03 2016 -0700
tree: 69da2e9c6d337fe22273ae8bb2c417fb7177ce75
parent: 89bd8358c0a69e8cd6456d81d88ef366e261f732 [diff]
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index 65f4def..83cc278 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc

@@ -752,20 +752,20 @@
   GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 0);
 }
 
-// Note that 32 bit x86 doesn't have the capability to inline MathRoundDouble,
-// as it needs 64 bit instructions.
 void IntrinsicLocationsBuilderX86::VisitMathRoundFloat(HInvoke* invoke) {
-  // See intrinsics.h.
-  if (!kRoundIsPlusPointFive) {
-    return;
-  }
-
   // Do we have instruction support?
   if (codegen_->GetInstructionSetFeatures().HasSSE4_1()) {
+    HInvokeStaticOrDirect* static_or_direct = invoke->AsInvokeStaticOrDirect();
+    DCHECK(static_or_direct != nullptr);
     LocationSummary* locations = new (arena_) LocationSummary(invoke,
                                                               LocationSummary::kNoCall,
                                                               kIntrinsified);
     locations->SetInAt(0, Location::RequiresFpuRegister());
+    if (static_or_direct->HasSpecialInput() &&
+        invoke->InputAt(
+            static_or_direct->GetSpecialInputIndex())->IsX86ComputeBaseMethodAddress()) {
+      locations->SetInAt(1, Location::RequiresRegister());
+    }
     locations->SetOut(Location::RequiresRegister());
     locations->AddTemp(Location::RequiresFpuRegister());
     locations->AddTemp(Location::RequiresFpuRegister());
@@ -774,7 +774,7 @@
 
   // We have to fall back to a call to the intrinsic.
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                           LocationSummary::kCallOnMainOnly);
+                                                            LocationSummary::kCallOnMainOnly);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0)));
   locations->SetOut(Location::RegisterLocation(EAX));
@@ -784,47 +784,42 @@
 
 void IntrinsicCodeGeneratorX86::VisitMathRoundFloat(HInvoke* invoke) {
   LocationSummary* locations = invoke->GetLocations();
-  if (locations->WillCall()) {
+  if (locations->WillCall()) {  // TODO: can we reach this?
     InvokeOutOfLineIntrinsic(codegen_, invoke);
     return;
   }
 
-  // Implement RoundFloat as t1 = floor(input + 0.5f);  convert to int.
   XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
+  Register constant_area = locations->InAt(1).AsRegister<Register>();
+  XmmRegister t1 = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+  XmmRegister t2 = locations->GetTemp(1).AsFpuRegister<XmmRegister>();
   Register out = locations->Out().AsRegister<Register>();
-  XmmRegister maxInt = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
-  XmmRegister inPlusPointFive = locations->GetTemp(1).AsFpuRegister<XmmRegister>();
-  NearLabel done, nan;
+  NearLabel skip_incr, done;
   X86Assembler* assembler = GetAssembler();
 
-  // Generate 0.5 into inPlusPointFive.
-  __ movl(out, Immediate(bit_cast<int32_t, float>(0.5f)));
-  __ movd(inPlusPointFive, out);
+  // Since no direct x86 rounding instruction matches the required semantics,
+  // this intrinsic is implemented as follows:
+  //  result = floor(in);
+  //  if (in - result >= 0.5f)
+  //    result = result + 1.0f;
+  __ movss(t2, in);
+  __ roundss(t1, in, Immediate(1));
+  __ subss(t2, t1);
+  __ comiss(t2, codegen_->LiteralInt32Address(bit_cast<int32_t, float>(0.5f), constant_area));
+  __ j(kBelow, &skip_incr);
+  __ addss(t1, codegen_->LiteralInt32Address(bit_cast<int32_t, float>(1.0f), constant_area));
+  __ Bind(&skip_incr);
 
-  // Add in the input.
-  __ addss(inPlusPointFive, in);
-
-  // And truncate to an integer.
-  __ roundss(inPlusPointFive, inPlusPointFive, Immediate(1));
-
+  // Final conversion to an integer. Unfortunately this also does not have a
+  // direct x86 instruction, since NaN should map to 0 and large positive
+  // values need to be clipped to the extreme value.
   __ movl(out, Immediate(kPrimIntMax));
-  // maxInt = int-to-float(out)
-  __ cvtsi2ss(maxInt, out);
-
-  // if inPlusPointFive >= maxInt goto done
-  __ comiss(inPlusPointFive, maxInt);
-  __ j(kAboveEqual, &done);
-
-  // if input == NaN goto nan
-  __ j(kUnordered, &nan);
-
-  // output = float-to-int-truncate(input)
-  __ cvttss2si(out, inPlusPointFive);
-  __ jmp(&done);
-  __ Bind(&nan);
-
-  //  output = 0
-  __ xorl(out, out);
+  __ cvtsi2ss(t2, out);
+  __ comiss(t1, t2);
+  __ j(kAboveEqual, &done);  // clipped to max (already in out), does not jump on unordered
+  __ movl(out, Immediate(0));  // does not change flags
+  __ j(kUnordered, &done);  // NaN mapped to 0 (just moved in out)
+  __ cvttss2si(out, t1);
   __ Bind(&done);
 }
 

diff --git a/compiler/optimizing/pc_relative_fixups_x86.cc b/compiler/optimizing/pc_relative_fixups_x86.cc
index 921f3df..ad0921d 100644
--- a/compiler/optimizing/pc_relative_fixups_x86.cc
+++ b/compiler/optimizing/pc_relative_fixups_x86.cc

@@ -227,6 +227,7 @@
       case Intrinsics::kMathMaxFloatFloat:
       case Intrinsics::kMathMinDoubleDouble:
       case Intrinsics::kMathMinFloatFloat:
+      case Intrinsics::kMathRoundFloat:
         if (!base_added) {
           DCHECK(invoke_static_or_direct != nullptr);
           DCHECK(!invoke_static_or_direct->HasCurrentMethodInput());

diff --git a/test/580-checker-round/src/Main.java b/test/580-checker-round/src/Main.java
index 9e248ef..83bc55c 100644
--- a/test/580-checker-round/src/Main.java
+++ b/test/580-checker-round/src/Main.java

@@ -36,7 +36,8 @@
     expectEquals32(-2, round32(-1.51f));
     expectEquals32(-1, round32(-1.2f));
     expectEquals32(-1, round32(-1.0f));
-    expectEquals32(-1, round32(-0.51f));
+    expectEquals32(-1, round32(-0.5000001f));
+    expectEquals32(0, round32(-0.5f));
     expectEquals32(0, round32(-0.2f));
     expectEquals32(0, round32(-0.0f));
     expectEquals32(0, round32(+0.0f));
@@ -47,11 +48,23 @@
     expectEquals32(2, round32(+1.5f));
     expectEquals32(2147483647, round32(Float.POSITIVE_INFINITY));
 
+    // Near minint.
+    expectEquals32(-2147483648, round32(Math.nextAfter(-2147483648.0f, Float.NEGATIVE_INFINITY)));
+    expectEquals32(-2147483648, round32(-2147483648.0f));
+    expectEquals32(-2147483520, round32(Math.nextAfter(-2147483648.0f, Float.POSITIVE_INFINITY)));
+
+    // Near maxint.
+    expectEquals32(2147483520, round32(Math.nextAfter(2147483648.0f, Float.NEGATIVE_INFINITY)));
+    expectEquals32(2147483647, round32(2147483648.0f));
+    expectEquals32(2147483647, round32(Math.nextAfter(2147483648.0f, Float.POSITIVE_INFINITY)));
+
     // Some others.
     for (int i = -100; i <= 100; ++i) {
       expectEquals32(i - 1, round32((float) i - 0.51f));
+      expectEquals32(i, round32((float) i - 0.5f));
       expectEquals32(i, round32((float) i));
       expectEquals32(i + 1, round32((float) i + 0.5f));
+      expectEquals32(i + 1, round32((float) i + 0.51f));
     }
     for (float f = -1.5f; f <= -1.499f; f = Math.nextAfter(f, Float.POSITIVE_INFINITY)) {
       expectEquals32(-1, round32(f));
@@ -61,8 +74,10 @@
     float[] fvals = {
       -16777215.5f,
       -16777215.0f,
-      -0.4999f,
-      0.4999f,
+      -0.49999998f,
+      -0.4999999701976776123046875f,
+      0.4999999701976776123046875f,
+      0.49999998f,
       16777215.0f,
       16777215.5f
     };
@@ -71,6 +86,8 @@
       -16777215,
       0,
       0,
+      0,
+      0,
       16777215,
       16777216
     };
@@ -98,7 +115,8 @@
     expectEquals64(-2L, round64(-1.51d));
     expectEquals64(-1L, round64(-1.2d));
     expectEquals64(-1L, round64(-1.0d));
-    expectEquals64(-1L, round64(-0.51d));
+    expectEquals64(-1L, round64(-0.5000001f));
+    expectEquals64(0L, round64(-0.5d));
     expectEquals64(0L, round64(-0.2d));
     expectEquals64(0L, round64(-0.0d));
     expectEquals64(0L, round64(+0.0d));
@@ -109,11 +127,27 @@
     expectEquals64(2L, round64(+1.5d));
     expectEquals64(9223372036854775807L, round64(Double.POSITIVE_INFINITY));
 
+    // Near minlong.
+    expectEquals64(-9223372036854775808L,
+        round64(Math.nextAfter(-9223372036854775808.0, Double.NEGATIVE_INFINITY)));
+    expectEquals64(-9223372036854775808L, round64(-9223372036854775808.0));
+    expectEquals64(-9223372036854774784L,
+        round64(Math.nextAfter(-9223372036854775809.0, Double.POSITIVE_INFINITY)));
+
+    // Near maxlong.
+    expectEquals64(9223372036854774784L,
+        round64(Math.nextAfter(9223372036854775808.0, Double.NEGATIVE_INFINITY)));
+    expectEquals64(9223372036854775807L, round64(9223372036854775808.0));
+    expectEquals64(9223372036854775807L,
+        round64(Math.nextAfter(9223372036854775808.0, Double.POSITIVE_INFINITY)));
+
     // Some others.
     for (long l = -100; l <= 100; ++l) {
       expectEquals64(l - 1, round64((double) l - 0.51d));
+      expectEquals64(l, round64((double) l - 0.5d));
+      expectEquals64(l, round64((double) l));
       expectEquals64(l + 1, round64((double) l + 0.5d));
-      expectEquals64(l + 1, round64((double) l + 0.5d));
+      expectEquals64(l + 1, round64((double) l + 0.51d));
     }
     for (double d = -1.5d; d <= -1.49999999999d; d = Math.nextAfter(d, Double.POSITIVE_INFINITY)) {
       expectEquals64(-1L, round64(d));
@@ -123,8 +157,10 @@
     double[] dvals = {
       -9007199254740991.5d,
       -9007199254740991.0d,
+      -0.49999999999999997d,
       -0.49999999999999994d,
       0.49999999999999994d,
+      0.49999999999999997d,
       9007199254740991.0d,
       9007199254740991.5d
     };
@@ -133,6 +169,8 @@
       -9007199254740991L,
       0L,
       0L,
+      0L,
+      0L,
       9007199254740991L,
       9007199254740992L
     };
commit	2c9f495e8f8c1b367662a5a7a32d216b70d3bb18	[log] [tgz]
author	Aart Bik <ajcbik@google.com>	Mon Aug 01 16:52:27 2016 -0700
committer	Aart Bik <ajcbik@google.com>	Thu Aug 04 09:53:03 2016 -0700
tree	69da2e9c6d337fe22273ae8bb2c417fb7177ce75
parent	89bd8358c0a69e8cd6456d81d88ef366e261f732 [diff]