Speedup div/rem by constants on x86 and x86_64

This is done using the algorithms in Hacker's Delight chapter 10.

Change-Id: I7bacefe10067569769ed31a1f7834f796fb41119
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 8d0ca0b..dac7221 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -16,6 +16,7 @@
 
 #include "code_generator_x86.h"
 
+#include "code_generator_utils.h"
 #include "entrypoints/quick/quick_entrypoints.h"
 #include "entrypoints/quick/quick_entrypoints_enum.h"
 #include "gc/accounting/card_table.h"
@@ -2222,6 +2223,134 @@
   __ addl(ESP, Immediate(2 * elem_size));
 }
 
+
+void InstructionCodeGeneratorX86::DivRemOneOrMinusOne(HBinaryOperation* instruction) {
+  DCHECK(instruction->IsDiv() || instruction->IsRem());
+
+  LocationSummary* locations = instruction->GetLocations();
+  DCHECK(locations->InAt(1).IsConstant());
+
+  Register out_register = locations->Out().AsRegister<Register>();
+  Register input_register = locations->InAt(0).AsRegister<Register>();
+  int imm = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
+
+  DCHECK(imm == 1 || imm == -1);
+
+  if (instruction->IsRem()) {
+    __ xorl(out_register, out_register);
+  } else {
+    __ movl(out_register, input_register);
+    if (imm == -1) {
+      __ negl(out_register);
+    }
+  }
+}
+
+
+void InstructionCodeGeneratorX86::DivByPowerOfTwo(HBinaryOperation* instruction) {
+  DCHECK(instruction->IsDiv());
+
+  LocationSummary* locations = instruction->GetLocations();
+
+  Register out_register = locations->Out().AsRegister<Register>();
+  Register input_register = locations->InAt(0).AsRegister<Register>();
+  int imm = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
+
+  DCHECK(instruction->IsDiv() && IsPowerOfTwo(std::abs(imm)));
+  Register num = locations->GetTemp(0).AsRegister<Register>();
+
+  __ leal(num, Address(input_register, std::abs(imm) - 1));
+  __ testl(input_register, input_register);
+  __ cmovl(kGreaterEqual, num, input_register);
+  int shift = CTZ(imm);
+  __ sarl(num, Immediate(shift));
+
+  if (imm < 0) {
+    __ negl(num);
+  }
+
+  __ movl(out_register, num);
+}
+
+void InstructionCodeGeneratorX86::GenerateDivRemWithAnyConstant(HBinaryOperation* instruction) {
+  DCHECK(instruction->IsDiv() || instruction->IsRem());
+
+  LocationSummary* locations = instruction->GetLocations();
+  int imm = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
+
+  Register eax = locations->InAt(0).AsRegister<Register>();
+  Register out = locations->Out().AsRegister<Register>();
+  Register num;
+  Register edx;
+
+  if (instruction->IsDiv()) {
+    edx = locations->GetTemp(0).AsRegister<Register>();
+    num = locations->GetTemp(1).AsRegister<Register>();
+  } else {
+    edx = locations->Out().AsRegister<Register>();
+    num = locations->GetTemp(0).AsRegister<Register>();
+  }
+
+  DCHECK_EQ(EAX, eax);
+  DCHECK_EQ(EDX, edx);
+  if (instruction->IsDiv()) {
+    DCHECK_EQ(EAX, out);
+  } else {
+    DCHECK_EQ(EDX, out);
+  }
+
+  int64_t magic;
+  int shift;
+  CalculateMagicAndShiftForDivRem(imm, false /* is_long */, &magic, &shift);
+
+  Label ndiv;
+  Label end;
+  // If numerator is 0, the result is 0, no computation needed.
+  __ testl(eax, eax);
+  __ j(kNotEqual, &ndiv);
+
+  __ xorl(out, out);
+  __ jmp(&end);
+
+  __ Bind(&ndiv);
+
+  // Save the numerator.
+  __ movl(num, eax);
+
+  // EAX = magic
+  __ movl(eax, Immediate(magic));
+
+  // EDX:EAX = magic * numerator
+  __ imull(num);
+
+  if (imm > 0 && magic < 0) {
+    // EDX += num
+    __ addl(edx, num);
+  } else if (imm < 0 && magic > 0) {
+    __ subl(edx, num);
+  }
+
+  // Shift if needed.
+  if (shift != 0) {
+    __ sarl(edx, Immediate(shift));
+  }
+
+  // EDX += 1 if EDX < 0
+  __ movl(eax, edx);
+  __ shrl(edx, Immediate(31));
+  __ addl(edx, eax);
+
+  if (instruction->IsRem()) {
+    __ movl(eax, num);
+    __ imull(edx, Immediate(imm));
+    __ subl(eax, edx);
+    __ movl(edx, eax);
+  } else {
+    __ movl(eax, edx);
+  }
+  __ Bind(&end);
+}
+
 void InstructionCodeGeneratorX86::GenerateDivRemIntegral(HBinaryOperation* instruction) {
   DCHECK(instruction->IsDiv() || instruction->IsRem());
 
@@ -2233,28 +2362,42 @@
 
   switch (instruction->GetResultType()) {
     case Primitive::kPrimInt: {
-      Register second_reg = second.AsRegister<Register>();
       DCHECK_EQ(EAX, first.AsRegister<Register>());
       DCHECK_EQ(is_div ? EAX : EDX, out.AsRegister<Register>());
 
-      SlowPathCodeX86* slow_path =
+      if (second.IsConstant()) {
+        int imm = second.GetConstant()->AsIntConstant()->GetValue();
+
+        if (imm == 0) {
+          // Do not generate anything for 0. DivZeroCheck would forbid any generated code.
+        } else if (imm == 1 || imm == -1) {
+          DivRemOneOrMinusOne(instruction);
+        } else if (is_div && IsPowerOfTwo(std::abs(imm))) {
+          DivByPowerOfTwo(instruction);
+        } else {
+          DCHECK(imm <= -2 || imm >= 2);
+          GenerateDivRemWithAnyConstant(instruction);
+        }
+      } else {
+        SlowPathCodeX86* slow_path =
           new (GetGraph()->GetArena()) DivRemMinusOneSlowPathX86(out.AsRegister<Register>(),
-                                                                 is_div);
-      codegen_->AddSlowPath(slow_path);
+              is_div);
+        codegen_->AddSlowPath(slow_path);
 
-      // 0x80000000/-1 triggers an arithmetic exception!
-      // Dividing by -1 is actually negation and -0x800000000 = 0x80000000 so
-      // it's safe to just use negl instead of more complex comparisons.
+        Register second_reg = second.AsRegister<Register>();
+        // 0x80000000/-1 triggers an arithmetic exception!
+        // Dividing by -1 is actually negation and -0x800000000 = 0x80000000 so
+        // it's safe to just use negl instead of more complex comparisons.
 
-      __ cmpl(second_reg, Immediate(-1));
-      __ j(kEqual, slow_path->GetEntryLabel());
+        __ cmpl(second_reg, Immediate(-1));
+        __ j(kEqual, slow_path->GetEntryLabel());
 
-      // edx:eax <- sign-extended of eax
-      __ cdq();
-      // eax = quotient, edx = remainder
-      __ idivl(second_reg);
-
-      __ Bind(slow_path->GetExitLabel());
+        // edx:eax <- sign-extended of eax
+        __ cdq();
+        // eax = quotient, edx = remainder
+        __ idivl(second_reg);
+        __ Bind(slow_path->GetExitLabel());
+      }
       break;
     }
 
@@ -2294,10 +2437,16 @@
   switch (div->GetResultType()) {
     case Primitive::kPrimInt: {
       locations->SetInAt(0, Location::RegisterLocation(EAX));
-      locations->SetInAt(1, Location::RequiresRegister());
+      locations->SetInAt(1, Location::RegisterOrConstant(div->InputAt(1)));
       locations->SetOut(Location::SameAsFirstInput());
       // Intel uses edx:eax as the dividend.
       locations->AddTemp(Location::RegisterLocation(EDX));
+      // We need to save the numerator while we tweak eax and edx. As we are using imul in a way
+      // which enforces results to be in EAX and EDX, things are simpler if we use EAX also as
+      // output and request another temp.
+      if (div->InputAt(1)->IsConstant()) {
+        locations->AddTemp(Location::RequiresRegister());
+      }
       break;
     }
     case Primitive::kPrimLong: {
@@ -2355,6 +2504,7 @@
 
 void LocationsBuilderX86::VisitRem(HRem* rem) {
   Primitive::Type type = rem->GetResultType();
+
   LocationSummary::CallKind call_kind = (rem->GetResultType() == Primitive::kPrimLong)
       ? LocationSummary::kCall
       : LocationSummary::kNoCall;
@@ -2363,8 +2513,14 @@
   switch (type) {
     case Primitive::kPrimInt: {
       locations->SetInAt(0, Location::RegisterLocation(EAX));
-      locations->SetInAt(1, Location::RequiresRegister());
+      locations->SetInAt(1, Location::RegisterOrConstant(rem->InputAt(1)));
       locations->SetOut(Location::RegisterLocation(EDX));
+      // We need to save the numerator while we tweak eax and edx. As we are using imul in a way
+      // which enforces results to be in EAX and EDX, things are simpler if we use EDX also as
+      // output and request another temp.
+      if (rem->InputAt(1)->IsConstant()) {
+        locations->AddTemp(Location::RequiresRegister());
+      }
       break;
     }
     case Primitive::kPrimLong: {