Compiler changes for bitstring based type checks.

We guard the use of this feature with a compile-time flag,
set to true in this CL.

Boot image size for aosp_taimen-userdebug in AOSP master:
  - before:
    arm boot*.oat: 63604740
    arm64 boot*.oat: 74237864
  - after:
    arm boot*.oat: 63531172 (-72KiB, -0.1%)
    arm64 boot*.oat: 74135008 (-100KiB, -0.1%)

The new TypeCheckBenchmark yields the following changes
using the little cores of taimen fixed at 1.4016GHz:
                               32-bit        64-bit
  timeCheckCastLevel1ToLevel1  11.48->15.80 11.47->15.78
  timeCheckCastLevel2ToLevel1  15.08->15.79 15.08->15.79
  timeCheckCastLevel3ToLevel1  19.01->15.82 17.94->15.81
  timeCheckCastLevel9ToLevel1  42.55->15.79 42.63->15.81
  timeCheckCastLevel9ToLevel2  39.70->14.36 39.70->14.35
  timeInstanceOfLevel1ToLevel1 13.74->17.93 13.76->17.95
  timeInstanceOfLevel2ToLevel1 17.02->17.95 16.99->17.93
  timeInstanceOfLevel3ToLevel1 24.03->17.95 24.45->17.95
  timeInstanceOfLevel9ToLevel1 47.13->17.95 47.14->18.00
  timeInstanceOfLevel9ToLevel2 44.19->16.52 44.27->16.51
This suggests that the bitstring typecheck should not be
used for exact type checks which would be equivalent to the
"Level1ToLevel1" benchmark. Whether the implementation is
a beneficial replacement for the kClassHierarchyCheck and
kAbstractClassCheck on average depends on how many levels
from the target class (or Object for a negative result) is
a typical object's class.

Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing --jit
Test: testrunner.py --host -t 670-bitstring-type-check
Test: Pixel 2 XL boots.
Test: testrunner.py --target --optimizing --jit
Test: testrunner.py --target -t 670-bitstring-type-check
Bug: 64692057
Bug: 71853552
Bug: 26687569
Change-Id: I538d7e036b5a8ae2cc3fe77662a5903d74854562
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 13bbffa..b47a5cf 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -2112,6 +2112,26 @@
   __ Bind(slow_path->GetExitLabel());
 }
 
+void InstructionCodeGeneratorARM64::GenerateBitstringTypeCheckCompare(
+    HTypeCheckInstruction* check, vixl::aarch64::Register temp) {
+  uint32_t path_to_root = check->GetBitstringPathToRoot();
+  uint32_t mask = check->GetBitstringMask();
+  DCHECK(IsPowerOfTwo(mask + 1));
+  size_t mask_bits = WhichPowerOf2(mask + 1);
+
+  if (mask_bits == 16u) {
+    // Load only the bitstring part of the status word.
+    __ Ldrh(temp, HeapOperand(temp, mirror::Class::StatusOffset()));
+  } else {
+    // /* uint32_t */ temp = temp->status_
+    __ Ldr(temp, HeapOperand(temp, mirror::Class::StatusOffset()));
+    // Extract the bitstring bits.
+    __ Ubfx(temp, temp, 0, mask_bits);
+  }
+  // Compare the bitstring bits to `path_to_root`.
+  __ Cmp(temp, path_to_root);
+}
+
 void CodeGeneratorARM64::GenerateMemoryBarrier(MemBarrierKind kind) {
   BarrierType type = BarrierAll;
 
@@ -3840,6 +3860,8 @@
     case TypeCheckKind::kInterfaceCheck:
       call_kind = LocationSummary::kCallOnSlowPath;
       break;
+    case TypeCheckKind::kBitstringCheck:
+      break;
   }
 
   LocationSummary* locations =
@@ -3848,7 +3870,13 @@
     locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
   }
   locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetInAt(1, Location::RequiresRegister());
+  if (type_check_kind == TypeCheckKind::kBitstringCheck) {
+    locations->SetInAt(1, Location::ConstantLocation(instruction->InputAt(1)->AsConstant()));
+    locations->SetInAt(2, Location::ConstantLocation(instruction->InputAt(2)->AsConstant()));
+    locations->SetInAt(3, Location::ConstantLocation(instruction->InputAt(3)->AsConstant()));
+  } else {
+    locations->SetInAt(1, Location::RequiresRegister());
+  }
   // The "out" register is used as a temporary, so it overlaps with the inputs.
   // Note that TypeCheckSlowPathARM64 uses this register too.
   locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
@@ -3861,7 +3889,9 @@
   LocationSummary* locations = instruction->GetLocations();
   Location obj_loc = locations->InAt(0);
   Register obj = InputRegisterAt(instruction, 0);
-  Register cls = InputRegisterAt(instruction, 1);
+  Register cls = (type_check_kind == TypeCheckKind::kBitstringCheck)
+      ? Register()
+      : InputRegisterAt(instruction, 1);
   Location out_loc = locations->Out();
   Register out = OutputRegister(instruction);
   const size_t num_temps = NumberOfInstanceOfTemps(type_check_kind);
@@ -4047,6 +4077,23 @@
       }
       break;
     }
+
+    case TypeCheckKind::kBitstringCheck: {
+      // /* HeapReference<Class> */ temp = obj->klass_
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        out_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp_loc,
+                                        kWithoutReadBarrier);
+
+      GenerateBitstringTypeCheckCompare(instruction, out);
+      __ Cset(out, eq);
+      if (zero.IsLinked()) {
+        __ B(&done);
+      }
+      break;
+    }
   }
 
   if (zero.IsLinked()) {
@@ -4069,7 +4116,13 @@
   LocationSummary* locations =
       new (GetGraph()->GetAllocator()) LocationSummary(instruction, call_kind);
   locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetInAt(1, Location::RequiresRegister());
+  if (type_check_kind == TypeCheckKind::kBitstringCheck) {
+    locations->SetInAt(1, Location::ConstantLocation(instruction->InputAt(1)->AsConstant()));
+    locations->SetInAt(2, Location::ConstantLocation(instruction->InputAt(2)->AsConstant()));
+    locations->SetInAt(3, Location::ConstantLocation(instruction->InputAt(3)->AsConstant()));
+  } else {
+    locations->SetInAt(1, Location::RequiresRegister());
+  }
   // Add temps for read barriers and other uses. One is used by TypeCheckSlowPathARM64.
   locations->AddRegisterTemps(NumberOfCheckCastTemps(type_check_kind));
 }
@@ -4079,7 +4132,9 @@
   LocationSummary* locations = instruction->GetLocations();
   Location obj_loc = locations->InAt(0);
   Register obj = InputRegisterAt(instruction, 0);
-  Register cls = InputRegisterAt(instruction, 1);
+  Register cls = (type_check_kind == TypeCheckKind::kBitstringCheck)
+      ? Register()
+      : InputRegisterAt(instruction, 1);
   const size_t num_temps = NumberOfCheckCastTemps(type_check_kind);
   DCHECK_GE(num_temps, 1u);
   DCHECK_LE(num_temps, 3u);
@@ -4260,6 +4315,20 @@
       __ B(ne, &start_loop);
       break;
     }
+
+    case TypeCheckKind::kBitstringCheck: {
+      // /* HeapReference<Class> */ temp = obj->klass_
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        temp_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp2_loc,
+                                        kWithoutReadBarrier);
+
+      GenerateBitstringTypeCheckCompare(instruction, temp);
+      __ B(ne, type_check_slow_path->GetEntryLabel());
+      break;
+    }
   }
   __ Bind(&done);