Merge "ART: Expose vreg and visitor in RootInfo"
diff --git a/build/Android.common_test.mk b/build/Android.common_test.mk
index d2e3371..291db8b 100644
--- a/build/Android.common_test.mk
+++ b/build/Android.common_test.mk
@@ -124,12 +124,17 @@
 ART_TEST_RUN_TEST_MULTI_IMAGE ?= $(ART_TEST_FULL)
 
 # Define the command run on test failure. $(1) is the name of the test. Executed by the shell.
+# If the test was a top-level make target (e.g. `test-art-host-gtest-codegen_test64`), the command
+# fails with exit status 1 (returned by the last `grep` statement below).
+# Otherwise (e.g., if the test was run as a prerequisite of a compound test command, such as
+# `test-art-host-gtest-codegen_test`), the command does not fail, as this would break rules running
+# ART_TEST_PREREQ_FINISHED as one of their actions, which expects *all* prerequisites *not* to fail.
 define ART_TEST_FAILED
   ( [ -f $(ART_HOST_TEST_DIR)/skipped/$(1) ] || \
     (mkdir -p $(ART_HOST_TEST_DIR)/failed/ && touch $(ART_HOST_TEST_DIR)/failed/$(1) && \
       echo $(ART_TEST_KNOWN_FAILING) | grep -q $(1) \
         && (echo -e "$(1) \e[91mKNOWN FAILURE\e[0m") \
-        || (echo -e "$(1) \e[91mFAILED\e[0m" >&2 )))
+        || (echo -e "$(1) \e[91mFAILED\e[0m" >&2; echo $(MAKECMDGOALS) | grep -q -v $(1))))
 endef
 
 ifeq ($(ART_TEST_QUIET),true)
diff --git a/compiler/Android.bp b/compiler/Android.bp
index f1bf27e..6edb639 100644
--- a/compiler/Android.bp
+++ b/compiler/Android.bp
@@ -42,6 +42,7 @@
         "linker/vector_output_stream.cc",
         "linker/relative_patcher.cc",
         "jit/jit_compiler.cc",
+        "jit/jit_logger.cc",
         "jni/quick/calling_convention.cc",
         "jni/quick/jni_compiler.cc",
         "optimizing/block_builder.cc",
@@ -418,6 +419,7 @@
         },
         mips: {
             srcs: [
+                "optimizing/emit_swap_mips_test.cc",
                 "utils/mips/assembler_mips_test.cc",
                 "utils/mips/assembler_mips32r6_test.cc",
             ],
diff --git a/compiler/jit/jit_compiler.cc b/compiler/jit/jit_compiler.cc
index f83d37c..9dfb434 100644
--- a/compiler/jit/jit_compiler.cc
+++ b/compiler/jit/jit_compiler.cc
@@ -171,19 +171,10 @@
 
   size_t thread_count = compiler_driver_->GetThreadCount();
   if (compiler_options_->GetGenerateDebugInfo()) {
-#ifdef ART_TARGET_ANDROID
-    const char* prefix = "/data/misc/trace";
-#else
-    const char* prefix = "/tmp";
-#endif
     DCHECK_EQ(thread_count, 1u)
         << "Generating debug info only works with one compiler thread";
-    std::string perf_filename = std::string(prefix) + "/perf-" + std::to_string(getpid()) + ".map";
-    perf_file_.reset(OS::CreateEmptyFileWriteOnly(perf_filename.c_str()));
-    if (perf_file_ == nullptr) {
-      LOG(ERROR) << "Could not create perf file at " << perf_filename <<
-                    " Are you on a user build? Perf only works on userdebug/eng builds";
-    }
+    jit_logger_.reset(new JitLogger());
+    jit_logger_->OpenLog();
   }
 
   size_t inline_depth_limit = compiler_driver_->GetCompilerOptions().GetInlineDepthLimit();
@@ -192,9 +183,8 @@
 }
 
 JitCompiler::~JitCompiler() {
-  if (perf_file_ != nullptr) {
-    UNUSED(perf_file_->Flush());
-    UNUSED(perf_file_->Close());
+  if (compiler_options_->GetGenerateDebugInfo()) {
+    jit_logger_->CloseLog();
   }
 }
 
@@ -218,19 +208,8 @@
     TimingLogger::ScopedTiming t2("Compiling", &logger);
     JitCodeCache* const code_cache = runtime->GetJit()->GetCodeCache();
     success = compiler_driver_->GetCompiler()->JitCompile(self, code_cache, method, osr);
-    if (success && (perf_file_ != nullptr)) {
-      const void* ptr = method->GetEntryPointFromQuickCompiledCode();
-      std::ostringstream stream;
-      stream << std::hex
-             << reinterpret_cast<uintptr_t>(ptr)
-             << " "
-             << code_cache->GetMemorySizeOfCodePointer(ptr)
-             << " "
-             << method->PrettyMethod()
-             << std::endl;
-      std::string str = stream.str();
-      bool res = perf_file_->WriteFully(str.c_str(), str.size());
-      CHECK(res);
+    if (success && (jit_logger_ != nullptr)) {
+      jit_logger_->WriteLog(code_cache, method);
     }
   }
 
diff --git a/compiler/jit/jit_compiler.h b/compiler/jit/jit_compiler.h
index ea2747c..f0f24d3 100644
--- a/compiler/jit/jit_compiler.h
+++ b/compiler/jit/jit_compiler.h
@@ -19,6 +19,7 @@
 
 #include "base/mutex.h"
 #include "compiled_method.h"
+#include "jit_logger.h"
 #include "driver/compiler_driver.h"
 #include "driver/compiler_options.h"
 
@@ -50,7 +51,7 @@
   std::unique_ptr<CumulativeLogger> cumulative_logger_;
   std::unique_ptr<CompilerDriver> compiler_driver_;
   std::unique_ptr<const InstructionSetFeatures> instruction_set_features_;
-  std::unique_ptr<File> perf_file_;
+  std::unique_ptr<JitLogger> jit_logger_;
 
   JitCompiler();
 
diff --git a/compiler/jit/jit_logger.cc b/compiler/jit/jit_logger.cc
new file mode 100644
index 0000000..9ce3b0c
--- /dev/null
+++ b/compiler/jit/jit_logger.cc
@@ -0,0 +1,312 @@
+/*
+ * Copyright 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "jit_logger.h"
+
+#include "arch/instruction_set.h"
+#include "art_method-inl.h"
+#include "base/time_utils.h"
+#include "base/unix_file/fd_file.h"
+#include "driver/compiler_driver.h"
+#include "jit/jit.h"
+#include "jit/jit_code_cache.h"
+
+namespace art {
+namespace jit {
+
+#ifdef ART_TARGET_ANDROID
+static const char* kLogPrefix = "/data/misc/trace";
+#else
+static const char* kLogPrefix = "/tmp";
+#endif
+
+// File format of perf-PID.map:
+// +---------------------+
+// |ADDR SIZE symbolname1|
+// |ADDR SIZE symbolname2|
+// |...                  |
+// +---------------------+
+void JitLogger::OpenPerfMapLog() {
+  std::string pid_str = std::to_string(getpid());
+  std::string perf_filename = std::string(kLogPrefix) + "/perf-" + pid_str + ".map";
+  perf_file_.reset(OS::CreateEmptyFileWriteOnly(perf_filename.c_str()));
+  if (perf_file_ == nullptr) {
+    LOG(ERROR) << "Could not create perf file at " << perf_filename <<
+      " Are you on a user build? Perf only works on userdebug/eng builds";
+  }
+}
+
+void JitLogger::WritePerfMapLog(JitCodeCache* code_cache, ArtMethod* method) {
+  if (perf_file_ != nullptr) {
+    const void* ptr = method->GetEntryPointFromQuickCompiledCode();
+    size_t code_size = code_cache->GetMemorySizeOfCodePointer(ptr);
+    std::string method_name = method->PrettyMethod();
+
+    std::ostringstream stream;
+    stream << std::hex
+           << reinterpret_cast<uintptr_t>(ptr)
+           << " "
+           << code_size
+           << " "
+           << method_name
+           << std::endl;
+    std::string str = stream.str();
+    bool res = perf_file_->WriteFully(str.c_str(), str.size());
+    if (!res) {
+      LOG(WARNING) << "Failed to write jitted method info in log: write failure.";
+    }
+  } else {
+    LOG(WARNING) << "Failed to write jitted method info in log: log file doesn't exist.";
+  }
+}
+
+void JitLogger::ClosePerfMapLog() {
+  if (perf_file_ != nullptr) {
+    UNUSED(perf_file_->Flush());
+    UNUSED(perf_file_->Close());
+  }
+}
+
+//  File format of jit-PID.jump:
+//
+//  +--------------------------------+
+//  |  PerfJitHeader                 |
+//  +--------------------------------+
+//  |  PerfJitCodeLoad {             | .
+//  |    struct PerfJitBase;         |  .
+//  |    uint32_t process_id_;       |   .
+//  |    uint32_t thread_id_;        |   .
+//  |    uint64_t vma_;              |   .
+//  |    uint64_t code_address_;     |   .
+//  |    uint64_t code_size_;        |   .
+//  |    uint64_t code_id_;          |   .
+//  |  }                             |   .
+//  +-                              -+   .
+//  |  method_name'\0'               |   +--> one jitted method
+//  +-                              -+   .
+//  |  jitted code binary            |   .
+//  |  ...                           |   .
+//  +--------------------------------+   .
+//  |  PerfJitCodeDebugInfo     {    |   .
+//  |    struct PerfJitBase;         |   .
+//  |    uint64_t address_;          |   .
+//  |    uint64_t entry_count_;      |   .
+//  |    struct PerfJitDebugEntry;   |  .
+//  |  }                             | .
+//  +--------------------------------+
+//  |  PerfJitCodeLoad               |
+//     ...
+//
+struct PerfJitHeader {
+  uint32_t magic_;            // Characters "JiTD"
+  uint32_t version_;          // Header version
+  uint32_t size_;             // Total size of header
+  uint32_t elf_mach_target_;  // Elf mach target
+  uint32_t reserved_;         // Reserved, currently not used
+  uint32_t process_id_;       // Process ID of the JIT compiler
+  uint64_t time_stamp_;       // Timestamp when the header is generated
+  uint64_t flags_;            // Currently the flags are only used for choosing clock for timestamp,
+                              // we set it to 0 to tell perf that we use CLOCK_MONOTONIC clock.
+  static const uint32_t kMagic = 0x4A695444;  // "JiTD"
+  static const uint32_t kVersion = 1;
+};
+
+// Each record starts with such basic information: event type, total size, and timestamp.
+struct PerfJitBase {
+  enum PerfJitEvent {
+    // A jitted code load event.
+    // In ART JIT, it is used to log a new method is jit compiled and committed to jit-code-cache.
+    // Note that such kLoad event supports code cache GC in ART JIT.
+    // For every kLoad event recorded in jit-PID.dump and every perf sample recorded in perf.data,
+    // each event/sample has time stamp. In case code cache GC happens in ART JIT, and a new
+    // jitted method is committed to the same address of a previously deleted method,
+    // the time stamp information can help profiler to tell whether this sample belongs to the
+    // era of the first jitted method, or does it belong to the period of the second jitted method.
+    // JitCodeCache doesn't have to record any event on 'code delete'.
+    kLoad = 0,
+
+    // A jitted code move event, i,e. a jitted code moved from one address to another address.
+    // It helps profiler to map samples to the right symbol even when the code is moved.
+    // In ART JIT, this event can help log such behavior:
+    // A jitted method is recorded in previous kLoad event, but due to some reason,
+    // it is moved to another address in jit-code-cache.
+    kMove = 1,
+
+    // Logs debug line/column information.
+    kDebugInfo = 2,
+
+    // Logs JIT VM end of life event.
+    kClose = 3
+  };
+  uint32_t event_;       // Must be one of the events defined in PerfJitEvent.
+  uint32_t size_;        // Total size of this event record.
+                         // For example, for kLoad event, size of the event record is:
+                         // sizeof(PerfJitCodeLoad) + method_name.size() + compiled code size.
+  uint64_t time_stamp_;  // Timestamp for the event.
+};
+
+// Logs a jitted code load event (kLoad).
+// In ART JIT, it is used to log a new method is jit compiled and commited to jit-code-cache.
+struct PerfJitCodeLoad : PerfJitBase {
+  uint32_t process_id_;    // Process ID who performs the jit code load.
+                           // In ART JIT, it is the pid of the JIT compiler.
+  uint32_t thread_id_;     // Thread ID who performs the jit code load.
+                           // In ART JIT, it is the tid of the JIT compiler.
+  uint64_t vma_;           // Address of the code section. In ART JIT, because code_address_
+                           // uses absolute address, this field is 0.
+  uint64_t code_address_;  // Address where is jitted code is loaded.
+  uint64_t code_size_;     // Size of the jitted code.
+  uint64_t code_id_;       // Unique ID for each jitted code.
+};
+
+// This structure is for source line/column mapping.
+// Currently this feature is not implemented in ART JIT yet.
+struct PerfJitDebugEntry {
+  uint64_t address_;      // Code address which maps to the line/column in source.
+  uint32_t line_number_;  // Source line number starting at 1.
+  uint32_t column_;       // Column discriminator, default 0.
+  const char name_[0];    // Followed by null-terminated name or \0xff\0 if same as previous.
+};
+
+// Logs debug line information (kDebugInfo).
+// This structure is for source line/column mapping.
+// Currently this feature is not implemented in ART JIT yet.
+struct PerfJitCodeDebugInfo : PerfJitBase {
+  uint64_t address_;              // Starting code address which the debug info describes.
+  uint64_t entry_count_;          // How many instances of PerfJitDebugEntry.
+  PerfJitDebugEntry entries_[0];  // Followed by entry_count_ instances of PerfJitDebugEntry.
+};
+
+static uint32_t GetElfMach() {
+#if defined(__arm__)
+  static const uint32_t kElfMachARM = 0x28;
+  return kElfMachARM;
+#elif defined(__aarch64__)
+  static const uint32_t kElfMachARM64 = 0xB7;
+  return kElfMachARM64;
+#elif defined(__i386__)
+  static const uint32_t kElfMachIA32 = 0x3;
+  return kElfMachIA32;
+#elif defined(__x86_64__)
+  static const uint32_t kElfMachX64 = 0x3E;
+  return kElfMachX64;
+#else
+  UNIMPLEMENTED(WARNING) << "Unsupported architecture in JitLogger";
+  return 0;
+#endif
+}
+
+void JitLogger::OpenMarkerFile() {
+  int fd = jit_dump_file_->Fd();
+  // The 'perf inject' tool requires that the jit-PID.dump file
+  // must have a mmap(PROT_READ|PROT_EXEC) record in perf.data.
+  marker_address_ = mmap(nullptr, kPageSize, PROT_READ | PROT_EXEC, MAP_PRIVATE, fd, 0);
+  if (marker_address_ == MAP_FAILED) {
+    LOG(WARNING) << "Failed to create record in perf.data. JITed code profiling will not work.";
+    return;
+  }
+}
+
+void JitLogger::CloseMarkerFile() {
+  if (marker_address_ != nullptr) {
+    munmap(marker_address_, kPageSize);
+  }
+}
+
+void JitLogger::WriteJitDumpDebugInfo() {
+  // In the future, we can add java source file line/column mapping here.
+}
+
+void JitLogger::WriteJitDumpHeader() {
+  PerfJitHeader header;
+
+  std::memset(&header, 0, sizeof(header));
+  header.magic_ = PerfJitHeader::kMagic;
+  header.version_ = PerfJitHeader::kVersion;
+  header.size_ = sizeof(header);
+  header.elf_mach_target_ = GetElfMach();
+  header.process_id_ = static_cast<uint32_t>(getpid());
+  header.time_stamp_ = art::NanoTime();  // CLOCK_MONOTONIC clock is required.
+  header.flags_ = 0;
+
+  bool res = jit_dump_file_->WriteFully(reinterpret_cast<const char*>(&header), sizeof(header));
+  if (!res) {
+    LOG(WARNING) << "Failed to write profiling log. The 'perf inject' tool will not work.";
+  }
+}
+
+void JitLogger::OpenJitDumpLog() {
+  std::string pid_str = std::to_string(getpid());
+  std::string jitdump_filename = std::string(kLogPrefix) + "/jit-" + pid_str + ".dump";
+
+  jit_dump_file_.reset(OS::CreateEmptyFile(jitdump_filename.c_str()));
+  if (jit_dump_file_ == nullptr) {
+    LOG(ERROR) << "Could not create jit dump file at " << jitdump_filename <<
+      " Are you on a user build? Perf only works on userdebug/eng builds";
+    return;
+  }
+
+  OpenMarkerFile();
+
+  // Continue to write jit-PID.dump file even above OpenMarkerFile() fails.
+  // Even if that means 'perf inject' tool cannot work, developers can still use other tools
+  // to map the samples in perf.data to the information (symbol,address,code) recorded
+  // in the jit-PID.dump file, and still proceed the jitted code analysis.
+  WriteJitDumpHeader();
+}
+
+void JitLogger::WriteJitDumpLog(JitCodeCache* code_cache, ArtMethod* method) {
+  if (jit_dump_file_ != nullptr) {
+    const void* code = method->GetEntryPointFromQuickCompiledCode();
+    size_t code_size = code_cache->GetMemorySizeOfCodePointer(code);
+    std::string method_name = method->PrettyMethod();
+
+    PerfJitCodeLoad jit_code;
+    std::memset(&jit_code, 0, sizeof(jit_code));
+    jit_code.event_ = PerfJitCodeLoad::kLoad;
+    jit_code.size_ = sizeof(jit_code) + method_name.size() + 1 + code_size;
+    jit_code.time_stamp_ = art::NanoTime();    // CLOCK_MONOTONIC clock is required.
+    jit_code.process_id_ = static_cast<uint32_t>(getpid());
+    jit_code.thread_id_ = static_cast<uint32_t>(art::GetTid());
+    jit_code.vma_ = 0x0;
+    jit_code.code_address_ = reinterpret_cast<uint64_t>(code);
+    jit_code.code_size_ = code_size;
+    jit_code.code_id_ = code_index_++;
+
+    // Write one complete jitted method info, including:
+    // - PerfJitCodeLoad structure
+    // - Method name
+    // - Complete generated code of this method
+    //
+    // Use UNUSED() here to avoid compiler warnings.
+    UNUSED(jit_dump_file_->WriteFully(reinterpret_cast<const char*>(&jit_code), sizeof(jit_code)));
+    UNUSED(jit_dump_file_->WriteFully(method_name.c_str(), method_name.size() + 1));
+    UNUSED(jit_dump_file_->WriteFully(code, code_size));
+
+    WriteJitDumpDebugInfo();
+  }
+}
+
+void JitLogger::CloseJitDumpLog() {
+  if (jit_dump_file_ != nullptr) {
+    CloseMarkerFile();
+    UNUSED(jit_dump_file_->Flush());
+    UNUSED(jit_dump_file_->Close());
+  }
+}
+
+}  // namespace jit
+}  // namespace art
diff --git a/compiler/jit/jit_logger.h b/compiler/jit/jit_logger.h
new file mode 100644
index 0000000..0f8cfe4
--- /dev/null
+++ b/compiler/jit/jit_logger.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_JIT_JIT_LOGGER_H_
+#define ART_COMPILER_JIT_JIT_LOGGER_H_
+
+#include "base/mutex.h"
+#include "compiled_method.h"
+#include "driver/compiler_driver.h"
+#include "driver/compiler_options.h"
+
+namespace art {
+
+class ArtMethod;
+
+namespace jit {
+
+//
+// JitLogger supports two approaches of perf profiling.
+//
+// (1) perf-map:
+//     The perf-map mechanism generates perf-PID.map file,
+//     which provides simple "address, size, method_name" information to perf,
+//     and allows perf to map samples in jit-code-cache to jitted method symbols.
+//
+//     Command line Example:
+//       $ perf record dalvikvm -Xcompiler-option --generate-debug-info -cp <classpath> Test
+//       $ perf report
+//     NOTE:
+//       - Make sure that the perf-PID.map file is available for 'perf report' tool to access,
+//         so that jitted method can be displayed.
+//
+//
+// (2) perf-inject:
+//     The perf-inject mechansim generates jit-PID.dump file,
+//     which provides rich informations about a jitted method.
+//     It allows perf or other profiling tools to do advanced analysis on jitted code,
+//     for example instruction level profiling.
+//
+//     Command line Example:
+//       $ perf record -k mono dalvikvm -Xcompiler-option --generate-debug-info -cp <classpath> Test
+//       $ perf inject -i perf.data -o perf.data.jitted
+//       $ perf report -i perf.data.jitted
+//       $ perf annotate -i perf.data.jitted
+//     NOTE:
+//       REQUIREMENTS
+//       - The 'perf record -k mono' option requires 4.1 (or higher) Linux kernel.
+//       - The 'perf inject' (generating jit ELF files feature) requires perf 4.6 (or higher).
+//       PERF RECORD
+//       - The '-k mono' option tells 'perf record' to use CLOCK_MONOTONIC clock during sampling;
+//         which is required by 'perf inject', to make sure that both perf.data and jit-PID.dump
+//         have unified clock source for timestamps.
+//       PERF INJECT
+//       - The 'perf inject' tool injects information from jit-PID.dump into perf.data file,
+//         and generates small ELF files (jitted-TID-CODEID.so) for each jitted method.
+//       - On Android devices, the jit-PID.dump file is generated in /data/misc/trace/ folder, and
+//         such location is recorded in perf.data file.
+//         The 'perf inject' tool is going to look for jit-PID.dump and generates small ELF files in
+//         this /data/misc/trace/ folder.
+//         Make sure that you have the read/write access to /data/misc/trace/ folder.
+//       - On non-Android devices, the jit-PID.dump file is generated in /tmp/ folder, and
+//         'perf inject' tool operates on this folder.
+//         Make sure that you have the read/write access to /tmp/ folder.
+//       - If you are executing 'perf inject' on non-Android devices (host), but perf.data and
+//         jit-PID.dump files are adb-pulled from Android devices, make sure that there is a
+//         /data/misc/trace/ folder on host, and jit-PID.dump file is copied to this folder.
+//       - Currently 'perf inject' doesn't provide option to change the path for jit-PID.dump and
+//         generated ELF files.
+//       PERF ANNOTATE
+//       - The 'perf annotate' tool displays assembly level profiling report.
+//         Source code can also be displayed if the ELF file has debug symbols.
+//       - Make sure above small ELF files are available for 'perf annotate' tool to access,
+//         so that jitted code can be displayed in assembly view.
+//
+class JitLogger {
+  public:
+    JitLogger() : code_index_(0), marker_address_(nullptr) {}
+
+    void OpenLog() {
+      OpenPerfMapLog();
+      OpenJitDumpLog();
+    }
+
+    void WriteLog(JitCodeCache* code_cache, ArtMethod* method)
+        REQUIRES_SHARED(Locks::mutator_lock_) {
+      WritePerfMapLog(code_cache, method);
+      WriteJitDumpLog(code_cache, method);
+    }
+
+    void CloseLog() {
+      ClosePerfMapLog();
+      CloseJitDumpLog();
+    }
+
+  private:
+    // For perf-map profiling
+    void OpenPerfMapLog();
+    void WritePerfMapLog(JitCodeCache* code_cache, ArtMethod* method)
+        REQUIRES_SHARED(Locks::mutator_lock_);
+    void ClosePerfMapLog();
+
+    // For perf-inject profiling
+    void OpenJitDumpLog();
+    void WriteJitDumpLog(JitCodeCache* code_cache, ArtMethod* method)
+        REQUIRES_SHARED(Locks::mutator_lock_);
+    void CloseJitDumpLog();
+
+    void OpenMarkerFile();
+    void CloseMarkerFile();
+    void WriteJitDumpHeader();
+    void WriteJitDumpDebugInfo();
+
+    std::unique_ptr<File> perf_file_;
+    std::unique_ptr<File> jit_dump_file_;
+    uint64_t code_index_;
+    void* marker_address_;
+
+    DISALLOW_COPY_AND_ASSIGN(JitLogger);
+};
+
+}  // namespace jit
+}  // namespace art
+
+#endif  // ART_COMPILER_JIT_JIT_LOGGER_H_
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index a81f24e..bf246ad 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -30,6 +30,7 @@
 #include "memory_region.h"
 #include "nodes.h"
 #include "optimizing_compiler_stats.h"
+#include "read_barrier_option.h"
 #include "stack_map_stream.h"
 #include "utils/label.h"
 
@@ -50,6 +51,9 @@
 // Maximum value for a primitive long.
 static int64_t constexpr kPrimLongMax = INT64_C(0x7fffffffffffffff);
 
+static constexpr ReadBarrierOption kCompilerReadBarrierOption =
+    kEmitCompilerReadBarrier ? kWithReadBarrier : kWithoutReadBarrier;
+
 class Assembler;
 class CodeGenerator;
 class CompilerDriver;
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index b6d7b80..7c72d00 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -620,8 +620,10 @@
 // reference (different from `ref`) in `obj.field`).
 class ReadBarrierMarkSlowPathARM : public SlowPathCodeARM {
  public:
-  ReadBarrierMarkSlowPathARM(HInstruction* instruction, Location ref)
-      : SlowPathCodeARM(instruction), ref_(ref) {
+  ReadBarrierMarkSlowPathARM(HInstruction* instruction,
+                             Location ref,
+                             Location entrypoint = Location::NoLocation())
+      : SlowPathCodeARM(instruction), ref_(ref), entrypoint_(entrypoint) {
     DCHECK(kEmitCompilerReadBarrier);
   }
 
@@ -676,10 +678,15 @@
     //
     //   rX <- ReadBarrierMarkRegX(rX)
     //
-    int32_t entry_point_offset =
-        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref_reg);
-    // This runtime call does not require a stack map.
-    arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
+    if (entrypoint_.IsValid()) {
+      arm_codegen->ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction_, this);
+      __ blx(entrypoint_.AsRegister<Register>());
+    } else {
+      int32_t entry_point_offset =
+          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref_reg);
+      // This runtime call does not require a stack map.
+      arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
+    }
     __ b(GetExitLabel());
   }
 
@@ -687,6 +694,9 @@
   // The location (register) of the marked object reference.
   const Location ref_;
 
+  // The location of the entrypoint if already loaded.
+  const Location entrypoint_;
+
   DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathARM);
 };
 
@@ -4864,16 +4874,21 @@
     case Primitive::kPrimShort:
     case Primitive::kPrimChar:
     case Primitive::kPrimInt: {
+      Register length;
+      if (maybe_compressed_char_at) {
+        length = locations->GetTemp(0).AsRegister<Register>();
+        uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
+        __ LoadFromOffset(kLoadWord, length, obj, count_offset);
+        codegen_->MaybeRecordImplicitNullCheck(instruction);
+      }
       if (index.IsConstant()) {
         int32_t const_index = index.GetConstant()->AsIntConstant()->GetValue();
         if (maybe_compressed_char_at) {
-          Register length = IP;
           Label uncompressed_load, done;
-          uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
-          __ LoadFromOffset(kLoadWord, length, obj, count_offset);
-          codegen_->MaybeRecordImplicitNullCheck(instruction);
-          __ cmp(length, ShifterOperand(0));
-          __ b(&uncompressed_load, GE);
+          __ Lsrs(length, length, 1u);  // LSRS has a 16-bit encoding, TST (immediate) does not.
+          static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
+                        "Expecting 0=compressed, 1=uncompressed");
+          __ b(&uncompressed_load, CS);
           __ LoadFromOffset(kLoadUnsignedByte,
                             out_loc.AsRegister<Register>(),
                             obj,
@@ -4908,12 +4923,10 @@
         }
         if (maybe_compressed_char_at) {
           Label uncompressed_load, done;
-          uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
-          Register length = locations->GetTemp(0).AsRegister<Register>();
-          __ LoadFromOffset(kLoadWord, length, obj, count_offset);
-          codegen_->MaybeRecordImplicitNullCheck(instruction);
-          __ cmp(length, ShifterOperand(0));
-          __ b(&uncompressed_load, GE);
+          __ Lsrs(length, length, 1u);  // LSRS has a 16-bit encoding, TST (immediate) does not.
+          static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
+                        "Expecting 0=compressed, 1=uncompressed");
+          __ b(&uncompressed_load, CS);
           __ ldrb(out_loc.AsRegister<Register>(),
                   Address(temp, index.AsRegister<Register>(), Shift::LSL, 0));
           __ b(&done);
@@ -5318,7 +5331,7 @@
   codegen_->MaybeRecordImplicitNullCheck(instruction);
   // Mask out compression flag from String's array length.
   if (mirror::kUseStringCompression && instruction->IsStringLength()) {
-    __ bic(out, out, ShifterOperand(1u << 31));
+    __ Lsr(out, out, 1u);
   }
 }
 
@@ -5745,7 +5758,9 @@
   Location out_loc = locations->Out();
   Register out = out_loc.AsRegister<Register>();
 
-  const bool requires_read_barrier = kEmitCompilerReadBarrier && !cls->IsInBootImage();
+  const ReadBarrierOption read_barrier_option = cls->IsInBootImage()
+      ? kWithoutReadBarrier
+      : kCompilerReadBarrierOption;
   bool generate_null_check = false;
   switch (cls->GetLoadKind()) {
     case HLoadClass::LoadKind::kReferrersClass: {
@@ -5757,17 +5772,17 @@
                               out_loc,
                               current_method,
                               ArtMethod::DeclaringClassOffset().Int32Value(),
-                              requires_read_barrier);
+                              read_barrier_option);
       break;
     }
     case HLoadClass::LoadKind::kBootImageLinkTimeAddress: {
-      DCHECK(!requires_read_barrier);
+      DCHECK_EQ(read_barrier_option, kWithoutReadBarrier);
       __ LoadLiteral(out, codegen_->DeduplicateBootImageTypeLiteral(cls->GetDexFile(),
                                                                     cls->GetTypeIndex()));
       break;
     }
     case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: {
-      DCHECK(!requires_read_barrier);
+      DCHECK_EQ(read_barrier_option, kWithoutReadBarrier);
       CodeGeneratorARM::PcRelativePatchInfo* labels =
           codegen_->NewPcRelativeTypePatch(cls->GetDexFile(), cls->GetTypeIndex());
       __ BindTrackedLabel(&labels->movw_label);
@@ -5779,7 +5794,7 @@
       break;
     }
     case HLoadClass::LoadKind::kBootImageAddress: {
-      DCHECK(!requires_read_barrier);
+      DCHECK_EQ(read_barrier_option, kWithoutReadBarrier);
       DCHECK_NE(cls->GetAddress(), 0u);
       uint32_t address = dchecked_integral_cast<uint32_t>(cls->GetAddress());
       __ LoadLiteral(out, codegen_->DeduplicateBootImageAddressLiteral(address));
@@ -5799,7 +5814,7 @@
       uint32_t offset = address & MaxInt<uint32_t>(offset_bits);
       __ LoadLiteral(out, codegen_->DeduplicateDexCacheAddressLiteral(base_address));
       // /* GcRoot<mirror::Class> */ out = *(base_address + offset)
-      GenerateGcRootFieldLoad(cls, out_loc, out, offset, requires_read_barrier);
+      GenerateGcRootFieldLoad(cls, out_loc, out, offset, read_barrier_option);
       generate_null_check = !cls->IsInDexCache();
       break;
     }
@@ -5808,7 +5823,7 @@
       HArmDexCacheArraysBase* base = cls->InputAt(0)->AsArmDexCacheArraysBase();
       int32_t offset = cls->GetDexCacheElementOffset() - base->GetElementOffset();
       // /* GcRoot<mirror::Class> */ out = *(dex_cache_arrays_base + offset)
-      GenerateGcRootFieldLoad(cls, out_loc, base_reg, offset, requires_read_barrier);
+      GenerateGcRootFieldLoad(cls, out_loc, base_reg, offset, read_barrier_option);
       generate_null_check = !cls->IsInDexCache();
       break;
     }
@@ -5822,7 +5837,7 @@
                         ArtMethod::DexCacheResolvedTypesOffset(kArmPointerSize).Int32Value());
       // /* GcRoot<mirror::Class> */ out = out[type_index]
       size_t offset = CodeGenerator::GetCacheOffset(cls->GetTypeIndex());
-      GenerateGcRootFieldLoad(cls, out_loc, out, offset, requires_read_barrier);
+      GenerateGcRootFieldLoad(cls, out_loc, out, offset, read_barrier_option);
       generate_null_check = !cls->IsInDexCache();
     }
   }
@@ -5965,7 +5980,7 @@
       __ movt(temp, /* placeholder */ 0u);
       __ BindTrackedLabel(&labels->add_pc_label);
       __ add(temp, temp, ShifterOperand(PC));
-      GenerateGcRootFieldLoad(load, out_loc, temp, /* offset */ 0, kEmitCompilerReadBarrier);
+      GenerateGcRootFieldLoad(load, out_loc, temp, /* offset */ 0, kCompilerReadBarrierOption);
       SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathARM(load);
       codegen_->AddSlowPath(slow_path);
       __ CompareAndBranchIfZero(out, slow_path->GetEntryLabel());
@@ -6021,12 +6036,26 @@
   CheckEntrypointTypes<kQuickDeliverException, void, mirror::Object*>();
 }
 
-static bool TypeCheckNeedsATemporary(TypeCheckKind type_check_kind) {
-  return kEmitCompilerReadBarrier &&
-      (kUseBakerReadBarrier ||
-       type_check_kind == TypeCheckKind::kAbstractClassCheck ||
-       type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
-       type_check_kind == TypeCheckKind::kArrayObjectCheck);
+// Temp is used for read barrier.
+static size_t NumberOfInstanceOfTemps(TypeCheckKind type_check_kind) {
+  if (kEmitCompilerReadBarrier &&
+       (kUseBakerReadBarrier ||
+          type_check_kind == TypeCheckKind::kAbstractClassCheck ||
+          type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
+          type_check_kind == TypeCheckKind::kArrayObjectCheck)) {
+    return 1;
+  }
+  return 0;
+}
+
+// Interface case has 3 temps, one for holding the number of interfaces, one for the current
+// interface pointer, one for loading the current interface.
+// The other checks have one temp for loading the object's class.
+static size_t NumberOfCheckCastTemps(TypeCheckKind type_check_kind) {
+  if (type_check_kind == TypeCheckKind::kInterfaceCheck) {
+    return 3;
+  }
+  return 1 + NumberOfInstanceOfTemps(type_check_kind);
 }
 
 void LocationsBuilderARM::VisitInstanceOf(HInstanceOf* instruction) {
@@ -6058,11 +6087,7 @@
   // The "out" register is used as a temporary, so it overlaps with the inputs.
   // Note that TypeCheckSlowPathARM uses this register too.
   locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
-  // When read barriers are enabled, we need a temporary register for
-  // some cases.
-  if (TypeCheckNeedsATemporary(type_check_kind)) {
-    locations->AddTemp(Location::RequiresRegister());
-  }
+  locations->AddRegisterTemps(NumberOfInstanceOfTemps(type_check_kind));
 }
 
 void InstructionCodeGeneratorARM::VisitInstanceOf(HInstanceOf* instruction) {
@@ -6073,9 +6098,9 @@
   Register cls = locations->InAt(1).AsRegister<Register>();
   Location out_loc = locations->Out();
   Register out = out_loc.AsRegister<Register>();
-  Location maybe_temp_loc = TypeCheckNeedsATemporary(type_check_kind) ?
-      locations->GetTemp(0) :
-      Location::NoLocation();
+  const size_t num_temps = NumberOfInstanceOfTemps(type_check_kind);
+  DCHECK_LE(num_temps, 1u);
+  Location maybe_temp_loc = (num_temps >= 1) ? locations->GetTemp(0) : Location::NoLocation();
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
   uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
@@ -6090,7 +6115,12 @@
   }
 
   // /* HeapReference<Class> */ out = obj->klass_
-  GenerateReferenceLoadTwoRegisters(instruction, out_loc, obj_loc, class_offset, maybe_temp_loc);
+  GenerateReferenceLoadTwoRegisters(instruction,
+                                    out_loc,
+                                    obj_loc,
+                                    class_offset,
+                                    maybe_temp_loc,
+                                    kCompilerReadBarrierOption);
 
   switch (type_check_kind) {
     case TypeCheckKind::kExactCheck: {
@@ -6108,7 +6138,11 @@
       Label loop;
       __ Bind(&loop);
       // /* HeapReference<Class> */ out = out->super_class_
-      GenerateReferenceLoadOneRegister(instruction, out_loc, super_offset, maybe_temp_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       out_loc,
+                                       super_offset,
+                                       maybe_temp_loc,
+                                       kCompilerReadBarrierOption);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ CompareAndBranchIfZero(out, &done);
       __ cmp(out, ShifterOperand(cls));
@@ -6127,7 +6161,11 @@
       __ cmp(out, ShifterOperand(cls));
       __ b(&success, EQ);
       // /* HeapReference<Class> */ out = out->super_class_
-      GenerateReferenceLoadOneRegister(instruction, out_loc, super_offset, maybe_temp_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       out_loc,
+                                       super_offset,
+                                       maybe_temp_loc,
+                                       kCompilerReadBarrierOption);
       __ CompareAndBranchIfNonZero(out, &loop);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ b(&done);
@@ -6146,7 +6184,11 @@
       __ b(&exact_check, EQ);
       // Otherwise, we need to check that the object's class is a non-primitive array.
       // /* HeapReference<Class> */ out = out->component_type_
-      GenerateReferenceLoadOneRegister(instruction, out_loc, component_offset, maybe_temp_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       out_loc,
+                                       component_offset,
+                                       maybe_temp_loc,
+                                       kCompilerReadBarrierOption);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ CompareAndBranchIfZero(out, &done);
       __ LoadFromOffset(kLoadUnsignedHalfword, out, out, primitive_offset);
@@ -6242,13 +6284,7 @@
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RequiresRegister());
-  // Note that TypeCheckSlowPathARM uses this "temp" register too.
-  locations->AddTemp(Location::RequiresRegister());
-  // When read barriers are enabled, we need an additional temporary
-  // register for some cases.
-  if (TypeCheckNeedsATemporary(type_check_kind)) {
-    locations->AddTemp(Location::RequiresRegister());
-  }
+  locations->AddRegisterTemps(NumberOfCheckCastTemps(type_check_kind));
 }
 
 void InstructionCodeGeneratorARM::VisitCheckCast(HCheckCast* instruction) {
@@ -6259,20 +6295,31 @@
   Register cls = locations->InAt(1).AsRegister<Register>();
   Location temp_loc = locations->GetTemp(0);
   Register temp = temp_loc.AsRegister<Register>();
-  Location maybe_temp2_loc = TypeCheckNeedsATemporary(type_check_kind) ?
-      locations->GetTemp(1) :
-      Location::NoLocation();
-  uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
-  uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
-  uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
-  uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
+  const size_t num_temps = NumberOfCheckCastTemps(type_check_kind);
+  DCHECK_LE(num_temps, 3u);
+  Location maybe_temp2_loc = (num_temps >= 2) ? locations->GetTemp(1) : Location::NoLocation();
+  Location maybe_temp3_loc = (num_temps >= 3) ? locations->GetTemp(2) : Location::NoLocation();
+  const uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
+  const uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
+  const uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
+  const uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
+  const uint32_t iftable_offset = mirror::Class::IfTableOffset().Uint32Value();
+  const uint32_t array_length_offset = mirror::Array::LengthOffset().Uint32Value();
+  const uint32_t object_array_data_offset =
+      mirror::Array::DataOffset(kHeapReferenceSize).Uint32Value();
 
-  bool is_type_check_slow_path_fatal =
-      (type_check_kind == TypeCheckKind::kExactCheck ||
-       type_check_kind == TypeCheckKind::kAbstractClassCheck ||
-       type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
-       type_check_kind == TypeCheckKind::kArrayObjectCheck) &&
-      !instruction->CanThrowIntoCatchBlock();
+  // Always false for read barriers since we may need to go to the entrypoint for non-fatal cases
+  // from false negatives. The false negatives may come from avoiding read barriers below. Avoiding
+  // read barriers is done for performance and code size reasons.
+  bool is_type_check_slow_path_fatal = false;
+  if (!kEmitCompilerReadBarrier) {
+    is_type_check_slow_path_fatal =
+        (type_check_kind == TypeCheckKind::kExactCheck ||
+         type_check_kind == TypeCheckKind::kAbstractClassCheck ||
+         type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
+         type_check_kind == TypeCheckKind::kArrayObjectCheck) &&
+        !instruction->CanThrowIntoCatchBlock();
+  }
   SlowPathCodeARM* type_check_slow_path =
       new (GetGraph()->GetArena()) TypeCheckSlowPathARM(instruction,
                                                         is_type_check_slow_path_fatal);
@@ -6284,12 +6331,17 @@
     __ CompareAndBranchIfZero(obj, &done);
   }
 
-  // /* HeapReference<Class> */ temp = obj->klass_
-  GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset, maybe_temp2_loc);
-
   switch (type_check_kind) {
     case TypeCheckKind::kExactCheck:
     case TypeCheckKind::kArrayCheck: {
+      // /* HeapReference<Class> */ temp = obj->klass_
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        temp_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp2_loc,
+                                        kWithoutReadBarrier);
+
       __ cmp(temp, ShifterOperand(cls));
       // Jump to slow path for throwing the exception or doing a
       // more involved array check.
@@ -6298,12 +6350,24 @@
     }
 
     case TypeCheckKind::kAbstractClassCheck: {
+      // /* HeapReference<Class> */ temp = obj->klass_
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        temp_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp2_loc,
+                                        kWithoutReadBarrier);
+
       // If the class is abstract, we eagerly fetch the super class of the
       // object to avoid doing a comparison we know will fail.
       Label loop;
       __ Bind(&loop);
       // /* HeapReference<Class> */ temp = temp->super_class_
-      GenerateReferenceLoadOneRegister(instruction, temp_loc, super_offset, maybe_temp2_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       temp_loc,
+                                       super_offset,
+                                       maybe_temp2_loc,
+                                       kWithoutReadBarrier);
 
       // If the class reference currently in `temp` is null, jump to the slow path to throw the
       // exception.
@@ -6316,6 +6380,14 @@
     }
 
     case TypeCheckKind::kClassHierarchyCheck: {
+      // /* HeapReference<Class> */ temp = obj->klass_
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        temp_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp2_loc,
+                                        kWithoutReadBarrier);
+
       // Walk over the class hierarchy to find a match.
       Label loop;
       __ Bind(&loop);
@@ -6323,7 +6395,11 @@
       __ b(&done, EQ);
 
       // /* HeapReference<Class> */ temp = temp->super_class_
-      GenerateReferenceLoadOneRegister(instruction, temp_loc, super_offset, maybe_temp2_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       temp_loc,
+                                       super_offset,
+                                       maybe_temp2_loc,
+                                       kWithoutReadBarrier);
 
       // If the class reference currently in `temp` is null, jump to the slow path to throw the
       // exception.
@@ -6334,13 +6410,25 @@
     }
 
     case TypeCheckKind::kArrayObjectCheck: {
+      // /* HeapReference<Class> */ temp = obj->klass_
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        temp_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp2_loc,
+                                        kWithoutReadBarrier);
+
       // Do an exact check.
       __ cmp(temp, ShifterOperand(cls));
       __ b(&done, EQ);
 
       // Otherwise, we need to check that the object's class is a non-primitive array.
       // /* HeapReference<Class> */ temp = temp->component_type_
-      GenerateReferenceLoadOneRegister(instruction, temp_loc, component_offset, maybe_temp2_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       temp_loc,
+                                       component_offset,
+                                       maybe_temp2_loc,
+                                       kWithoutReadBarrier);
       // If the component type is null, jump to the slow path to throw the exception.
       __ CompareAndBranchIfZero(temp, type_check_slow_path->GetEntryLabel());
       // Otherwise,the object is indeed an array, jump to label `check_non_primitive_component_type`
@@ -6352,10 +6440,7 @@
     }
 
     case TypeCheckKind::kUnresolvedCheck:
-    case TypeCheckKind::kInterfaceCheck:
-      // We always go into the type check slow path for the unresolved
-      // and interface check cases.
-      //
+      // We always go into the type check slow path for the unresolved check case.
       // We cannot directly call the CheckCast runtime entry point
       // without resorting to a type checking slow path here (i.e. by
       // calling InvokeRuntime directly), as it would require to
@@ -6363,8 +6448,54 @@
       // instruction (following the runtime calling convention), which
       // might be cluttered by the potential first read barrier
       // emission at the beginning of this method.
+
       __ b(type_check_slow_path->GetEntryLabel());
       break;
+
+    case TypeCheckKind::kInterfaceCheck: {
+      // Avoid read barriers to improve performance of the fast path. We can not get false
+      // positives by doing this.
+      // /* HeapReference<Class> */ temp = obj->klass_
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        temp_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp2_loc,
+                                        kWithoutReadBarrier);
+
+      // /* HeapReference<Class> */ temp = temp->iftable_
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        temp_loc,
+                                        temp_loc,
+                                        iftable_offset,
+                                        maybe_temp2_loc,
+                                        kWithoutReadBarrier);
+      Label is_null;
+      // Null iftable means it is empty and will always fail the check.
+      // Not cbz since the temp may not be a low register.
+      __ CompareAndBranchIfZero(temp, &is_null);
+
+      // Loop through the iftable and check if any class matches.
+      __ ldr(maybe_temp2_loc.AsRegister<Register>(), Address(temp, array_length_offset));
+
+      Label start_loop;
+      __ Bind(&start_loop);
+      __ ldr(maybe_temp3_loc.AsRegister<Register>(), Address(temp, object_array_data_offset));
+      __ MaybeUnpoisonHeapReference(maybe_temp3_loc.AsRegister<Register>());
+      __ cmp(cls, ShifterOperand(maybe_temp3_loc.AsRegister<Register>()));
+      __ b(&done, EQ);  // Return if same class.
+      // Go to next interface.
+      __ add(temp, temp, ShifterOperand(2 * kHeapReferenceSize));
+      __ sub(maybe_temp2_loc.AsRegister<Register>(),
+             maybe_temp2_loc.AsRegister<Register>(),
+             ShifterOperand(2));
+      // Not cbnz since the temp may not be a low register.
+      __ CompareAndBranchIfNonZero(maybe_temp2_loc.AsRegister<Register>(), &start_loop);
+      __ Bind(&is_null);
+
+      __ b(type_check_slow_path->GetEntryLabel());
+      break;
+    }
   }
   __ Bind(&done);
 
@@ -6638,12 +6769,15 @@
   }
 }
 
-void InstructionCodeGeneratorARM::GenerateReferenceLoadOneRegister(HInstruction* instruction,
-                                                                   Location out,
-                                                                   uint32_t offset,
-                                                                   Location maybe_temp) {
+void InstructionCodeGeneratorARM::GenerateReferenceLoadOneRegister(
+    HInstruction* instruction,
+    Location out,
+    uint32_t offset,
+    Location maybe_temp,
+    ReadBarrierOption read_barrier_option) {
   Register out_reg = out.AsRegister<Register>();
-  if (kEmitCompilerReadBarrier) {
+  if (read_barrier_option == kWithReadBarrier) {
+    CHECK(kEmitCompilerReadBarrier);
     DCHECK(maybe_temp.IsRegister()) << maybe_temp;
     if (kUseBakerReadBarrier) {
       // Load with fast path based Baker's read barrier.
@@ -6668,14 +6802,17 @@
   }
 }
 
-void InstructionCodeGeneratorARM::GenerateReferenceLoadTwoRegisters(HInstruction* instruction,
-                                                                    Location out,
-                                                                    Location obj,
-                                                                    uint32_t offset,
-                                                                    Location maybe_temp) {
+void InstructionCodeGeneratorARM::GenerateReferenceLoadTwoRegisters(
+    HInstruction* instruction,
+    Location out,
+    Location obj,
+    uint32_t offset,
+    Location maybe_temp,
+    ReadBarrierOption read_barrier_option) {
   Register out_reg = out.AsRegister<Register>();
   Register obj_reg = obj.AsRegister<Register>();
-  if (kEmitCompilerReadBarrier) {
+  if (read_barrier_option == kWithReadBarrier) {
+    CHECK(kEmitCompilerReadBarrier);
     if (kUseBakerReadBarrier) {
       DCHECK(maybe_temp.IsRegister()) << maybe_temp;
       // Load with fast path based Baker's read barrier.
@@ -6700,17 +6837,18 @@
                                                           Location root,
                                                           Register obj,
                                                           uint32_t offset,
-                                                          bool requires_read_barrier) {
+                                                          ReadBarrierOption read_barrier_option) {
   Register root_reg = root.AsRegister<Register>();
-  if (requires_read_barrier) {
+  if (read_barrier_option == kWithReadBarrier) {
     DCHECK(kEmitCompilerReadBarrier);
     if (kUseBakerReadBarrier) {
       // Fast path implementation of art::ReadBarrier::BarrierForRoot when
       // Baker's read barrier are used:
       //
       //   root = obj.field;
-      //   if (Thread::Current()->GetIsGcMarking()) {
-      //     root = ReadBarrier::Mark(root)
+      //   temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
+      //   if (temp != null) {
+      //     root = temp(root)
       //   }
 
       // /* GcRoot<mirror::Object> */ root = *(obj + offset)
@@ -6724,14 +6862,23 @@
                     "have different sizes.");
 
       // Slow path marking the GC root `root`.
+      Location temp = Location::RegisterLocation(LR);
       SlowPathCodeARM* slow_path =
-          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM(instruction, root);
+          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM(
+              instruction,
+              root,
+              /*entrypoint*/ temp);
       codegen_->AddSlowPath(slow_path);
 
-      // IP = Thread::Current()->GetIsGcMarking()
-      __ LoadFromOffset(
-          kLoadWord, IP, TR, Thread::IsGcMarkingOffset<kArmPointerSize>().Int32Value());
-      __ CompareAndBranchIfNonZero(IP, slow_path->GetEntryLabel());
+      // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
+      const int32_t entry_point_offset =
+          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg());
+      // Loading the entrypoint does not require a load acquire since it is only changed when
+      // threads are suspended or running a checkpoint.
+      __ LoadFromOffset(kLoadWord, temp.AsRegister<Register>(), TR, entry_point_offset);
+      // The entrypoint is null when the GC is not marking, this prevents one load compared to
+      // checking GetIsGcMarking.
+      __ CompareAndBranchIfNonZero(temp.AsRegister<Register>(), slow_path->GetEntryLabel());
       __ Bind(slow_path->GetExitLabel());
     } else {
       // GC root loaded through a slow path for read barriers other
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index 3d46aab..f95dd57 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -263,7 +263,8 @@
   void GenerateReferenceLoadOneRegister(HInstruction* instruction,
                                         Location out,
                                         uint32_t offset,
-                                        Location maybe_temp);
+                                        Location maybe_temp,
+                                        ReadBarrierOption read_barrier_option);
   // Generate a heap reference load using two different registers
   // `out` and `obj`:
   //
@@ -278,17 +279,18 @@
                                          Location out,
                                          Location obj,
                                          uint32_t offset,
-                                         Location maybe_temp);
+                                         Location maybe_temp,
+                                         ReadBarrierOption read_barrier_option);
   // Generate a GC root reference load:
   //
   //   root <- *(obj + offset)
   //
-  // while honoring read barriers if `requires_read_barrier` is true.
+  // while honoring read barriers based on read_barrier_option.
   void GenerateGcRootFieldLoad(HInstruction* instruction,
                                Location root,
                                Register obj,
                                uint32_t offset,
-                               bool requires_read_barrier);
+                               ReadBarrierOption read_barrier_option);
   void GenerateTestAndBranch(HInstruction* instruction,
                              size_t condition_input_index,
                              Label* true_target,
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 97224e0..35b1605 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -607,10 +607,16 @@
 // probably still be a from-space reference (unless it gets updated by
 // another thread, or if another thread installed another object
 // reference (different from `ref`) in `obj.field`).
+// If entrypoint is a valid location it is assumed to already be holding the entrypoint. The case
+// where the entrypoint is passed in is for the GcRoot read barrier.
 class ReadBarrierMarkSlowPathARM64 : public SlowPathCodeARM64 {
  public:
-  ReadBarrierMarkSlowPathARM64(HInstruction* instruction, Location ref)
-      : SlowPathCodeARM64(instruction), ref_(ref) {
+  ReadBarrierMarkSlowPathARM64(HInstruction* instruction,
+                               Location ref,
+                               Location entrypoint = Location::NoLocation())
+      : SlowPathCodeARM64(instruction),
+        ref_(ref),
+        entrypoint_(entrypoint) {
     DCHECK(kEmitCompilerReadBarrier);
   }
 
@@ -665,10 +671,16 @@
     //
     //   rX <- ReadBarrierMarkRegX(rX)
     //
-    int32_t entry_point_offset =
-        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ref_.reg());
-    // This runtime call does not require a stack map.
-    arm64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
+    if (entrypoint_.IsValid()) {
+      arm64_codegen->ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction_, this);
+      __ Blr(XRegisterFrom(entrypoint_));
+    } else {
+      // Entrypoint is not already loaded, load from the thread.
+      int32_t entry_point_offset =
+          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ref_.reg());
+      // This runtime call does not require a stack map.
+      arm64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
+    }
     __ B(GetExitLabel());
   }
 
@@ -676,6 +688,9 @@
   // The location (register) of the marked object reference.
   const Location ref_;
 
+  // The location of the entrypoint if it is already loaded.
+  const Location entrypoint_;
+
   DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathARM64);
 };
 
@@ -2333,13 +2348,22 @@
     if (maybe_compressed_char_at) {
       uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
       length = temps.AcquireW();
-      __ Ldr(length, HeapOperand(obj, count_offset));
+      if (instruction->GetArray()->IsIntermediateAddress()) {
+        DCHECK_LT(count_offset, offset);
+        int64_t adjusted_offset = static_cast<int64_t>(count_offset) - static_cast<int64_t>(offset);
+        // Note that `adjusted_offset` is negative, so this will be a LDUR.
+        __ Ldr(length, MemOperand(obj.X(), adjusted_offset));
+      } else {
+        __ Ldr(length, HeapOperand(obj, count_offset));
+      }
       codegen_->MaybeRecordImplicitNullCheck(instruction);
     }
     if (index.IsConstant()) {
       if (maybe_compressed_char_at) {
         vixl::aarch64::Label uncompressed_load, done;
-        __ Tbz(length.W(), kWRegSize - 1, &uncompressed_load);
+        static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
+                      "Expecting 0=compressed, 1=uncompressed");
+        __ Tbnz(length.W(), 0, &uncompressed_load);
         __ Ldrb(Register(OutputCPURegister(instruction)),
                 HeapOperand(obj, offset + Int64ConstantFrom(index)));
         __ B(&done);
@@ -2367,7 +2391,9 @@
       }
       if (maybe_compressed_char_at) {
         vixl::aarch64::Label uncompressed_load, done;
-        __ Tbz(length.W(), kWRegSize - 1, &uncompressed_load);
+        static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
+                      "Expecting 0=compressed, 1=uncompressed");
+        __ Tbnz(length.W(), 0, &uncompressed_load);
         __ Ldrb(Register(OutputCPURegister(instruction)),
                 HeapOperand(temp, XRegisterFrom(index), LSL, 0));
         __ B(&done);
@@ -2412,7 +2438,7 @@
   codegen_->MaybeRecordImplicitNullCheck(instruction);
   // Mask out compression flag from String's array length.
   if (mirror::kUseStringCompression && instruction->IsStringLength()) {
-    __ And(out.W(), out.W(), Operand(static_cast<int32_t>(INT32_MAX)));
+    __ Lsr(out.W(), out.W(), 1u);
   }
 }
 
@@ -3321,12 +3347,26 @@
   HandleFieldSet(instruction, instruction->GetFieldInfo(), instruction->GetValueCanBeNull());
 }
 
-static bool TypeCheckNeedsATemporary(TypeCheckKind type_check_kind) {
-  return kEmitCompilerReadBarrier &&
+// Temp is used for read barrier.
+static size_t NumberOfInstanceOfTemps(TypeCheckKind type_check_kind) {
+  if (kEmitCompilerReadBarrier &&
       (kUseBakerReadBarrier ||
-       type_check_kind == TypeCheckKind::kAbstractClassCheck ||
-       type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
-       type_check_kind == TypeCheckKind::kArrayObjectCheck);
+          type_check_kind == TypeCheckKind::kAbstractClassCheck ||
+          type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
+          type_check_kind == TypeCheckKind::kArrayObjectCheck)) {
+    return 1;
+  }
+  return 0;
+}
+
+// Interface case has 3 temps, one for holding the number of interfaces, one for the current
+// interface pointer, one for loading the current interface.
+// The other checks have one temp for loading the object's class.
+static size_t NumberOfCheckCastTemps(TypeCheckKind type_check_kind) {
+  if (type_check_kind == TypeCheckKind::kInterfaceCheck) {
+    return 3;
+  }
+  return 1 + NumberOfInstanceOfTemps(type_check_kind);
 }
 
 void LocationsBuilderARM64::VisitInstanceOf(HInstanceOf* instruction) {
@@ -3358,11 +3398,8 @@
   // The "out" register is used as a temporary, so it overlaps with the inputs.
   // Note that TypeCheckSlowPathARM64 uses this register too.
   locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
-  // When read barriers are enabled, we need a temporary register for
-  // some cases.
-  if (TypeCheckNeedsATemporary(type_check_kind)) {
-    locations->AddTemp(Location::RequiresRegister());
-  }
+  // Add temps if necessary for read barriers.
+  locations->AddRegisterTemps(NumberOfInstanceOfTemps(type_check_kind));
 }
 
 void InstructionCodeGeneratorARM64::VisitInstanceOf(HInstanceOf* instruction) {
@@ -3373,9 +3410,9 @@
   Register cls = InputRegisterAt(instruction, 1);
   Location out_loc = locations->Out();
   Register out = OutputRegister(instruction);
-  Location maybe_temp_loc = TypeCheckNeedsATemporary(type_check_kind) ?
-      locations->GetTemp(0) :
-      Location::NoLocation();
+  const size_t num_temps = NumberOfInstanceOfTemps(type_check_kind);
+  DCHECK_LE(num_temps, 1u);
+  Location maybe_temp_loc = (num_temps >= 1) ? locations->GetTemp(0) : Location::NoLocation();
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
   uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
@@ -3391,7 +3428,12 @@
   }
 
   // /* HeapReference<Class> */ out = obj->klass_
-  GenerateReferenceLoadTwoRegisters(instruction, out_loc, obj_loc, class_offset, maybe_temp_loc);
+  GenerateReferenceLoadTwoRegisters(instruction,
+                                    out_loc,
+                                    obj_loc,
+                                    class_offset,
+                                    maybe_temp_loc,
+                                    kCompilerReadBarrierOption);
 
   switch (type_check_kind) {
     case TypeCheckKind::kExactCheck: {
@@ -3409,7 +3451,11 @@
       vixl::aarch64::Label loop, success;
       __ Bind(&loop);
       // /* HeapReference<Class> */ out = out->super_class_
-      GenerateReferenceLoadOneRegister(instruction, out_loc, super_offset, maybe_temp_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       out_loc,
+                                       super_offset,
+                                       maybe_temp_loc,
+                                       kCompilerReadBarrierOption);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ Cbz(out, &done);
       __ Cmp(out, cls);
@@ -3428,7 +3474,11 @@
       __ Cmp(out, cls);
       __ B(eq, &success);
       // /* HeapReference<Class> */ out = out->super_class_
-      GenerateReferenceLoadOneRegister(instruction, out_loc, super_offset, maybe_temp_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       out_loc,
+                                       super_offset,
+                                       maybe_temp_loc,
+                                       kCompilerReadBarrierOption);
       __ Cbnz(out, &loop);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ B(&done);
@@ -3447,7 +3497,11 @@
       __ B(eq, &exact_check);
       // Otherwise, we need to check that the object's class is a non-primitive array.
       // /* HeapReference<Class> */ out = out->component_type_
-      GenerateReferenceLoadOneRegister(instruction, out_loc, component_offset, maybe_temp_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       out_loc,
+                                       component_offset,
+                                       maybe_temp_loc,
+                                       kCompilerReadBarrierOption);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ Cbz(out, &done);
       __ Ldrh(out, HeapOperand(out, primitive_offset));
@@ -3543,13 +3597,8 @@
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RequiresRegister());
-  // Note that TypeCheckSlowPathARM64 uses this "temp" register too.
-  locations->AddTemp(Location::RequiresRegister());
-  // When read barriers are enabled, we need an additional temporary
-  // register for some cases.
-  if (TypeCheckNeedsATemporary(type_check_kind)) {
-    locations->AddTemp(Location::RequiresRegister());
-  }
+  // Add temps for read barriers and other uses. One is used by TypeCheckSlowPathARM64.
+  locations->AddRegisterTemps(NumberOfCheckCastTemps(type_check_kind));
 }
 
 void InstructionCodeGeneratorARM64::VisitCheckCast(HCheckCast* instruction) {
@@ -3558,22 +3607,34 @@
   Location obj_loc = locations->InAt(0);
   Register obj = InputRegisterAt(instruction, 0);
   Register cls = InputRegisterAt(instruction, 1);
+  const size_t num_temps = NumberOfCheckCastTemps(type_check_kind);
+  DCHECK_GE(num_temps, 1u);
+  DCHECK_LE(num_temps, 3u);
   Location temp_loc = locations->GetTemp(0);
-  Location maybe_temp2_loc = TypeCheckNeedsATemporary(type_check_kind) ?
-      locations->GetTemp(1) :
-      Location::NoLocation();
+  Location maybe_temp2_loc = (num_temps >= 2) ? locations->GetTemp(1) : Location::NoLocation();
+  Location maybe_temp3_loc = (num_temps >= 3) ? locations->GetTemp(2) : Location::NoLocation();
   Register temp = WRegisterFrom(temp_loc);
-  uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
-  uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
-  uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
-  uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
+  const uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
+  const uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
+  const uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
+  const uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
+  const uint32_t iftable_offset = mirror::Class::IfTableOffset().Uint32Value();
+  const uint32_t array_length_offset = mirror::Array::LengthOffset().Uint32Value();
+  const uint32_t object_array_data_offset =
+      mirror::Array::DataOffset(kHeapReferenceSize).Uint32Value();
 
-  bool is_type_check_slow_path_fatal =
-      (type_check_kind == TypeCheckKind::kExactCheck ||
-       type_check_kind == TypeCheckKind::kAbstractClassCheck ||
-       type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
-       type_check_kind == TypeCheckKind::kArrayObjectCheck) &&
-      !instruction->CanThrowIntoCatchBlock();
+  bool is_type_check_slow_path_fatal = false;
+  // Always false for read barriers since we may need to go to the entrypoint for non-fatal cases
+  // from false negatives. The false negatives may come from avoiding read barriers below. Avoiding
+  // read barriers is done for performance and code size reasons.
+  if (!kEmitCompilerReadBarrier) {
+    is_type_check_slow_path_fatal =
+        (type_check_kind == TypeCheckKind::kExactCheck ||
+         type_check_kind == TypeCheckKind::kAbstractClassCheck ||
+         type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
+         type_check_kind == TypeCheckKind::kArrayObjectCheck) &&
+        !instruction->CanThrowIntoCatchBlock();
+  }
   SlowPathCodeARM64* type_check_slow_path =
       new (GetGraph()->GetArena()) TypeCheckSlowPathARM64(instruction,
                                                           is_type_check_slow_path_fatal);
@@ -3585,12 +3646,17 @@
     __ Cbz(obj, &done);
   }
 
-  // /* HeapReference<Class> */ temp = obj->klass_
-  GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset, maybe_temp2_loc);
-
   switch (type_check_kind) {
     case TypeCheckKind::kExactCheck:
     case TypeCheckKind::kArrayCheck: {
+      // /* HeapReference<Class> */ temp = obj->klass_
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        temp_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp2_loc,
+                                        kWithoutReadBarrier);
+
       __ Cmp(temp, cls);
       // Jump to slow path for throwing the exception or doing a
       // more involved array check.
@@ -3599,12 +3665,24 @@
     }
 
     case TypeCheckKind::kAbstractClassCheck: {
+      // /* HeapReference<Class> */ temp = obj->klass_
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        temp_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp2_loc,
+                                        kWithoutReadBarrier);
+
       // If the class is abstract, we eagerly fetch the super class of the
       // object to avoid doing a comparison we know will fail.
       vixl::aarch64::Label loop;
       __ Bind(&loop);
       // /* HeapReference<Class> */ temp = temp->super_class_
-      GenerateReferenceLoadOneRegister(instruction, temp_loc, super_offset, maybe_temp2_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       temp_loc,
+                                       super_offset,
+                                       maybe_temp2_loc,
+                                       kWithoutReadBarrier);
 
       // If the class reference currently in `temp` is null, jump to the slow path to throw the
       // exception.
@@ -3616,6 +3694,14 @@
     }
 
     case TypeCheckKind::kClassHierarchyCheck: {
+      // /* HeapReference<Class> */ temp = obj->klass_
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        temp_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp2_loc,
+                                        kWithoutReadBarrier);
+
       // Walk over the class hierarchy to find a match.
       vixl::aarch64::Label loop;
       __ Bind(&loop);
@@ -3623,7 +3709,11 @@
       __ B(eq, &done);
 
       // /* HeapReference<Class> */ temp = temp->super_class_
-      GenerateReferenceLoadOneRegister(instruction, temp_loc, super_offset, maybe_temp2_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       temp_loc,
+                                       super_offset,
+                                       maybe_temp2_loc,
+                                       kWithoutReadBarrier);
 
       // If the class reference currently in `temp` is not null, jump
       // back at the beginning of the loop.
@@ -3634,13 +3724,25 @@
     }
 
     case TypeCheckKind::kArrayObjectCheck: {
+      // /* HeapReference<Class> */ temp = obj->klass_
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        temp_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp2_loc,
+                                        kWithoutReadBarrier);
+
       // Do an exact check.
       __ Cmp(temp, cls);
       __ B(eq, &done);
 
       // Otherwise, we need to check that the object's class is a non-primitive array.
       // /* HeapReference<Class> */ temp = temp->component_type_
-      GenerateReferenceLoadOneRegister(instruction, temp_loc, component_offset, maybe_temp2_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       temp_loc,
+                                       component_offset,
+                                       maybe_temp2_loc,
+                                       kWithoutReadBarrier);
 
       // If the component type is null, jump to the slow path to throw the exception.
       __ Cbz(temp, type_check_slow_path->GetEntryLabel());
@@ -3653,9 +3755,7 @@
     }
 
     case TypeCheckKind::kUnresolvedCheck:
-    case TypeCheckKind::kInterfaceCheck:
-      // We always go into the type check slow path for the unresolved
-      // and interface check cases.
+      // We always go into the type check slow path for the unresolved check cases.
       //
       // We cannot directly call the CheckCast runtime entry point
       // without resorting to a type checking slow path here (i.e. by
@@ -3666,6 +3766,44 @@
       // emission at the beginning of this method.
       __ B(type_check_slow_path->GetEntryLabel());
       break;
+    case TypeCheckKind::kInterfaceCheck: {
+      // /* HeapReference<Class> */ temp = obj->klass_
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        temp_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp2_loc,
+                                        kWithoutReadBarrier);
+
+      // /* HeapReference<Class> */ temp = temp->iftable_
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        temp_loc,
+                                        temp_loc,
+                                        iftable_offset,
+                                        maybe_temp2_loc,
+                                        kWithoutReadBarrier);
+      vixl::aarch64::Label is_null;
+      // Null iftable means it is empty and will always fail the check.
+      __ Cbz(temp, &is_null);
+
+      // Loop through the iftable and check if any class matches.
+      __ Ldr(WRegisterFrom(maybe_temp2_loc), HeapOperand(temp.W(), array_length_offset));
+
+      vixl::aarch64::Label start_loop;
+      __ Bind(&start_loop);
+      __ Ldr(WRegisterFrom(maybe_temp3_loc), HeapOperand(temp.W(), object_array_data_offset));
+      GetAssembler()->MaybeUnpoisonHeapReference(WRegisterFrom(maybe_temp3_loc));
+      __ Cmp(cls, WRegisterFrom(maybe_temp3_loc));
+      __ B(eq, &done);  // Return if same class.
+      // Go to next interface.
+      __ Add(temp, temp, 2 * kHeapReferenceSize);
+      __ Sub(WRegisterFrom(maybe_temp2_loc), WRegisterFrom(maybe_temp2_loc), 2);
+      __ Cbnz(WRegisterFrom(maybe_temp2_loc), &start_loop);
+      __ Bind(&is_null);
+
+      __ B(type_check_slow_path->GetEntryLabel());
+      break;
+    }
   }
   __ Bind(&done);
 
@@ -4228,7 +4366,9 @@
   Location out_loc = cls->GetLocations()->Out();
   Register out = OutputRegister(cls);
 
-  const bool requires_read_barrier = kEmitCompilerReadBarrier && !cls->IsInBootImage();
+  const ReadBarrierOption read_barrier_option = cls->IsInBootImage()
+      ? kWithoutReadBarrier
+      : kCompilerReadBarrierOption;
   bool generate_null_check = false;
   switch (cls->GetLoadKind()) {
     case HLoadClass::LoadKind::kReferrersClass: {
@@ -4241,16 +4381,16 @@
                               current_method,
                               ArtMethod::DeclaringClassOffset().Int32Value(),
                               /* fixup_label */ nullptr,
-                              requires_read_barrier);
+                              read_barrier_option);
       break;
     }
     case HLoadClass::LoadKind::kBootImageLinkTimeAddress:
-      DCHECK(!requires_read_barrier);
+      DCHECK_EQ(read_barrier_option, kWithoutReadBarrier);
       __ Ldr(out, codegen_->DeduplicateBootImageTypeLiteral(cls->GetDexFile(),
                                                             cls->GetTypeIndex()));
       break;
     case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: {
-      DCHECK(!requires_read_barrier);
+      DCHECK_EQ(read_barrier_option, kWithoutReadBarrier);
       // Add ADRP with its PC-relative type patch.
       const DexFile& dex_file = cls->GetDexFile();
       uint32_t type_index = cls->GetTypeIndex();
@@ -4263,7 +4403,7 @@
       break;
     }
     case HLoadClass::LoadKind::kBootImageAddress: {
-      DCHECK(!requires_read_barrier);
+      DCHECK_EQ(read_barrier_option, kWithoutReadBarrier);
       DCHECK(cls->GetAddress() != 0u && IsUint<32>(cls->GetAddress()));
       __ Ldr(out.W(), codegen_->DeduplicateBootImageAddressLiteral(cls->GetAddress()));
       break;
@@ -4286,7 +4426,7 @@
                               out.X(),
                               offset,
                               /* fixup_label */ nullptr,
-                              requires_read_barrier);
+                              read_barrier_option);
       generate_null_check = !cls->IsInDexCache();
       break;
     }
@@ -4306,7 +4446,7 @@
                               out.X(),
                               /* offset placeholder */ 0,
                               ldr_label,
-                              requires_read_barrier);
+                              read_barrier_option);
       generate_null_check = !cls->IsInDexCache();
       break;
     }
@@ -4323,7 +4463,7 @@
                               out.X(),
                               CodeGenerator::GetCacheOffset(cls->GetTypeIndex()),
                               /* fixup_label */ nullptr,
-                              requires_read_barrier);
+                              read_barrier_option);
       generate_null_check = !cls->IsInDexCache();
       break;
     }
@@ -4460,7 +4600,7 @@
                               temp,
                               /* offset placeholder */ 0u,
                               ldr_label,
-                              kEmitCompilerReadBarrier);
+                              kCompilerReadBarrierOption);
       SlowPathCodeARM64* slow_path =
           new (GetGraph()->GetArena()) LoadStringSlowPathARM64(load, temp, adrp_label);
       codegen_->AddSlowPath(slow_path);
@@ -5162,13 +5302,16 @@
   }
 }
 
-void InstructionCodeGeneratorARM64::GenerateReferenceLoadOneRegister(HInstruction* instruction,
-                                                                     Location out,
-                                                                     uint32_t offset,
-                                                                     Location maybe_temp) {
+void InstructionCodeGeneratorARM64::GenerateReferenceLoadOneRegister(
+    HInstruction* instruction,
+    Location out,
+    uint32_t offset,
+    Location maybe_temp,
+    ReadBarrierOption read_barrier_option) {
   Primitive::Type type = Primitive::kPrimNot;
   Register out_reg = RegisterFrom(out, type);
-  if (kEmitCompilerReadBarrier) {
+  if (read_barrier_option == kWithReadBarrier) {
+    CHECK(kEmitCompilerReadBarrier);
     Register temp_reg = RegisterFrom(maybe_temp, type);
     if (kUseBakerReadBarrier) {
       // Load with fast path based Baker's read barrier.
@@ -5198,15 +5341,18 @@
   }
 }
 
-void InstructionCodeGeneratorARM64::GenerateReferenceLoadTwoRegisters(HInstruction* instruction,
-                                                                      Location out,
-                                                                      Location obj,
-                                                                      uint32_t offset,
-                                                                      Location maybe_temp) {
+void InstructionCodeGeneratorARM64::GenerateReferenceLoadTwoRegisters(
+    HInstruction* instruction,
+    Location out,
+    Location obj,
+    uint32_t offset,
+    Location maybe_temp,
+    ReadBarrierOption read_barrier_option) {
   Primitive::Type type = Primitive::kPrimNot;
   Register out_reg = RegisterFrom(out, type);
   Register obj_reg = RegisterFrom(obj, type);
-  if (kEmitCompilerReadBarrier) {
+  if (read_barrier_option == kWithReadBarrier) {
+    CHECK(kEmitCompilerReadBarrier);
     if (kUseBakerReadBarrier) {
       // Load with fast path based Baker's read barrier.
       Register temp_reg = RegisterFrom(maybe_temp, type);
@@ -5232,23 +5378,25 @@
   }
 }
 
-void InstructionCodeGeneratorARM64::GenerateGcRootFieldLoad(HInstruction* instruction,
-                                                            Location root,
-                                                            Register obj,
-                                                            uint32_t offset,
-                                                            vixl::aarch64::Label* fixup_label,
-                                                            bool requires_read_barrier) {
+void InstructionCodeGeneratorARM64::GenerateGcRootFieldLoad(
+    HInstruction* instruction,
+    Location root,
+    Register obj,
+    uint32_t offset,
+    vixl::aarch64::Label* fixup_label,
+    ReadBarrierOption read_barrier_option) {
   DCHECK(fixup_label == nullptr || offset == 0u);
   Register root_reg = RegisterFrom(root, Primitive::kPrimNot);
-  if (requires_read_barrier) {
+  if (read_barrier_option == kWithReadBarrier) {
     DCHECK(kEmitCompilerReadBarrier);
     if (kUseBakerReadBarrier) {
       // Fast path implementation of art::ReadBarrier::BarrierForRoot when
       // Baker's read barrier are used:
       //
       //   root = obj.field;
-      //   if (Thread::Current()->GetIsGcMarking()) {
-      //     root = ReadBarrier::Mark(root)
+      //   temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
+      //   if (temp != null) {
+      //     root = temp(root)
       //   }
 
       // /* GcRoot<mirror::Object> */ root = *(obj + offset)
@@ -5265,16 +5413,22 @@
                     "art::mirror::CompressedReference<mirror::Object> and int32_t "
                     "have different sizes.");
 
-      // Slow path marking the GC root `root`.
-      SlowPathCodeARM64* slow_path =
-          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(instruction, root);
-      codegen_->AddSlowPath(slow_path);
+      Register temp = lr;
 
-      MacroAssembler* masm = GetVIXLAssembler();
-      UseScratchRegisterScope temps(masm);
-      Register temp = temps.AcquireW();
-      // temp = Thread::Current()->GetIsGcMarking()
-      __ Ldr(temp, MemOperand(tr, Thread::IsGcMarkingOffset<kArm64PointerSize>().Int32Value()));
+      // Slow path marking the GC root `root`. The entrypoint will alrady be loaded in temp.
+      SlowPathCodeARM64* slow_path =
+          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(instruction,
+                                                                    root,
+                                                                    LocationFrom(temp));
+      codegen_->AddSlowPath(slow_path);
+      const int32_t entry_point_offset =
+          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(root.reg());
+      // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
+      // Loading the entrypoint does not require a load acquire since it is only changed when
+      // threads are suspended or running a checkpoint.
+      __ Ldr(temp, MemOperand(tr, entry_point_offset));
+      // The entrypoint is null when the GC is not marking, this prevents one load compared to
+      // checking GetIsGcMarking.
       __ Cbnz(temp, slow_path->GetEntryLabel());
       __ Bind(slow_path->GetExitLabel());
     } else {
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 7f54b4b..0e8d4fd 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -269,7 +269,8 @@
   void GenerateReferenceLoadOneRegister(HInstruction* instruction,
                                         Location out,
                                         uint32_t offset,
-                                        Location maybe_temp);
+                                        Location maybe_temp,
+                                        ReadBarrierOption read_barrier_option);
   // Generate a heap reference load using two different registers
   // `out` and `obj`:
   //
@@ -284,18 +285,19 @@
                                          Location out,
                                          Location obj,
                                          uint32_t offset,
-                                         Location maybe_temp);
+                                         Location maybe_temp,
+                                         ReadBarrierOption read_barrier_option);
   // Generate a GC root reference load:
   //
   //   root <- *(obj + offset)
   //
-  // while honoring read barriers if `requires_read_barrier` is true.
+  // while honoring read barriers based on read_barrier_option.
   void GenerateGcRootFieldLoad(HInstruction* instruction,
                                Location root,
                                vixl::aarch64::Register obj,
                                uint32_t offset,
                                vixl::aarch64::Label* fixup_label,
-                               bool requires_read_barrier);
+                               ReadBarrierOption read_barrier_option);
 
   // Generate a floating-point comparison.
   void GenerateFcmp(HInstruction* instruction);
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index b9814b6..1c1cc95 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -458,7 +458,7 @@
     __ Bind(GetEntryLabel());
 
     if (!is_fatal_) {
-      TODO_VIXL32(FATAL);
+      SaveLiveRegisters(codegen, locations);
     }
 
     // We're moving two locations to locations that could overlap, so we need a parallel
@@ -472,7 +472,13 @@
                                LocationFrom(calling_convention.GetRegisterAt(1)),
                                Primitive::kPrimNot);
     if (instruction_->IsInstanceOf()) {
-      TODO_VIXL32(FATAL);
+      arm_codegen->InvokeRuntime(kQuickInstanceofNonTrivial,
+                                 instruction_,
+                                 instruction_->GetDexPc(),
+                                 this);
+      CheckEntrypointTypes<
+          kQuickInstanceofNonTrivial, size_t, mirror::Class*, mirror::Class*>();
+      arm_codegen->Move32(locations->Out(), LocationFrom(r0));
     } else {
       DCHECK(instruction_->IsCheckCast());
       arm_codegen->InvokeRuntime(kQuickCheckInstanceOf,
@@ -483,7 +489,8 @@
     }
 
     if (!is_fatal_) {
-      TODO_VIXL32(FATAL);
+      RestoreLiveRegisters(codegen, locations);
+      __ B(GetExitLabel());
     }
   }
 
@@ -850,9 +857,9 @@
   }
 }
 
-void CodeGeneratorARMVIXL::MoveConstant(Location destination ATTRIBUTE_UNUSED,
-                                        int32_t value ATTRIBUTE_UNUSED) {
-  TODO_VIXL32(FATAL);
+void CodeGeneratorARMVIXL::MoveConstant(Location location, int32_t value) {
+  DCHECK(location.IsRegister());
+  __ Mov(RegisterFrom(location), value);
 }
 
 void CodeGeneratorARMVIXL::MoveLocation(Location dst, Location src, Primitive::Type dst_type) {
@@ -863,9 +870,15 @@
   GetMoveResolver()->EmitNativeCode(&move);
 }
 
-void CodeGeneratorARMVIXL::AddLocationAsTemp(Location location ATTRIBUTE_UNUSED,
-                                             LocationSummary* locations ATTRIBUTE_UNUSED) {
-  TODO_VIXL32(FATAL);
+void CodeGeneratorARMVIXL::AddLocationAsTemp(Location location, LocationSummary* locations) {
+  if (location.IsRegister()) {
+    locations->AddTemp(location);
+  } else if (location.IsRegisterPair()) {
+    locations->AddTemp(LocationFrom(LowRegisterFrom(location)));
+    locations->AddTemp(LocationFrom(HighRegisterFrom(location)));
+  } else {
+    UNIMPLEMENTED(FATAL) << "AddLocationAsTemp not implemented for location " << location;
+  }
 }
 
 void CodeGeneratorARMVIXL::InvokeRuntime(QuickEntrypointEnum entrypoint,
@@ -1478,6 +1491,17 @@
   codegen_->GenerateFrameExit();
 }
 
+void LocationsBuilderARMVIXL::VisitInvokeUnresolved(HInvokeUnresolved* invoke) {
+  // The trampoline uses the same calling convention as dex calling conventions,
+  // except instead of loading arg0/r0 with the target Method*, arg0/r0 will contain
+  // the method_idx.
+  HandleInvoke(invoke);
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitInvokeUnresolved(HInvokeUnresolved* invoke) {
+  codegen_->GenerateInvokeUnresolvedRuntimeCall(invoke);
+}
+
 void LocationsBuilderARMVIXL::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) {
   // Explicit clinit checks triggered by static invokes must have been pruned by
   // art::PrepareForRegisterAllocation.
@@ -1548,6 +1572,63 @@
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
 }
 
+void LocationsBuilderARMVIXL::VisitInvokeInterface(HInvokeInterface* invoke) {
+  HandleInvoke(invoke);
+  // Add the hidden argument.
+  invoke->GetLocations()->AddTemp(LocationFrom(r12));
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitInvokeInterface(HInvokeInterface* invoke) {
+  // TODO: b/18116999, our IMTs can miss an IncompatibleClassChangeError.
+  LocationSummary* locations = invoke->GetLocations();
+  vixl32::Register temp = RegisterFrom(locations->GetTemp(0));
+  vixl32::Register hidden_reg = RegisterFrom(locations->GetTemp(1));
+  Location receiver = locations->InAt(0);
+  uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
+
+  DCHECK(!receiver.IsStackSlot());
+
+  // /* HeapReference<Class> */ temp = receiver->klass_
+  GetAssembler()->LoadFromOffset(kLoadWord, temp, RegisterFrom(receiver), class_offset);
+
+  codegen_->MaybeRecordImplicitNullCheck(invoke);
+  // Instead of simply (possibly) unpoisoning `temp` here, we should
+  // emit a read barrier for the previous class reference load.
+  // However this is not required in practice, as this is an
+  // intermediate/temporary reference and because the current
+  // concurrent copying collector keeps the from-space memory
+  // intact/accessible until the end of the marking phase (the
+  // concurrent copying collector may not in the future).
+  GetAssembler()->MaybeUnpoisonHeapReference(temp);
+  GetAssembler()->LoadFromOffset(kLoadWord,
+                                 temp,
+                                 temp,
+                                 mirror::Class::ImtPtrOffset(kArmPointerSize).Uint32Value());
+  uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement(
+      invoke->GetImtIndex(), kArmPointerSize));
+  // temp = temp->GetImtEntryAt(method_offset);
+  GetAssembler()->LoadFromOffset(kLoadWord, temp, temp, method_offset);
+  uint32_t entry_point =
+      ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArmPointerSize).Int32Value();
+  // LR = temp->GetEntryPoint();
+  GetAssembler()->LoadFromOffset(kLoadWord, lr, temp, entry_point);
+
+  // Set the hidden (in r12) argument. It is done here, right before a BLX to prevent other
+  // instruction from clobbering it as they might use r12 as a scratch register.
+  DCHECK(hidden_reg.Is(r12));
+  __ Mov(hidden_reg, invoke->GetDexMethodIndex());
+
+  {
+    AssemblerAccurateScope aas(GetVIXLAssembler(),
+                               kArmInstrMaxSizeInBytes,
+                               CodeBufferCheckScope::kMaximumSize);
+    // LR();
+    __ blx(lr);
+    DCHECK(!codegen_->IsLeafMethod());
+    codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
+  }
+}
+
 void LocationsBuilderARMVIXL::VisitNeg(HNeg* neg) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(neg, LocationSummary::kNoCall);
@@ -3592,6 +3673,74 @@
   HandleFieldSet(instruction, instruction->GetFieldInfo(), instruction->GetValueCanBeNull());
 }
 
+void LocationsBuilderARMVIXL::VisitUnresolvedInstanceFieldGet(
+    HUnresolvedInstanceFieldGet* instruction) {
+  FieldAccessCallingConventionARMVIXL calling_convention;
+  codegen_->CreateUnresolvedFieldLocationSummary(
+      instruction, instruction->GetFieldType(), calling_convention);
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitUnresolvedInstanceFieldGet(
+    HUnresolvedInstanceFieldGet* instruction) {
+  FieldAccessCallingConventionARMVIXL calling_convention;
+  codegen_->GenerateUnresolvedFieldAccess(instruction,
+                                          instruction->GetFieldType(),
+                                          instruction->GetFieldIndex(),
+                                          instruction->GetDexPc(),
+                                          calling_convention);
+}
+
+void LocationsBuilderARMVIXL::VisitUnresolvedInstanceFieldSet(
+    HUnresolvedInstanceFieldSet* instruction) {
+  FieldAccessCallingConventionARMVIXL calling_convention;
+  codegen_->CreateUnresolvedFieldLocationSummary(
+      instruction, instruction->GetFieldType(), calling_convention);
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitUnresolvedInstanceFieldSet(
+    HUnresolvedInstanceFieldSet* instruction) {
+  FieldAccessCallingConventionARMVIXL calling_convention;
+  codegen_->GenerateUnresolvedFieldAccess(instruction,
+                                          instruction->GetFieldType(),
+                                          instruction->GetFieldIndex(),
+                                          instruction->GetDexPc(),
+                                          calling_convention);
+}
+
+void LocationsBuilderARMVIXL::VisitUnresolvedStaticFieldGet(
+    HUnresolvedStaticFieldGet* instruction) {
+  FieldAccessCallingConventionARMVIXL calling_convention;
+  codegen_->CreateUnresolvedFieldLocationSummary(
+      instruction, instruction->GetFieldType(), calling_convention);
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitUnresolvedStaticFieldGet(
+    HUnresolvedStaticFieldGet* instruction) {
+  FieldAccessCallingConventionARMVIXL calling_convention;
+  codegen_->GenerateUnresolvedFieldAccess(instruction,
+                                          instruction->GetFieldType(),
+                                          instruction->GetFieldIndex(),
+                                          instruction->GetDexPc(),
+                                          calling_convention);
+}
+
+void LocationsBuilderARMVIXL::VisitUnresolvedStaticFieldSet(
+    HUnresolvedStaticFieldSet* instruction) {
+  FieldAccessCallingConventionARMVIXL calling_convention;
+  codegen_->CreateUnresolvedFieldLocationSummary(
+      instruction, instruction->GetFieldType(), calling_convention);
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitUnresolvedStaticFieldSet(
+    HUnresolvedStaticFieldSet* instruction) {
+  FieldAccessCallingConventionARMVIXL calling_convention;
+  codegen_->GenerateUnresolvedFieldAccess(instruction,
+                                          instruction->GetFieldType(),
+                                          instruction->GetFieldIndex(),
+                                          instruction->GetDexPc(),
+                                          calling_convention);
+}
+
 void LocationsBuilderARMVIXL::VisitNullCheck(HNullCheck* instruction) {
   // TODO(VIXL): https://android-review.googlesource.com/#/c/275337/
   LocationSummary::CallKind call_kind = instruction->CanThrowIntoCatchBlock()
@@ -3798,16 +3947,21 @@
     case Primitive::kPrimShort:
     case Primitive::kPrimChar:
     case Primitive::kPrimInt: {
+      vixl32::Register length;
+      if (maybe_compressed_char_at) {
+        length = RegisterFrom(locations->GetTemp(0));
+        uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
+        GetAssembler()->LoadFromOffset(kLoadWord, length, obj, count_offset);
+        codegen_->MaybeRecordImplicitNullCheck(instruction);
+      }
       if (index.IsConstant()) {
         int32_t const_index = index.GetConstant()->AsIntConstant()->GetValue();
         if (maybe_compressed_char_at) {
-          vixl32::Register length = temps.Acquire();
           vixl32::Label uncompressed_load, done;
-          uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
-          GetAssembler()->LoadFromOffset(kLoadWord, length, obj, count_offset);
-          codegen_->MaybeRecordImplicitNullCheck(instruction);
-          __ Cmp(length, 0);
-          __ B(ge, &uncompressed_load);
+          __ Lsrs(length, length, 1u);  // LSRS has a 16-bit encoding, TST (immediate) does not.
+          static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
+                        "Expecting 0=compressed, 1=uncompressed");
+          __ B(cs, &uncompressed_load);
           GetAssembler()->LoadFromOffset(kLoadUnsignedByte,
                                          RegisterFrom(out_loc),
                                          obj,
@@ -3835,12 +3989,10 @@
         }
         if (maybe_compressed_char_at) {
           vixl32::Label uncompressed_load, done;
-          uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
-          vixl32::Register length = RegisterFrom(locations->GetTemp(0));
-          GetAssembler()->LoadFromOffset(kLoadWord, length, obj, count_offset);
-          codegen_->MaybeRecordImplicitNullCheck(instruction);
-          __ Cmp(length, 0);
-          __ B(ge, &uncompressed_load);
+          __ Lsrs(length, length, 1u);  // LSRS has a 16-bit encoding, TST (immediate) does not.
+          static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
+                        "Expecting 0=compressed, 1=uncompressed");
+          __ B(cs, &uncompressed_load);
           __ Ldrb(RegisterFrom(out_loc), MemOperand(temp, RegisterFrom(index), vixl32::LSL, 0));
           __ B(&done);
           __ Bind(&uncompressed_load);
@@ -4219,7 +4371,7 @@
   codegen_->MaybeRecordImplicitNullCheck(instruction);
   // Mask out compression flag from String's array length.
   if (mirror::kUseStringCompression && instruction->IsStringLength()) {
-    __ Bic(out, out, 1u << 31);
+    __ Lsr(out, out, 1u);
   }
 }
 
@@ -4762,6 +4914,196 @@
        type_check_kind == TypeCheckKind::kArrayObjectCheck);
 }
 
+
+void LocationsBuilderARMVIXL::VisitInstanceOf(HInstanceOf* instruction) {
+  LocationSummary::CallKind call_kind = LocationSummary::kNoCall;
+  TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
+  bool baker_read_barrier_slow_path = false;
+  switch (type_check_kind) {
+    case TypeCheckKind::kExactCheck:
+    case TypeCheckKind::kAbstractClassCheck:
+    case TypeCheckKind::kClassHierarchyCheck:
+    case TypeCheckKind::kArrayObjectCheck:
+      call_kind =
+          kEmitCompilerReadBarrier ? LocationSummary::kCallOnSlowPath : LocationSummary::kNoCall;
+      baker_read_barrier_slow_path = kUseBakerReadBarrier;
+      break;
+    case TypeCheckKind::kArrayCheck:
+    case TypeCheckKind::kUnresolvedCheck:
+    case TypeCheckKind::kInterfaceCheck:
+      call_kind = LocationSummary::kCallOnSlowPath;
+      break;
+  }
+
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind);
+  if (baker_read_barrier_slow_path) {
+    locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
+  }
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(1, Location::RequiresRegister());
+  // The "out" register is used as a temporary, so it overlaps with the inputs.
+  // Note that TypeCheckSlowPathARM uses this register too.
+  locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
+  // When read barriers are enabled, we need a temporary register for
+  // some cases.
+  if (TypeCheckNeedsATemporary(type_check_kind)) {
+    locations->AddTemp(Location::RequiresRegister());
+  }
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitInstanceOf(HInstanceOf* instruction) {
+  TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
+  LocationSummary* locations = instruction->GetLocations();
+  Location obj_loc = locations->InAt(0);
+  vixl32::Register obj = InputRegisterAt(instruction, 0);
+  vixl32::Register cls = InputRegisterAt(instruction, 1);
+  Location out_loc = locations->Out();
+  vixl32::Register out = OutputRegister(instruction);
+  Location maybe_temp_loc = TypeCheckNeedsATemporary(type_check_kind) ?
+      locations->GetTemp(0) :
+      Location::NoLocation();
+  uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
+  uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
+  uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
+  uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
+  vixl32::Label done, zero;
+  SlowPathCodeARMVIXL* slow_path = nullptr;
+
+  // Return 0 if `obj` is null.
+  // avoid null check if we know obj is not null.
+  if (instruction->MustDoNullCheck()) {
+    __ Cbz(obj, &zero);
+  }
+
+  // /* HeapReference<Class> */ out = obj->klass_
+  GenerateReferenceLoadTwoRegisters(instruction, out_loc, obj_loc, class_offset, maybe_temp_loc);
+
+  switch (type_check_kind) {
+    case TypeCheckKind::kExactCheck: {
+      __ Cmp(out, cls);
+      // Classes must be equal for the instanceof to succeed.
+      __ B(ne, &zero);
+      __ Mov(out, 1);
+      __ B(&done);
+      break;
+    }
+
+    case TypeCheckKind::kAbstractClassCheck: {
+      // If the class is abstract, we eagerly fetch the super class of the
+      // object to avoid doing a comparison we know will fail.
+      vixl32::Label loop;
+      __ Bind(&loop);
+      // /* HeapReference<Class> */ out = out->super_class_
+      GenerateReferenceLoadOneRegister(instruction, out_loc, super_offset, maybe_temp_loc);
+      // If `out` is null, we use it for the result, and jump to `done`.
+      __ Cbz(out, &done);
+      __ Cmp(out, cls);
+      __ B(ne, &loop);
+      __ Mov(out, 1);
+      if (zero.IsReferenced()) {
+        __ B(&done);
+      }
+      break;
+    }
+
+    case TypeCheckKind::kClassHierarchyCheck: {
+      // Walk over the class hierarchy to find a match.
+      vixl32::Label loop, success;
+      __ Bind(&loop);
+      __ Cmp(out, cls);
+      __ B(eq, &success);
+      // /* HeapReference<Class> */ out = out->super_class_
+      GenerateReferenceLoadOneRegister(instruction, out_loc, super_offset, maybe_temp_loc);
+      __ Cbnz(out, &loop);
+      // If `out` is null, we use it for the result, and jump to `done`.
+      __ B(&done);
+      __ Bind(&success);
+      __ Mov(out, 1);
+      if (zero.IsReferenced()) {
+        __ B(&done);
+      }
+      break;
+    }
+
+    case TypeCheckKind::kArrayObjectCheck: {
+      // Do an exact check.
+      vixl32::Label exact_check;
+      __ Cmp(out, cls);
+      __ B(eq, &exact_check);
+      // Otherwise, we need to check that the object's class is a non-primitive array.
+      // /* HeapReference<Class> */ out = out->component_type_
+      GenerateReferenceLoadOneRegister(instruction, out_loc, component_offset, maybe_temp_loc);
+      // If `out` is null, we use it for the result, and jump to `done`.
+      __ Cbz(out, &done);
+      GetAssembler()->LoadFromOffset(kLoadUnsignedHalfword, out, out, primitive_offset);
+      static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
+      __ Cbnz(out, &zero);
+      __ Bind(&exact_check);
+      __ Mov(out, 1);
+      __ B(&done);
+      break;
+    }
+
+    case TypeCheckKind::kArrayCheck: {
+      __ Cmp(out, cls);
+      DCHECK(locations->OnlyCallsOnSlowPath());
+      slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathARMVIXL(instruction,
+                                                                        /* is_fatal */ false);
+      codegen_->AddSlowPath(slow_path);
+      __ B(ne, slow_path->GetEntryLabel());
+      __ Mov(out, 1);
+      if (zero.IsReferenced()) {
+        __ B(&done);
+      }
+      break;
+    }
+
+    case TypeCheckKind::kUnresolvedCheck:
+    case TypeCheckKind::kInterfaceCheck: {
+      // Note that we indeed only call on slow path, but we always go
+      // into the slow path for the unresolved and interface check
+      // cases.
+      //
+      // We cannot directly call the InstanceofNonTrivial runtime
+      // entry point without resorting to a type checking slow path
+      // here (i.e. by calling InvokeRuntime directly), as it would
+      // require to assign fixed registers for the inputs of this
+      // HInstanceOf instruction (following the runtime calling
+      // convention), which might be cluttered by the potential first
+      // read barrier emission at the beginning of this method.
+      //
+      // TODO: Introduce a new runtime entry point taking the object
+      // to test (instead of its class) as argument, and let it deal
+      // with the read barrier issues. This will let us refactor this
+      // case of the `switch` code as it was previously (with a direct
+      // call to the runtime not using a type checking slow path).
+      // This should also be beneficial for the other cases above.
+      DCHECK(locations->OnlyCallsOnSlowPath());
+      slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathARMVIXL(instruction,
+                                                                        /* is_fatal */ false);
+      codegen_->AddSlowPath(slow_path);
+      __ B(slow_path->GetEntryLabel());
+      if (zero.IsReferenced()) {
+        __ B(&done);
+      }
+      break;
+    }
+  }
+
+  if (zero.IsReferenced()) {
+    __ Bind(&zero);
+    __ Mov(out, 0);
+  }
+
+  if (done.IsReferenced()) {
+    __ Bind(&done);
+  }
+
+  if (slow_path != nullptr) {
+    __ Bind(slow_path->GetExitLabel());
+  }
+}
+
 void LocationsBuilderARMVIXL::VisitCheckCast(HCheckCast* instruction) {
   LocationSummary::CallKind call_kind = LocationSummary::kNoCall;
   bool throws_into_catch = instruction->CanThrowIntoCatchBlock();
@@ -4807,6 +5149,9 @@
       locations->GetTemp(1) :
       Location::NoLocation();
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
+  uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
+  uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
+  uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
 
   bool is_type_check_slow_path_fatal =
       (type_check_kind == TypeCheckKind::kExactCheck ||
@@ -4839,23 +5184,72 @@
     }
 
     case TypeCheckKind::kAbstractClassCheck: {
-      TODO_VIXL32(FATAL);
+      // If the class is abstract, we eagerly fetch the super class of the
+      // object to avoid doing a comparison we know will fail.
+      vixl32::Label loop;
+      __ Bind(&loop);
+      // /* HeapReference<Class> */ temp = temp->super_class_
+      GenerateReferenceLoadOneRegister(instruction, temp_loc, super_offset, maybe_temp2_loc);
+
+      // If the class reference currently in `temp` is null, jump to the slow path to throw the
+      // exception.
+      __ Cbz(temp, type_check_slow_path->GetEntryLabel());
+
+      // Otherwise, compare the classes.
+      __ Cmp(temp, cls);
+      __ B(ne, &loop);
       break;
     }
 
     case TypeCheckKind::kClassHierarchyCheck: {
-      TODO_VIXL32(FATAL);
+      // Walk over the class hierarchy to find a match.
+      vixl32::Label loop;
+      __ Bind(&loop);
+      __ Cmp(temp, cls);
+      __ B(eq, &done);
+
+      // /* HeapReference<Class> */ temp = temp->super_class_
+      GenerateReferenceLoadOneRegister(instruction, temp_loc, super_offset, maybe_temp2_loc);
+
+      // If the class reference currently in `temp` is null, jump to the slow path to throw the
+      // exception.
+      __ Cbz(temp, type_check_slow_path->GetEntryLabel());
+      // Otherwise, jump to the beginning of the loop.
+      __ B(&loop);
       break;
     }
 
-    case TypeCheckKind::kArrayObjectCheck: {
-      TODO_VIXL32(FATAL);
+    case TypeCheckKind::kArrayObjectCheck:  {
+      // Do an exact check.
+      __ Cmp(temp, cls);
+      __ B(eq, &done);
+
+      // Otherwise, we need to check that the object's class is a non-primitive array.
+      // /* HeapReference<Class> */ temp = temp->component_type_
+      GenerateReferenceLoadOneRegister(instruction, temp_loc, component_offset, maybe_temp2_loc);
+      // If the component type is null, jump to the slow path to throw the exception.
+      __ Cbz(temp, type_check_slow_path->GetEntryLabel());
+      // Otherwise,the object is indeed an array, jump to label `check_non_primitive_component_type`
+      // to further check that this component type is not a primitive type.
+      GetAssembler()->LoadFromOffset(kLoadUnsignedHalfword, temp, temp, primitive_offset);
+      static_assert(Primitive::kPrimNot == 0, "Expected 0 for art::Primitive::kPrimNot");
+      __ Cbnz(temp, type_check_slow_path->GetEntryLabel());
       break;
     }
 
     case TypeCheckKind::kUnresolvedCheck:
     case TypeCheckKind::kInterfaceCheck:
-      TODO_VIXL32(FATAL);
+      // We always go into the type check slow path for the unresolved
+      // and interface check cases.
+      //
+      // We cannot directly call the CheckCast runtime entry point
+      // without resorting to a type checking slow path here (i.e. by
+      // calling InvokeRuntime directly), as it would require to
+      // assign fixed registers for the inputs of this HInstanceOf
+      // instruction (following the runtime calling convention), which
+      // might be cluttered by the potential first read barrier
+      // emission at the beginning of this method.
+      __ B(type_check_slow_path->GetEntryLabel());
       break;
   }
   __ Bind(&done);
@@ -5034,6 +5428,22 @@
   }
 }
 
+void InstructionCodeGeneratorARMVIXL::GenerateReferenceLoadOneRegister(
+    HInstruction* instruction ATTRIBUTE_UNUSED,
+    Location out,
+    uint32_t offset,
+    Location maybe_temp ATTRIBUTE_UNUSED) {
+  vixl32::Register out_reg = RegisterFrom(out);
+  if (kEmitCompilerReadBarrier) {
+    TODO_VIXL32(FATAL);
+  } else {
+    // Plain load with no read barrier.
+    // /* HeapReference<Object> */ out = *(out + offset)
+    GetAssembler()->LoadFromOffset(kLoadWord, out_reg, out_reg, offset);
+    GetAssembler()->MaybeUnpoisonHeapReference(out_reg);
+  }
+}
+
 void InstructionCodeGeneratorARMVIXL::GenerateReferenceLoadTwoRegisters(
     HInstruction* instruction ATTRIBUTE_UNUSED,
     Location out,
diff --git a/compiler/optimizing/code_generator_arm_vixl.h b/compiler/optimizing/code_generator_arm_vixl.h
index c583a44..2ccc30f 100644
--- a/compiler/optimizing/code_generator_arm_vixl.h
+++ b/compiler/optimizing/code_generator_arm_vixl.h
@@ -18,6 +18,7 @@
 #define ART_COMPILER_OPTIMIZING_CODE_GENERATOR_ARM_VIXL_H_
 
 #include "code_generator_arm.h"
+#include "common_arm.h"
 #include "utils/arm/assembler_arm_vixl.h"
 
 // TODO(VIXL): make vixl clean wrt -Wshadow.
@@ -131,8 +132,11 @@
   M(If)                                         \
   M(InstanceFieldGet)                           \
   M(InstanceFieldSet)                           \
+  M(InstanceOf)                                 \
   M(IntConstant)                                \
+  M(InvokeInterface)                            \
   M(InvokeStaticOrDirect)                       \
+  M(InvokeUnresolved)                           \
   M(InvokeVirtual)                              \
   M(LessThan)                                   \
   M(LessThanOrEqual)                            \
@@ -166,24 +170,25 @@
   M(Throw)                                      \
   M(TryBoundary)                                \
   M(TypeConversion)                             \
+  M(UnresolvedInstanceFieldGet)                 \
+  M(UnresolvedInstanceFieldSet)                 \
+  M(UnresolvedStaticFieldGet)                   \
+  M(UnresolvedStaticFieldSet)                   \
   M(UShr)                                       \
   M(Xor)                                        \
 
 // TODO: Remove once the VIXL32 backend is implemented completely.
 #define FOR_EACH_UNIMPLEMENTED_INSTRUCTION(M)   \
+  M(ArmDexCacheArraysBase)                      \
+  M(BitwiseNegatedRight)                        \
   M(BoundType)                                  \
   M(ClassTableGet)                              \
-  M(InstanceOf)                                 \
-  M(InvokeInterface)                            \
-  M(InvokeUnresolved)                           \
+  M(IntermediateAddress)                        \
   M(MonitorOperation)                           \
+  M(MultiplyAccumulate)                         \
   M(NativeDebugInfo)                            \
   M(PackedSwitch)                               \
   M(Rem)                                        \
-  M(UnresolvedInstanceFieldGet)                 \
-  M(UnresolvedInstanceFieldSet)                 \
-  M(UnresolvedStaticFieldGet)                   \
-  M(UnresolvedStaticFieldSet)                   \
 
 class CodeGeneratorARMVIXL;
 
@@ -215,6 +220,38 @@
   DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConventionARMVIXL);
 };
 
+class FieldAccessCallingConventionARMVIXL : public FieldAccessCallingConvention {
+ public:
+  FieldAccessCallingConventionARMVIXL() {}
+
+  Location GetObjectLocation() const OVERRIDE {
+    return helpers::LocationFrom(vixl::aarch32::r1);
+  }
+  Location GetFieldIndexLocation() const OVERRIDE {
+    return helpers::LocationFrom(vixl::aarch32::r0);
+  }
+  Location GetReturnLocation(Primitive::Type type) const OVERRIDE {
+    return Primitive::Is64BitType(type)
+        ? helpers::LocationFrom(vixl::aarch32::r0, vixl::aarch32::r1)
+        : helpers::LocationFrom(vixl::aarch32::r0);
+  }
+  Location GetSetValueLocation(Primitive::Type type, bool is_instance) const OVERRIDE {
+    return Primitive::Is64BitType(type)
+        ? helpers::LocationFrom(vixl::aarch32::r2, vixl::aarch32::r3)
+        : (is_instance
+            ? helpers::LocationFrom(vixl::aarch32::r2)
+            : helpers::LocationFrom(vixl::aarch32::r1));
+  }
+  Location GetFpuLocation(Primitive::Type type) const OVERRIDE {
+    return Primitive::Is64BitType(type)
+        ? helpers::LocationFrom(vixl::aarch32::s0, vixl::aarch32::s1)
+        : helpers::LocationFrom(vixl::aarch32::s0);
+  }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(FieldAccessCallingConventionARMVIXL);
+};
+
 class SlowPathCodeARMVIXL : public SlowPathCode {
  public:
   explicit SlowPathCodeARMVIXL(HInstruction* instruction)
@@ -344,6 +381,19 @@
                       bool value_can_be_null);
   void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info);
 
+  // Generate a heap reference load using one register `out`:
+  //
+  //   out <- *(out + offset)
+  //
+  // while honoring heap poisoning and/or read barriers (if any).
+  //
+  // Location `maybe_temp` is used when generating a read barrier and
+  // shall be a register in that case; it may be an invalid location
+  // otherwise.
+  void GenerateReferenceLoadOneRegister(HInstruction* instruction,
+                                        Location out,
+                                        uint32_t offset,
+                                        Location maybe_temp);
   // Generate a heap reference load using two different registers
   // `out` and `obj`:
   //
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index f19e2fe..f169eb0 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -568,8 +568,7 @@
     DCHECK_EQ(type, Primitive::kPrimFloat);  // Can only swap a float.
     FRegister f1 = loc1.IsFpuRegister() ? loc1.AsFpuRegister<FRegister>()
                                         : loc2.AsFpuRegister<FRegister>();
-    Register r2 = loc1.IsRegister() ? loc1.AsRegister<Register>()
-                                    : loc2.AsRegister<Register>();
+    Register r2 = loc1.IsRegister() ? loc1.AsRegister<Register>() : loc2.AsRegister<Register>();
     __ Move(TMP, r2);
     __ Mfc1(r2, f1);
     __ Mtc1(TMP, f1);
@@ -610,10 +609,8 @@
     Exchange(loc1.GetStackIndex(), loc2.GetStackIndex(), /* double_slot */ true);
   } else if ((loc1.IsRegister() && loc2.IsStackSlot()) ||
              (loc1.IsStackSlot() && loc2.IsRegister())) {
-    Register reg = loc1.IsRegister() ? loc1.AsRegister<Register>()
-                                     : loc2.AsRegister<Register>();
-    intptr_t offset = loc1.IsStackSlot() ? loc1.GetStackIndex()
-                                         : loc2.GetStackIndex();
+    Register reg = loc1.IsRegister() ? loc1.AsRegister<Register>() : loc2.AsRegister<Register>();
+    intptr_t offset = loc1.IsStackSlot() ? loc1.GetStackIndex() : loc2.GetStackIndex();
     __ Move(TMP, reg);
     __ LoadFromOffset(kLoadWord, reg, SP, offset);
     __ StoreToOffset(kStoreWord, TMP, SP, offset);
@@ -623,8 +620,7 @@
                                            : loc2.AsRegisterPairLow<Register>();
     Register reg_h = loc1.IsRegisterPair() ? loc1.AsRegisterPairHigh<Register>()
                                            : loc2.AsRegisterPairHigh<Register>();
-    intptr_t offset_l = loc1.IsDoubleStackSlot() ? loc1.GetStackIndex()
-                                                 : loc2.GetStackIndex();
+    intptr_t offset_l = loc1.IsDoubleStackSlot() ? loc1.GetStackIndex() : loc2.GetStackIndex();
     intptr_t offset_h = loc1.IsDoubleStackSlot() ? loc1.GetHighStackIndex(kMipsWordSize)
                                                  : loc2.GetHighStackIndex(kMipsWordSize);
     __ Move(TMP, reg_l);
@@ -633,6 +629,20 @@
     __ Move(TMP, reg_h);
     __ LoadFromOffset(kLoadWord, reg_h, SP, offset_h);
     __ StoreToOffset(kStoreWord, TMP, SP, offset_h);
+  } else if (loc1.IsFpuRegister() || loc2.IsFpuRegister()) {
+    FRegister reg = loc1.IsFpuRegister() ? loc1.AsFpuRegister<FRegister>()
+                                         : loc2.AsFpuRegister<FRegister>();
+    intptr_t offset = loc1.IsFpuRegister() ? loc2.GetStackIndex() : loc1.GetStackIndex();
+    if (type == Primitive::kPrimFloat) {
+      __ MovS(FTMP, reg);
+      __ LoadSFromOffset(reg, SP, offset);
+      __ StoreSToOffset(FTMP, SP, offset);
+    } else {
+      DCHECK_EQ(type, Primitive::kPrimDouble);
+      __ MovD(FTMP, reg);
+      __ LoadDFromOffset(reg, SP, offset);
+      __ StoreDToOffset(FTMP, SP, offset);
+    }
   } else {
     LOG(FATAL) << "Swap between " << loc1 << " and " << loc2 << " is unsupported";
   }
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 4e343f1..2a9e21d 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -151,7 +151,7 @@
       }
       __ movl(length_loc.AsRegister<Register>(), array_len);
       if (mirror::kUseStringCompression) {
-        __ andl(length_loc.AsRegister<Register>(), Immediate(INT32_MAX));
+        __ shrl(length_loc.AsRegister<Register>(), Immediate(1));
       }
     }
     x86_codegen->EmitParallelMoves(
@@ -5243,9 +5243,11 @@
         // Branch cases into compressed and uncompressed for each index's type.
         uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
         NearLabel done, not_compressed;
-        __ cmpl(Address(obj, count_offset), Immediate(0));
+        __ testl(Address(obj, count_offset), Immediate(1));
         codegen_->MaybeRecordImplicitNullCheck(instruction);
-        __ j(kGreaterEqual, &not_compressed);
+        static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
+                      "Expecting 0=compressed, 1=uncompressed");
+        __ j(kNotZero, &not_compressed);
         __ movzxb(out, CodeGeneratorX86::ArrayAddress(obj, index, TIMES_1, data_offset));
         __ jmp(&done);
         __ Bind(&not_compressed);
@@ -5595,7 +5597,7 @@
   codegen_->MaybeRecordImplicitNullCheck(instruction);
   // Mask out most significant bit in case the array is String's array of char.
   if (mirror::kUseStringCompression && instruction->IsStringLength()) {
-    __ andl(out, Immediate(INT32_MAX));
+    __ shrl(out, Immediate(1));
   }
 }
 
@@ -5654,10 +5656,12 @@
       Location array_loc = array_length->GetLocations()->InAt(0);
       Address array_len(array_loc.AsRegister<Register>(), len_offset);
       if (is_string_compressed_char_at) {
+        // TODO: if index_loc.IsConstant(), compare twice the index (to compensate for
+        // the string compression flag) with the in-memory length and avoid the temporary.
         Register length_reg = locations->GetTemp(0).AsRegister<Register>();
         __ movl(length_reg, array_len);
         codegen_->MaybeRecordImplicitNullCheck(array_length);
-        __ andl(length_reg, Immediate(INT32_MAX));
+        __ shrl(length_reg, Immediate(1));
         codegen_->GenerateIntCompare(length_reg, index_loc);
       } else {
         // Checking bounds for general case:
@@ -6073,7 +6077,9 @@
   Register out = out_loc.AsRegister<Register>();
 
   bool generate_null_check = false;
-  const bool requires_read_barrier = kEmitCompilerReadBarrier && !cls->IsInBootImage();
+  const ReadBarrierOption read_barrier_option = cls->IsInBootImage()
+      ? kWithoutReadBarrier
+      : kCompilerReadBarrierOption;
   switch (cls->GetLoadKind()) {
     case HLoadClass::LoadKind::kReferrersClass: {
       DCHECK(!cls->CanCallRuntime());
@@ -6085,24 +6091,24 @@
           out_loc,
           Address(current_method, ArtMethod::DeclaringClassOffset().Int32Value()),
           /* fixup_label */ nullptr,
-          requires_read_barrier);
+          read_barrier_option);
       break;
     }
     case HLoadClass::LoadKind::kBootImageLinkTimeAddress: {
-      DCHECK(!requires_read_barrier);
+      DCHECK_EQ(read_barrier_option, kWithoutReadBarrier);
       __ movl(out, Immediate(/* placeholder */ 0));
       codegen_->RecordTypePatch(cls);
       break;
     }
     case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: {
-      DCHECK(!requires_read_barrier);
+      DCHECK_EQ(read_barrier_option, kWithoutReadBarrier);
       Register method_address = locations->InAt(0).AsRegister<Register>();
       __ leal(out, Address(method_address, CodeGeneratorX86::kDummy32BitOffset));
       codegen_->RecordTypePatch(cls);
       break;
     }
     case HLoadClass::LoadKind::kBootImageAddress: {
-      DCHECK(!requires_read_barrier);
+      DCHECK_EQ(read_barrier_option, kWithoutReadBarrier);
       DCHECK_NE(cls->GetAddress(), 0u);
       uint32_t address = dchecked_integral_cast<uint32_t>(cls->GetAddress());
       __ movl(out, Immediate(address));
@@ -6117,7 +6123,7 @@
                               out_loc,
                               Address::Absolute(address),
                               /* fixup_label */ nullptr,
-                              requires_read_barrier);
+                              read_barrier_option);
       generate_null_check = !cls->IsInDexCache();
       break;
     }
@@ -6130,7 +6136,7 @@
                               out_loc,
                               Address(base_reg, CodeGeneratorX86::kDummy32BitOffset),
                               fixup_label,
-                              requires_read_barrier);
+                              read_barrier_option);
       generate_null_check = !cls->IsInDexCache();
       break;
     }
@@ -6145,7 +6151,7 @@
                               out_loc,
                               Address(out, CodeGenerator::GetCacheOffset(cls->GetTypeIndex())),
                               /* fixup_label */ nullptr,
-                              requires_read_barrier);
+                              read_barrier_option);
       generate_null_check = !cls->IsInDexCache();
       break;
     }
@@ -6283,7 +6289,7 @@
       Address address = Address(method_address, CodeGeneratorX86::kDummy32BitOffset);
       Label* fixup_label = codegen_->NewStringBssEntryPatch(load);
       // /* GcRoot<mirror::Class> */ out = *address  /* PC-relative */
-      GenerateGcRootFieldLoad(load, out_loc, address, fixup_label, kEmitCompilerReadBarrier);
+      GenerateGcRootFieldLoad(load, out_loc, address, fixup_label, kCompilerReadBarrierOption);
       SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathX86(load);
       codegen_->AddSlowPath(slow_path);
       __ testl(out, out);
@@ -6337,12 +6343,26 @@
   CheckEntrypointTypes<kQuickDeliverException, void, mirror::Object*>();
 }
 
-static bool TypeCheckNeedsATemporary(TypeCheckKind type_check_kind) {
-  return kEmitCompilerReadBarrier &&
+// Temp is used for read barrier.
+static size_t NumberOfInstanceOfTemps(TypeCheckKind type_check_kind) {
+  if (kEmitCompilerReadBarrier &&
       !kUseBakerReadBarrier &&
       (type_check_kind == TypeCheckKind::kAbstractClassCheck ||
        type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
-       type_check_kind == TypeCheckKind::kArrayObjectCheck);
+       type_check_kind == TypeCheckKind::kArrayObjectCheck)) {
+    return 1;
+  }
+  return 0;
+}
+
+// Interface case has 3 temps, one for holding the number of interfaces, one for the current
+// interface pointer, one for loading the current interface.
+// The other checks have one temp for loading the object's class.
+static size_t NumberOfCheckCastTemps(TypeCheckKind type_check_kind) {
+  if (type_check_kind == TypeCheckKind::kInterfaceCheck && !kPoisonHeapReferences) {
+    return 2;
+  }
+  return 1 + NumberOfInstanceOfTemps(type_check_kind);
 }
 
 void LocationsBuilderX86::VisitInstanceOf(HInstanceOf* instruction) {
@@ -6373,11 +6393,8 @@
   locations->SetInAt(1, Location::Any());
   // Note that TypeCheckSlowPathX86 uses this "out" register too.
   locations->SetOut(Location::RequiresRegister());
-  // When read barriers are enabled, we need a temporary register for
-  // some cases.
-  if (TypeCheckNeedsATemporary(type_check_kind)) {
-    locations->AddTemp(Location::RequiresRegister());
-  }
+  // When read barriers are enabled, we need a temporary register for some cases.
+  locations->AddRegisterTemps(NumberOfInstanceOfTemps(type_check_kind));
 }
 
 void InstructionCodeGeneratorX86::VisitInstanceOf(HInstanceOf* instruction) {
@@ -6388,9 +6405,9 @@
   Location cls = locations->InAt(1);
   Location out_loc = locations->Out();
   Register out = out_loc.AsRegister<Register>();
-  Location maybe_temp_loc = TypeCheckNeedsATemporary(type_check_kind) ?
-      locations->GetTemp(0) :
-      Location::NoLocation();
+  const size_t num_temps = NumberOfInstanceOfTemps(type_check_kind);
+  DCHECK_LE(num_temps, 1u);
+  Location maybe_temp_loc = (num_temps >= 1) ? locations->GetTemp(0) : Location::NoLocation();
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
   uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
@@ -6406,7 +6423,11 @@
   }
 
   // /* HeapReference<Class> */ out = obj->klass_
-  GenerateReferenceLoadTwoRegisters(instruction, out_loc, obj_loc, class_offset);
+  GenerateReferenceLoadTwoRegisters(instruction,
+                                    out_loc,
+                                    obj_loc,
+                                    class_offset,
+                                    kCompilerReadBarrierOption);
 
   switch (type_check_kind) {
     case TypeCheckKind::kExactCheck: {
@@ -6430,7 +6451,11 @@
       NearLabel loop;
       __ Bind(&loop);
       // /* HeapReference<Class> */ out = out->super_class_
-      GenerateReferenceLoadOneRegister(instruction, out_loc, super_offset, maybe_temp_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       out_loc,
+                                       super_offset,
+                                       maybe_temp_loc,
+                                       kCompilerReadBarrierOption);
       __ testl(out, out);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ j(kEqual, &done);
@@ -6460,7 +6485,11 @@
       }
       __ j(kEqual, &success);
       // /* HeapReference<Class> */ out = out->super_class_
-      GenerateReferenceLoadOneRegister(instruction, out_loc, super_offset, maybe_temp_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       out_loc,
+                                       super_offset,
+                                       maybe_temp_loc,
+                                       kCompilerReadBarrierOption);
       __ testl(out, out);
       __ j(kNotEqual, &loop);
       // If `out` is null, we use it for the result, and jump to `done`.
@@ -6485,7 +6514,11 @@
       __ j(kEqual, &exact_check);
       // Otherwise, we need to check that the object's class is a non-primitive array.
       // /* HeapReference<Class> */ out = out->component_type_
-      GenerateReferenceLoadOneRegister(instruction, out_loc, component_offset, maybe_temp_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       out_loc,
+                                       component_offset,
+                                       maybe_temp_loc,
+                                       kCompilerReadBarrierOption);
       __ testl(out, out);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ j(kEqual, &done);
@@ -6562,35 +6595,43 @@
   }
 }
 
+static bool IsTypeCheckSlowPathFatal(TypeCheckKind type_check_kind, bool throws_into_catch) {
+  switch (type_check_kind) {
+  case TypeCheckKind::kExactCheck:
+  case TypeCheckKind::kAbstractClassCheck:
+  case TypeCheckKind::kClassHierarchyCheck:
+  case TypeCheckKind::kArrayObjectCheck:
+    return !throws_into_catch && !kEmitCompilerReadBarrier;
+  case TypeCheckKind::kInterfaceCheck:
+    return !throws_into_catch && !kEmitCompilerReadBarrier && !kPoisonHeapReferences;
+  case TypeCheckKind::kArrayCheck:
+  case TypeCheckKind::kUnresolvedCheck:
+    return false;
+  }
+  LOG(FATAL) << "Unreachable";
+  UNREACHABLE();
+}
+
 void LocationsBuilderX86::VisitCheckCast(HCheckCast* instruction) {
-  LocationSummary::CallKind call_kind = LocationSummary::kNoCall;
   bool throws_into_catch = instruction->CanThrowIntoCatchBlock();
   TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
-  switch (type_check_kind) {
-    case TypeCheckKind::kExactCheck:
-    case TypeCheckKind::kAbstractClassCheck:
-    case TypeCheckKind::kClassHierarchyCheck:
-    case TypeCheckKind::kArrayObjectCheck:
-      call_kind = (throws_into_catch || kEmitCompilerReadBarrier) ?
-          LocationSummary::kCallOnSlowPath :
-          LocationSummary::kNoCall;  // In fact, call on a fatal (non-returning) slow path.
-      break;
-    case TypeCheckKind::kArrayCheck:
-    case TypeCheckKind::kUnresolvedCheck:
-    case TypeCheckKind::kInterfaceCheck:
-      call_kind = LocationSummary::kCallOnSlowPath;
-      break;
-  }
+  LocationSummary::CallKind call_kind =
+      IsTypeCheckSlowPathFatal(type_check_kind, throws_into_catch)
+          ? LocationSummary::kNoCall
+          : LocationSummary::kCallOnSlowPath;
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind);
   locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetInAt(1, Location::Any());
+  if (type_check_kind == TypeCheckKind::kInterfaceCheck) {
+    // Require a register for the interface check since there is a loop that compares the class to
+    // a memory address.
+    locations->SetInAt(1, Location::RequiresRegister());
+  } else {
+    locations->SetInAt(1, Location::Any());
+  }
   // Note that TypeCheckSlowPathX86 uses this "temp" register too.
   locations->AddTemp(Location::RequiresRegister());
-  // When read barriers are enabled, we need an additional temporary
-  // register for some cases.
-  if (TypeCheckNeedsATemporary(type_check_kind)) {
-    locations->AddTemp(Location::RequiresRegister());
-  }
+  // When read barriers are enabled, we need an additional temporary register for some cases.
+  locations->AddRegisterTemps(NumberOfCheckCastTemps(type_check_kind));
 }
 
 void InstructionCodeGeneratorX86::VisitCheckCast(HCheckCast* instruction) {
@@ -6601,20 +6642,25 @@
   Location cls = locations->InAt(1);
   Location temp_loc = locations->GetTemp(0);
   Register temp = temp_loc.AsRegister<Register>();
-  Location maybe_temp2_loc = TypeCheckNeedsATemporary(type_check_kind) ?
-      locations->GetTemp(1) :
-      Location::NoLocation();
-  uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
-  uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
-  uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
-  uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
+  const size_t num_temps = NumberOfCheckCastTemps(type_check_kind);
+  DCHECK_GE(num_temps, 1u);
+  DCHECK_LE(num_temps, 2u);
+  Location maybe_temp2_loc = (num_temps >= 2) ? locations->GetTemp(1) : Location::NoLocation();
+  const uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
+  const uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
+  const uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
+  const uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
+  const uint32_t iftable_offset = mirror::Class::IfTableOffset().Uint32Value();
+  const uint32_t array_length_offset = mirror::Array::LengthOffset().Uint32Value();
+  const uint32_t object_array_data_offset =
+      mirror::Array::DataOffset(kHeapReferenceSize).Uint32Value();
 
+  // Always false for read barriers since we may need to go to the entrypoint for non-fatal cases
+  // from false negatives. The false negatives may come from avoiding read barriers below. Avoiding
+  // read barriers is done for performance and code size reasons.
   bool is_type_check_slow_path_fatal =
-      (type_check_kind == TypeCheckKind::kExactCheck ||
-       type_check_kind == TypeCheckKind::kAbstractClassCheck ||
-       type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
-       type_check_kind == TypeCheckKind::kArrayObjectCheck) &&
-      !instruction->CanThrowIntoCatchBlock();
+      IsTypeCheckSlowPathFatal(type_check_kind, instruction->CanThrowIntoCatchBlock());
+
   SlowPathCode* type_check_slow_path =
       new (GetGraph()->GetArena()) TypeCheckSlowPathX86(instruction,
                                                         is_type_check_slow_path_fatal);
@@ -6627,12 +6673,16 @@
     __ j(kEqual, &done);
   }
 
-  // /* HeapReference<Class> */ temp = obj->klass_
-  GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset);
-
   switch (type_check_kind) {
     case TypeCheckKind::kExactCheck:
     case TypeCheckKind::kArrayCheck: {
+      // /* HeapReference<Class> */ temp = obj->klass_
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        temp_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        kWithoutReadBarrier);
+
       if (cls.IsRegister()) {
         __ cmpl(temp, cls.AsRegister<Register>());
       } else {
@@ -6646,12 +6696,23 @@
     }
 
     case TypeCheckKind::kAbstractClassCheck: {
+      // /* HeapReference<Class> */ temp = obj->klass_
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        temp_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        kWithoutReadBarrier);
+
       // If the class is abstract, we eagerly fetch the super class of the
       // object to avoid doing a comparison we know will fail.
       NearLabel loop;
       __ Bind(&loop);
       // /* HeapReference<Class> */ temp = temp->super_class_
-      GenerateReferenceLoadOneRegister(instruction, temp_loc, super_offset, maybe_temp2_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       temp_loc,
+                                       super_offset,
+                                       maybe_temp2_loc,
+                                       kWithoutReadBarrier);
 
       // If the class reference currently in `temp` is null, jump to the slow path to throw the
       // exception.
@@ -6670,6 +6731,13 @@
     }
 
     case TypeCheckKind::kClassHierarchyCheck: {
+      // /* HeapReference<Class> */ temp = obj->klass_
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        temp_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        kWithoutReadBarrier);
+
       // Walk over the class hierarchy to find a match.
       NearLabel loop;
       __ Bind(&loop);
@@ -6682,7 +6750,11 @@
       __ j(kEqual, &done);
 
       // /* HeapReference<Class> */ temp = temp->super_class_
-      GenerateReferenceLoadOneRegister(instruction, temp_loc, super_offset, maybe_temp2_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       temp_loc,
+                                       super_offset,
+                                       maybe_temp2_loc,
+                                       kWithoutReadBarrier);
 
       // If the class reference currently in `temp` is not null, jump
       // back at the beginning of the loop.
@@ -6694,6 +6766,13 @@
     }
 
     case TypeCheckKind::kArrayObjectCheck: {
+      // /* HeapReference<Class> */ temp = obj->klass_
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        temp_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        kWithoutReadBarrier);
+
       // Do an exact check.
       if (cls.IsRegister()) {
         __ cmpl(temp, cls.AsRegister<Register>());
@@ -6705,7 +6784,11 @@
 
       // Otherwise, we need to check that the object's class is a non-primitive array.
       // /* HeapReference<Class> */ temp = temp->component_type_
-      GenerateReferenceLoadOneRegister(instruction, temp_loc, component_offset, maybe_temp2_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       temp_loc,
+                                       component_offset,
+                                       maybe_temp2_loc,
+                                       kWithoutReadBarrier);
 
       // If the component type is null (i.e. the object not an array),  jump to the slow path to
       // throw the exception. Otherwise proceed with the check.
@@ -6718,10 +6801,7 @@
     }
 
     case TypeCheckKind::kUnresolvedCheck:
-    case TypeCheckKind::kInterfaceCheck:
-      // We always go into the type check slow path for the unresolved
-      // and interface check cases.
-      //
+      // We always go into the type check slow path for the unresolved check case.
       // We cannot directly call the CheckCast runtime entry point
       // without resorting to a type checking slow path here (i.e. by
       // calling InvokeRuntime directly), as it would require to
@@ -6729,15 +6809,52 @@
       // instruction (following the runtime calling convention), which
       // might be cluttered by the potential first read barrier
       // emission at the beginning of this method.
-      //
-      // TODO: Introduce a new runtime entry point taking the object
-      // to test (instead of its class) as argument, and let it deal
-      // with the read barrier issues. This will let us refactor this
-      // case of the `switch` code as it was previously (with a direct
-      // call to the runtime not using a type checking slow path).
-      // This should also be beneficial for the other cases above.
       __ jmp(type_check_slow_path->GetEntryLabel());
       break;
+
+    case TypeCheckKind::kInterfaceCheck: {
+      // Fast path for the interface check. Since we compare with a memory location in the inner
+      // loop we would need to have cls poisoned. However unpoisoning cls would reset the
+      // conditional flags and cause the conditional jump to be incorrect. Therefore we just jump
+      // to the slow path if we are running under poisoning.
+      if (!kPoisonHeapReferences) {
+        // Try to avoid read barriers to improve the fast path. We can not get false positives by
+        // doing this.
+        // /* HeapReference<Class> */ temp = obj->klass_
+        GenerateReferenceLoadTwoRegisters(instruction,
+                                          temp_loc,
+                                          obj_loc,
+                                          class_offset,
+                                          kWithoutReadBarrier);
+
+        // /* HeapReference<Class> */ temp = temp->iftable_
+        GenerateReferenceLoadTwoRegisters(instruction,
+                                          temp_loc,
+                                          temp_loc,
+                                          iftable_offset,
+                                          kWithoutReadBarrier);
+        NearLabel is_null;
+        // Null iftable means it is empty.
+        __ testl(temp, temp);
+        __ j(kZero, &is_null);
+
+        // Loop through the iftable and check if any class matches.
+        __ movl(maybe_temp2_loc.AsRegister<Register>(), Address(temp, array_length_offset));
+
+        NearLabel start_loop;
+        __ Bind(&start_loop);
+        __ cmpl(cls.AsRegister<Register>(), Address(temp, object_array_data_offset));
+        __ j(kEqual, &done);  // Return if same class.
+        // Go to next interface.
+        __ addl(temp, Immediate(2 * kHeapReferenceSize));
+        __ subl(maybe_temp2_loc.AsRegister<Register>(), Immediate(2));
+        __ j(kNotZero, &start_loop);
+        __ Bind(&is_null);
+      }
+
+      __ jmp(type_check_slow_path->GetEntryLabel());
+      break;
+    }
   }
   __ Bind(&done);
 
@@ -6896,12 +7013,15 @@
   }
 }
 
-void InstructionCodeGeneratorX86::GenerateReferenceLoadOneRegister(HInstruction* instruction,
-                                                                   Location out,
-                                                                   uint32_t offset,
-                                                                   Location maybe_temp) {
+void InstructionCodeGeneratorX86::GenerateReferenceLoadOneRegister(
+    HInstruction* instruction,
+    Location out,
+    uint32_t offset,
+    Location maybe_temp,
+    ReadBarrierOption read_barrier_option) {
   Register out_reg = out.AsRegister<Register>();
-  if (kEmitCompilerReadBarrier) {
+  if (read_barrier_option == kWithReadBarrier) {
+    CHECK(kEmitCompilerReadBarrier);
     if (kUseBakerReadBarrier) {
       // Load with fast path based Baker's read barrier.
       // /* HeapReference<Object> */ out = *(out + offset)
@@ -6926,13 +7046,16 @@
   }
 }
 
-void InstructionCodeGeneratorX86::GenerateReferenceLoadTwoRegisters(HInstruction* instruction,
-                                                                    Location out,
-                                                                    Location obj,
-                                                                    uint32_t offset) {
+void InstructionCodeGeneratorX86::GenerateReferenceLoadTwoRegisters(
+    HInstruction* instruction,
+    Location out,
+    Location obj,
+    uint32_t offset,
+    ReadBarrierOption read_barrier_option) {
   Register out_reg = out.AsRegister<Register>();
   Register obj_reg = obj.AsRegister<Register>();
-  if (kEmitCompilerReadBarrier) {
+  if (read_barrier_option == kWithReadBarrier) {
+    CHECK(kEmitCompilerReadBarrier);
     if (kUseBakerReadBarrier) {
       // Load with fast path based Baker's read barrier.
       // /* HeapReference<Object> */ out = *(obj + offset)
@@ -6952,13 +7075,14 @@
   }
 }
 
-void InstructionCodeGeneratorX86::GenerateGcRootFieldLoad(HInstruction* instruction,
-                                                          Location root,
-                                                          const Address& address,
-                                                          Label* fixup_label,
-                                                          bool requires_read_barrier) {
+void InstructionCodeGeneratorX86::GenerateGcRootFieldLoad(
+    HInstruction* instruction,
+    Location root,
+    const Address& address,
+    Label* fixup_label,
+    ReadBarrierOption read_barrier_option) {
   Register root_reg = root.AsRegister<Register>();
-  if (requires_read_barrier) {
+  if (read_barrier_option == kWithReadBarrier) {
     DCHECK(kEmitCompilerReadBarrier);
     if (kUseBakerReadBarrier) {
       // Fast path implementation of art::ReadBarrier::BarrierForRoot when
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index 1b51999..164231b 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -240,7 +240,8 @@
   void GenerateReferenceLoadOneRegister(HInstruction* instruction,
                                         Location out,
                                         uint32_t offset,
-                                        Location maybe_temp);
+                                        Location maybe_temp,
+                                        ReadBarrierOption read_barrier_option);
   // Generate a heap reference load using two different registers
   // `out` and `obj`:
   //
@@ -254,17 +255,18 @@
   void GenerateReferenceLoadTwoRegisters(HInstruction* instruction,
                                          Location out,
                                          Location obj,
-                                         uint32_t offset);
+                                         uint32_t offset,
+                                         ReadBarrierOption read_barrier_option);
   // Generate a GC root reference load:
   //
   //   root <- *address
   //
-  // while honoring read barriers if `requires_read_barrier` is true.
+  // while honoring read barriers based on read_barrier_option.
   void GenerateGcRootFieldLoad(HInstruction* instruction,
                                Location root,
                                const Address& address,
                                Label* fixup_label,
-                               bool requires_read_barrier);
+                               ReadBarrierOption read_barrier_option);
 
   // Push value to FPU stack. `is_fp` specifies whether the value is floating point or not.
   // `is_wide` specifies whether it is long/double or not.
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index f3ee7cf..cb89e50 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -199,7 +199,7 @@
       }
       __ movl(length_loc.AsRegister<CpuRegister>(), array_len);
       if (mirror::kUseStringCompression) {
-        __ andl(length_loc.AsRegister<CpuRegister>(), Immediate(INT32_MAX));
+        __ shrl(length_loc.AsRegister<CpuRegister>(), Immediate(1));
       }
     }
 
@@ -4732,9 +4732,11 @@
         // Branch cases into compressed and uncompressed for each index's type.
         uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
         NearLabel done, not_compressed;
-        __ cmpl(Address(obj, count_offset), Immediate(0));
+        __ testl(Address(obj, count_offset), Immediate(1));
         codegen_->MaybeRecordImplicitNullCheck(instruction);
-        __ j(kGreaterEqual, &not_compressed);
+        static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
+                      "Expecting 0=compressed, 1=uncompressed");
+        __ j(kNotZero, &not_compressed);
         __ movzxb(out, CodeGeneratorX86_64::ArrayAddress(obj, index, TIMES_1, data_offset));
         __ jmp(&done);
         __ Bind(&not_compressed);
@@ -5066,7 +5068,7 @@
   codegen_->MaybeRecordImplicitNullCheck(instruction);
   // Mask out most significant bit in case the array is String's array of char.
   if (mirror::kUseStringCompression && instruction->IsStringLength()) {
-    __ andl(out, Immediate(INT32_MAX));
+    __ shrl(out, Immediate(1));
   }
 }
 
@@ -5118,10 +5120,12 @@
       Location array_loc = array_length->GetLocations()->InAt(0);
       Address array_len(array_loc.AsRegister<CpuRegister>(), len_offset);
       if (mirror::kUseStringCompression && instruction->IsStringCharAt()) {
+        // TODO: if index_loc.IsConstant(), compare twice the index (to compensate for
+        // the string compression flag) with the in-memory length and avoid the temporary.
         CpuRegister length_reg = CpuRegister(TMP);
         __ movl(length_reg, array_len);
         codegen_->MaybeRecordImplicitNullCheck(array_length);
-        __ andl(length_reg, Immediate(INT32_MAX));
+        __ shrl(length_reg, Immediate(1));
         codegen_->GenerateIntCompare(length_reg, index_loc);
       } else {
         // Checking the bound for general case:
@@ -5497,7 +5501,9 @@
   Location out_loc = locations->Out();
   CpuRegister out = out_loc.AsRegister<CpuRegister>();
 
-  const bool requires_read_barrier = kEmitCompilerReadBarrier && !cls->IsInBootImage();
+  const ReadBarrierOption read_barrier_option = cls->IsInBootImage()
+      ? kWithoutReadBarrier
+      : kCompilerReadBarrierOption;
   bool generate_null_check = false;
   switch (cls->GetLoadKind()) {
     case HLoadClass::LoadKind::kReferrersClass: {
@@ -5510,16 +5516,16 @@
           out_loc,
           Address(current_method, ArtMethod::DeclaringClassOffset().Int32Value()),
           /* fixup_label */ nullptr,
-          requires_read_barrier);
+          read_barrier_option);
       break;
     }
     case HLoadClass::LoadKind::kBootImageLinkTimePcRelative:
-      DCHECK(!requires_read_barrier);
+      DCHECK_EQ(read_barrier_option, kWithoutReadBarrier);
       __ leal(out, Address::Absolute(CodeGeneratorX86_64::kDummy32BitOffset, /* no_rip */ false));
       codegen_->RecordTypePatch(cls);
       break;
     case HLoadClass::LoadKind::kBootImageAddress: {
-      DCHECK(!requires_read_barrier);
+      DCHECK_EQ(read_barrier_option, kWithoutReadBarrier);
       DCHECK_NE(cls->GetAddress(), 0u);
       uint32_t address = dchecked_integral_cast<uint32_t>(cls->GetAddress());
       __ movl(out, Immediate(address));  // Zero-extended.
@@ -5535,7 +5541,7 @@
                                 out_loc,
                                 address,
                                 /* fixup_label */ nullptr,
-                                requires_read_barrier);
+                                read_barrier_option);
       } else {
         // TODO: Consider using opcode A1, i.e. movl eax, moff32 (with 64-bit address).
         __ movq(out, Immediate(cls->GetAddress()));
@@ -5543,7 +5549,7 @@
                                 out_loc,
                                 Address(out, 0),
                                 /* fixup_label */ nullptr,
-                                requires_read_barrier);
+                                read_barrier_option);
       }
       generate_null_check = !cls->IsInDexCache();
       break;
@@ -5554,7 +5560,7 @@
       Address address = Address::Absolute(CodeGeneratorX86_64::kDummy32BitOffset,
                                           /* no_rip */ false);
       // /* GcRoot<mirror::Class> */ out = *address  /* PC-relative */
-      GenerateGcRootFieldLoad(cls, out_loc, address, fixup_label, requires_read_barrier);
+      GenerateGcRootFieldLoad(cls, out_loc, address, fixup_label, read_barrier_option);
       generate_null_check = !cls->IsInDexCache();
       break;
     }
@@ -5571,7 +5577,7 @@
           out_loc,
           Address(out, CodeGenerator::GetCacheOffset(cls->GetTypeIndex())),
           /* fixup_label */ nullptr,
-          requires_read_barrier);
+          read_barrier_option);
       generate_null_check = !cls->IsInDexCache();
       break;
     }
@@ -5684,7 +5690,7 @@
                                           /* no_rip */ false);
       Label* fixup_label = codegen_->NewStringBssEntryPatch(load);
       // /* GcRoot<mirror::Class> */ out = *address  /* PC-relative */
-      GenerateGcRootFieldLoad(load, out_loc, address, fixup_label, kEmitCompilerReadBarrier);
+      GenerateGcRootFieldLoad(load, out_loc, address, fixup_label, kCompilerReadBarrierOption);
       SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathX86_64(load);
       codegen_->AddSlowPath(slow_path);
       __ testl(out, out);
@@ -5825,7 +5831,7 @@
                                     out_loc,
                                     obj_loc,
                                     class_offset,
-                                    kEmitCompilerReadBarrier);
+                                    kCompilerReadBarrierOption);
 
   switch (type_check_kind) {
     case TypeCheckKind::kExactCheck: {
@@ -5854,7 +5860,11 @@
       NearLabel loop, success;
       __ Bind(&loop);
       // /* HeapReference<Class> */ out = out->super_class_
-      GenerateReferenceLoadOneRegister(instruction, out_loc, super_offset, maybe_temp_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       out_loc,
+                                       super_offset,
+                                       maybe_temp_loc,
+                                       kCompilerReadBarrierOption);
       __ testl(out, out);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ j(kEqual, &done);
@@ -5884,7 +5894,11 @@
       }
       __ j(kEqual, &success);
       // /* HeapReference<Class> */ out = out->super_class_
-      GenerateReferenceLoadOneRegister(instruction, out_loc, super_offset, maybe_temp_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       out_loc,
+                                       super_offset,
+                                       maybe_temp_loc,
+                                       kCompilerReadBarrierOption);
       __ testl(out, out);
       __ j(kNotEqual, &loop);
       // If `out` is null, we use it for the result, and jump to `done`.
@@ -5909,7 +5923,11 @@
       __ j(kEqual, &exact_check);
       // Otherwise, we need to check that the object's class is a non-primitive array.
       // /* HeapReference<Class> */ out = out->component_type_
-      GenerateReferenceLoadOneRegister(instruction, out_loc, component_offset, maybe_temp_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       out_loc,
+                                       component_offset,
+                                       maybe_temp_loc,
+                                       kCompilerReadBarrierOption);
       __ testl(out, out);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ j(kEqual, &done);
@@ -5986,7 +6004,7 @@
   }
 }
 
-bool IsTypeCheckSlowPathFatal(TypeCheckKind type_check_kind, bool throws_into_catch) {
+static bool IsTypeCheckSlowPathFatal(TypeCheckKind type_check_kind, bool throws_into_catch) {
   switch (type_check_kind) {
     case TypeCheckKind::kExactCheck:
     case TypeCheckKind::kAbstractClassCheck:
@@ -6046,8 +6064,12 @@
   const uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
   const uint32_t iftable_offset = mirror::Class::IfTableOffset().Uint32Value();
   const uint32_t array_length_offset = mirror::Array::LengthOffset().Uint32Value();
-  const int object_array_data_offset = mirror::Array::DataOffset(kHeapReferenceSize).Uint32Value();
+  const uint32_t object_array_data_offset =
+      mirror::Array::DataOffset(kHeapReferenceSize).Uint32Value();
 
+  // Always false for read barriers since we may need to go to the entrypoint for non-fatal cases
+  // from false negatives. The false negatives may come from avoiding read barriers below. Avoiding
+  // read barriers is done for performance and code size reasons.
   bool is_type_check_slow_path_fatal =
       IsTypeCheckSlowPathFatal(type_check_kind, instruction->CanThrowIntoCatchBlock());
   SlowPathCode* type_check_slow_path =
@@ -6055,22 +6077,23 @@
                                                            is_type_check_slow_path_fatal);
   codegen_->AddSlowPath(type_check_slow_path);
 
+
+  NearLabel done;
+  // Avoid null check if we know obj is not null.
+  if (instruction->MustDoNullCheck()) {
+    __ testl(obj, obj);
+    __ j(kEqual, &done);
+  }
+
   switch (type_check_kind) {
     case TypeCheckKind::kExactCheck:
     case TypeCheckKind::kArrayCheck: {
-      NearLabel done;
-      // Avoid null check if we know obj is not null.
-      if (instruction->MustDoNullCheck()) {
-        __ testl(obj, obj);
-        __ j(kEqual, &done);
-      }
-
       // /* HeapReference<Class> */ temp = obj->klass_
       GenerateReferenceLoadTwoRegisters(instruction,
                                         temp_loc,
                                         obj_loc,
                                         class_offset,
-                                        kEmitCompilerReadBarrier);
+                                        kWithoutReadBarrier);
       if (cls.IsRegister()) {
         __ cmpl(temp, cls.AsRegister<CpuRegister>());
       } else {
@@ -6080,30 +6103,26 @@
       // Jump to slow path for throwing the exception or doing a
       // more involved array check.
       __ j(kNotEqual, type_check_slow_path->GetEntryLabel());
-      __ Bind(&done);
       break;
     }
 
     case TypeCheckKind::kAbstractClassCheck: {
-      NearLabel done;
-      // Avoid null check if we know obj is not null.
-      if (instruction->MustDoNullCheck()) {
-        __ testl(obj, obj);
-        __ j(kEqual, &done);
-      }
-
       // /* HeapReference<Class> */ temp = obj->klass_
       GenerateReferenceLoadTwoRegisters(instruction,
                                         temp_loc,
                                         obj_loc,
                                         class_offset,
-                                        kEmitCompilerReadBarrier);
+                                        kWithoutReadBarrier);
       // If the class is abstract, we eagerly fetch the super class of the
       // object to avoid doing a comparison we know will fail.
       NearLabel loop;
       __ Bind(&loop);
       // /* HeapReference<Class> */ temp = temp->super_class_
-      GenerateReferenceLoadOneRegister(instruction, temp_loc, super_offset, maybe_temp2_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       temp_loc,
+                                       super_offset,
+                                       maybe_temp2_loc,
+                                       kWithoutReadBarrier);
 
       // If the class reference currently in `temp` is null, jump to the slow path to throw the
       // exception.
@@ -6117,24 +6136,16 @@
         __ cmpl(temp, Address(CpuRegister(RSP), cls.GetStackIndex()));
       }
       __ j(kNotEqual, &loop);
-      __ Bind(&done);
       break;
     }
 
     case TypeCheckKind::kClassHierarchyCheck: {
-      NearLabel done;
-      // Avoid null check if we know obj is not null.
-      if (instruction->MustDoNullCheck()) {
-        __ testl(obj, obj);
-        __ j(kEqual, &done);
-      }
-
       // /* HeapReference<Class> */ temp = obj->klass_
       GenerateReferenceLoadTwoRegisters(instruction,
                                         temp_loc,
                                         obj_loc,
                                         class_offset,
-                                        kEmitCompilerReadBarrier);
+                                        kWithoutReadBarrier);
       // Walk over the class hierarchy to find a match.
       NearLabel loop;
       __ Bind(&loop);
@@ -6147,7 +6158,11 @@
       __ j(kEqual, &done);
 
       // /* HeapReference<Class> */ temp = temp->super_class_
-      GenerateReferenceLoadOneRegister(instruction, temp_loc, super_offset, maybe_temp2_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       temp_loc,
+                                       super_offset,
+                                       maybe_temp2_loc,
+                                       kWithoutReadBarrier);
 
       // If the class reference currently in `temp` is not null, jump
       // back at the beginning of the loop.
@@ -6155,28 +6170,16 @@
       __ j(kNotZero, &loop);
       // Otherwise, jump to the slow path to throw the exception.
       __ jmp(type_check_slow_path->GetEntryLabel());
-      __ Bind(&done);
       break;
     }
 
     case TypeCheckKind::kArrayObjectCheck: {
-      // We cannot use a NearLabel here, as its range might be too
-      // short in some cases when read barriers are enabled.  This has
-      // been observed for instance when the code emitted for this
-      // case uses high x86-64 registers (R8-R15).
-      Label done;
-      // Avoid null check if we know obj is not null.
-      if (instruction->MustDoNullCheck()) {
-        __ testl(obj, obj);
-        __ j(kEqual, &done);
-      }
-
       // /* HeapReference<Class> */ temp = obj->klass_
       GenerateReferenceLoadTwoRegisters(instruction,
                                         temp_loc,
                                         obj_loc,
                                         class_offset,
-                                        kEmitCompilerReadBarrier);
+                                        kWithoutReadBarrier);
       // Do an exact check.
       NearLabel check_non_primitive_component_type;
       if (cls.IsRegister()) {
@@ -6189,7 +6192,11 @@
 
       // Otherwise, we need to check that the object's class is a non-primitive array.
       // /* HeapReference<Class> */ temp = temp->component_type_
-      GenerateReferenceLoadOneRegister(instruction, temp_loc, component_offset, maybe_temp2_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       temp_loc,
+                                       component_offset,
+                                       maybe_temp2_loc,
+                                       kWithoutReadBarrier);
 
       // If the component type is not null (i.e. the object is indeed
       // an array), jump to label `check_non_primitive_component_type`
@@ -6200,21 +6207,10 @@
       __ j(kZero, type_check_slow_path->GetEntryLabel());
       __ cmpw(Address(temp, primitive_offset), Immediate(Primitive::kPrimNot));
       __ j(kNotEqual, type_check_slow_path->GetEntryLabel());
-      __ Bind(&done);
       break;
     }
 
-    case TypeCheckKind::kUnresolvedCheck:
-    case TypeCheckKind::kInterfaceCheck:
-      NearLabel done;
-
-      // Avoid null check if we know obj is not null.
-      if (instruction->MustDoNullCheck()) {
-        __ testl(obj, obj);
-        __ j(kEqual, &done);
-      }
-
-
+    case TypeCheckKind::kUnresolvedCheck: {
       // We always go into the type check slow path for the unresolved case.
       //
       // We cannot directly call the CheckCast runtime entry point
@@ -6224,18 +6220,14 @@
       // instruction (following the runtime calling convention), which
       // might be cluttered by the potential first read barrier
       // emission at the beginning of this method.
-      //
-      // TODO: Introduce a new runtime entry point taking the object
-      // to test (instead of its class) as argument, and let it deal
-      // with the read barrier issues. This will let us refactor this
-      // case of the `switch` code as it was previously (with a direct
-      // call to the runtime not using a type checking slow path).
-      // This should also be beneficial for the other cases above.
+      __ jmp(type_check_slow_path->GetEntryLabel());
+      break;
+    }
 
-      // Fast path for the interface check. Since we compare with a memory location in the inner
-      // loop we would need to have cls poisoned. However unpoisoning cls would reset the
-      // conditional flags and cause the conditional jump to be incorrect.
-      if (type_check_kind == TypeCheckKind::kInterfaceCheck && !kPoisonHeapReferences) {
+    case TypeCheckKind::kInterfaceCheck:
+      // Fast path for the interface check. We always go slow path for heap poisoning since
+      // unpoisoning cls would require an extra temp.
+      if (!kPoisonHeapReferences) {
         // Try to avoid read barriers to improve the fast path. We can not get false positives by
         // doing this.
         // /* HeapReference<Class> */ temp = obj->klass_
@@ -6243,39 +6235,40 @@
                                           temp_loc,
                                           obj_loc,
                                           class_offset,
-                                          /*emit_read_barrier*/ false);
+                                          kWithoutReadBarrier);
 
         // /* HeapReference<Class> */ temp = temp->iftable_
         GenerateReferenceLoadTwoRegisters(instruction,
                                           temp_loc,
                                           temp_loc,
                                           iftable_offset,
-                                          /*emit_read_barrier*/ false);
+                                          kWithoutReadBarrier);
         NearLabel is_null;
         // Null iftable means it is empty.
-        __ testl(temp_loc.AsRegister<CpuRegister>(), temp_loc.AsRegister<CpuRegister>());
+        __ testl(temp, temp);
         __ j(kZero, &is_null);
 
         // Loop through the iftable and check if any class matches.
-        __ movl(maybe_temp2_loc.AsRegister<CpuRegister>(),
-                Address(temp_loc.AsRegister<CpuRegister>(), array_length_offset));
+        __ movl(maybe_temp2_loc.AsRegister<CpuRegister>(), Address(temp, array_length_offset));
 
         NearLabel start_loop;
         __ Bind(&start_loop);
-        __ cmpl(cls.AsRegister<CpuRegister>(),
-                Address(temp_loc.AsRegister<CpuRegister>(), object_array_data_offset));
+        __ cmpl(cls.AsRegister<CpuRegister>(), Address(temp, object_array_data_offset));
         __ j(kEqual, &done);  // Return if same class.
         // Go to next interface.
-        __ addq(temp_loc.AsRegister<CpuRegister>(), Immediate(2 * kHeapReferenceSize));
-        __ subq(maybe_temp2_loc.AsRegister<CpuRegister>(), Immediate(2));
+        __ addl(temp, Immediate(2 * kHeapReferenceSize));
+        __ subl(maybe_temp2_loc.AsRegister<CpuRegister>(), Immediate(2));
         __ j(kNotZero, &start_loop);
         __ Bind(&is_null);
       }
       __ jmp(type_check_slow_path->GetEntryLabel());
-      __ Bind(&done);
       break;
   }
 
+  if (done.IsLinked()) {
+    __ Bind(&done);
+  }
+
   __ Bind(type_check_slow_path->GetExitLabel());
 }
 
@@ -6412,12 +6405,15 @@
   }
 }
 
-void InstructionCodeGeneratorX86_64::GenerateReferenceLoadOneRegister(HInstruction* instruction,
-                                                                      Location out,
-                                                                      uint32_t offset,
-                                                                      Location maybe_temp) {
+void InstructionCodeGeneratorX86_64::GenerateReferenceLoadOneRegister(
+    HInstruction* instruction,
+    Location out,
+    uint32_t offset,
+    Location maybe_temp,
+    ReadBarrierOption read_barrier_option) {
   CpuRegister out_reg = out.AsRegister<CpuRegister>();
-  if (kEmitCompilerReadBarrier) {
+  if (read_barrier_option == kWithReadBarrier) {
+    CHECK(kEmitCompilerReadBarrier);
     if (kUseBakerReadBarrier) {
       // Load with fast path based Baker's read barrier.
       // /* HeapReference<Object> */ out = *(out + offset)
@@ -6442,14 +6438,16 @@
   }
 }
 
-void InstructionCodeGeneratorX86_64::GenerateReferenceLoadTwoRegisters(HInstruction* instruction,
-                                                                       Location out,
-                                                                       Location obj,
-                                                                       uint32_t offset,
-                                                                       bool emit_read_barrier) {
+void InstructionCodeGeneratorX86_64::GenerateReferenceLoadTwoRegisters(
+    HInstruction* instruction,
+    Location out,
+    Location obj,
+    uint32_t offset,
+    ReadBarrierOption read_barrier_option) {
   CpuRegister out_reg = out.AsRegister<CpuRegister>();
   CpuRegister obj_reg = obj.AsRegister<CpuRegister>();
-  if (emit_read_barrier) {
+  if (read_barrier_option == kWithReadBarrier) {
+    CHECK(kEmitCompilerReadBarrier);
     if (kUseBakerReadBarrier) {
       // Load with fast path based Baker's read barrier.
       // /* HeapReference<Object> */ out = *(obj + offset)
@@ -6469,13 +6467,14 @@
   }
 }
 
-void InstructionCodeGeneratorX86_64::GenerateGcRootFieldLoad(HInstruction* instruction,
-                                                             Location root,
-                                                             const Address& address,
-                                                             Label* fixup_label,
-                                                             bool requires_read_barrier) {
+void InstructionCodeGeneratorX86_64::GenerateGcRootFieldLoad(
+    HInstruction* instruction,
+    Location root,
+    const Address& address,
+    Label* fixup_label,
+    ReadBarrierOption read_barrier_option) {
   CpuRegister root_reg = root.AsRegister<CpuRegister>();
-  if (requires_read_barrier) {
+  if (read_barrier_option == kWithReadBarrier) {
     DCHECK(kEmitCompilerReadBarrier);
     if (kUseBakerReadBarrier) {
       // Fast path implementation of art::ReadBarrier::BarrierForRoot when
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index 5a6dc54..e5a4152 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -234,7 +234,8 @@
   void GenerateReferenceLoadOneRegister(HInstruction* instruction,
                                         Location out,
                                         uint32_t offset,
-                                        Location maybe_temp);
+                                        Location maybe_temp,
+                                        ReadBarrierOption read_barrier_option);
   // Generate a heap reference load using two different registers
   // `out` and `obj`:
   //
@@ -249,17 +250,17 @@
                                          Location out,
                                          Location obj,
                                          uint32_t offset,
-                                         bool emit_read_barrier);
+                                         ReadBarrierOption read_barrier_option);
   // Generate a GC root reference load:
   //
   //   root <- *address
   //
-  // while honoring read barriers if `requires_read_barrier` is true.
+  // while honoring read barriers based on read_barrier_option.
   void GenerateGcRootFieldLoad(HInstruction* instruction,
                                Location root,
                                const Address& address,
                                Label* fixup_label,
-                               bool requires_read_barrier);
+                               ReadBarrierOption read_barrier_option);
 
   void PushOntoFPStack(Location source, uint32_t temp_offset,
                        uint32_t stack_adjustment, bool is_float);
diff --git a/compiler/optimizing/common_arm.h b/compiler/optimizing/common_arm.h
index 13824ad..5129daf 100644
--- a/compiler/optimizing/common_arm.h
+++ b/compiler/optimizing/common_arm.h
@@ -39,22 +39,22 @@
 
 inline vixl::aarch32::Register HighRegisterFrom(Location location) {
   DCHECK(location.IsRegisterPair()) << location;
-  return vixl::aarch32::Register(location.AsRegisterPairHigh<vixl32::Register>());
+  return vixl::aarch32::Register(location.AsRegisterPairHigh<vixl::aarch32::Register>());
 }
 
 inline vixl::aarch32::DRegister HighDRegisterFrom(Location location) {
   DCHECK(location.IsFpuRegisterPair()) << location;
-  return vixl::aarch32::DRegister(location.AsFpuRegisterPairHigh<vixl32::DRegister>());
+  return vixl::aarch32::DRegister(location.AsFpuRegisterPairHigh<vixl::aarch32::DRegister>());
 }
 
 inline vixl::aarch32::Register LowRegisterFrom(Location location) {
   DCHECK(location.IsRegisterPair()) << location;
-  return vixl::aarch32::Register(location.AsRegisterPairLow<vixl32::Register>());
+  return vixl::aarch32::Register(location.AsRegisterPairLow<vixl::aarch32::Register>());
 }
 
 inline vixl::aarch32::SRegister LowSRegisterFrom(Location location) {
   DCHECK(location.IsFpuRegisterPair()) << location;
-  return vixl::aarch32::SRegister(location.AsFpuRegisterPairLow<vixl32::SRegister>());
+  return vixl::aarch32::SRegister(location.AsFpuRegisterPairLow<vixl::aarch32::SRegister>());
 }
 
 inline vixl::aarch32::Register RegisterFrom(Location location) {
diff --git a/compiler/optimizing/emit_swap_mips_test.cc b/compiler/optimizing/emit_swap_mips_test.cc
new file mode 100644
index 0000000..9dc53e6
--- /dev/null
+++ b/compiler/optimizing/emit_swap_mips_test.cc
@@ -0,0 +1,354 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "base/arena_allocator.h"
+#include "code_generator_mips.h"
+#include "optimizing_unit_test.h"
+#include "parallel_move_resolver.h"
+#include "utils/assembler_test_base.h"
+#include "utils/mips/assembler_mips.h"
+
+#include "gtest/gtest.h"
+
+namespace art {
+
+class EmitSwapMipsTest : public ::testing::Test {
+ public:
+  void SetUp() OVERRIDE {
+    allocator_.reset(new ArenaAllocator(&pool_));
+    graph_ = CreateGraph(allocator_.get());
+    isa_features_ = MipsInstructionSetFeatures::FromCppDefines();
+    codegen_ = new (graph_->GetArena()) mips::CodeGeneratorMIPS(graph_,
+                                                                *isa_features_.get(),
+                                                                CompilerOptions());
+    moves_ = new (allocator_.get()) HParallelMove(allocator_.get());
+    test_helper_.reset(
+        new AssemblerTestInfrastructure(GetArchitectureString(),
+                                        GetAssemblerCmdName(),
+                                        GetAssemblerParameters(),
+                                        GetObjdumpCmdName(),
+                                        GetObjdumpParameters(),
+                                        GetDisassembleCmdName(),
+                                        GetDisassembleParameters(),
+                                        GetAssemblyHeader()));
+  }
+
+  void TearDown() OVERRIDE {
+    allocator_.reset();
+    test_helper_.reset();
+  }
+
+  // Get the typically used name for this architecture.
+  std::string GetArchitectureString() {
+    return "mips";
+  }
+
+  // Get the name of the assembler.
+  std::string GetAssemblerCmdName() {
+    return "as";
+  }
+
+  // Switches to the assembler command.
+  std::string GetAssemblerParameters() {
+    return " --no-warn -32 -march=mips32r2";
+  }
+
+  // Get the name of the objdump.
+  std::string GetObjdumpCmdName() {
+    return "objdump";
+  }
+
+  // Switches to the objdump command.
+  std::string GetObjdumpParameters() {
+    return " -h";
+  }
+
+  // Get the name of the objdump.
+  std::string GetDisassembleCmdName() {
+    return "objdump";
+  }
+
+  // Switches to the objdump command.
+  std::string GetDisassembleParameters() {
+    return " -D -bbinary -mmips:isa32r2";
+  }
+
+  // No need for assembly header here.
+  const char* GetAssemblyHeader() {
+    return nullptr;
+  }
+
+  void DriverWrapper(HParallelMove* move, std::string assembly_text, std::string test_name) {
+    codegen_->GetMoveResolver()->EmitNativeCode(move);
+    assembler_ = codegen_->GetAssembler();
+    assembler_->FinalizeCode();
+    std::unique_ptr<std::vector<uint8_t>> data(new std::vector<uint8_t>(assembler_->CodeSize()));
+    MemoryRegion code(&(*data)[0], data->size());
+    assembler_->FinalizeInstructions(code);
+    test_helper_->Driver(*data, assembly_text, test_name);
+  }
+
+ protected:
+  ArenaPool pool_;
+  HGraph* graph_;
+  HParallelMove* moves_;
+  mips::CodeGeneratorMIPS* codegen_;
+  mips::MipsAssembler* assembler_;
+  std::unique_ptr<ArenaAllocator> allocator_;
+  std::unique_ptr<AssemblerTestInfrastructure> test_helper_;
+  std::unique_ptr<const MipsInstructionSetFeatures> isa_features_;
+};
+
+TEST_F(EmitSwapMipsTest, TwoRegisters) {
+  moves_->AddMove(
+      Location::RegisterLocation(4),
+      Location::RegisterLocation(5),
+      Primitive::kPrimInt,
+      nullptr);
+  moves_->AddMove(
+      Location::RegisterLocation(5),
+      Location::RegisterLocation(4),
+      Primitive::kPrimInt,
+      nullptr);
+  const char* expected =
+      "or $t8, $a1, $zero\n"
+      "or $a1, $a0, $zero\n"
+      "or $a0, $t8, $zero\n";
+  DriverWrapper(moves_, expected, "TwoRegisters");
+}
+
+TEST_F(EmitSwapMipsTest, TwoRegisterPairs) {
+  moves_->AddMove(
+      Location::RegisterPairLocation(4, 5),
+      Location::RegisterPairLocation(6, 7),
+      Primitive::kPrimLong,
+      nullptr);
+  moves_->AddMove(
+      Location::RegisterPairLocation(6, 7),
+      Location::RegisterPairLocation(4, 5),
+      Primitive::kPrimLong,
+      nullptr);
+  const char* expected =
+      "or $t8, $a2, $zero\n"
+      "or $a2, $a0, $zero\n"
+      "or $a0, $t8, $zero\n"
+      "or $t8, $a3, $zero\n"
+      "or $a3, $a1, $zero\n"
+      "or $a1, $t8, $zero\n";
+  DriverWrapper(moves_, expected, "TwoRegisterPairs");
+}
+
+TEST_F(EmitSwapMipsTest, TwoFpuRegistersFloat) {
+  moves_->AddMove(
+      Location::FpuRegisterLocation(4),
+      Location::FpuRegisterLocation(6),
+      Primitive::kPrimFloat,
+      nullptr);
+  moves_->AddMove(
+      Location::FpuRegisterLocation(6),
+      Location::FpuRegisterLocation(4),
+      Primitive::kPrimFloat,
+      nullptr);
+  const char* expected =
+      "mov.s $f8, $f6\n"
+      "mov.s $f6, $f4\n"
+      "mov.s $f4, $f8\n";
+  DriverWrapper(moves_, expected, "TwoFpuRegistersFloat");
+}
+
+TEST_F(EmitSwapMipsTest, TwoFpuRegistersDouble) {
+  moves_->AddMove(
+      Location::FpuRegisterLocation(4),
+      Location::FpuRegisterLocation(6),
+      Primitive::kPrimDouble,
+      nullptr);
+  moves_->AddMove(
+      Location::FpuRegisterLocation(6),
+      Location::FpuRegisterLocation(4),
+      Primitive::kPrimDouble,
+      nullptr);
+  const char* expected =
+      "mov.d $f8, $f6\n"
+      "mov.d $f6, $f4\n"
+      "mov.d $f4, $f8\n";
+  DriverWrapper(moves_, expected, "TwoFpuRegistersDouble");
+}
+
+TEST_F(EmitSwapMipsTest, RegisterAndFpuRegister) {
+  moves_->AddMove(
+      Location::RegisterLocation(4),
+      Location::FpuRegisterLocation(6),
+      Primitive::kPrimFloat,
+      nullptr);
+  moves_->AddMove(
+      Location::FpuRegisterLocation(6),
+      Location::RegisterLocation(4),
+      Primitive::kPrimFloat,
+      nullptr);
+  const char* expected =
+      "or $t8, $a0, $zero\n"
+      "mfc1 $a0, $f6\n"
+      "mtc1 $t8, $f6\n";
+  DriverWrapper(moves_, expected, "RegisterAndFpuRegister");
+}
+
+TEST_F(EmitSwapMipsTest, RegisterPairAndFpuRegister) {
+  moves_->AddMove(
+      Location::RegisterPairLocation(4, 5),
+      Location::FpuRegisterLocation(4),
+      Primitive::kPrimDouble,
+      nullptr);
+  moves_->AddMove(
+      Location::FpuRegisterLocation(4),
+      Location::RegisterPairLocation(4, 5),
+      Primitive::kPrimDouble,
+      nullptr);
+  const char* expected =
+      "mfc1 $t8, $f4\n"
+      "mfc1 $at, $f5\n"
+      "mtc1 $a0, $f4\n"
+      "mtc1 $a1, $f5\n"
+      "or $a0, $t8, $zero\n"
+      "or $a1, $at, $zero\n";
+  DriverWrapper(moves_, expected, "RegisterPairAndFpuRegister");
+}
+
+TEST_F(EmitSwapMipsTest, TwoStackSlots) {
+  moves_->AddMove(
+      Location::StackSlot(52),
+      Location::StackSlot(48),
+      Primitive::kPrimInt,
+      nullptr);
+  moves_->AddMove(
+      Location::StackSlot(48),
+      Location::StackSlot(52),
+      Primitive::kPrimInt,
+      nullptr);
+  const char* expected =
+      "addiu $sp, $sp, -4\n"
+      "sw $v0, 0($sp)\n"
+      "lw $v0, 56($sp)\n"
+      "lw $t8, 52($sp)\n"
+      "sw $v0, 52($sp)\n"
+      "sw $t8, 56($sp)\n"
+      "lw $v0, 0($sp)\n"
+      "addiu $sp, $sp, 4\n";
+  DriverWrapper(moves_, expected, "TwoStackSlots");
+}
+
+TEST_F(EmitSwapMipsTest, TwoDoubleStackSlots) {
+  moves_->AddMove(
+      Location::DoubleStackSlot(56),
+      Location::DoubleStackSlot(48),
+      Primitive::kPrimLong,
+      nullptr);
+  moves_->AddMove(
+      Location::DoubleStackSlot(48),
+      Location::DoubleStackSlot(56),
+      Primitive::kPrimLong,
+      nullptr);
+  const char* expected =
+      "addiu $sp, $sp, -4\n"
+      "sw $v0, 0($sp)\n"
+      "lw $v0, 60($sp)\n"
+      "lw $t8, 52($sp)\n"
+      "sw $v0, 52($sp)\n"
+      "sw $t8, 60($sp)\n"
+      "lw $v0, 64($sp)\n"
+      "lw $t8, 56($sp)\n"
+      "sw $v0, 56($sp)\n"
+      "sw $t8, 64($sp)\n"
+      "lw $v0, 0($sp)\n"
+      "addiu $sp, $sp, 4\n";
+  DriverWrapper(moves_, expected, "TwoDoubleStackSlots");
+}
+
+TEST_F(EmitSwapMipsTest, RegisterAndStackSlot) {
+  moves_->AddMove(
+      Location::RegisterLocation(4),
+      Location::StackSlot(48),
+      Primitive::kPrimInt,
+      nullptr);
+  moves_->AddMove(
+      Location::StackSlot(48),
+      Location::RegisterLocation(4),
+      Primitive::kPrimInt,
+      nullptr);
+  const char* expected =
+      "or $t8, $a0, $zero\n"
+      "lw $a0, 48($sp)\n"
+      "sw $t8, 48($sp)\n";
+  DriverWrapper(moves_, expected, "RegisterAndStackSlot");
+}
+
+TEST_F(EmitSwapMipsTest, RegisterPairAndDoubleStackSlot) {
+  moves_->AddMove(
+      Location::RegisterPairLocation(4, 5),
+      Location::DoubleStackSlot(32),
+      Primitive::kPrimLong,
+      nullptr);
+  moves_->AddMove(
+      Location::DoubleStackSlot(32),
+      Location::RegisterPairLocation(4, 5),
+      Primitive::kPrimLong,
+      nullptr);
+  const char* expected =
+      "or $t8, $a0, $zero\n"
+      "lw $a0, 32($sp)\n"
+      "sw $t8, 32($sp)\n"
+      "or $t8, $a1, $zero\n"
+      "lw $a1, 36($sp)\n"
+      "sw $t8, 36($sp)\n";
+  DriverWrapper(moves_, expected, "RegisterPairAndDoubleStackSlot");
+}
+
+TEST_F(EmitSwapMipsTest, FpuRegisterAndStackSlot) {
+  moves_->AddMove(
+      Location::FpuRegisterLocation(4),
+      Location::StackSlot(48),
+      Primitive::kPrimFloat,
+      nullptr);
+  moves_->AddMove(
+      Location::StackSlot(48),
+      Location::FpuRegisterLocation(4),
+      Primitive::kPrimFloat,
+      nullptr);
+  const char* expected =
+      "mov.s $f8, $f4\n"
+      "lwc1 $f4, 48($sp)\n"
+      "swc1 $f8, 48($sp)\n";
+  DriverWrapper(moves_, expected, "FpuRegisterAndStackSlot");
+}
+
+TEST_F(EmitSwapMipsTest, FpuRegisterAndDoubleStackSlot) {
+  moves_->AddMove(
+      Location::FpuRegisterLocation(4),
+      Location::DoubleStackSlot(48),
+      Primitive::kPrimDouble,
+      nullptr);
+  moves_->AddMove(
+      Location::DoubleStackSlot(48),
+      Location::FpuRegisterLocation(4),
+      Primitive::kPrimDouble,
+      nullptr);
+  const char* expected =
+      "mov.d $f8, $f4\n"
+      "ldc1 $f4, 48($sp)\n"
+      "sdc1 $f8, 48($sp)\n";
+  DriverWrapper(moves_, expected, "FpuRegisterAndDoubleStackSlot");
+}
+
+}  // namespace art
diff --git a/compiler/optimizing/inliner.cc b/compiler/optimizing/inliner.cc
index 9e81623..7fe54b9 100644
--- a/compiler/optimizing/inliner.cc
+++ b/compiler/optimizing/inliner.cc
@@ -1226,12 +1226,22 @@
 
   // Skip the entry block, it does not contain instructions that prevent inlining.
   for (HBasicBlock* block : callee_graph->GetReversePostOrderSkipEntryBlock()) {
-    if (block->IsLoopHeader() && block->GetLoopInformation()->IsIrreducible()) {
-      // Don't inline methods with irreducible loops, they could prevent some
-      // optimizations to run.
-      VLOG(compiler) << "Method " << callee_dex_file.PrettyMethod(method_index)
-                     << " could not be inlined because it contains an irreducible loop";
-      return false;
+    if (block->IsLoopHeader()) {
+      if (block->GetLoopInformation()->IsIrreducible()) {
+        // Don't inline methods with irreducible loops, they could prevent some
+        // optimizations to run.
+        VLOG(compiler) << "Method " << callee_dex_file.PrettyMethod(method_index)
+                       << " could not be inlined because it contains an irreducible loop";
+        return false;
+      }
+      if (!block->GetLoopInformation()->HasExitEdge()) {
+        // Don't inline methods with loops without exit, since they cause the
+        // loop information to be computed incorrectly when updating after
+        // inlining.
+        VLOG(compiler) << "Method " << callee_dex_file.PrettyMethod(method_index)
+                       << " could not be inlined because it contains a loop with no exit";
+        return false;
+      }
     }
 
     for (HInstructionIterator instr_it(block->GetInstructions());
diff --git a/compiler/optimizing/instruction_simplifier_arm64.cc b/compiler/optimizing/instruction_simplifier_arm64.cc
index d0dd650..6d107d5 100644
--- a/compiler/optimizing/instruction_simplifier_arm64.cc
+++ b/compiler/optimizing/instruction_simplifier_arm64.cc
@@ -140,13 +140,6 @@
 
 void InstructionSimplifierArm64Visitor::VisitArrayGet(HArrayGet* instruction) {
   size_t data_offset = CodeGenerator::GetArrayDataOffset(instruction);
-  // Don't move the array pointer if it is charAt because we need to take the count first.
-  // TODO: Implement reading (length + compression) for String compression feature from
-  // negative offset (count_offset - data_offset) using LDP and clobbering an extra temporary.
-  // Note that "LDR (Immediate)" does not have a "signed offset" encoding.
-  if (mirror::kUseStringCompression && instruction->IsStringCharAt()) {
-    return;
-  }
   if (TryExtractArrayAccessAddress(instruction,
                                    instruction->GetArray(),
                                    instruction->GetIndex(),
diff --git a/compiler/optimizing/intrinsics_arm.cc b/compiler/optimizing/intrinsics_arm.cc
index 0c39223..8234b24 100644
--- a/compiler/optimizing/intrinsics_arm.cc
+++ b/compiler/optimizing/intrinsics_arm.cc
@@ -1058,7 +1058,6 @@
   // Need temporary registers for String compression's feature.
   if (mirror::kUseStringCompression) {
     locations->AddTemp(Location::RequiresRegister());
-    locations->AddTemp(Location::RequiresRegister());
   }
   locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
 }
@@ -1074,10 +1073,9 @@
   Register temp0 = locations->GetTemp(0).AsRegister<Register>();
   Register temp1 = locations->GetTemp(1).AsRegister<Register>();
   Register temp2 = locations->GetTemp(2).AsRegister<Register>();
-  Register temp3, temp4;
+  Register temp3;
   if (mirror::kUseStringCompression) {
     temp3 = locations->GetTemp(3).AsRegister<Register>();
-    temp4 = locations->GetTemp(4).AsRegister<Register>();
   }
 
   Label loop;
@@ -1104,41 +1102,42 @@
   // Reference equality check, return 0 if same reference.
   __ subs(out, str, ShifterOperand(arg));
   __ b(&end, EQ);
+
   if (mirror::kUseStringCompression) {
-    // Load lengths of this and argument strings.
+    // Load `count` fields of this and argument strings.
     __ ldr(temp3, Address(str, count_offset));
-    __ ldr(temp4, Address(arg, count_offset));
-    // Clean out compression flag from lengths.
-    __ bic(temp0, temp3, ShifterOperand(0x80000000));
-    __ bic(IP, temp4, ShifterOperand(0x80000000));
+    __ ldr(temp2, Address(arg, count_offset));
+    // Extract lengths from the `count` fields.
+    __ Lsr(temp0, temp3, 1u);
+    __ Lsr(temp1, temp2, 1u);
   } else {
     // Load lengths of this and argument strings.
     __ ldr(temp0, Address(str, count_offset));
-    __ ldr(IP, Address(arg, count_offset));
+    __ ldr(temp1, Address(arg, count_offset));
   }
   // out = length diff.
-  __ subs(out, temp0, ShifterOperand(IP));
+  __ subs(out, temp0, ShifterOperand(temp1));
   // temp0 = min(len(str), len(arg)).
   __ it(GT);
-  __ mov(temp0, ShifterOperand(IP), GT);
+  __ mov(temp0, ShifterOperand(temp1), GT);
   // Shorter string is empty?
   __ CompareAndBranchIfZero(temp0, &end);
 
   if (mirror::kUseStringCompression) {
     // Check if both strings using same compression style to use this comparison loop.
-    __ eors(temp3, temp3, ShifterOperand(temp4));
-    __ b(&different_compression, MI);
-  }
-  // Store offset of string value in preparation for comparison loop.
-  __ mov(temp1, ShifterOperand(value_offset));
-  if (mirror::kUseStringCompression) {
+    __ eor(temp2, temp2, ShifterOperand(temp3));
+    __ Lsrs(temp2, temp2, 1u);
+    __ b(&different_compression, CS);
     // For string compression, calculate the number of bytes to compare (not chars).
     // This could in theory exceed INT32_MAX, so treat temp0 as unsigned.
-    __ cmp(temp4, ShifterOperand(0));
-    __ it(GE);
-    __ add(temp0, temp0, ShifterOperand(temp0), GE);
+    __ Lsls(temp3, temp3, 31u);  // Extract purely the compression flag.
+    __ it(NE);
+    __ add(temp0, temp0, ShifterOperand(temp0), NE);
   }
 
+  // Store offset of string value in preparation for comparison loop.
+  __ mov(temp1, ShifterOperand(value_offset));
+
   // Assertions that must hold in order to compare multiple characters at a time.
   CHECK_ALIGNED(value_offset, 8);
   static_assert(IsAligned<8>(kObjectAlignment),
@@ -1198,69 +1197,80 @@
   // The comparison is unsigned for string compression, otherwise signed.
   __ cmp(temp0, ShifterOperand(temp1, LSR, mirror::kUseStringCompression ? 3 : 4));
   __ b(&end, mirror::kUseStringCompression ? LS : LE);
+
   // Extract the characters and calculate the difference.
-  Label uncompressed_string, continue_process;
   if (mirror::kUseStringCompression) {
-    __ cmp(temp4, ShifterOperand(0));
-    __ b(&uncompressed_string, GE);
-    __ bic(temp1, temp1, ShifterOperand(0x7));
-    __ b(&continue_process);
+    // For compressed strings we need to clear 0x7 from temp1, for uncompressed we need to clear
+    // 0xf. We also need to prepare the character extraction mask `uncompressed ? 0xffffu : 0xffu`.
+    // The compression flag is now in the highest bit of temp3, so let's play some tricks.
+    __ orr(temp3, temp3, ShifterOperand(0xffu << 23));  // uncompressed ? 0xff800000u : 0x7ff80000u
+    __ bic(temp1, temp1, ShifterOperand(temp3, LSR, 31 - 3));  // &= ~(uncompressed ? 0xfu : 0x7u)
+    __ Asr(temp3, temp3, 7u);                           // uncompressed ? 0xffff0000u : 0xff0000u.
+    __ Lsr(temp2, temp2, temp1);                        // Extract second character.
+    __ Lsr(temp3, temp3, 16u);                          // uncompressed ? 0xffffu : 0xffu
+    __ Lsr(out, IP, temp1);                             // Extract first character.
+    __ and_(temp2, temp2, ShifterOperand(temp3));
+    __ and_(out, out, ShifterOperand(temp3));
+  } else {
+    __ bic(temp1, temp1, ShifterOperand(0xf));
+    __ Lsr(temp2, temp2, temp1);
+    __ Lsr(out, IP, temp1);
+    __ movt(temp2, 0);
+    __ movt(out, 0);
   }
-  __ Bind(&uncompressed_string);
-  __ bic(temp1, temp1, ShifterOperand(0xf));
-  __ Bind(&continue_process);
 
-  __ Lsr(temp2, temp2, temp1);
-  __ Lsr(IP, IP, temp1);
-  Label calculate_difference, uncompressed_string_extract_chars;
-  if (mirror::kUseStringCompression) {
-    __ cmp(temp4, ShifterOperand(0));
-    __ b(&uncompressed_string_extract_chars, GE);
-    __ ubfx(temp2, temp2, 0, 8);
-    __ ubfx(IP, IP, 0, 8);
-    __ b(&calculate_difference);
-  }
-  __ Bind(&uncompressed_string_extract_chars);
-  __ movt(temp2, 0);
-  __ movt(IP, 0);
-  __ Bind(&calculate_difference);
-  __ sub(out, IP, ShifterOperand(temp2));
-  __ b(&end);
+  __ sub(out, out, ShifterOperand(temp2));
 
   if (mirror::kUseStringCompression) {
+    __ b(&end);
+    __ Bind(&different_compression);
+
+    // Comparison for different compression style.
     const size_t c_char_size = Primitive::ComponentSize(Primitive::kPrimByte);
     DCHECK_EQ(c_char_size, 1u);
-    Label loop_arg_compressed, loop_this_compressed, find_diff;
-    // Comparison for different compression style.
-    // This part is when THIS is compressed and ARG is not.
-    __ Bind(&different_compression);
-    __ add(temp2, str, ShifterOperand(value_offset));
-    __ add(temp3, arg, ShifterOperand(value_offset));
-    __ cmp(temp4, ShifterOperand(0));
-    __ b(&loop_arg_compressed, LT);
 
-    __ Bind(&loop_this_compressed);
-    __ ldrb(IP, Address(temp2, c_char_size, Address::PostIndex));
-    __ ldrh(temp4, Address(temp3, char_size, Address::PostIndex));
-    __ cmp(IP, ShifterOperand(temp4));
-    __ b(&find_diff, NE);
-    __ subs(temp0, temp0, ShifterOperand(1));
-    __ b(&loop_this_compressed, GT);
-    __ b(&end);
+    // We want to free up the temp3, currently holding `str.count`, for comparison.
+    // So, we move it to the bottom bit of the iteration count `temp0` which we tnen
+    // need to treat as unsigned. Start by freeing the bit with an ADD and continue
+    // further down by a LSRS+SBC which will flip the meaning of the flag but allow
+    // `subs temp0, #2; bhi different_compression_loop` to serve as the loop condition.
+    __ add(temp0, temp0, ShifterOperand(temp0));  // Unlike LSL, this ADD is always 16-bit.
+    // `temp1` will hold the compressed data pointer, `temp2` the uncompressed data pointer.
+    __ mov(temp1, ShifterOperand(str));
+    __ mov(temp2, ShifterOperand(arg));
+    __ Lsrs(temp3, temp3, 1u);                // Continue the move of the compression flag.
+    __ it(CS, kItThen);                       // Interleave with selection of temp1 and temp2.
+    __ mov(temp1, ShifterOperand(arg), CS);   // Preserves flags.
+    __ mov(temp2, ShifterOperand(str), CS);   // Preserves flags.
+    __ sbc(temp0, temp0, ShifterOperand(0));  // Complete the move of the compression flag.
 
-    // This part is when THIS is not compressed and ARG is.
-    __ Bind(&loop_arg_compressed);
-    __ ldrh(IP, Address(temp2, char_size, Address::PostIndex));
-    __ ldrb(temp4, Address(temp3, c_char_size, Address::PostIndex));
-    __ cmp(IP, ShifterOperand(temp4));
-    __ b(&find_diff, NE);
-    __ subs(temp0, temp0, ShifterOperand(1));
-    __ b(&loop_arg_compressed, GT);
+    // Adjust temp1 and temp2 from string pointers to data pointers.
+    __ add(temp1, temp1, ShifterOperand(value_offset));
+    __ add(temp2, temp2, ShifterOperand(value_offset));
+
+    Label different_compression_loop;
+    Label different_compression_diff;
+
+    // Main loop for different compression.
+    __ Bind(&different_compression_loop);
+    __ ldrb(IP, Address(temp1, c_char_size, Address::PostIndex));
+    __ ldrh(temp3, Address(temp2, char_size, Address::PostIndex));
+    __ cmp(IP, ShifterOperand(temp3));
+    __ b(&different_compression_diff, NE);
+    __ subs(temp0, temp0, ShifterOperand(2));
+    __ b(&different_compression_loop, HI);
     __ b(&end);
 
     // Calculate the difference.
-    __ Bind(&find_diff);
-    __ sub(out, IP, ShifterOperand(temp4));
+    __ Bind(&different_compression_diff);
+    __ sub(out, IP, ShifterOperand(temp3));
+    // Flip the difference if the `arg` is compressed.
+    // `temp0` contains inverted `str` compression flag, i.e the same as `arg` compression flag.
+    __ Lsrs(temp0, temp0, 1u);
+    static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
+                  "Expecting 0=compressed, 1=uncompressed");
+    __ it(CC);
+    __ rsb(out, out, ShifterOperand(0), CC);
   }
 
   __ Bind(&end);
@@ -1298,7 +1308,7 @@
   Register temp1 = locations->GetTemp(1).AsRegister<Register>();
   Register temp2 = locations->GetTemp(2).AsRegister<Register>();
 
-  Label loop, preloop;
+  Label loop;
   Label end;
   Label return_true;
   Label return_false;
@@ -1317,6 +1327,10 @@
     __ CompareAndBranchIfZero(arg, &return_false);
   }
 
+  // Reference equality check, return true if same reference.
+  __ cmp(str, ShifterOperand(arg));
+  __ b(&return_true, EQ);
+
   if (!optimizations.GetArgumentIsString()) {
     // Instanceof check for the argument by comparing class fields.
     // All string objects must have the same type since String cannot be subclassed.
@@ -1328,48 +1342,44 @@
     __ b(&return_false, NE);
   }
 
-  // Load lengths of this and argument strings.
+  // Load `count` fields of this and argument strings.
   __ ldr(temp, Address(str, count_offset));
   __ ldr(temp1, Address(arg, count_offset));
-  // Check if lengths are equal, return false if they're not.
+  // Check if `count` fields are equal, return false if they're not.
   // Also compares the compression style, if differs return false.
   __ cmp(temp, ShifterOperand(temp1));
   __ b(&return_false, NE);
-  // Return true if both strings are empty.
-  if (mirror::kUseStringCompression) {
-    // Length needs to be masked out first because 0 is treated as compressed.
-    __ bic(temp, temp, ShifterOperand(0x80000000));
-  }
+  // Return true if both strings are empty. Even with string compression `count == 0` means empty.
+  static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
+                "Expecting 0=compressed, 1=uncompressed");
   __ cbz(temp, &return_true);
-  // Reference equality check, return true if same reference.
-  __ cmp(str, ShifterOperand(arg));
-  __ b(&return_true, EQ);
 
-  // Assertions that must hold in order to compare strings 2 characters at a time.
+  // Assertions that must hold in order to compare strings 4 bytes at a time.
   DCHECK_ALIGNED(value_offset, 4);
   static_assert(IsAligned<4>(kObjectAlignment), "String data must be aligned for fast compare.");
 
   if (mirror::kUseStringCompression) {
-    // If not compressed, directly to fast compare. Else do preprocess on length.
-    __ cmp(temp1, ShifterOperand(0));
-    __ b(&preloop, GT);
-    // Mask out compression flag and adjust length for compressed string (8-bit)
-    // as if it is a 16-bit data, new_length = (length + 1) / 2.
-    __ add(temp, temp, ShifterOperand(1));
-    __ Lsr(temp, temp, 1);
-    __ Bind(&preloop);
+    // For string compression, calculate the number of bytes to compare (not chars).
+    // This could in theory exceed INT32_MAX, so treat temp as unsigned.
+    __ Lsrs(temp, temp, 1u);                        // Extract length and check compression flag.
+    __ it(CS);                                      // If uncompressed,
+    __ add(temp, temp, ShifterOperand(temp), CS);   //   double the byte count.
   }
-  // Loop to compare strings 2 characters at a time starting at the front of the string.
-  // Ok to do this because strings with an odd length are zero-padded.
+
+  // Store offset of string value in preparation for comparison loop.
   __ LoadImmediate(temp1, value_offset);
+
+  // Loop to compare strings 4 bytes at a time starting at the front of the string.
+  // Ok to do this because strings are zero-padded to kObjectAlignment.
   __ Bind(&loop);
   __ ldr(out, Address(str, temp1));
   __ ldr(temp2, Address(arg, temp1));
+  __ add(temp1, temp1, ShifterOperand(sizeof(uint32_t)));
   __ cmp(out, ShifterOperand(temp2));
   __ b(&return_false, NE);
-  __ add(temp1, temp1, ShifterOperand(sizeof(uint32_t)));
-  __ subs(temp, temp, ShifterOperand(sizeof(uint32_t) /  sizeof(uint16_t)));
-  __ b(&loop, GT);
+  // With string compression, we have compared 4 bytes, otherwise 2 chars.
+  __ subs(temp, temp, ShifterOperand(mirror::kUseStringCompression ? 4 : 2));
+  __ b(&loop, HI);
 
   // Return true and exit the function.
   // If loop does not result in returning false, we return true.
@@ -2477,8 +2487,8 @@
     const uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
     // String's length.
     __ ldr(IP, Address(srcObj, count_offset));
-    __ cmp(IP, ShifterOperand(0));
-    __ b(&compressed_string_preloop, LT);
+    __ tst(IP, ShifterOperand(1));
+    __ b(&compressed_string_preloop, EQ);
   }
   __ add(src_ptr, src_ptr, ShifterOperand(srcBegin, LSL, 1));
 
@@ -2513,9 +2523,10 @@
   __ subs(num_chr, num_chr, ShifterOperand(1));
   __ strh(IP, Address(dst_ptr, char_size, Address::PostIndex));
   __ b(&remainder, GT);
-  __ b(&done);
 
   if (mirror::kUseStringCompression) {
+    __ b(&done);
+
     const size_t c_char_size = Primitive::ComponentSize(Primitive::kPrimByte);
     DCHECK_EQ(c_char_size, 1u);
     // Copy loop for compressed src, copying 1 character (8-bit) to (16-bit) at a time.
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index b9424a3..451abc5 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -1243,7 +1243,6 @@
   // Need temporary registers for String compression's feature.
   if (mirror::kUseStringCompression) {
     locations->AddTemp(Location::RequiresRegister());
-    locations->AddTemp(Location::RequiresRegister());
   }
   locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
 }
@@ -1261,10 +1260,9 @@
   Register temp0 = WRegisterFrom(locations->GetTemp(0));
   Register temp1 = WRegisterFrom(locations->GetTemp(1));
   Register temp2 = WRegisterFrom(locations->GetTemp(2));
-  Register temp3, temp5;
+  Register temp3;
   if (mirror::kUseStringCompression) {
     temp3 = WRegisterFrom(locations->GetTemp(3));
-    temp5 = WRegisterFrom(locations->GetTemp(4));
   }
 
   vixl::aarch64::Label loop;
@@ -1291,68 +1289,65 @@
   // Reference equality check, return 0 if same reference.
   __ Subs(out, str, arg);
   __ B(&end, eq);
+
   if (mirror::kUseStringCompression) {
-    // Load lengths of this and argument strings.
+    // Load `count` fields of this and argument strings.
     __ Ldr(temp3, HeapOperand(str, count_offset));
-    __ Ldr(temp5, HeapOperand(arg, count_offset));
+    __ Ldr(temp2, HeapOperand(arg, count_offset));
     // Clean out compression flag from lengths.
-    __ Bic(temp0, temp3, Operand(static_cast<int32_t>(0x80000000)));
-    __ Bic(temp1, temp5, Operand(static_cast<int32_t>(0x80000000)));
+    __ Lsr(temp0, temp3, 1u);
+    __ Lsr(temp1, temp2, 1u);
   } else {
     // Load lengths of this and argument strings.
     __ Ldr(temp0, HeapOperand(str, count_offset));
     __ Ldr(temp1, HeapOperand(arg, count_offset));
   }
-  // Return zero if both strings are empty.
-  __ Orr(out, temp0, temp1);
-  __ Cbz(out, &end);
   // out = length diff.
   __ Subs(out, temp0, temp1);
-  // temp2 = min(len(str), len(arg)).
-  __ Csel(temp2, temp1, temp0, ge);
+  // temp0 = min(len(str), len(arg)).
+  __ Csel(temp0, temp1, temp0, ge);
   // Shorter string is empty?
-  __ Cbz(temp2, &end);
+  __ Cbz(temp0, &end);
 
   if (mirror::kUseStringCompression) {
     // Check if both strings using same compression style to use this comparison loop.
-    __ Eor(temp3.W(), temp3, Operand(temp5));
-    __ Tbnz(temp3.W(), kWRegSize - 1, &different_compression);
+    __ Eor(temp2, temp2, Operand(temp3));
+    // Interleave with compression flag extraction which is needed for both paths
+    // and also set flags which is needed only for the different compressions path.
+    __ Ands(temp3.W(), temp3.W(), Operand(1));
+    __ Tbnz(temp2, 0, &different_compression);  // Does not use flags.
   }
   // Store offset of string value in preparation for comparison loop.
   __ Mov(temp1, value_offset);
   if (mirror::kUseStringCompression) {
     // For string compression, calculate the number of bytes to compare (not chars).
-    // This could be in theory exceed INT32_MAX, so treat temp2 as unsigned.
-    vixl::aarch64::Label let_it_signed;
-    __ Cmp(temp5, Operand(0));
-    __ B(lt, &let_it_signed);
-    __ Add(temp2, temp2, Operand(temp2));
-    __ Bind(&let_it_signed);
+    // This could in theory exceed INT32_MAX, so treat temp0 as unsigned.
+    __ Lsl(temp0, temp0, temp3);
   }
 
   UseScratchRegisterScope scratch_scope(masm);
   Register temp4 = scratch_scope.AcquireX();
 
-  // Assertions that must hold in order to compare strings 4 characters at a time.
+  // Assertions that must hold in order to compare strings 8 bytes at a time.
   DCHECK_ALIGNED(value_offset, 8);
   static_assert(IsAligned<8>(kObjectAlignment), "String of odd length is not zero padded");
 
   const size_t char_size = Primitive::ComponentSize(Primitive::kPrimChar);
   DCHECK_EQ(char_size, 2u);
 
-  // Promote temp0 to an X reg, ready for LDR.
-  temp0 = temp0.X();
+  // Promote temp2 to an X reg, ready for LDR.
+  temp2 = temp2.X();
 
   // Loop to compare 4x16-bit characters at a time (ok because of string data alignment).
   __ Bind(&loop);
   __ Ldr(temp4, MemOperand(str.X(), temp1.X()));
-  __ Ldr(temp0, MemOperand(arg.X(), temp1.X()));
-  __ Cmp(temp4, temp0);
+  __ Ldr(temp2, MemOperand(arg.X(), temp1.X()));
+  __ Cmp(temp4, temp2);
   __ B(ne, &find_char_diff);
   __ Add(temp1, temp1, char_size * 4);
   // With string compression, we have compared 8 bytes, otherwise 4 chars.
-  __ Subs(temp2, temp2, (mirror::kUseStringCompression) ? 8 : 4);
-  __ B(hi, &loop);
+  __ Subs(temp0, temp0, (mirror::kUseStringCompression) ? 8 : 4);
+  __ B(&loop, hi);
   __ B(&end);
 
   // Promote temp1 to an X reg, ready for EOR.
@@ -1361,78 +1356,85 @@
   // Find the single character difference.
   __ Bind(&find_char_diff);
   // Get the bit position of the first character that differs.
-  __ Eor(temp1, temp0, temp4);
+  __ Eor(temp1, temp2, temp4);
   __ Rbit(temp1, temp1);
   __ Clz(temp1, temp1);
+
   // If the number of chars remaining <= the index where the difference occurs (0-3), then
   // the difference occurs outside the remaining string data, so just return length diff (out).
   // Unlike ARM, we're doing the comparison in one go here, without the subtraction at the
   // find_char_diff_2nd_cmp path, so it doesn't matter whether the comparison is signed or
   // unsigned when string compression is disabled.
   // When it's enabled, the comparison must be unsigned.
-  __ Cmp(temp2, Operand(temp1.W(), LSR, (mirror::kUseStringCompression) ? 3 : 4));
+  __ Cmp(temp0, Operand(temp1.W(), LSR, (mirror::kUseStringCompression) ? 3 : 4));
   __ B(ls, &end);
+
   // Extract the characters and calculate the difference.
-  vixl::aarch64::Label uncompressed_string, continue_process;
   if (mirror:: kUseStringCompression) {
-    __ Tbz(temp5, kWRegSize - 1, &uncompressed_string);
     __ Bic(temp1, temp1, 0x7);
-    __ B(&continue_process);
+    __ Bic(temp1, temp1, Operand(temp3.X(), LSL, 3u));
+  } else {
+    __ Bic(temp1, temp1, 0xf);
   }
-  __ Bind(&uncompressed_string);
-  __ Bic(temp1, temp1, 0xf);
-  __ Bind(&continue_process);
-
-  __ Lsr(temp0, temp0, temp1);
+  __ Lsr(temp2, temp2, temp1);
   __ Lsr(temp4, temp4, temp1);
-  vixl::aarch64::Label uncompressed_string_extract_chars;
   if (mirror::kUseStringCompression) {
-    __ Tbz(temp5, kWRegSize - 1, &uncompressed_string_extract_chars);
-    __ And(temp4, temp4, 0xff);
-    __ Sub(out, temp4.W(), Operand(temp0.W(), UXTB));
-    __ B(&end);
+    // Prioritize the case of compressed strings and calculate such result first.
+    __ Uxtb(temp1, temp4);
+    __ Sub(out, temp1.W(), Operand(temp2.W(), UXTB));
+    __ Tbz(temp3, 0u, &end);  // If actually compressed, we're done.
   }
-  __ Bind(&uncompressed_string_extract_chars);
-  __ And(temp4, temp4, 0xffff);
-  __ Sub(out, temp4.W(), Operand(temp0.W(), UXTH));
-  __ B(&end);
+  __ Uxth(temp4, temp4);
+  __ Sub(out, temp4.W(), Operand(temp2.W(), UXTH));
 
   if (mirror::kUseStringCompression) {
-    vixl::aarch64::Label loop_this_compressed, loop_arg_compressed, find_diff;
+    __ B(&end);
+    __ Bind(&different_compression);
+
+    // Comparison for different compression style.
     const size_t c_char_size = Primitive::ComponentSize(Primitive::kPrimByte);
     DCHECK_EQ(c_char_size, 1u);
-    temp0 = temp0.W();
     temp1 = temp1.W();
-    // Comparison for different compression style.
-    // This part is when THIS is compressed and ARG is not.
-    __ Bind(&different_compression);
-    __ Add(temp0, str, Operand(value_offset));
-    __ Add(temp1, arg, Operand(value_offset));
-    __ Cmp(temp5, Operand(0));
-    __ B(lt, &loop_arg_compressed);
+    temp2 = temp2.W();
+    temp4 = temp4.W();
 
-    __ Bind(&loop_this_compressed);
-    __ Ldrb(temp3, MemOperand(temp0.X(), c_char_size, PostIndex));
-    __ Ldrh(temp5, MemOperand(temp1.X(), char_size, PostIndex));
-    __ Cmp(temp3, Operand(temp5));
-    __ B(ne, &find_diff);
-    __ Subs(temp2, temp2, 1);
-    __ B(gt, &loop_this_compressed);
-    __ B(&end);
+    // `temp1` will hold the compressed data pointer, `temp2` the uncompressed data pointer.
+    // Note that flags have been set by the `str` compression flag extraction to `temp3`
+    // before branching to the `different_compression` label.
+    __ Csel(temp1, str, arg, eq);   // Pointer to the compressed string.
+    __ Csel(temp2, str, arg, ne);   // Pointer to the uncompressed string.
 
-    // This part is when THIS is not compressed and ARG is.
-    __ Bind(&loop_arg_compressed);
-    __ Ldrh(temp3, MemOperand(temp0.X(), char_size, PostIndex));
-    __ Ldrb(temp5, MemOperand(temp1.X(), c_char_size, PostIndex));
-    __ Cmp(temp3, Operand(temp5));
-    __ B(ne, &find_diff);
-    __ Subs(temp2, temp2, 1);
-    __ B(gt, &loop_arg_compressed);
+    // We want to free up the temp3, currently holding `str` compression flag, for comparison.
+    // So, we move it to the bottom bit of the iteration count `temp0` which we then need to treat
+    // as unsigned. Start by freeing the bit with a LSL and continue further down by a SUB which
+    // will allow `subs temp0, #2; bhi different_compression_loop` to serve as the loop condition.
+    __ Lsl(temp0, temp0, 1u);
+
+    // Adjust temp1 and temp2 from string pointers to data pointers.
+    __ Add(temp1, temp1, Operand(value_offset));
+    __ Add(temp2, temp2, Operand(value_offset));
+
+    // Complete the move of the compression flag.
+    __ Sub(temp0, temp0, Operand(temp3));
+
+    vixl::aarch64::Label different_compression_loop;
+    vixl::aarch64::Label different_compression_diff;
+
+    __ Bind(&different_compression_loop);
+    __ Ldrb(temp4, MemOperand(temp1.X(), c_char_size, PostIndex));
+    __ Ldrh(temp3, MemOperand(temp2.X(), char_size, PostIndex));
+    __ Subs(temp4, temp4, Operand(temp3));
+    __ B(&different_compression_diff, ne);
+    __ Subs(temp0, temp0, 2);
+    __ B(&different_compression_loop, hi);
     __ B(&end);
 
     // Calculate the difference.
-    __ Bind(&find_diff);
-    __ Sub(out, temp3.W(), Operand(temp5.W(), UXTH));
+    __ Bind(&different_compression_diff);
+    __ Tst(temp0, Operand(1));
+    static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
+                  "Expecting 0=compressed, 1=uncompressed");
+    __ Cneg(out, temp4, ne);
   }
 
   __ Bind(&end);
@@ -1468,7 +1470,7 @@
   Register temp1 = WRegisterFrom(locations->GetTemp(0));
   Register temp2 = WRegisterFrom(locations->GetTemp(1));
 
-  vixl::aarch64::Label loop, preloop;
+  vixl::aarch64::Label loop;
   vixl::aarch64::Label end;
   vixl::aarch64::Label return_true;
   vixl::aarch64::Label return_false;
@@ -1502,49 +1504,46 @@
     __ B(&return_false, ne);
   }
 
-  // Load lengths of this and argument strings.
+  // Load `count` fields of this and argument strings.
   __ Ldr(temp, MemOperand(str.X(), count_offset));
   __ Ldr(temp1, MemOperand(arg.X(), count_offset));
-  // Check if lengths are equal, return false if they're not.
+  // Check if `count` fields are equal, return false if they're not.
   // Also compares the compression style, if differs return false.
   __ Cmp(temp, temp1);
   __ B(&return_false, ne);
-  // Return true if both strings are empty.
-  if (mirror::kUseStringCompression) {
-    // Length needs to be masked out first because 0 is treated as compressed.
-    __ Bic(temp, temp, Operand(static_cast<int32_t>(0x80000000)));
-  }
+  // Return true if both strings are empty. Even with string compression `count == 0` means empty.
+  static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
+                "Expecting 0=compressed, 1=uncompressed");
   __ Cbz(temp, &return_true);
 
-  // Assertions that must hold in order to compare strings 4 characters at a time.
+  // Assertions that must hold in order to compare strings 8 bytes at a time.
   DCHECK_ALIGNED(value_offset, 8);
   static_assert(IsAligned<8>(kObjectAlignment), "String of odd length is not zero padded");
 
   if (mirror::kUseStringCompression) {
-    // If not compressed, directly to fast compare. Else do preprocess on length.
-    __ Cmp(temp1, Operand(0));
-    __ B(&preloop, gt);
-    // Mask out compression flag and adjust length for compressed string (8-bit)
-    // as if it is a 16-bit data, new_length = (length + 1) / 2
-    __ Add(temp, temp, 1);
-    __ Lsr(temp, temp, 1);
+    // For string compression, calculate the number of bytes to compare (not chars).
+    // This could in theory exceed INT32_MAX, so treat temp as unsigned.
+    __ Lsr(temp, temp, 1u);             // Extract length.
+    __ And(temp1, temp1, Operand(1));   // Extract compression flag.
+    __ Lsl(temp, temp, temp1);          // Calculate number of bytes to compare.
   }
 
+  // Store offset of string value in preparation for comparison loop
+  __ Mov(temp1, value_offset);
+
   temp1 = temp1.X();
   temp2 = temp2.X();
-  // Loop to compare strings 4 characters at a time starting at the beginning of the string.
-  // Ok to do this because strings are zero-padded to be 8-byte aligned.
-  // Store offset of string value in preparation for comparison loop
-  __ Bind(&preloop);
-  __ Mov(temp1, value_offset);
+  // Loop to compare strings 8 bytes at a time starting at the front of the string.
+  // Ok to do this because strings are zero-padded to kObjectAlignment.
   __ Bind(&loop);
   __ Ldr(out, MemOperand(str.X(), temp1));
   __ Ldr(temp2, MemOperand(arg.X(), temp1));
   __ Add(temp1, temp1, Operand(sizeof(uint64_t)));
   __ Cmp(out, temp2);
   __ B(&return_false, ne);
-  __ Sub(temp, temp, Operand(4), SetFlags);
-  __ B(&loop, gt);
+  // With string compression, we have compared 8 bytes, otherwise 4 chars.
+  __ Sub(temp, temp, Operand(mirror::kUseStringCompression ? 8 : 4), SetFlags);
+  __ B(&loop, hi);
 
   // Return true and exit the function.
   // If loop does not result in returning false, we return true.
@@ -1900,10 +1899,6 @@
   locations->AddTemp(Location::RequiresRegister());
   locations->AddTemp(Location::RequiresRegister());
   locations->AddTemp(Location::RequiresRegister());
-  // Need temporary register for String compression feature.
-  if (mirror::kUseStringCompression) {
-    locations->AddTemp(Location::RequiresRegister());
-  }
 }
 
 void IntrinsicCodeGeneratorARM64::VisitStringGetCharsNoCheck(HInvoke* invoke) {
@@ -1931,10 +1926,6 @@
   Register src_ptr = XRegisterFrom(locations->GetTemp(0));
   Register num_chr = XRegisterFrom(locations->GetTemp(1));
   Register tmp1 = XRegisterFrom(locations->GetTemp(2));
-  Register tmp3;
-  if (mirror::kUseStringCompression) {
-    tmp3 = WRegisterFrom(locations->GetTemp(3));
-  }
 
   UseScratchRegisterScope temps(masm);
   Register dst_ptr = temps.AcquireX();
@@ -1957,8 +1948,8 @@
     // Location of count in string.
     const uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
     // String's length.
-    __ Ldr(tmp3, MemOperand(srcObj, count_offset));
-    __ Tbnz(tmp3, kWRegSize - 1, &compressed_string_preloop);
+    __ Ldr(tmp2, MemOperand(srcObj, count_offset));
+    __ Tbz(tmp2, 0, &compressed_string_preloop);
   }
   __ Add(src_ptr, src_ptr, Operand(srcBegin, LSL, 1));
 
diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc
index e5240a2..e4bef34 100644
--- a/compiler/optimizing/intrinsics_arm_vixl.cc
+++ b/compiler/optimizing/intrinsics_arm_vixl.cc
@@ -1120,7 +1120,6 @@
   // Need temporary registers for String compression's feature.
   if (mirror::kUseStringCompression) {
     locations->AddTemp(Location::RequiresRegister());
-    locations->AddTemp(Location::RequiresRegister());
   }
   locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
 }
@@ -1136,10 +1135,9 @@
   vixl32::Register temp0 = RegisterFrom(locations->GetTemp(0));
   vixl32::Register temp1 = RegisterFrom(locations->GetTemp(1));
   vixl32::Register temp2 = RegisterFrom(locations->GetTemp(2));
-  vixl32::Register temp3, temp4;
+  vixl32::Register temp3;
   if (mirror::kUseStringCompression) {
     temp3 = RegisterFrom(locations->GetTemp(3));
-    temp4 = RegisterFrom(locations->GetTemp(4));
   }
 
   vixl32::Label loop;
@@ -1167,23 +1165,20 @@
   __ Subs(out, str, arg);
   __ B(eq, &end);
 
-  UseScratchRegisterScope temps(assembler->GetVIXLAssembler());
-  vixl32::Register temp_reg = temps.Acquire();
-
   if (mirror::kUseStringCompression) {
-    // Load lengths of this and argument strings.
+    // Load `count` fields of this and argument strings.
     __ Ldr(temp3, MemOperand(str, count_offset));
-    __ Ldr(temp4, MemOperand(arg, count_offset));
-    // Clean out compression flag from lengths.
-    __ Bic(temp0, temp3, 0x80000000);
-    __ Bic(temp_reg, temp4, 0x80000000);
+    __ Ldr(temp2, MemOperand(arg, count_offset));
+    // Extract lengths from the `count` fields.
+    __ Lsr(temp0, temp3, 1u);
+    __ Lsr(temp1, temp2, 1u);
   } else {
     // Load lengths of this and argument strings.
     __ Ldr(temp0, MemOperand(str, count_offset));
-    __ Ldr(temp_reg, MemOperand(arg, count_offset));
+    __ Ldr(temp1, MemOperand(arg, count_offset));
   }
   // out = length diff.
-  __ Subs(out, temp0, temp_reg);
+  __ Subs(out, temp0, temp1);
   // temp0 = min(len(str), len(arg)).
 
   {
@@ -1192,33 +1187,32 @@
                                CodeBufferCheckScope::kMaximumSize);
 
     __ it(gt);
-    __ mov(gt, temp0, temp_reg);
+    __ mov(gt, temp0, temp1);
   }
 
-  temps.Release(temp_reg);
   // Shorter string is empty?
   __ Cbz(temp0, &end);
 
   if (mirror::kUseStringCompression) {
     // Check if both strings using same compression style to use this comparison loop.
-    __ Eors(temp3, temp3, temp4);
-    __ B(mi, &different_compression);
-  }
-  // Store offset of string value in preparation for comparison loop.
-  __ Mov(temp1, value_offset);
-  if (mirror::kUseStringCompression) {
+    __ Eors(temp2, temp2, temp3);
+    __ Lsrs(temp2, temp2, 1u);
+    __ B(cs, &different_compression);
     // For string compression, calculate the number of bytes to compare (not chars).
     // This could in theory exceed INT32_MAX, so treat temp0 as unsigned.
-    __ Cmp(temp4, 0);
+    __ Lsls(temp3, temp3, 31u);  // Extract purely the compression flag.
 
     AssemblerAccurateScope aas(assembler->GetVIXLAssembler(),
                                2 * kMaxInstructionSizeInBytes,
                                CodeBufferCheckScope::kMaximumSize);
 
-    __ it(ge);
-    __ add(ge, temp0, temp0, temp0);
+    __ it(ne);
+    __ add(ne, temp0, temp0, temp0);
   }
 
+  // Store offset of string value in preparation for comparison loop.
+  __ Mov(temp1, value_offset);
+
   // Assertions that must hold in order to compare multiple characters at a time.
   CHECK_ALIGNED(value_offset, 8);
   static_assert(IsAligned<8>(kObjectAlignment),
@@ -1227,10 +1221,12 @@
   const size_t char_size = Primitive::ComponentSize(Primitive::kPrimChar);
   DCHECK_EQ(char_size, 2u);
 
+  UseScratchRegisterScope temps(assembler->GetVIXLAssembler());
+
   vixl32::Label find_char_diff_2nd_cmp;
   // Unrolled loop comparing 4x16-bit chars per iteration (ok because of string data alignment).
   __ Bind(&loop);
-  temp_reg = temps.Acquire();
+  vixl32::Register temp_reg = temps.Acquire();
   __ Ldr(temp_reg, MemOperand(str, temp1));
   __ Ldr(temp2, MemOperand(arg, temp1));
   __ Cmp(temp_reg, temp2);
@@ -1279,72 +1275,92 @@
   // The comparison is unsigned for string compression, otherwise signed.
   __ Cmp(temp0, Operand(temp1, vixl32::LSR, (mirror::kUseStringCompression ? 3 : 4)));
   __ B((mirror::kUseStringCompression ? ls : le), &end);
+
   // Extract the characters and calculate the difference.
-  vixl32::Label uncompressed_string, continue_process;
   if (mirror::kUseStringCompression) {
-    __ Cmp(temp4, 0);
-    __ B(ge, &uncompressed_string);
-    __ Bic(temp1, temp1, 0x7);
-    __ B(&continue_process);
+    // For compressed strings we need to clear 0x7 from temp1, for uncompressed we need to clear
+    // 0xf. We also need to prepare the character extraction mask `uncompressed ? 0xffffu : 0xffu`.
+    // The compression flag is now in the highest bit of temp3, so let's play some tricks.
+    __ orr(temp3, temp3, 0xffu << 23);                  // uncompressed ? 0xff800000u : 0x7ff80000u
+    __ bic(temp1, temp1, Operand(temp3, vixl32::LSR, 31 - 3));  // &= ~(uncompressed ? 0xfu : 0x7u)
+    __ Asr(temp3, temp3, 7u);                           // uncompressed ? 0xffff0000u : 0xff0000u.
+    __ Lsr(temp2, temp2, temp1);                        // Extract second character.
+    __ Lsr(temp3, temp3, 16u);                          // uncompressed ? 0xffffu : 0xffu
+    __ Lsr(out, temp_reg, temp1);                       // Extract first character.
+    __ and_(temp2, temp2, temp3);
+    __ and_(out, out, temp3);
+  } else {
+    __ bic(temp1, temp1, 0xf);
+    __ Lsr(temp2, temp2, temp1);
+    __ Lsr(out, temp_reg, temp1);
+    __ movt(temp2, 0);
+    __ movt(out, 0);
   }
-  __ Bind(&uncompressed_string);
-  __ Bic(temp1, temp1, 0xf);
-  __ Bind(&continue_process);
 
-  __ Lsr(temp2, temp2, temp1);
-  __ Lsr(temp_reg, temp_reg, temp1);
-  vixl32::Label calculate_difference, uncompressed_string_extract_chars;
-  if (mirror::kUseStringCompression) {
-    __ Cmp(temp4, 0);
-    __ B(ge, &uncompressed_string_extract_chars);
-    __ Ubfx(temp2, temp2, 0, 8);
-    __ Ubfx(temp_reg, temp_reg, 0, 8);
-    __ B(&calculate_difference);
-  }
-  __ Bind(&uncompressed_string_extract_chars);
-  __ Movt(temp2, 0);
-  __ Movt(temp_reg, 0);
-  __ Bind(&calculate_difference);
-  __ Sub(out, temp_reg, temp2);
+  __ Sub(out, out, temp2);
   temps.Release(temp_reg);
-  __ B(&end);
 
   if (mirror::kUseStringCompression) {
+    __ B(&end);
+    __ Bind(&different_compression);
+
+    // Comparison for different compression style.
     const size_t c_char_size = Primitive::ComponentSize(Primitive::kPrimByte);
     DCHECK_EQ(c_char_size, 1u);
-    vixl32::Label loop_arg_compressed, loop_this_compressed, find_diff;
-    // Comparison for different compression style.
-    // This part is when THIS is compressed and ARG is not.
-    __ Bind(&different_compression);
-    __ Add(temp2, str, value_offset);
-    __ Add(temp3, arg, value_offset);
-    __ Cmp(temp4, 0);
-    __ B(lt, &loop_arg_compressed);
 
-    __ Bind(&loop_this_compressed);
+    // We want to free up the temp3, currently holding `str.count`, for comparison.
+    // So, we move it to the bottom bit of the iteration count `temp0` which we tnen
+    // need to treat as unsigned. Start by freeing the bit with an ADD and continue
+    // further down by a LSRS+SBC which will flip the meaning of the flag but allow
+    // `subs temp0, #2; bhi different_compression_loop` to serve as the loop condition.
+    __ add(temp0, temp0, temp0);              // Unlike LSL, this ADD is always 16-bit.
+    // `temp1` will hold the compressed data pointer, `temp2` the uncompressed data pointer.
+    __ mov(temp1, str);
+    __ mov(temp2, arg);
+    __ Lsrs(temp3, temp3, 1u);                // Continue the move of the compression flag.
+    {
+      AssemblerAccurateScope aas(assembler->GetVIXLAssembler(),
+                                 3 * kMaxInstructionSizeInBytes,
+                                 CodeBufferCheckScope::kMaximumSize);
+      __ itt(cs);                             // Interleave with selection of temp1 and temp2.
+      __ mov(cs, temp1, arg);                 // Preserves flags.
+      __ mov(cs, temp2, str);                 // Preserves flags.
+    }
+    __ sbc(temp0, temp0, 0);                  // Complete the move of the compression flag.
+
+    // Adjust temp1 and temp2 from string pointers to data pointers.
+    __ add(temp1, temp1, value_offset);
+    __ add(temp2, temp2, value_offset);
+
+    vixl32::Label different_compression_loop;
+    vixl32::Label different_compression_diff;
+
+    // Main loop for different compression.
     temp_reg = temps.Acquire();
-    __ Ldrb(temp_reg, MemOperand(temp2, c_char_size, PostIndex));
-    __ Ldrh(temp4, MemOperand(temp3, char_size, PostIndex));
-    __ Cmp(temp_reg, temp4);
-    __ B(ne, &find_diff);
-    __ Subs(temp0, temp0, 1);
-    __ B(gt, &loop_this_compressed);
-    __ B(&end);
-
-    // This part is when THIS is not compressed and ARG is.
-    __ Bind(&loop_arg_compressed);
-    __ Ldrh(temp_reg, MemOperand(temp2, char_size, PostIndex));
-    __ Ldrb(temp4, MemOperand(temp3, c_char_size, PostIndex));
-    __ Cmp(temp_reg, temp4);
-    __ B(ne, &find_diff);
-    __ Subs(temp0, temp0, 1);
-    __ B(gt, &loop_arg_compressed);
+    __ Bind(&different_compression_loop);
+    __ Ldrb(temp_reg, MemOperand(temp1, c_char_size, PostIndex));
+    __ Ldrh(temp3, MemOperand(temp2, char_size, PostIndex));
+    __ cmp(temp_reg, temp3);
+    __ B(ne, &different_compression_diff);
+    __ Subs(temp0, temp0, 2);
+    __ B(hi, &different_compression_loop);
     __ B(&end);
 
     // Calculate the difference.
-    __ Bind(&find_diff);
-    __ Sub(out, temp_reg, temp4);
+    __ Bind(&different_compression_diff);
+    __ Sub(out, temp_reg, temp3);
     temps.Release(temp_reg);
+    // Flip the difference if the `arg` is compressed.
+    // `temp0` contains inverted `str` compression flag, i.e the same as `arg` compression flag.
+    __ Lsrs(temp0, temp0, 1u);
+    static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
+                  "Expecting 0=compressed, 1=uncompressed");
+
+    AssemblerAccurateScope aas(assembler->GetVIXLAssembler(),
+                               2 * kMaxInstructionSizeInBytes,
+                               CodeBufferCheckScope::kMaximumSize);
+    __ it(cc);
+    __ rsb(cc, out, out, 0);
   }
 
   __ Bind(&end);
@@ -1382,7 +1398,7 @@
   vixl32::Register temp1 = RegisterFrom(locations->GetTemp(1));
   vixl32::Register temp2 = RegisterFrom(locations->GetTemp(2));
 
-  vixl32::Label loop, preloop;
+  vixl32::Label loop;
   vixl32::Label end;
   vixl32::Label return_true;
   vixl32::Label return_false;
@@ -1401,6 +1417,10 @@
     __ Cbz(arg, &return_false);
   }
 
+  // Reference equality check, return true if same reference.
+  __ Cmp(str, arg);
+  __ B(eq, &return_true);
+
   if (!optimizations.GetArgumentIsString()) {
     // Instanceof check for the argument by comparing class fields.
     // All string objects must have the same type since String cannot be subclassed.
@@ -1412,48 +1432,47 @@
     __ B(ne, &return_false);
   }
 
-  // Load lengths of this and argument strings.
+  // Load `count` fields of this and argument strings.
   __ Ldr(temp, MemOperand(str, count_offset));
   __ Ldr(temp1, MemOperand(arg, count_offset));
-  // Check if lengths are equal, return false if they're not.
+  // Check if `count` fields are equal, return false if they're not.
   // Also compares the compression style, if differs return false.
   __ Cmp(temp, temp1);
   __ B(ne, &return_false);
-  // Return true if both strings are empty.
-  if (mirror::kUseStringCompression) {
-    // Length needs to be masked out first because 0 is treated as compressed.
-    __ Bic(temp, temp, 0x80000000);
-  }
+  // Return true if both strings are empty. Even with string compression `count == 0` means empty.
+  static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
+                "Expecting 0=compressed, 1=uncompressed");
   __ Cbz(temp, &return_true);
-  // Reference equality check, return true if same reference.
-  __ Cmp(str, arg);
-  __ B(eq, &return_true);
 
-  // Assertions that must hold in order to compare strings 2 characters at a time.
+  // Assertions that must hold in order to compare strings 4 bytes at a time.
   DCHECK_ALIGNED(value_offset, 4);
   static_assert(IsAligned<4>(kObjectAlignment), "String data must be aligned for fast compare.");
 
   if (mirror::kUseStringCompression) {
-    // If not compressed, directly to fast compare. Else do preprocess on length.
-    __ Cmp(temp1, 0);
-    __ B(gt, &preloop);
-    // Mask out compression flag and adjust length for compressed string (8-bit)
-    // as if it is a 16-bit data, new_length = (length + 1) / 2.
-    __ Add(temp, temp, 1);
-    __ Lsr(temp, temp, 1);
-    __ Bind(&preloop);
+    // For string compression, calculate the number of bytes to compare (not chars).
+    // This could in theory exceed INT32_MAX, so treat temp as unsigned.
+    __ Lsrs(temp, temp, 1u);                        // Extract length and check compression flag.
+    AssemblerAccurateScope aas(assembler->GetVIXLAssembler(),
+                               2 * kMaxInstructionSizeInBytes,
+                               CodeBufferCheckScope::kMaximumSize);
+    __ it(cs);                                      // If uncompressed,
+    __ add(cs, temp, temp, temp);                   //   double the byte count.
   }
-  // Loop to compare strings 2 characters at a time starting at the front of the string.
-  // Ok to do this because strings with an odd length are zero-padded.
+
+  // Store offset of string value in preparation for comparison loop.
   __ Mov(temp1, value_offset);
+
+  // Loop to compare strings 4 bytes at a time starting at the front of the string.
+  // Ok to do this because strings are zero-padded to kObjectAlignment.
   __ Bind(&loop);
   __ Ldr(out, MemOperand(str, temp1));
   __ Ldr(temp2, MemOperand(arg, temp1));
+  __ Add(temp1, temp1, sizeof(uint32_t));
   __ Cmp(out, temp2);
   __ B(ne, &return_false);
-  __ Add(temp1, temp1, sizeof(uint32_t));
-  __ Subs(temp, temp, sizeof(uint32_t) / sizeof(uint16_t));
-  __ B(gt, &loop);
+  // With string compression, we have compared 4 bytes, otherwise 2 chars.
+  __ Subs(temp, temp, mirror::kUseStringCompression ? 4 : 2);
+  __ B(hi, &loop);
 
   // Return true and exit the function.
   // If loop does not result in returning false, we return true.
@@ -2547,9 +2566,9 @@
     temp = temps.Acquire();
     // String's length.
     __ Ldr(temp, MemOperand(srcObj, count_offset));
-    __ Cmp(temp, 0);
+    __ Tst(temp, 1);
     temps.Release(temp);
-    __ B(lt, &compressed_string_preloop);
+    __ B(eq, &compressed_string_preloop);
   }
   __ Add(src_ptr, src_ptr, Operand(srcBegin, vixl32::LSL, 1));
 
@@ -2588,9 +2607,10 @@
   __ Strh(temp, MemOperand(dst_ptr, char_size, PostIndex));
   temps.Release(temp);
   __ B(gt, &remainder);
-  __ B(&done);
 
   if (mirror::kUseStringCompression) {
+    __ B(&done);
+
     const size_t c_char_size = Primitive::ComponentSize(Primitive::kPrimByte);
     DCHECK_EQ(c_char_size, 1u);
     // Copy loop for compressed src, copying 1 character (8-bit) to (16-bit) at a time.
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index bac98d5..06ab46f 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -1408,21 +1408,22 @@
   // compression style is decided on alloc.
   __ cmpl(ecx, Address(arg, count_offset));
   __ j(kNotEqual, &return_false);
+  // Return true if strings are empty. Even with string compression `count == 0` means empty.
+  static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
+                "Expecting 0=compressed, 1=uncompressed");
+  __ jecxz(&return_true);
 
   if (mirror::kUseStringCompression) {
     NearLabel string_uncompressed;
-    // Differ cases into both compressed or both uncompressed. Different compression style
-    // is cut above.
-    __ cmpl(ecx, Immediate(0));
-    __ j(kGreaterEqual, &string_uncompressed);
+    // Extract length and differentiate between both compressed or both uncompressed.
+    // Different compression style is cut above.
+    __ shrl(ecx, Immediate(1));
+    __ j(kCarrySet, &string_uncompressed);
     // Divide string length by 2, rounding up, and continue as if uncompressed.
-    // Merge clearing the compression flag (+0x80000000) with +1 for rounding.
-    __ addl(ecx, Immediate(0x80000001));
+    __ addl(ecx, Immediate(1));
     __ shrl(ecx, Immediate(1));
     __ Bind(&string_uncompressed);
   }
-  // Return true if strings are empty.
-  __ jecxz(&return_true);
   // Load starting addresses of string values into ESI/EDI as required for repe_cmpsl instruction.
   __ leal(esi, Address(str, value_offset));
   __ leal(edi, Address(arg, value_offset));
@@ -1535,21 +1536,24 @@
   // Location of count within the String object.
   int32_t count_offset = mirror::String::CountOffset().Int32Value();
 
-  // Load string length, i.e., the count field of the string.
+  // Load the count field of the string containing the length and compression flag.
   __ movl(string_length, Address(string_obj, count_offset));
-  if (mirror::kUseStringCompression) {
-    string_length_flagged = locations->GetTemp(2).AsRegister<Register>();
-    __ movl(string_length_flagged, string_length);
-    // Mask out first bit used as compression flag.
-    __ andl(string_length, Immediate(INT32_MAX));
-  }
 
-  // Do a zero-length check.
+  // Do a zero-length check. Even with string compression `count == 0` means empty.
+  static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
+                "Expecting 0=compressed, 1=uncompressed");
   // TODO: Support jecxz.
   NearLabel not_found_label;
   __ testl(string_length, string_length);
   __ j(kEqual, &not_found_label);
 
+  if (mirror::kUseStringCompression) {
+    string_length_flagged = locations->GetTemp(2).AsRegister<Register>();
+    __ movl(string_length_flagged, string_length);
+    // Extract the length and shift out the least significant bit used as compression flag.
+    __ shrl(string_length, Immediate(1));
+  }
+
   if (start_at_zero) {
     // Number of chars to scan is the same as the string length.
     __ movl(counter, string_length);
@@ -1570,8 +1574,8 @@
 
     if (mirror::kUseStringCompression) {
       NearLabel modify_counter, offset_uncompressed_label;
-      __ cmpl(string_length_flagged, Immediate(0));
-      __ j(kGreaterEqual, &offset_uncompressed_label);
+      __ testl(string_length_flagged, Immediate(1));
+      __ j(kNotZero, &offset_uncompressed_label);
       // Move to the start of the string: string_obj + value_offset + start_index.
       __ leal(string_obj, Address(string_obj, counter, ScaleFactor::TIMES_1, value_offset));
       __ jmp(&modify_counter);
@@ -1593,8 +1597,8 @@
   if (mirror::kUseStringCompression) {
     NearLabel uncompressed_string_comparison;
     NearLabel comparison_done;
-    __ cmpl(string_length_flagged, Immediate(0));
-    __ j(kGreater, &uncompressed_string_comparison);
+    __ testl(string_length_flagged, Immediate(1));
+    __ j(kNotZero, &uncompressed_string_comparison);
 
     // Check if EAX (search_value) is ASCII.
     __ cmpl(search_value, Immediate(127));
@@ -1787,8 +1791,10 @@
     __ cfi().AdjustCFAOffset(stack_adjust);
 
     NearLabel copy_loop, copy_uncompressed;
-    __ cmpl(Address(obj, count_offset), Immediate(0));
-    __ j(kGreaterEqual, &copy_uncompressed);
+    __ testl(Address(obj, count_offset), Immediate(1));
+    static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
+                  "Expecting 0=compressed, 1=uncompressed");
+    __ j(kNotZero, &copy_uncompressed);
     // Compute the address of the source string by adding the number of chars from
     // the source beginning to the value offset of a string.
     __ leal(ESI, CodeGeneratorX86::ArrayAddress(obj, srcBegin, TIMES_1, value_offset));
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index 01577f7..2ea8670 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -1574,20 +1574,23 @@
   // compression style is decided on alloc.
   __ cmpl(rcx, Address(arg, count_offset));
   __ j(kNotEqual, &return_false);
+  // Return true if both strings are empty. Even with string compression `count == 0` means empty.
+  static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
+                "Expecting 0=compressed, 1=uncompressed");
+  __ jrcxz(&return_true);
 
   if (mirror::kUseStringCompression) {
     NearLabel string_uncompressed;
-    // Both string are compressed.
-    __ cmpl(rcx, Immediate(0));
-    __ j(kGreaterEqual, &string_uncompressed);
+    // Extract length and differentiate between both compressed or both uncompressed.
+    // Different compression style is cut above.
+    __ shrl(rcx, Immediate(1));
+    __ j(kCarrySet, &string_uncompressed);
     // Divide string length by 2, rounding up, and continue as if uncompressed.
     // Merge clearing the compression flag with +1 for rounding.
-    __ addl(rcx, Immediate(static_cast<int32_t>(0x80000001)));
+    __ addl(rcx, Immediate(1));
     __ shrl(rcx, Immediate(1));
     __ Bind(&string_uncompressed);
   }
-  // Return true if both strings are empty.
-  __ jrcxz(&return_true);
   // Load starting addresses of string values into RSI/RDI as required for repe_cmpsq instruction.
   __ leal(rsi, Address(str, value_offset));
   __ leal(rdi, Address(arg, value_offset));
@@ -1694,21 +1697,22 @@
   // Location of count within the String object.
   int32_t count_offset = mirror::String::CountOffset().Int32Value();
 
-  // Load string length, i.e., the count field of the string.
+  // Load the count field of the string containing the length and compression flag.
   __ movl(string_length, Address(string_obj, count_offset));
-  if (mirror::kUseStringCompression) {
-    // Use TMP to keep string_length_flagged.
-    __ movl(CpuRegister(TMP), string_length);
-    // Mask out first bit used as compression flag.
-    __ andl(string_length, Immediate(INT32_MAX));
-  }
 
-  // Do a length check.
+  // Do a zero-length check. Even with string compression `count == 0` means empty.
   // TODO: Support jecxz.
   NearLabel not_found_label;
   __ testl(string_length, string_length);
   __ j(kEqual, &not_found_label);
 
+  if (mirror::kUseStringCompression) {
+    // Use TMP to keep string_length_flagged.
+    __ movl(CpuRegister(TMP), string_length);
+    // Mask out first bit used as compression flag.
+    __ shrl(string_length, Immediate(1));
+  }
+
   if (start_at_zero) {
     // Number of chars to scan is the same as the string length.
     __ movl(counter, string_length);
@@ -1728,8 +1732,8 @@
 
     if (mirror::kUseStringCompression) {
       NearLabel modify_counter, offset_uncompressed_label;
-      __ cmpl(CpuRegister(TMP), Immediate(0));
-      __ j(kGreaterEqual, &offset_uncompressed_label);
+      __ testl(CpuRegister(TMP), Immediate(1));
+      __ j(kNotZero, &offset_uncompressed_label);
       __ leaq(string_obj, Address(string_obj, counter, ScaleFactor::TIMES_1, value_offset));
       __ jmp(&modify_counter);
       // Move to the start of the string: string_obj + value_offset + 2 * start_index.
@@ -1747,8 +1751,8 @@
   if (mirror::kUseStringCompression) {
     NearLabel uncompressed_string_comparison;
     NearLabel comparison_done;
-    __ cmpl(CpuRegister(TMP), Immediate(0));
-    __ j(kGreater, &uncompressed_string_comparison);
+    __ testl(CpuRegister(TMP), Immediate(1));
+    __ j(kNotZero, &uncompressed_string_comparison);
     // Check if RAX (search_value) is ASCII.
     __ cmpl(search_value, Immediate(127));
     __ j(kGreater, &not_found_label);
@@ -1931,8 +1935,10 @@
     // Location of count in string.
     const uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
 
-    __ cmpl(Address(obj, count_offset), Immediate(0));
-    __ j(kGreaterEqual, &copy_uncompressed);
+    __ testl(Address(obj, count_offset), Immediate(1));
+    static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
+                  "Expecting 0=compressed, 1=uncompressed");
+    __ j(kNotZero, &copy_uncompressed);
     // Compute the address of the source string by adding the number of chars from
     // the source beginning to the value offset of a string.
     __ leaq(CpuRegister(RSI),
diff --git a/compiler/optimizing/locations.h b/compiler/optimizing/locations.h
index 52747c0..091b58a 100644
--- a/compiler/optimizing/locations.h
+++ b/compiler/optimizing/locations.h
@@ -525,6 +525,12 @@
     temps_.push_back(location);
   }
 
+  void AddRegisterTemps(size_t count) {
+    for (size_t i = 0; i < count; ++i) {
+      AddTemp(Location::RequiresRegister());
+    }
+  }
+
   Location GetTemp(uint32_t at) const {
     return temps_[at];
   }
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index 9155322..680381a 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -735,6 +735,20 @@
   return true;
 }
 
+
+bool HLoopInformation::HasExitEdge() const {
+  // Determine if this loop has at least one exit edge.
+  HBlocksInLoopReversePostOrderIterator it_loop(*this);
+  for (; !it_loop.Done(); it_loop.Advance()) {
+    for (HBasicBlock* successor : it_loop.Current()->GetSuccessors()) {
+      if (!Contains(*successor)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 bool HBasicBlock::Dominates(HBasicBlock* other) const {
   // Walk up the dominator tree from `other`, to find out if `this`
   // is an ancestor.
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 883ac65..e0c582a 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -769,6 +769,8 @@
 
   bool DominatesAllBackEdges(HBasicBlock* block);
 
+  bool HasExitEdge() const;
+
  private:
   // Internal recursive implementation of `Populate`.
   void PopulateRecursive(HBasicBlock* block);
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 6f84cdc..7a930cc 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -167,24 +167,13 @@
       LOG(INFO) << "TIMINGS " << GetMethodName();
       LOG(INFO) << Dumpable<TimingLogger>(timing_logger_);
     }
-    if (visualizer_enabled_) {
-      MutexLock mu(Thread::Current(), visualizer_dump_mutex_);
-      *visualizer_output_ << visualizer_oss_.str();
-      // The destructor of `visualizer_output_` is normally
-      // responsible for flushing (and closing) the stream, but it
-      // won't be invoked during fast exits in non-debug mode -- see
-      // art::Dex2Oat::~Dex2Oat, which explicitly abandons some
-      // objects (such as the compiler driver) in non-debug mode, to
-      // avoid the cost of destructing them.  Therefore we explicitly
-      // flush the stream here to prevent truncated CFG visualizer
-      // files.
-      visualizer_output_->flush();
-    }
+    DCHECK(visualizer_oss_.str().empty());
   }
 
-  void DumpDisassembly() const {
+  void DumpDisassembly() REQUIRES(!visualizer_dump_mutex_) {
     if (visualizer_enabled_) {
       visualizer_.DumpGraphWithDisassembly();
+      FlushVisualizer();
     }
   }
 
@@ -199,24 +188,34 @@
   }
 
  private:
-  void StartPass(const char* pass_name) {
+  void StartPass(const char* pass_name) REQUIRES(!visualizer_dump_mutex_) {
     VLOG(compiler) << "Starting pass: " << pass_name;
     // Dump graph first, then start timer.
     if (visualizer_enabled_) {
       visualizer_.DumpGraph(pass_name, /* is_after_pass */ false, graph_in_bad_state_);
+      FlushVisualizer();
     }
     if (timing_logger_enabled_) {
       timing_logger_.StartTiming(pass_name);
     }
   }
 
-  void EndPass(const char* pass_name) {
+  void FlushVisualizer() REQUIRES(!visualizer_dump_mutex_) {
+    MutexLock mu(Thread::Current(), visualizer_dump_mutex_);
+    *visualizer_output_ << visualizer_oss_.str();
+    visualizer_output_->flush();
+    visualizer_oss_.str("");
+    visualizer_oss_.clear();
+  }
+
+  void EndPass(const char* pass_name) REQUIRES(!visualizer_dump_mutex_) {
     // Pause timer first, then dump graph.
     if (timing_logger_enabled_) {
       timing_logger_.EndTiming();
     }
     if (visualizer_enabled_) {
       visualizer_.DumpGraph(pass_name, /* is_after_pass */ true, graph_in_bad_state_);
+      FlushVisualizer();
     }
 
     // Validate the HGraph if running in debug mode.
diff --git a/dex2oat/dex2oat.cc b/dex2oat/dex2oat.cc
index 6418c50..1180bde 100644
--- a/dex2oat/dex2oat.cc
+++ b/dex2oat/dex2oat.cc
@@ -517,8 +517,7 @@
       thread_count_(sysconf(_SC_NPROCESSORS_CONF)),
       start_ns_(NanoTime()),
       oat_fd_(-1),
-      input_vdex_fd_(-1),
-      output_vdex_fd_(-1),
+      vdex_fd_(-1),
       zip_fd_(-1),
       image_base_(0U),
       image_classes_zip_filename_(nullptr),
@@ -591,12 +590,8 @@
     ParseUintOption(option, "--zip-fd", &zip_fd_, Usage);
   }
 
-  void ParseInputVdexFd(const StringPiece& option) {
-    ParseUintOption(option, "--input-vdex-fd", &input_vdex_fd_, Usage);
-  }
-
-  void ParseOutputVdexFd(const StringPiece& option) {
-    ParseUintOption(option, "--output-vdex-fd", &output_vdex_fd_, Usage);
+  void ParseVdexFd(const StringPiece& option) {
+    ParseUintOption(option, "--vdex-fd", &vdex_fd_, Usage);
   }
 
   void ParseOatFd(const StringPiece& option) {
@@ -712,9 +707,9 @@
       Usage("--oat-file should not be used with --oat-fd");
     }
 
-    if ((output_vdex_fd_ == -1) != (oat_fd_ == -1)) {
+    if ((vdex_fd_ == -1) != (oat_fd_ == -1)) {
       Usage("VDEX and OAT output must be specified either with one --oat-filename "
-            "or with --oat-fd and --output-vdex-fd file descriptors");
+            "or with --oat-fd and --vdex-fd file descriptors");
     }
 
     if (!parser_options->oat_symbols.empty() && oat_fd_ != -1) {
@@ -725,8 +720,8 @@
       Usage("--oat-symbols should not be used with --host");
     }
 
-    if (output_vdex_fd_ != -1 && !image_filenames_.empty()) {
-      Usage("--output-vdex-fd should not be used with --image");
+    if (vdex_fd_ != -1 && !image_filenames_.empty()) {
+      Usage("--vdex-fd should not be used with --image");
     }
 
     if (oat_fd_ != -1 && !image_filenames_.empty()) {
@@ -1119,10 +1114,8 @@
         ParseZipFd(option);
       } else if (option.starts_with("--zip-location=")) {
         zip_location_ = option.substr(strlen("--zip-location=")).data();
-      } else if (option.starts_with("--input-vdex-fd=")) {
-        ParseInputVdexFd(option);
-      } else if (option.starts_with("--output-vdex-fd=")) {
-        ParseOutputVdexFd(option);
+      } else if (option.starts_with("--vdex-fd=")) {
+        ParseVdexFd(option);
       } else if (option.starts_with("--oat-file=")) {
         oat_filenames_.push_back(option.substr(strlen("--oat-file=")).data());
       } else if (option.starts_with("--oat-symbols=")) {
@@ -1265,7 +1258,7 @@
         }
         oat_files_.push_back(std::move(oat_file));
 
-        DCHECK_EQ(output_vdex_fd_, -1);
+        DCHECK_EQ(vdex_fd_, -1);
         std::string vdex_filename = ReplaceFileExtension(oat_filename, "vdex");
         std::unique_ptr<File> vdex_file(OS::CreateEmptyFile(vdex_filename.c_str()));
         if (vdex_file.get() == nullptr) {
@@ -1291,9 +1284,9 @@
       }
       oat_files_.push_back(std::move(oat_file));
 
-      DCHECK_NE(output_vdex_fd_, -1);
+      DCHECK_NE(vdex_fd_, -1);
       std::string vdex_location = ReplaceFileExtension(oat_location_, "vdex");
-      std::unique_ptr<File> vdex_file(new File(output_vdex_fd_, vdex_location, /* check_usage */ true));
+      std::unique_ptr<File> vdex_file(new File(vdex_fd_, vdex_location, /* check_usage */ true));
       if (vdex_file.get() == nullptr) {
         PLOG(ERROR) << "Failed to create vdex file: " << vdex_location;
         return false;
@@ -2583,8 +2576,7 @@
   std::vector<const char*> oat_filenames_;
   std::vector<const char*> oat_unstripped_;
   int oat_fd_;
-  int input_vdex_fd_;
-  int output_vdex_fd_;
+  int vdex_fd_;
   std::vector<const char*> dex_filenames_;
   std::vector<const char*> dex_locations_;
   int zip_fd_;
diff --git a/runtime/arch/arm/entrypoints_init_arm.cc b/runtime/arch/arm/entrypoints_init_arm.cc
index 01b3f34..5bd6b56 100644
--- a/runtime/arch/arm/entrypoints_init_arm.cc
+++ b/runtime/arch/arm/entrypoints_init_arm.cc
@@ -67,6 +67,21 @@
 // Long long arithmetics - REM_LONG[_2ADDR] and DIV_LONG[_2ADDR]
 extern "C" int64_t __aeabi_ldivmod(int64_t, int64_t);
 
+void UpdateReadBarrierEntrypoints(QuickEntryPoints* qpoints, bool is_marking) {
+  qpoints->pReadBarrierMarkReg00 = is_marking ? art_quick_read_barrier_mark_reg00 : nullptr;
+  qpoints->pReadBarrierMarkReg01 = is_marking ? art_quick_read_barrier_mark_reg01 : nullptr;
+  qpoints->pReadBarrierMarkReg02 = is_marking ? art_quick_read_barrier_mark_reg02 : nullptr;
+  qpoints->pReadBarrierMarkReg03 = is_marking ? art_quick_read_barrier_mark_reg03 : nullptr;
+  qpoints->pReadBarrierMarkReg04 = is_marking ? art_quick_read_barrier_mark_reg04 : nullptr;
+  qpoints->pReadBarrierMarkReg05 = is_marking ? art_quick_read_barrier_mark_reg05 : nullptr;
+  qpoints->pReadBarrierMarkReg06 = is_marking ? art_quick_read_barrier_mark_reg06 : nullptr;
+  qpoints->pReadBarrierMarkReg07 = is_marking ? art_quick_read_barrier_mark_reg07 : nullptr;
+  qpoints->pReadBarrierMarkReg08 = is_marking ? art_quick_read_barrier_mark_reg08 : nullptr;
+  qpoints->pReadBarrierMarkReg09 = is_marking ? art_quick_read_barrier_mark_reg09 : nullptr;
+  qpoints->pReadBarrierMarkReg10 = is_marking ? art_quick_read_barrier_mark_reg10 : nullptr;
+  qpoints->pReadBarrierMarkReg11 = is_marking ? art_quick_read_barrier_mark_reg11 : nullptr;
+}
+
 void InitEntryPoints(JniEntryPoints* jpoints, QuickEntryPoints* qpoints) {
   DefaultInitEntryPoints(jpoints, qpoints);
 
@@ -123,18 +138,7 @@
 
   // Read barrier.
   qpoints->pReadBarrierJni = ReadBarrierJni;
-  qpoints->pReadBarrierMarkReg00 = art_quick_read_barrier_mark_reg00;
-  qpoints->pReadBarrierMarkReg01 = art_quick_read_barrier_mark_reg01;
-  qpoints->pReadBarrierMarkReg02 = art_quick_read_barrier_mark_reg02;
-  qpoints->pReadBarrierMarkReg03 = art_quick_read_barrier_mark_reg03;
-  qpoints->pReadBarrierMarkReg04 = art_quick_read_barrier_mark_reg04;
-  qpoints->pReadBarrierMarkReg05 = art_quick_read_barrier_mark_reg05;
-  qpoints->pReadBarrierMarkReg06 = art_quick_read_barrier_mark_reg06;
-  qpoints->pReadBarrierMarkReg07 = art_quick_read_barrier_mark_reg07;
-  qpoints->pReadBarrierMarkReg08 = art_quick_read_barrier_mark_reg08;
-  qpoints->pReadBarrierMarkReg09 = art_quick_read_barrier_mark_reg09;
-  qpoints->pReadBarrierMarkReg10 = art_quick_read_barrier_mark_reg10;
-  qpoints->pReadBarrierMarkReg11 = art_quick_read_barrier_mark_reg11;
+  UpdateReadBarrierEntrypoints(qpoints, /*is_marking*/ false);
   qpoints->pReadBarrierMarkReg12 = nullptr;  // Cannot use register 12 (IP) to pass arguments.
   qpoints->pReadBarrierMarkReg13 = nullptr;  // Cannot use register 13 (SP) to pass arguments.
   qpoints->pReadBarrierMarkReg14 = nullptr;  // Cannot use register 14 (LR) to pass arguments.
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index 550f8c7..3a83eaf 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -1769,12 +1769,15 @@
     .cfi_rel_offset r10, 4
     .cfi_rel_offset r11, 8
     .cfi_rel_offset lr, 12
+#if (STRING_COMPRESSION_FEATURE)
+    ldr   r4, [r0, #MIRROR_STRING_COUNT_OFFSET]
+#else
     ldr   r3, [r0, #MIRROR_STRING_COUNT_OFFSET]
+#endif
     add   r0, #MIRROR_STRING_VALUE_OFFSET
 #if (STRING_COMPRESSION_FEATURE)
     /* r4 count (with flag) and r3 holds actual length */
-    mov   r4, r3
-    bic   r3, #2147483648
+    lsr   r3, r4, #1
 #endif
     /* Clamp start to [0..count] */
     cmp   r2, #0
@@ -1789,8 +1792,8 @@
 
     /* Build pointer to start of data to compare and pre-bias */
 #if (STRING_COMPRESSION_FEATURE)
-    cmp   r4, #0
-    blt   .Lstring_indexof_compressed
+    lsrs  r4, r4, #1
+    bcc   .Lstring_indexof_compressed
 #endif
     add   r0, r0, r2, lsl #1
     sub   r0, #2
diff --git a/runtime/arch/arm64/entrypoints_init_arm64.cc b/runtime/arch/arm64/entrypoints_init_arm64.cc
index 3c77672..e7c9fef 100644
--- a/runtime/arch/arm64/entrypoints_init_arm64.cc
+++ b/runtime/arch/arm64/entrypoints_init_arm64.cc
@@ -70,6 +70,47 @@
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg28(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg29(mirror::Object*);
 
+void UpdateReadBarrierEntrypoints(QuickEntryPoints* qpoints, bool is_marking) {
+  // ARM64 is the architecture with the largest number of core
+  // registers (32) that supports the read barrier configuration.
+  // Because registers 30 (LR) and 31 (SP/XZR) cannot be used to pass
+  // arguments, only define ReadBarrierMarkRegX entrypoints for the
+  // first 30 registers.  This limitation is not a problem on other
+  // supported architectures (ARM, x86 and x86-64) either, as they
+  // have less core registers (resp. 16, 8 and 16).  (We may have to
+  // revise that design choice if read barrier support is added for
+  // MIPS and/or MIPS64.)
+  qpoints->pReadBarrierMarkReg00 = is_marking ? art_quick_read_barrier_mark_reg00 : nullptr;
+  qpoints->pReadBarrierMarkReg01 = is_marking ? art_quick_read_barrier_mark_reg01 : nullptr;
+  qpoints->pReadBarrierMarkReg02 = is_marking ? art_quick_read_barrier_mark_reg02 : nullptr;
+  qpoints->pReadBarrierMarkReg03 = is_marking ? art_quick_read_barrier_mark_reg03 : nullptr;
+  qpoints->pReadBarrierMarkReg04 = is_marking ? art_quick_read_barrier_mark_reg04 : nullptr;
+  qpoints->pReadBarrierMarkReg05 = is_marking ? art_quick_read_barrier_mark_reg05 : nullptr;
+  qpoints->pReadBarrierMarkReg06 = is_marking ? art_quick_read_barrier_mark_reg06 : nullptr;
+  qpoints->pReadBarrierMarkReg07 = is_marking ? art_quick_read_barrier_mark_reg07 : nullptr;
+  qpoints->pReadBarrierMarkReg08 = is_marking ? art_quick_read_barrier_mark_reg08 : nullptr;
+  qpoints->pReadBarrierMarkReg09 = is_marking ? art_quick_read_barrier_mark_reg09 : nullptr;
+  qpoints->pReadBarrierMarkReg10 = is_marking ? art_quick_read_barrier_mark_reg10 : nullptr;
+  qpoints->pReadBarrierMarkReg11 = is_marking ? art_quick_read_barrier_mark_reg11 : nullptr;
+  qpoints->pReadBarrierMarkReg12 = is_marking ? art_quick_read_barrier_mark_reg12 : nullptr;
+  qpoints->pReadBarrierMarkReg13 = is_marking ? art_quick_read_barrier_mark_reg13 : nullptr;
+  qpoints->pReadBarrierMarkReg14 = is_marking ? art_quick_read_barrier_mark_reg14 : nullptr;
+  qpoints->pReadBarrierMarkReg15 = is_marking ? art_quick_read_barrier_mark_reg15 : nullptr;
+  qpoints->pReadBarrierMarkReg17 = is_marking ? art_quick_read_barrier_mark_reg17 : nullptr;
+  qpoints->pReadBarrierMarkReg18 = is_marking ? art_quick_read_barrier_mark_reg18 : nullptr;
+  qpoints->pReadBarrierMarkReg19 = is_marking ? art_quick_read_barrier_mark_reg19 : nullptr;
+  qpoints->pReadBarrierMarkReg20 = is_marking ? art_quick_read_barrier_mark_reg20 : nullptr;
+  qpoints->pReadBarrierMarkReg21 = is_marking ? art_quick_read_barrier_mark_reg21 : nullptr;
+  qpoints->pReadBarrierMarkReg22 = is_marking ? art_quick_read_barrier_mark_reg22 : nullptr;
+  qpoints->pReadBarrierMarkReg23 = is_marking ? art_quick_read_barrier_mark_reg23 : nullptr;
+  qpoints->pReadBarrierMarkReg24 = is_marking ? art_quick_read_barrier_mark_reg24 : nullptr;
+  qpoints->pReadBarrierMarkReg25 = is_marking ? art_quick_read_barrier_mark_reg25 : nullptr;
+  qpoints->pReadBarrierMarkReg26 = is_marking ? art_quick_read_barrier_mark_reg26 : nullptr;
+  qpoints->pReadBarrierMarkReg27 = is_marking ? art_quick_read_barrier_mark_reg27 : nullptr;
+  qpoints->pReadBarrierMarkReg28 = is_marking ? art_quick_read_barrier_mark_reg28 : nullptr;
+  qpoints->pReadBarrierMarkReg29 = is_marking ? art_quick_read_barrier_mark_reg29 : nullptr;
+}
+
 void InitEntryPoints(JniEntryPoints* jpoints, QuickEntryPoints* qpoints) {
   DefaultInitEntryPoints(jpoints, qpoints);
 
@@ -126,45 +167,8 @@
 
   // Read barrier.
   qpoints->pReadBarrierJni = ReadBarrierJni;
-  // ARM64 is the architecture with the largest number of core
-  // registers (32) that supports the read barrier configuration.
-  // Because registers 30 (LR) and 31 (SP/XZR) cannot be used to pass
-  // arguments, only define ReadBarrierMarkRegX entrypoints for the
-  // first 30 registers.  This limitation is not a problem on other
-  // supported architectures (ARM, x86 and x86-64) either, as they
-  // have less core registers (resp. 16, 8 and 16).  (We may have to
-  // revise that design choice if read barrier support is added for
-  // MIPS and/or MIPS64.)
-  qpoints->pReadBarrierMarkReg00 = art_quick_read_barrier_mark_reg00;
-  qpoints->pReadBarrierMarkReg01 = art_quick_read_barrier_mark_reg01;
-  qpoints->pReadBarrierMarkReg02 = art_quick_read_barrier_mark_reg02;
-  qpoints->pReadBarrierMarkReg03 = art_quick_read_barrier_mark_reg03;
-  qpoints->pReadBarrierMarkReg04 = art_quick_read_barrier_mark_reg04;
-  qpoints->pReadBarrierMarkReg05 = art_quick_read_barrier_mark_reg05;
-  qpoints->pReadBarrierMarkReg06 = art_quick_read_barrier_mark_reg06;
-  qpoints->pReadBarrierMarkReg07 = art_quick_read_barrier_mark_reg07;
-  qpoints->pReadBarrierMarkReg08 = art_quick_read_barrier_mark_reg08;
-  qpoints->pReadBarrierMarkReg09 = art_quick_read_barrier_mark_reg09;
-  qpoints->pReadBarrierMarkReg10 = art_quick_read_barrier_mark_reg10;
-  qpoints->pReadBarrierMarkReg11 = art_quick_read_barrier_mark_reg11;
-  qpoints->pReadBarrierMarkReg12 = art_quick_read_barrier_mark_reg12;
-  qpoints->pReadBarrierMarkReg13 = art_quick_read_barrier_mark_reg13;
-  qpoints->pReadBarrierMarkReg14 = art_quick_read_barrier_mark_reg14;
-  qpoints->pReadBarrierMarkReg15 = art_quick_read_barrier_mark_reg15;
   qpoints->pReadBarrierMarkReg16 = nullptr;  // IP0 is used as a temp by the asm stub.
-  qpoints->pReadBarrierMarkReg17 = art_quick_read_barrier_mark_reg17;
-  qpoints->pReadBarrierMarkReg18 = art_quick_read_barrier_mark_reg18;
-  qpoints->pReadBarrierMarkReg19 = art_quick_read_barrier_mark_reg19;
-  qpoints->pReadBarrierMarkReg20 = art_quick_read_barrier_mark_reg20;
-  qpoints->pReadBarrierMarkReg21 = art_quick_read_barrier_mark_reg21;
-  qpoints->pReadBarrierMarkReg22 = art_quick_read_barrier_mark_reg22;
-  qpoints->pReadBarrierMarkReg23 = art_quick_read_barrier_mark_reg23;
-  qpoints->pReadBarrierMarkReg24 = art_quick_read_barrier_mark_reg24;
-  qpoints->pReadBarrierMarkReg25 = art_quick_read_barrier_mark_reg25;
-  qpoints->pReadBarrierMarkReg26 = art_quick_read_barrier_mark_reg26;
-  qpoints->pReadBarrierMarkReg27 = art_quick_read_barrier_mark_reg27;
-  qpoints->pReadBarrierMarkReg28 = art_quick_read_barrier_mark_reg28;
-  qpoints->pReadBarrierMarkReg29 = art_quick_read_barrier_mark_reg29;
+  UpdateReadBarrierEntrypoints(qpoints, /*is_marking*/ false);
   qpoints->pReadBarrierSlow = artReadBarrierSlow;
   qpoints->pReadBarrierForRootSlow = artReadBarrierForRootSlow;
 };
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index d8ebe26..73bca03 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -2403,12 +2403,15 @@
      *    w2:   Starting offset in string data
      */
 ENTRY art_quick_indexof
+#if (STRING_COMPRESSION_FEATURE)
+    ldr   w4, [x0, #MIRROR_STRING_COUNT_OFFSET]
+#else
     ldr   w3, [x0, #MIRROR_STRING_COUNT_OFFSET]
+#endif
     add   x0, x0, #MIRROR_STRING_VALUE_OFFSET
 #if (STRING_COMPRESSION_FEATURE)
     /* w4 holds count (with flag) and w3 holds actual length */
-    mov   w4, w3
-    and   w3, w3, #2147483647
+    lsr   w3, w4, #1
 #endif
     /* Clamp start to [0..count] */
     cmp   w2, #0
@@ -2420,7 +2423,7 @@
     mov   x5, x0
 
 #if (STRING_COMPRESSION_FEATURE)
-    tbnz  w4, #31, .Lstring_indexof_compressed
+    tbz   w4, #0, .Lstring_indexof_compressed
 #endif
     /* Build pointer to start of data to compare and pre-bias */
     add   x0, x0, x2, lsl #1
diff --git a/runtime/arch/mips/entrypoints_init_mips.cc b/runtime/arch/mips/entrypoints_init_mips.cc
index e3230f6..6dca46f 100644
--- a/runtime/arch/mips/entrypoints_init_mips.cc
+++ b/runtime/arch/mips/entrypoints_init_mips.cc
@@ -59,6 +59,10 @@
 extern "C" int64_t __divdi3(int64_t, int64_t);
 extern "C" int64_t __moddi3(int64_t, int64_t);
 
+// No read barrier entrypoints for marking registers.
+void UpdateReadBarrierEntrypoints(QuickEntryPoints* qpoints ATTRIBUTE_UNUSED,
+                                  bool is_marking ATTRIBUTE_UNUSED) {}
+
 void InitEntryPoints(JniEntryPoints* jpoints, QuickEntryPoints* qpoints) {
   // Note: MIPS has asserts checking for the type of entrypoint. Don't move it
   //       to InitDefaultEntryPoints().
diff --git a/runtime/arch/mips64/entrypoints_init_mips64.cc b/runtime/arch/mips64/entrypoints_init_mips64.cc
index 43b73f1..0e81906 100644
--- a/runtime/arch/mips64/entrypoints_init_mips64.cc
+++ b/runtime/arch/mips64/entrypoints_init_mips64.cc
@@ -59,6 +59,10 @@
 extern "C" int64_t __divdi3(int64_t, int64_t);
 extern "C" int64_t __moddi3(int64_t, int64_t);
 
+// No read barrier entrypoints for marking registers.
+void UpdateReadBarrierEntrypoints(QuickEntryPoints* qpoints ATTRIBUTE_UNUSED,
+                                  bool is_marking ATTRIBUTE_UNUSED) {}
+
 void InitEntryPoints(JniEntryPoints* jpoints, QuickEntryPoints* qpoints) {
   DefaultInitEntryPoints(jpoints, qpoints);
 
diff --git a/runtime/arch/x86/entrypoints_init_x86.cc b/runtime/arch/x86/entrypoints_init_x86.cc
index 877df8f..94fea69 100644
--- a/runtime/arch/x86/entrypoints_init_x86.cc
+++ b/runtime/arch/x86/entrypoints_init_x86.cc
@@ -44,6 +44,16 @@
 extern "C" mirror::Object* art_quick_read_barrier_slow(mirror::Object*, mirror::Object*, uint32_t);
 extern "C" mirror::Object* art_quick_read_barrier_for_root_slow(GcRoot<mirror::Object>*);
 
+void UpdateReadBarrierEntrypoints(QuickEntryPoints* qpoints, bool is_marking) {
+  qpoints->pReadBarrierMarkReg00 = is_marking ? art_quick_read_barrier_mark_reg00 : nullptr;
+  qpoints->pReadBarrierMarkReg01 = is_marking ? art_quick_read_barrier_mark_reg01 : nullptr;
+  qpoints->pReadBarrierMarkReg02 = is_marking ? art_quick_read_barrier_mark_reg02 : nullptr;
+  qpoints->pReadBarrierMarkReg03 = is_marking ? art_quick_read_barrier_mark_reg03 : nullptr;
+  qpoints->pReadBarrierMarkReg05 = is_marking ? art_quick_read_barrier_mark_reg05 : nullptr;
+  qpoints->pReadBarrierMarkReg06 = is_marking ? art_quick_read_barrier_mark_reg06 : nullptr;
+  qpoints->pReadBarrierMarkReg07 = is_marking ? art_quick_read_barrier_mark_reg07 : nullptr;
+}
+
 void InitEntryPoints(JniEntryPoints* jpoints, QuickEntryPoints* qpoints) {
   DefaultInitEntryPoints(jpoints, qpoints);
 
@@ -87,14 +97,8 @@
 
   // Read barrier.
   qpoints->pReadBarrierJni = ReadBarrierJni;
-  qpoints->pReadBarrierMarkReg00 = art_quick_read_barrier_mark_reg00;
-  qpoints->pReadBarrierMarkReg01 = art_quick_read_barrier_mark_reg01;
-  qpoints->pReadBarrierMarkReg02 = art_quick_read_barrier_mark_reg02;
-  qpoints->pReadBarrierMarkReg03 = art_quick_read_barrier_mark_reg03;
+  UpdateReadBarrierEntrypoints(qpoints, /*is_marking*/ false);
   qpoints->pReadBarrierMarkReg04 = nullptr;  // Cannot use register 4 (ESP) to pass arguments.
-  qpoints->pReadBarrierMarkReg05 = art_quick_read_barrier_mark_reg05;
-  qpoints->pReadBarrierMarkReg06 = art_quick_read_barrier_mark_reg06;
-  qpoints->pReadBarrierMarkReg07 = art_quick_read_barrier_mark_reg07;
   // x86 has only 8 core registers.
   qpoints->pReadBarrierMarkReg08 = nullptr;
   qpoints->pReadBarrierMarkReg09 = nullptr;
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 635bfa3..761a510 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -2035,15 +2035,14 @@
     lea MIRROR_STRING_VALUE_OFFSET(%ecx), %edi
 #if (STRING_COMPRESSION_FEATURE)
     /* Differ cases */
-    cmpl    LITERAL(0), %edx
-    jl      .Lstring_compareto_this_is_compressed
-    cmpl    LITERAL(0), %ebx
-    jl      .Lstring_compareto_that_is_compressed
+    shrl    LITERAL(1), %edx
+    jnc     .Lstring_compareto_this_is_compressed
+    shrl    LITERAL(1), %ebx
+    jnc     .Lstring_compareto_that_is_compressed
     jmp     .Lstring_compareto_both_not_compressed
 .Lstring_compareto_this_is_compressed:
-    andl    LITERAL(0x7FFFFFFF), %edx
-    cmpl    LITERAL(0), %ebx
-    jl      .Lstring_compareto_both_compressed
+    shrl    LITERAL(1), %ebx
+    jnc     .Lstring_compareto_both_compressed
     /* If (this->IsCompressed() && that->IsCompressed() == false) */
     mov     %edx, %eax
     subl    %ebx, %eax
@@ -2061,7 +2060,6 @@
     cmovne  %edx, %eax                        // return eax = *(this_cur_char) - *(that_cur_char)
     jmp     .Lstring_compareto_return
 .Lstring_compareto_that_is_compressed:
-    andl    LITERAL(0x7FFFFFFF), %ebx
     mov     %edx, %eax
     subl    %ebx, %eax
     mov     %edx, %ecx
@@ -2078,7 +2076,6 @@
     cmovne  %edx, %eax
     jmp     .Lstring_compareto_return         // return eax = *(this_cur_char) - *(that_cur_char)
 .Lstring_compareto_both_compressed:
-    andl    LITERAL(0x7FFFFFFF), %ebx
     /* Calculate min length and count diff */
     mov     %edx, %ecx
     mov     %edx, %eax
diff --git a/runtime/arch/x86_64/entrypoints_init_x86_64.cc b/runtime/arch/x86_64/entrypoints_init_x86_64.cc
index 59c9dfe..6b66e62 100644
--- a/runtime/arch/x86_64/entrypoints_init_x86_64.cc
+++ b/runtime/arch/x86_64/entrypoints_init_x86_64.cc
@@ -55,6 +55,24 @@
 extern "C" mirror::Object* art_quick_read_barrier_slow(mirror::Object*, mirror::Object*, uint32_t);
 extern "C" mirror::Object* art_quick_read_barrier_for_root_slow(GcRoot<mirror::Object>*);
 
+void UpdateReadBarrierEntrypoints(QuickEntryPoints* qpoints, bool is_marking) {
+  qpoints->pReadBarrierMarkReg00 = is_marking ? art_quick_read_barrier_mark_reg00 : nullptr;
+  qpoints->pReadBarrierMarkReg01 = is_marking ? art_quick_read_barrier_mark_reg01 : nullptr;
+  qpoints->pReadBarrierMarkReg02 = is_marking ? art_quick_read_barrier_mark_reg02 : nullptr;
+  qpoints->pReadBarrierMarkReg03 = is_marking ? art_quick_read_barrier_mark_reg03 : nullptr;
+  qpoints->pReadBarrierMarkReg05 = is_marking ? art_quick_read_barrier_mark_reg05 : nullptr;
+  qpoints->pReadBarrierMarkReg06 = is_marking ? art_quick_read_barrier_mark_reg06 : nullptr;
+  qpoints->pReadBarrierMarkReg07 = is_marking ? art_quick_read_barrier_mark_reg07 : nullptr;
+  qpoints->pReadBarrierMarkReg08 = is_marking ? art_quick_read_barrier_mark_reg08 : nullptr;
+  qpoints->pReadBarrierMarkReg09 = is_marking ? art_quick_read_barrier_mark_reg09 : nullptr;
+  qpoints->pReadBarrierMarkReg10 = is_marking ? art_quick_read_barrier_mark_reg10 : nullptr;
+  qpoints->pReadBarrierMarkReg11 = is_marking ? art_quick_read_barrier_mark_reg11 : nullptr;
+  qpoints->pReadBarrierMarkReg12 = is_marking ? art_quick_read_barrier_mark_reg12 : nullptr;
+  qpoints->pReadBarrierMarkReg13 = is_marking ? art_quick_read_barrier_mark_reg13 : nullptr;
+  qpoints->pReadBarrierMarkReg14 = is_marking ? art_quick_read_barrier_mark_reg14 : nullptr;
+  qpoints->pReadBarrierMarkReg15 = is_marking ? art_quick_read_barrier_mark_reg15 : nullptr;
+}
+
 void InitEntryPoints(JniEntryPoints* jpoints, QuickEntryPoints* qpoints) {
 #if defined(__APPLE__)
   UNUSED(jpoints, qpoints);
@@ -101,22 +119,8 @@
 
   // Read barrier.
   qpoints->pReadBarrierJni = ReadBarrierJni;
-  qpoints->pReadBarrierMarkReg00 = art_quick_read_barrier_mark_reg00;
-  qpoints->pReadBarrierMarkReg01 = art_quick_read_barrier_mark_reg01;
-  qpoints->pReadBarrierMarkReg02 = art_quick_read_barrier_mark_reg02;
-  qpoints->pReadBarrierMarkReg03 = art_quick_read_barrier_mark_reg03;
+  UpdateReadBarrierEntrypoints(qpoints, /*is_marking*/ false);
   qpoints->pReadBarrierMarkReg04 = nullptr;  // Cannot use register 4 (RSP) to pass arguments.
-  qpoints->pReadBarrierMarkReg05 = art_quick_read_barrier_mark_reg05;
-  qpoints->pReadBarrierMarkReg06 = art_quick_read_barrier_mark_reg06;
-  qpoints->pReadBarrierMarkReg07 = art_quick_read_barrier_mark_reg07;
-  qpoints->pReadBarrierMarkReg08 = art_quick_read_barrier_mark_reg08;
-  qpoints->pReadBarrierMarkReg09 = art_quick_read_barrier_mark_reg09;
-  qpoints->pReadBarrierMarkReg10 = art_quick_read_barrier_mark_reg10;
-  qpoints->pReadBarrierMarkReg11 = art_quick_read_barrier_mark_reg11;
-  qpoints->pReadBarrierMarkReg12 = art_quick_read_barrier_mark_reg12;
-  qpoints->pReadBarrierMarkReg13 = art_quick_read_barrier_mark_reg13;
-  qpoints->pReadBarrierMarkReg14 = art_quick_read_barrier_mark_reg14;
-  qpoints->pReadBarrierMarkReg15 = art_quick_read_barrier_mark_reg15;
   // x86-64 has only 16 core registers.
   qpoints->pReadBarrierMarkReg16 = nullptr;
   qpoints->pReadBarrierMarkReg17 = nullptr;
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 72a03eb..20ee3f5 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -2142,15 +2142,14 @@
     leal MIRROR_STRING_VALUE_OFFSET(%esi), %esi
 #if (STRING_COMPRESSION_FEATURE)
     /* Differ cases */
-    cmpl LITERAL(0), %r8d
-    jl      .Lstring_compareto_this_is_compressed
-    cmpl    LITERAL(0), %r9d
-    jl      .Lstring_compareto_that_is_compressed
+    shrl    LITERAL(1), %r8d
+    jnc     .Lstring_compareto_this_is_compressed
+    shrl    LITERAL(1), %r9d
+    jnc     .Lstring_compareto_that_is_compressed
     jmp     .Lstring_compareto_both_not_compressed
 .Lstring_compareto_this_is_compressed:
-    andl    LITERAL(0x7FFFFFFF), %r8d
-    cmpl    LITERAL(0), %r9d
-    jl      .Lstring_compareto_both_compressed
+    shrl    LITERAL(1), %r9d
+    jnc     .Lstring_compareto_both_compressed
     /* Comparison this (8-bit) and that (16-bit) */
     mov     %r8d, %eax
     subl    %r9d, %eax
@@ -2169,7 +2168,6 @@
 .Lstring_compareto_keep_length1:
     ret
 .Lstring_compareto_that_is_compressed:
-    andl    LITERAL(0x7FFFFFFF), %r9d
     movl    %r8d, %eax
     subl    %r9d, %eax
     mov     %r8d, %ecx
@@ -2187,7 +2185,6 @@
 .Lstring_compareto_keep_length2:
     ret
 .Lstring_compareto_both_compressed:
-    andl    LITERAL(0x7FFFFFFF), %r9d
     /* Calculate min length and count diff */
     movl    %r8d, %ecx
     movl    %r8d, %eax
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index 11d6849..2e72ada 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -246,7 +246,7 @@
     Thread* self = Thread::Current();
     CHECK(thread == self || thread->IsSuspended() || thread->GetState() == kWaitingPerformingGc)
         << thread->GetState() << " thread " << thread << " self " << self;
-    thread->SetIsGcMarking(true);
+    thread->SetIsGcMarkingAndUpdateEntrypoints(true);
     if (use_tlab_ && thread->HasTlab()) {
       if (ConcurrentCopying::kEnableFromSpaceAccountingCheck) {
         // This must come before the revoke.
@@ -746,7 +746,7 @@
     // Disable the thread-local is_gc_marking flag.
     // Note a thread that has just started right before this checkpoint may have already this flag
     // set to false, which is ok.
-    thread->SetIsGcMarking(false);
+    thread->SetIsGcMarkingAndUpdateEntrypoints(false);
     // If thread is a running mutator, then act on behalf of the garbage collector.
     // See the code in ThreadList::RunCheckpoint.
     concurrent_copying_->GetBarrier().Pass(self);
diff --git a/runtime/interpreter/interpreter_common.cc b/runtime/interpreter/interpreter_common.cc
index 5e4bb41..cb775cd 100644
--- a/runtime/interpreter/interpreter_common.cc
+++ b/runtime/interpreter/interpreter_common.cc
@@ -827,6 +827,20 @@
   return klass;
 }
 
+// Returns true iff. the callsite type for a polymorphic invoke is transformer
+// like, i.e that it has a single input argument whose type is
+// dalvik.system.EmulatedStackFrame.
+static inline bool IsCallerTransformer(Handle<mirror::MethodType> callsite_type)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  ObjPtr<mirror::ObjectArray<mirror::Class>> param_types(callsite_type->GetPTypes());
+  if (param_types->GetLength() == 1) {
+    ObjPtr<mirror::Class> param(param_types->GetWithoutChecks(0));
+    return param == WellKnownClasses::ToClass(WellKnownClasses::dalvik_system_EmulatedStackFrame);
+  }
+
+  return false;
+}
+
 template<bool is_range, bool do_access_check>
 inline bool DoInvokePolymorphic(Thread* self,
                                 ShadowFrame& shadow_frame,
@@ -838,6 +852,11 @@
   const uint32_t vRegC = (is_range) ? inst->VRegC_4rcc() : inst->VRegC_45cc();
   const int invoke_method_idx = (is_range) ? inst->VRegB_4rcc() : inst->VRegB_45cc();
 
+  // Initialize |result| to 0 as this is the default return value for
+  // polymorphic invocations of method handle types with void return
+  // and provides sane return result in error cases.
+  result->SetJ(0);
+
   // Determine if this invocation is MethodHandle.invoke() or
   // MethodHandle.invokeExact().
   bool is_invoke_exact = IsInvokeExact(shadow_frame.GetMethod()->GetDeclaringClass()->GetDexFile(),
@@ -859,7 +878,6 @@
     // Note that the invoke type is kVirtual here because a call to a signature
     // polymorphic method is shaped like a virtual call at the bytecode level.
     ThrowNullPointerExceptionForMethodAccess(invoke_method_idx, InvokeType::kVirtual);
-    result->SetJ(0);
     return false;
   }
 
@@ -880,14 +898,13 @@
   // This implies we couldn't resolve one or more types in this method handle.
   if (UNLIKELY(callsite_type.Get() == nullptr)) {
     CHECK(self->IsExceptionPending());
-    result->SetJ(0);
     return false;
   }
 
   const MethodHandleKind handle_kind = method_handle->GetHandleKind();
   Handle<mirror::MethodType> handle_type(hs.NewHandle(method_handle->GetMethodType()));
   CHECK(handle_type.Get() != nullptr);
-  if (is_invoke_exact) {
+  {
     // We need to check the nominal type of the handle in addition to the
     // real type. The "nominal" type is present when MethodHandle.asType is
     // called any handle, and results in the declared type of the handle
@@ -900,9 +917,17 @@
       check_type.Assign(nominal_type.Ptr());
     }
 
-    if (UNLIKELY(!callsite_type->IsExactMatch(check_type.Ptr()))) {
-      ThrowWrongMethodTypeException(check_type.Ptr(), callsite_type.Get());
-      return false;
+    if (is_invoke_exact) {
+      if (UNLIKELY(!callsite_type->IsExactMatch(check_type.Ptr()))) {
+        ThrowWrongMethodTypeException(check_type.Ptr(), callsite_type.Get());
+        return false;
+      }
+    } else {
+      if (UNLIKELY(!IsCallerTransformer(callsite_type) &&
+                   !callsite_type->IsConvertible(check_type.Ptr()))) {
+        ThrowWrongMethodTypeException(check_type.Ptr(), callsite_type.Get());
+        return false;
+      }
     }
   }
 
@@ -932,7 +957,7 @@
       // TODO: Unfortunately, we have to postpone dynamic receiver based checks
       // because the receiver might be cast or might come from an emulated stack
       // frame, which means that it is unknown at this point. We perform these
-      // checks inside DoCallPolymorphic right before we do the actualy invoke.
+      // checks inside DoCallPolymorphic right before we do the actual invoke.
     } else if (handle_kind == kInvokeDirect) {
       // String constructors are a special case, they are replaced with StringFactory
       // methods.
@@ -965,40 +990,38 @@
       CHECK(called_method != nullptr);
     }
 
+    bool call_success;
     if (handle_kind == kInvokeTransform) {
-      return DoCallTransform<is_range>(called_method,
-                                       callsite_type,
-                                       handle_type,
-                                       self,
-                                       shadow_frame,
-                                       method_handle /* receiver */,
-                                       result,
-                                       arg,
-                                       first_src_reg);
+      call_success = DoCallTransform<is_range>(called_method,
+                                               callsite_type,
+                                               handle_type,
+                                               self,
+                                               shadow_frame,
+                                               method_handle /* receiver */,
+                                               result,
+                                               arg,
+                                               first_src_reg);
     } else {
-      return DoCallPolymorphic<is_range>(called_method,
-                                         callsite_type,
-                                         handle_type,
-                                         self,
-                                         shadow_frame,
-                                         result,
-                                         arg,
-                                         first_src_reg,
-                                         handle_kind);
+      call_success = DoCallPolymorphic<is_range>(called_method,
+                                                 callsite_type,
+                                                 handle_type,
+                                                 self,
+                                                 shadow_frame,
+                                                 result,
+                                                 arg,
+                                                 first_src_reg,
+                                                 handle_kind);
     }
+    if (LIKELY(call_success && ConvertReturnValue(callsite_type, handle_type, result))) {
+      return true;
+    }
+    DCHECK(self->IsExceptionPending());
+    return false;
   } else {
     DCHECK(!is_range);
     ArtField* field = method_handle->GetTargetField();
     Primitive::Type field_type = field->GetTypeAsPrimitiveType();;
 
-    if (!is_invoke_exact) {
-      if (handle_type->GetPTypes()->GetLength() != callsite_type->GetPTypes()->GetLength()) {
-        // Too many arguments to setter or getter.
-        ThrowWrongMethodTypeException(callsite_type.Get(), handle_type.Get());
-        return false;
-      }
-    }
-
     switch (handle_kind) {
       case kInstanceGet: {
         ObjPtr<mirror::Object> obj = shadow_frame.GetVRegReference(first_src_reg);
@@ -1029,7 +1052,6 @@
           return false;
         }
         ObjPtr<mirror::Object> obj = shadow_frame.GetVRegReference(first_src_reg);
-        result->SetL(0);
         return DoFieldPutForInvokePolymorphic(self, shadow_frame, obj, field, field_type, value);
       }
       case kStaticPut: {
@@ -1039,7 +1061,6 @@
           return false;
         }
         ObjPtr<mirror::Object> obj = field->GetDeclaringClass();
-        result->SetL(0);
         return DoFieldPutForInvokePolymorphic(self, shadow_frame, obj, field, field_type, value);
       }
       default:
@@ -1120,20 +1141,6 @@
   }
 }
 
-// Returns true iff. the callsite type for a polymorphic invoke is transformer
-// like, i.e that it has a single input argument whose type is
-// dalvik.system.EmulatedStackFrame.
-static inline bool IsCallerTransformer(Handle<mirror::MethodType> callsite_type)
-    REQUIRES_SHARED(Locks::mutator_lock_) {
-  ObjPtr<mirror::ObjectArray<mirror::Class>> param_types(callsite_type->GetPTypes());
-  if (param_types->GetLength() == 1) {
-    ObjPtr<mirror::Class> param(param_types->GetWithoutChecks(0));
-    return param == WellKnownClasses::ToClass(WellKnownClasses::dalvik_system_EmulatedStackFrame);
-  }
-
-  return false;
-}
-
 template <bool is_range>
 static inline bool DoCallPolymorphic(ArtMethod* called_method,
                                      Handle<mirror::MethodType> callsite_type,
@@ -1245,8 +1252,6 @@
 
   PerformCall(self, code_item, shadow_frame.GetMethod(), first_dest_reg, new_shadow_frame, result);
 
-  // TODO(narayan): Perform return value conversions.
-
   // If the caller of this signature polymorphic method was a transformer,
   // we need to copy the result back out to the emulated stack frame.
   if (is_caller_transformer && !self->IsExceptionPending()) {
diff --git a/runtime/interpreter/unstarted_runtime.cc b/runtime/interpreter/unstarted_runtime.cc
index 6bf7e15..2257fd6 100644
--- a/runtime/interpreter/unstarted_runtime.cc
+++ b/runtime/interpreter/unstarted_runtime.cc
@@ -1097,10 +1097,12 @@
     return;
   }
   DCHECK_GE(start, 0);
-  DCHECK_GE(end, string->GetLength());
+  DCHECK_LE(start, end);
+  DCHECK_LE(end, string->GetLength());
   StackHandleScope<1> hs(self);
   Handle<mirror::CharArray> h_char_array(
       hs.NewHandle(shadow_frame->GetVRegReference(arg_offset + 3)->AsCharArray()));
+  DCHECK_GE(index, 0);
   DCHECK_LE(index, h_char_array->GetLength());
   DCHECK_LE(end - start, h_char_array->GetLength() - index);
   string->GetChars(start, end, h_char_array, index);
diff --git a/runtime/jni_internal_test.cc b/runtime/jni_internal_test.cc
index e990935..a421c34 100644
--- a/runtime/jni_internal_test.cc
+++ b/runtime/jni_internal_test.cc
@@ -679,12 +679,8 @@
   ASSERT_TRUE(env_->IsInstanceOf(o, c));
   // ...whose fields haven't been initialized because
   // we didn't call a constructor.
-  if (art::mirror::kUseStringCompression) {
-    // Zero-length string is compressed, so the length internally will be -(1 << 31).
-    ASSERT_EQ(-2147483648, env_->GetIntField(o, env_->GetFieldID(c, "count", "I")));
-  } else {
-    ASSERT_EQ(0, env_->GetIntField(o, env_->GetFieldID(c, "count", "I")));
-  }
+  // Even with string compression empty string has `count == 0`.
+  ASSERT_EQ(0, env_->GetIntField(o, env_->GetFieldID(c, "count", "I")));
 }
 
 TEST_F(JniInternalTest, GetVersion) {
@@ -895,11 +891,12 @@
   // Make sure we can actually use it.
   jstring s = env_->NewStringUTF("poop");
   if (mirror::kUseStringCompression) {
-    // Negative because s is compressed (first bit is 1)
-    ASSERT_EQ(-2147483644, env_->GetIntField(s, fid2));
+    ASSERT_EQ(mirror::String::GetFlaggedCount(4, /* compressible */ true),
+              env_->GetIntField(s, fid2));
     // Create incompressible string
     jstring s_16 = env_->NewStringUTF("\u0444\u0444");
-    ASSERT_EQ(2, env_->GetIntField(s_16, fid2));
+    ASSERT_EQ(mirror::String::GetFlaggedCount(2, /* compressible */ false),
+              env_->GetIntField(s_16, fid2));
   } else {
     ASSERT_EQ(4, env_->GetIntField(s, fid2));
   }
diff --git a/runtime/method_handles.cc b/runtime/method_handles.cc
index 491d139..3c22d7f 100644
--- a/runtime/method_handles.cc
+++ b/runtime/method_handles.cc
@@ -21,168 +21,70 @@
 #include "jvalue-inl.h"
 #include "reflection.h"
 #include "reflection-inl.h"
+#include "well_known_classes.h"
 
 namespace art {
 
 namespace {
 
-static const char* kBoxedBooleanClass = "Ljava/lang/Boolean;";
-static const char* kBoxedByteClass = "Ljava/lang/Byte;";
-static const char* kBoxedCharacterClass = "Ljava/lang/Character;";
-static const char* kBoxedDoubleClass = "Ljava/lang/Double;";
-static const char* kBoxedFloatClass = "Ljava/lang/Float;";
-static const char* kBoxedIntegerClass = "Ljava/lang/Integer;";
-static const char* kBoxedLongClass = "Ljava/lang/Long;";
-static const char* kBoxedShortClass = "Ljava/lang/Short;";
+#define PRIMITIVES_LIST(V) \
+  V(Primitive::kPrimBoolean, Boolean, Boolean, Z) \
+  V(Primitive::kPrimByte, Byte, Byte, B)          \
+  V(Primitive::kPrimChar, Char, Character, C)     \
+  V(Primitive::kPrimShort, Short, Short, S)       \
+  V(Primitive::kPrimInt, Int, Integer, I)         \
+  V(Primitive::kPrimLong, Long, Long, J)          \
+  V(Primitive::kPrimFloat, Float, Float, F)       \
+  V(Primitive::kPrimDouble, Double, Double, D)
 
 // Assigns |type| to the primitive type associated with |klass|. Returns
 // true iff. |klass| was a boxed type (Integer, Long etc.), false otherwise.
 bool GetUnboxedPrimitiveType(ObjPtr<mirror::Class> klass, Primitive::Type* type)
     REQUIRES_SHARED(Locks::mutator_lock_) {
   ScopedAssertNoThreadSuspension ants(__FUNCTION__);
-  if (klass->DescriptorEquals(kBoxedBooleanClass)) {
-    (*type) = Primitive::kPrimBoolean;
-    return true;
-  } else if (klass->DescriptorEquals(kBoxedByteClass)) {
-    (*type) = Primitive::kPrimByte;
-    return true;
-  } else if (klass->DescriptorEquals(kBoxedCharacterClass)) {
-    (*type) = Primitive::kPrimChar;
-    return true;
-  } else if (klass->DescriptorEquals(kBoxedFloatClass)) {
-    (*type) = Primitive::kPrimFloat;
-    return true;
-  } else if (klass->DescriptorEquals(kBoxedDoubleClass)) {
-    (*type) = Primitive::kPrimDouble;
-    return true;
-  } else if (klass->DescriptorEquals(kBoxedIntegerClass)) {
-    (*type) = Primitive::kPrimInt;
-    return true;
-  } else if (klass->DescriptorEquals(kBoxedLongClass)) {
-    (*type) = Primitive::kPrimLong;
-    return true;
-  } else if (klass->DescriptorEquals(kBoxedShortClass)) {
-    (*type) = Primitive::kPrimShort;
-    return true;
-  } else {
-    return false;
+#define LOOKUP_PRIMITIVE(primitive, _, __, ___)                         \
+  if (klass->DescriptorEquals(Primitive::BoxedDescriptor(primitive))) { \
+    *type = primitive;                                                  \
+    return true;                                                        \
   }
-}
 
-// Returns the class corresponding to the boxed type for the primitive |type|.
-ObjPtr<mirror::Class> GetBoxedPrimitiveClass(Primitive::Type type)
-    REQUIRES_SHARED(Locks::mutator_lock_) {
-  ScopedAssertNoThreadSuspension ants(__FUNCTION__);
-  ClassLinker* const class_linker = Runtime::Current()->GetClassLinker();
-  switch (type) {
-    case Primitive::kPrimBoolean:
-      return class_linker->FindSystemClass(Thread::Current(), kBoxedBooleanClass);
-    case Primitive::kPrimByte:
-      return class_linker->FindSystemClass(Thread::Current(), kBoxedByteClass);
-    case Primitive::kPrimChar:
-      return class_linker->FindSystemClass(Thread::Current(), kBoxedCharacterClass);
-    case Primitive::kPrimShort:
-      return class_linker->FindSystemClass(Thread::Current(), kBoxedShortClass);
-    case Primitive::kPrimInt:
-      return class_linker->FindSystemClass(Thread::Current(), kBoxedIntegerClass);
-    case Primitive::kPrimLong:
-      return class_linker->FindSystemClass(Thread::Current(), kBoxedLongClass);
-    case Primitive::kPrimFloat:
-      return class_linker->FindSystemClass(Thread::Current(), kBoxedFloatClass);
-    case Primitive::kPrimDouble:
-      return class_linker->FindSystemClass(Thread::Current(), kBoxedDoubleClass);
-    case Primitive::kPrimNot:
-    case Primitive::kPrimVoid:
-      LOG(FATAL) << "Unreachable";
-      return nullptr;
-  }
-}
-
-// Returns true if |klass| is a boxed primitive type or a sub-class of a boxed primitive type.
-bool IsSubClassOfBoxedPrimitive(const Handle<mirror::Class>& klass)
-    REQUIRES_SHARED(Locks::mutator_lock_) {
-  StackHandleScope<1> hs(Thread::Current());
-  MutableHandle<mirror::Class> h_klass(hs.NewHandle(klass.Get()));
-  do {
-    Primitive::Type type;
-    if (GetUnboxedPrimitiveType(h_klass.Get(), &type)) {
-      return true;
-    }
-    h_klass.Assign(h_klass->GetSuperClass());
-  } while (h_klass.Get() != nullptr);
+  PRIMITIVES_LIST(LOOKUP_PRIMITIVE);
+#undef LOOKUP_PRIMITIVE
   return false;
 }
 
-// Unboxed the value |o| to |unboxed_value| of type |dst_class|.
-// |unboxed_value| must be zero on entry to avoid dangling pointers.
-// Returns true on success, false if an exception is raised.
-bool UnboxPrimitiveForMethodHandles(ObjPtr<mirror::Object> o,
-                                    ObjPtr<mirror::Class> dst_class,
-                                    JValue* unboxed_value)
+ObjPtr<mirror::Class> GetBoxedPrimitiveClass(Primitive::Type type)
     REQUIRES_SHARED(Locks::mutator_lock_) {
-  // Check unboxed_value does not contain a dangling pointer.
-  DCHECK_EQ(unboxed_value->GetJ(), 0);
-  DCHECK(dst_class->IsPrimitive());
-
-  // This is derived from UnboxPrimitive() in reflection.cc, but with
-  // exceptions appropriate to method handles.
-  if (UNLIKELY(dst_class->GetPrimitiveType() == Primitive::kPrimVoid)) {
-    ThrowClassCastException(o->GetClass(), dst_class);
-    return false;
+  ScopedAssertNoThreadSuspension ants(__FUNCTION__);
+  jmethodID m = nullptr;
+  switch (type) {
+#define CASE_PRIMITIVE(primitive, _, java_name, __)              \
+    case primitive:                                              \
+      m = WellKnownClasses::java_lang_ ## java_name ## _valueOf; \
+      break;
+    PRIMITIVES_LIST(CASE_PRIMITIVE);
+#undef CASE_PRIMITIVE
+    case Primitive::Type::kPrimNot:
+    case Primitive::Type::kPrimVoid:
+      return nullptr;
   }
-  if (UNLIKELY(o == nullptr)) {
-    ThrowNullPointerException(
-        StringPrintf("Expected to unbox a '%s' primitive type but was returned null",
-                     dst_class->PrettyDescriptor().c_str()).c_str());
-    return false;
-  }
+  return jni::DecodeArtMethod(m)->GetDeclaringClass();
+}
 
-  JValue boxed_value;
+bool GetUnboxedTypeAndValue(ObjPtr<mirror::Object> o, Primitive::Type* type, JValue* value)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  ScopedAssertNoThreadSuspension ants(__FUNCTION__);
   ObjPtr<mirror::Class> klass = o->GetClass();
-  ObjPtr<mirror::Class> src_class = nullptr;
-  ClassLinker* const class_linker = Runtime::Current()->GetClassLinker();
   ArtField* primitive_field = &klass->GetIFieldsPtr()->At(0);
-  if (klass->DescriptorEquals(kBoxedBooleanClass)) {
-    src_class = class_linker->FindPrimitiveClass('Z');
-    boxed_value.SetZ(primitive_field->GetBoolean(o));
-  } else if (klass->DescriptorEquals(kBoxedByteClass)) {
-    src_class = class_linker->FindPrimitiveClass('B');
-    boxed_value.SetB(primitive_field->GetByte(o));
-  } else if (klass->DescriptorEquals(kBoxedCharacterClass)) {
-    src_class = class_linker->FindPrimitiveClass('C');
-    boxed_value.SetC(primitive_field->GetChar(o));
-  } else if (klass->DescriptorEquals(kBoxedFloatClass)) {
-    src_class = class_linker->FindPrimitiveClass('F');
-    boxed_value.SetF(primitive_field->GetFloat(o));
-  } else if (klass->DescriptorEquals(kBoxedDoubleClass)) {
-    src_class = class_linker->FindPrimitiveClass('D');
-    boxed_value.SetD(primitive_field->GetDouble(o));
-  } else if (klass->DescriptorEquals(kBoxedIntegerClass)) {
-    src_class = class_linker->FindPrimitiveClass('I');
-    boxed_value.SetI(primitive_field->GetInt(o));
-  } else if (klass->DescriptorEquals(kBoxedLongClass)) {
-    src_class = class_linker->FindPrimitiveClass('J');
-    boxed_value.SetJ(primitive_field->GetLong(o));
-  } else if (klass->DescriptorEquals(kBoxedShortClass)) {
-    src_class = class_linker->FindPrimitiveClass('S');
-    boxed_value.SetS(primitive_field->GetShort(o));
-  } else {
-    std::string temp;
-    ThrowIllegalArgumentException(
-        StringPrintf("result has type %s, got %s",
-                     dst_class->PrettyDescriptor().c_str(),
-                     PrettyDescriptor(o->GetClass()->GetDescriptor(&temp)).c_str()).c_str());
-    return false;
+#define CASE_PRIMITIVE(primitive, abbrev, _, shorthand)         \
+  if (klass == GetBoxedPrimitiveClass(primitive)) {             \
+    *type = primitive;                                          \
+    value->Set ## shorthand(primitive_field->Get ## abbrev(o)); \
+    return true;                                                \
   }
-
-  if (!ConvertPrimitiveValueNoThrow(src_class->GetPrimitiveType(),
-                                    dst_class->GetPrimitiveType(),
-                                    boxed_value,
-                                    unboxed_value)) {
-    ThrowClassCastException(src_class, dst_class);
-    return false;
-  }
-  return true;
+  PRIMITIVES_LIST(CASE_PRIMITIVE)
+#undef CASE_PRIMITIVE
+  return false;
 }
 
 inline bool IsReferenceType(Primitive::Type type) {
@@ -195,6 +97,78 @@
 
 }  // namespace
 
+bool IsParameterTypeConvertible(ObjPtr<mirror::Class> from, ObjPtr<mirror::Class> to)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  // This function returns true if there's any conceivable conversion
+  // between |from| and |to|. It's expected this method will be used
+  // to determine if a WrongMethodTypeException should be raised. The
+  // decision logic follows the documentation for MethodType.asType().
+  if (from == to) {
+    return true;
+  }
+
+  Primitive::Type from_primitive = from->GetPrimitiveType();
+  Primitive::Type to_primitive = to->GetPrimitiveType();
+  DCHECK(from_primitive != Primitive::Type::kPrimVoid);
+  DCHECK(to_primitive != Primitive::Type::kPrimVoid);
+
+  // If |to| and |from| are references.
+  if (IsReferenceType(from_primitive) && IsReferenceType(to_primitive)) {
+    // Assignability is determined during parameter conversion when
+    // invoking the associated method handle.
+    return true;
+  }
+
+  // If |to| and |from| are primitives and a widening conversion exists.
+  if (Primitive::IsWidenable(from_primitive, to_primitive)) {
+    return true;
+  }
+
+  // If |to| is a reference and |from| is a primitive, then boxing conversion.
+  if (IsReferenceType(to_primitive) && IsPrimitiveType(from_primitive)) {
+    return to->IsAssignableFrom(GetBoxedPrimitiveClass(from_primitive));
+  }
+
+  // If |from| is a reference and |to| is a primitive, then unboxing conversion.
+  if (IsPrimitiveType(to_primitive) && IsReferenceType(from_primitive)) {
+    if (from->DescriptorEquals("Ljava/lang/Object;")) {
+      // Object might be converted into a primitive during unboxing.
+      return true;
+    } else if (Primitive::IsNumericType(to_primitive) &&
+               from->DescriptorEquals("Ljava/lang/Number;")) {
+      // Number might be unboxed into any of the number primitive types.
+      return true;
+    }
+    Primitive::Type unboxed_type;
+    if (GetUnboxedPrimitiveType(from, &unboxed_type)) {
+      if (unboxed_type == to_primitive) {
+        // Straightforward unboxing conversion such as Boolean => boolean.
+        return true;
+      } else {
+        // Check if widening operations for numeric primitives would work,
+        // such as Byte => byte => long.
+        return Primitive::IsWidenable(unboxed_type, to_primitive);
+      }
+    }
+  }
+
+  return false;
+}
+
+bool IsReturnTypeConvertible(ObjPtr<mirror::Class> from, ObjPtr<mirror::Class> to)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  if (to->GetPrimitiveType() == Primitive::Type::kPrimVoid) {
+    // Result will be ignored.
+    return true;
+  } else if (from->GetPrimitiveType() == Primitive::Type::kPrimVoid) {
+    // Returned value will be 0 / null.
+    return true;
+  } else {
+    // Otherwise apply usual parameter conversion rules.
+    return IsParameterTypeConvertible(from, to);
+  }
+}
+
 bool ConvertJValueCommon(
     Handle<mirror::MethodType> callsite_type,
     Handle<mirror::MethodType> callee_type,
@@ -209,14 +183,23 @@
   const Primitive::Type from_type = from->GetPrimitiveType();
   const Primitive::Type to_type = to->GetPrimitiveType();
 
+  // Put incoming value into |src_value| and set return value to 0.
+  // Errors and conversions from void require the return value to be 0.
+  const JValue src_value(*value);
+  value->SetJ(0);
+
+  // Conversion from void set result to zero.
+  if (from_type == Primitive::kPrimVoid) {
+    return true;
+  }
+
   // This method must be called only when the types don't match.
   DCHECK(from != to);
 
   if (IsPrimitiveType(from_type) && IsPrimitiveType(to_type)) {
     // The source and target types are both primitives.
-    if (UNLIKELY(!ConvertPrimitiveValueNoThrow(from_type, to_type, *value, value))) {
+    if (UNLIKELY(!ConvertPrimitiveValueNoThrow(from_type, to_type, src_value, value))) {
       ThrowWrongMethodTypeException(callee_type.Get(), callsite_type.Get());
-      value->SetJ(0);
       return false;
     }
     return true;
@@ -229,12 +212,7 @@
     // in mirror::Class::IsAssignable().
     StackHandleScope<2> hs(Thread::Current());
     Handle<mirror::Class> h_to(hs.NewHandle(to));
-    Handle<mirror::Object> h_obj(hs.NewHandle(value->GetL()));
-
-    // |value| will now be the result value, invalidate its existing value
-    // as |h_obj| now owns it.
-    value->SetJ(0);
-
+    Handle<mirror::Object> h_obj(hs.NewHandle(src_value.GetL()));
     if (h_obj.Get() != nullptr && !to->IsAssignableFrom(h_obj->GetClass())) {
       ThrowClassCastException(h_to.Get(), h_obj->GetClass());
       return false;
@@ -243,10 +221,6 @@
     return true;
   } else if (IsReferenceType(to_type)) {
     DCHECK(IsPrimitiveType(from_type));
-    // Playing it safe with StackHandleScope here with regards to
-    // GetUnboxedPrimitiveType() and GetBoxedPrimitiveClass().
-    StackHandleScope<1> hs(Thread::Current());
-    Handle<mirror::Class> h_to(hs.NewHandle(to));
     // The source type is a primitive and the target type is a reference, so we must box.
     // The target type maybe a super class of the boxed source type, for example,
     // if the source type is int, it's boxed type is java.lang.Integer, and the target
@@ -254,29 +228,26 @@
     Primitive::Type type;
     if (!GetUnboxedPrimitiveType(to, &type)) {
       ObjPtr<mirror::Class> boxed_from_class = GetBoxedPrimitiveClass(from_type);
-      if (boxed_from_class->IsSubClass(h_to.Get())) {
+      if (boxed_from_class->IsSubClass(to)) {
         type = from_type;
       } else {
-        value->SetJ(0);
         ThrowWrongMethodTypeException(callee_type.Get(), callsite_type.Get());
         return false;
       }
     }
 
     if (UNLIKELY(from_type != type)) {
-      value->SetJ(0);
       ThrowWrongMethodTypeException(callee_type.Get(), callsite_type.Get());
       return false;
     }
 
-    if (!ConvertPrimitiveValueNoThrow(from_type, type, *value, value)) {
-      value->SetJ(0);
+    if (!ConvertPrimitiveValueNoThrow(from_type, type, src_value, value)) {
       ThrowWrongMethodTypeException(callee_type.Get(), callsite_type.Get());
       return false;
     }
 
     // Then perform the actual boxing, and then set the reference.
-    ObjPtr<mirror::Object> boxed = BoxPrimitive(type, *value);
+    ObjPtr<mirror::Object> boxed = BoxPrimitive(type, src_value);
     value->SetL(boxed.Ptr());
     return true;
   } else {
@@ -284,33 +255,27 @@
     DCHECK(IsReferenceType(from_type));
     DCHECK(IsPrimitiveType(to_type));
 
-    // Use StackHandleScope to protect |from|, |to|, and the reference
-    // in |value| from heap re-arrangements that could be triggered
-    // ahead of unboxing step.
-    StackHandleScope<3> hs(Thread::Current());
-    Handle<mirror::Class> h_to(hs.NewHandle(to));
-    Handle<mirror::Class> h_from(hs.NewHandle(from));
-    Handle<mirror::Object> h_obj(hs.NewHandle(value->GetL()));
+    ObjPtr<mirror::Object> from_obj(src_value.GetL());
+    if (UNLIKELY(from_obj == nullptr)) {
+      ThrowNullPointerException(
+          StringPrintf("Expected to unbox a '%s' primitive type but was returned null",
+                       from->PrettyDescriptor().c_str()).c_str());
+      return false;
+    }
 
-    // |value| will now be the result value, invalidate its existing value
-    // as |h_obj| now owns it.
-    value->SetJ(0);
-
-    // Check source type is a boxed primitive or has a boxed primitive super-class.
-    ObjPtr<mirror::Class> boxed_to_class = GetBoxedPrimitiveClass(to_type);
-    if (!IsSubClassOfBoxedPrimitive(h_from) && !boxed_to_class->IsSubClass(h_from.Get())) {
+    Primitive::Type unboxed_type;
+    JValue unboxed_value;
+    if (UNLIKELY(!GetUnboxedTypeAndValue(from_obj, &unboxed_type, &unboxed_value))) {
       ThrowWrongMethodTypeException(callee_type.Get(), callsite_type.Get());
       return false;
     }
 
-    if (h_obj.Get() == nullptr) {
-      ThrowNullPointerException(
-        StringPrintf("Expected to unbox a '%s' but instance was null",
-                     h_from->PrettyDescriptor().c_str()).c_str());
+    if (UNLIKELY(!ConvertPrimitiveValueNoThrow(unboxed_type, to_type, unboxed_value, value))) {
+      ThrowClassCastException(from, to);
       return false;
     }
 
-    return UnboxPrimitiveForMethodHandles(h_obj.Get(), h_to.Get(), value);
+    return true;
   }
 }
 
diff --git a/runtime/method_handles.h b/runtime/method_handles.h
index 0cc69f2..54c772a 100644
--- a/runtime/method_handles.h
+++ b/runtime/method_handles.h
@@ -59,6 +59,16 @@
   return handle_kind <= kLastInvokeKind;
 }
 
+// Returns true if there is a possible conversion from |from| to |to|
+// for a MethodHandle parameter.
+bool IsParameterTypeConvertible(ObjPtr<mirror::Class> from,
+                                ObjPtr<mirror::Class> to);
+
+// Returns true if there is a possible conversion from |from| to |to|
+// for the return type of a MethodHandle.
+bool IsReturnTypeConvertible(ObjPtr<mirror::Class> from,
+                             ObjPtr<mirror::Class> to);
+
 // Performs a conversion from type |from| to a distinct type |to| as
 // part of conversion of |caller_type| to |callee_type|. The value to
 // be converted is in |value|. Returns true on success and updates
diff --git a/runtime/mirror/method_type.cc b/runtime/mirror/method_type.cc
index 9b0f872..5d77a16 100644
--- a/runtime/mirror/method_type.cc
+++ b/runtime/mirror/method_type.cc
@@ -18,6 +18,7 @@
 
 #include "class-inl.h"
 #include "gc_root-inl.h"
+#include "method_handles.h"
 
 namespace art {
 namespace mirror {
@@ -43,25 +44,44 @@
   return mt.Get();
 }
 
-bool MethodType::IsExactMatch(mirror::MethodType* other) REQUIRES_SHARED(Locks::mutator_lock_) {
-  if (GetRType() != other->GetRType()) {
-    return false;
-  }
-
+bool MethodType::IsExactMatch(mirror::MethodType* target) REQUIRES_SHARED(Locks::mutator_lock_) {
   mirror::ObjectArray<Class>* const p_types = GetPTypes();
   const int32_t params_length = p_types->GetLength();
 
-  mirror::ObjectArray<Class>* const other_p_types = other->GetPTypes();
-  if (params_length != other_p_types->GetLength()) {
+  mirror::ObjectArray<Class>* const target_p_types = target->GetPTypes();
+  if (params_length != target_p_types->GetLength()) {
+    return false;
+  }
+  for (int32_t i = 0; i < params_length; ++i) {
+    if (p_types->GetWithoutChecks(i) != target_p_types->GetWithoutChecks(i)) {
+      return false;
+    }
+  }
+  return GetRType() == target->GetRType();
+}
+
+bool MethodType::IsConvertible(mirror::MethodType* target) REQUIRES_SHARED(Locks::mutator_lock_) {
+  mirror::ObjectArray<Class>* const p_types = GetPTypes();
+  const int32_t params_length = p_types->GetLength();
+
+  mirror::ObjectArray<Class>* const target_p_types = target->GetPTypes();
+  if (params_length != target_p_types->GetLength()) {
+    return false;
+  }
+
+  // Perform return check before invoking method handle otherwise side
+  // effects from the invocation may be observable before
+  // WrongMethodTypeException is raised.
+  if (!IsReturnTypeConvertible(target->GetRType(), GetRType())) {
     return false;
   }
 
   for (int32_t i = 0; i < params_length; ++i) {
-    if (p_types->GetWithoutChecks(i) != other_p_types->GetWithoutChecks(i)) {
+    if (!IsParameterTypeConvertible(p_types->GetWithoutChecks(i),
+                                    target_p_types->GetWithoutChecks(i))) {
       return false;
     }
   }
-
   return true;
 }
 
diff --git a/runtime/mirror/method_type.h b/runtime/mirror/method_type.h
index fa700b6..9a98143 100644
--- a/runtime/mirror/method_type.h
+++ b/runtime/mirror/method_type.h
@@ -52,9 +52,13 @@
   static void ResetClass() REQUIRES_SHARED(Locks::mutator_lock_);
   static void VisitRoots(RootVisitor* visitor) REQUIRES_SHARED(Locks::mutator_lock_);
 
-  // Returns true iff. |other| is an exact match for this method type, i.e
+  // Returns true iff. |this| is an exact match for method type |target|, i.e
   // iff. they have the same return types and parameter types.
-  bool IsExactMatch(mirror::MethodType* other) REQUIRES_SHARED(Locks::mutator_lock_);
+  bool IsExactMatch(mirror::MethodType* target) REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Returns true iff. |this| can be converted to match |target| method type, i.e
+  // iff. they have convertible return types and parameter types.
+  bool IsConvertible(mirror::MethodType* target) REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Returns the pretty descriptor for this method type, suitable for display in
   // exception messages and the like.
diff --git a/runtime/mirror/string-inl.h b/runtime/mirror/string-inl.h
index d94b39f..6870fda 100644
--- a/runtime/mirror/string-inl.h
+++ b/runtime/mirror/string-inl.h
@@ -106,9 +106,7 @@
     string->SetCount(count_);
     const uint16_t* const src = src_array_->GetData() + offset_;
     const int32_t length = String::GetLengthFromCount(count_);
-    bool compressible = kUseStringCompression && String::GetCompressionFlagFromCount(count_);
-    DCHECK(!compressible || kUseStringCompression);
-    if (compressible) {
+    if (kUseStringCompression && String::IsCompressed(count_)) {
       for (int i = 0; i < length; ++i) {
         string->GetValueCompressed()[i] = static_cast<uint8_t>(src[i]);
       }
@@ -126,7 +124,8 @@
 // Sets string count and value in the allocation code path to ensure it is guarded by a CAS.
 class SetStringCountAndValueVisitorFromString {
  public:
-  SetStringCountAndValueVisitorFromString(int32_t count, Handle<String> src_string,
+  SetStringCountAndValueVisitorFromString(int32_t count,
+                                          Handle<String> src_string,
                                           int32_t offset) :
     count_(count), src_string_(src_string), offset_(offset) {
   }
@@ -137,8 +136,7 @@
     ObjPtr<String> string = ObjPtr<String>::DownCast(obj);
     string->SetCount(count_);
     const int32_t length = String::GetLengthFromCount(count_);
-    bool compressible = kUseStringCompression && String::GetCompressionFlagFromCount(count_);
-    DCHECK(!compressible || kUseStringCompression);
+    bool compressible = kUseStringCompression && String::IsCompressed(count_);
     if (src_string_->IsCompressed()) {
       const uint8_t* const src = src_string_->GetValueCompressed() + offset_;
       memcpy(string->GetValueCompressed(), src, length * sizeof(uint8_t));
@@ -209,8 +207,7 @@
                              gc::AllocatorType allocator_type,
                              const PreFenceVisitor& pre_fence_visitor) {
   constexpr size_t header_size = sizeof(String);
-  const bool compressible = kUseStringCompression &&
-                            String::GetCompressionFlagFromCount(utf16_length_with_flag);
+  const bool compressible = kUseStringCompression && String::IsCompressed(utf16_length_with_flag);
   const size_t block_size = (compressible) ? sizeof(uint8_t) : sizeof(uint16_t);
   size_t length = String::GetLengthFromCount(utf16_length_with_flag);
   static_assert(sizeof(length) <= sizeof(size_t),
@@ -245,7 +242,7 @@
 
 template <bool kIsInstrumented>
 inline String* String::AllocEmptyString(Thread* self, gc::AllocatorType allocator_type) {
-  const int32_t length_with_flag = String::GetFlaggedCount(0);
+  const int32_t length_with_flag = String::GetFlaggedCount(0, /* compressible */ true);
   SetStringCountVisitor visitor(length_with_flag);
   return Alloc<kIsInstrumented>(self, length_with_flag, allocator_type, visitor);
 }
@@ -255,10 +252,9 @@
                                           Handle<ByteArray> array, int32_t offset,
                                           int32_t high_byte, gc::AllocatorType allocator_type) {
   const uint8_t* const src = reinterpret_cast<uint8_t*>(array->GetData()) + offset;
-  const bool compressible = kUseStringCompression && String::AllASCII<uint8_t>(src, byte_length)
-                                            && (high_byte == 0);
-  const int32_t length_with_flag = (compressible) ? String::GetFlaggedCount(byte_length)
-                                                  : byte_length;
+  const bool compressible =
+      kUseStringCompression && String::AllASCII<uint8_t>(src, byte_length) && (high_byte == 0);
+  const int32_t length_with_flag = String::GetFlaggedCount(byte_length, compressible);
   SetStringCountAndBytesVisitor visitor(length_with_flag, array, offset, high_byte << 8);
   String* string = Alloc<kIsInstrumented>(self, length_with_flag, allocator_type, visitor);
   return string;
@@ -272,7 +268,7 @@
   DCHECK_GE(array->GetLength(), count);
   const bool compressible = kUseStringCompression &&
                             String::AllASCII<uint16_t>(array->GetData() + offset, count);
-  const int32_t length_with_flag = (compressible) ? String::GetFlaggedCount(count) : count;
+  const int32_t length_with_flag = String::GetFlaggedCount(count, compressible);
   SetStringCountAndValueVisitorFromCharArray visitor(length_with_flag, array, offset);
   String* new_string = Alloc<kIsInstrumented>(self, length_with_flag, allocator_type, visitor);
   return new_string;
@@ -284,8 +280,7 @@
   const bool compressible = kUseStringCompression &&
       ((string->IsCompressed()) ? true : String::AllASCII<uint16_t>(string->GetValue() + offset,
                                                                     string_length));
-  const int32_t length_with_flag = (compressible) ? String::GetFlaggedCount(string_length)
-                                                  : string_length;
+  const int32_t length_with_flag = String::GetFlaggedCount(string_length, compressible);
   SetStringCountAndValueVisitorFromString visitor(length_with_flag, string, offset);
   String* new_string = Alloc<kIsInstrumented>(self, length_with_flag, allocator_type, visitor);
   return new_string;
@@ -311,7 +306,7 @@
 template<typename MemoryType>
 bool String::AllASCII(const MemoryType* const chars, const int length) {
   for (int i = 0; i < length; ++i) {
-    if (chars[i] > 0x80) {
+    if (chars[i] >= 0x80) {
       return false;
     }
   }
diff --git a/runtime/mirror/string.cc b/runtime/mirror/string.cc
index 4336aa1..0ab0bd6 100644
--- a/runtime/mirror/string.cc
+++ b/runtime/mirror/string.cc
@@ -95,8 +95,7 @@
   gc::AllocatorType allocator_type = Runtime::Current()->GetHeap()->GetCurrentAllocator();
   const bool compressible = kUseStringCompression &&
       (string->IsCompressed() && string2->IsCompressed());
-  const int32_t length_with_flag = compressible ? String::GetFlaggedCount(length + length2)
-                                                : (length + length2);
+  const int32_t length_with_flag = String::GetFlaggedCount(length + length2, compressible);
 
   SetStringCountVisitor visitor(length_with_flag);
   ObjPtr<String> new_string = Alloc<true>(self, length_with_flag, allocator_type, visitor);
@@ -132,8 +131,7 @@
   gc::AllocatorType allocator_type = Runtime::Current()->GetHeap()->GetCurrentAllocator();
   const bool compressible = kUseStringCompression &&
                             String::AllASCII<uint16_t>(utf16_data_in, utf16_length);
-  int32_t length_with_flag = (compressible) ? String::GetFlaggedCount(utf16_length)
-                                            : utf16_length;
+  int32_t length_with_flag = String::GetFlaggedCount(utf16_length, compressible);
   SetStringCountVisitor visitor(length_with_flag);
   ObjPtr<String> string = Alloc<true>(self, length_with_flag, allocator_type, visitor);
   if (UNLIKELY(string == nullptr)) {
@@ -169,8 +167,7 @@
                                       int32_t utf8_length) {
   gc::AllocatorType allocator_type = Runtime::Current()->GetHeap()->GetCurrentAllocator();
   const bool compressible = kUseStringCompression && (utf16_length == utf8_length);
-  const int32_t utf16_length_with_flag = (compressible) ? String::GetFlaggedCount(utf16_length)
-                                                        : utf16_length;
+  const int32_t utf16_length_with_flag = String::GetFlaggedCount(utf16_length, compressible);
   SetStringCountVisitor visitor(utf16_length_with_flag);
   ObjPtr<String> string = Alloc<true>(self, utf16_length_with_flag, allocator_type, visitor);
   if (UNLIKELY(string == nullptr)) {
diff --git a/runtime/mirror/string.h b/runtime/mirror/string.h
index 6ce75bc..95b6c3e 100644
--- a/runtime/mirror/string.h
+++ b/runtime/mirror/string.h
@@ -33,6 +33,10 @@
 
 // String Compression
 static constexpr bool kUseStringCompression = false;
+enum class StringCompressionFlag : uint32_t {
+    kCompressed = 0u,
+    kUncompressed = 1u
+};
 
 // C++ mirror of java.lang.String
 class MANAGED String FINAL : public Object {
@@ -78,7 +82,6 @@
   void SetCount(int32_t new_count) REQUIRES_SHARED(Locks::mutator_lock_) {
     // Count is invariant so use non-transactional mode. Also disable check as we may run inside
     // a transaction.
-    DCHECK_LE(0, (new_count & INT32_MAX));
     SetField32<false, false>(OFFSET_OF_OBJECT_MEMBER(String, count_), new_count);
   }
 
@@ -175,7 +178,7 @@
 
   template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
   bool IsCompressed() REQUIRES_SHARED(Locks::mutator_lock_) {
-    return kUseStringCompression && GetCompressionFlagFromCount(GetCount());
+    return kUseStringCompression && IsCompressed(GetCount());
   }
 
   bool IsValueNull() REQUIRES_SHARED(Locks::mutator_lock_);
@@ -183,16 +186,27 @@
   template<typename MemoryType>
   static bool AllASCII(const MemoryType* const chars, const int length);
 
-  ALWAYS_INLINE static bool GetCompressionFlagFromCount(const int32_t count) {
-    return kUseStringCompression && ((count & (1u << 31)) != 0);
+  ALWAYS_INLINE static bool IsCompressed(int32_t count) {
+    return GetCompressionFlagFromCount(count) == StringCompressionFlag::kCompressed;
   }
 
-  ALWAYS_INLINE static int32_t GetLengthFromCount(const int32_t count) {
-    return kUseStringCompression ? (count & INT32_MAX) : count;
+  ALWAYS_INLINE static StringCompressionFlag GetCompressionFlagFromCount(int32_t count) {
+    return kUseStringCompression
+        ? static_cast<StringCompressionFlag>(static_cast<uint32_t>(count) & 1u)
+        : StringCompressionFlag::kUncompressed;
   }
 
-  ALWAYS_INLINE static int32_t GetFlaggedCount(const int32_t count) {
-    return kUseStringCompression ? (count | (1u << 31)) : count;
+  ALWAYS_INLINE static int32_t GetLengthFromCount(int32_t count) {
+    return kUseStringCompression ? static_cast<int32_t>(static_cast<uint32_t>(count) >> 1) : count;
+  }
+
+  ALWAYS_INLINE static int32_t GetFlaggedCount(int32_t length, bool compressible) {
+    return kUseStringCompression
+        ? static_cast<int32_t>((static_cast<uint32_t>(length) << 1) |
+                               (static_cast<uint32_t>(compressible
+                                                          ? StringCompressionFlag::kCompressed
+                                                          : StringCompressionFlag::kUncompressed)))
+        : length;
   }
 
   static Class* GetJavaLangString() REQUIRES_SHARED(Locks::mutator_lock_) {
diff --git a/runtime/oat_file_assistant.cc b/runtime/oat_file_assistant.cc
index 0679360..ff00451 100644
--- a/runtime/oat_file_assistant.cc
+++ b/runtime/oat_file_assistant.cc
@@ -595,7 +595,7 @@
 
   std::vector<std::string> args;
   args.push_back("--dex-file=" + dex_location_);
-  args.push_back("--output-vdex-fd=" + std::to_string(vdex_file->Fd()));
+  args.push_back("--vdex-fd=" + std::to_string(vdex_file->Fd()));
   args.push_back("--oat-fd=" + std::to_string(oat_file->Fd()));
   args.push_back("--oat-location=" + oat_file_name);
 
diff --git a/runtime/openjdkjvmti/OpenjdkJvmTi.cc b/runtime/openjdkjvmti/OpenjdkJvmTi.cc
index 9d4b554..6480843 100644
--- a/runtime/openjdkjvmti/OpenjdkJvmTi.cc
+++ b/runtime/openjdkjvmti/OpenjdkJvmTi.cc
@@ -61,20 +61,38 @@
 EventHandler gEventHandler;
 ObjectTagTable gObjectTagTable(&gEventHandler);
 
+#define ENSURE_NON_NULL(n)      \
+  do {                          \
+    if ((n) == nullptr) {       \
+      return ERR(NULL_POINTER); \
+    }                           \
+  } while (false)
+
 class JvmtiFunctions {
  private:
   static bool IsValidEnv(jvmtiEnv* env) {
     return env != nullptr;
   }
 
+#define ENSURE_VALID_ENV(env)          \
+  do {                                 \
+    if (!IsValidEnv(env)) {            \
+      return ERR(INVALID_ENVIRONMENT); \
+    }                                  \
+  } while (false)
+
+#define ENSURE_HAS_CAP(env, cap) \
+  do { \
+    ENSURE_VALID_ENV(env); \
+    if (ArtJvmTiEnv::AsArtJvmTiEnv(env)->capabilities.cap != 1) { \
+      return ERR(MUST_POSSESS_CAPABILITY); \
+    } \
+  } while (false)
+
  public:
   static jvmtiError Allocate(jvmtiEnv* env, jlong size, unsigned char** mem_ptr) {
-    if (!IsValidEnv(env)) {
-      return ERR(INVALID_ENVIRONMENT);
-    }
-    if (mem_ptr == nullptr) {
-      return ERR(NULL_POINTER);
-    }
+    ENSURE_VALID_ENV(env);
+    ENSURE_NON_NULL(mem_ptr);
     if (size < 0) {
       return ERR(ILLEGAL_ARGUMENT);
     } else if (size == 0) {
@@ -86,9 +104,7 @@
   }
 
   static jvmtiError Deallocate(jvmtiEnv* env, unsigned char* mem) {
-    if (!IsValidEnv(env)) {
-      return ERR(INVALID_ENVIRONMENT);
-    }
+    ENSURE_VALID_ENV(env);
     if (mem != nullptr) {
       free(mem);
     }
@@ -158,7 +174,7 @@
   static jvmtiError GetCurrentContendedMonitor(jvmtiEnv* env,
                                                jthread thread,
                                                jobject* monitor_ptr) {
-  return ERR(NOT_IMPLEMENTED);
+    return ERR(NOT_IMPLEMENTED);
   }
 
   static jvmtiError RunAgentThread(jvmtiEnv* env,
@@ -291,14 +307,13 @@
                                        jclass klass,
                                        const jvmtiHeapCallbacks* callbacks,
                                        const void* user_data) {
+    ENSURE_HAS_CAP(env, can_tag_objects);
     HeapUtil heap_util(&gObjectTagTable);
     return heap_util.IterateThroughHeap(env, heap_filter, klass, callbacks, user_data);
   }
 
   static jvmtiError GetTag(jvmtiEnv* env, jobject object, jlong* tag_ptr) {
-    if (object == nullptr || tag_ptr == nullptr) {
-      return ERR(NULL_POINTER);
-    }
+    ENSURE_HAS_CAP(env, can_tag_objects);
 
     JNIEnv* jni_env = GetJniEnv(env);
     if (jni_env == nullptr) {
@@ -315,6 +330,8 @@
   }
 
   static jvmtiError SetTag(jvmtiEnv* env, jobject object, jlong tag) {
+    ENSURE_HAS_CAP(env, can_tag_objects);
+
     if (object == nullptr) {
       return ERR(NULL_POINTER);
     }
@@ -337,6 +354,8 @@
                                        jint* count_ptr,
                                        jobject** object_result_ptr,
                                        jlong** tag_result_ptr) {
+    ENSURE_HAS_CAP(env, can_tag_objects);
+
     JNIEnv* jni_env = GetJniEnv(env);
     if (jni_env == nullptr) {
       return ERR(INTERNAL);
@@ -765,9 +784,7 @@
   static jvmtiError SetEventCallbacks(jvmtiEnv* env,
                                       const jvmtiEventCallbacks* callbacks,
                                       jint size_of_callbacks) {
-    if (env == nullptr) {
-      return ERR(NULL_POINTER);
-    }
+    ENSURE_VALID_ENV(env);
     if (size_of_callbacks < 0) {
       return ERR(ILLEGAL_ARGUMENT);
     }
@@ -794,6 +811,8 @@
                                              jvmtiEvent event_type,
                                              jthread event_thread,
                                              ...) {
+    ENSURE_VALID_ENV(env);
+    // TODO: Check for capabilities.
     art::Thread* art_thread = nullptr;
     if (event_thread != nullptr) {
       // TODO: Need non-aborting call here, to return JVMTI_ERROR_INVALID_THREAD.
@@ -834,20 +853,136 @@
   }
 
   static jvmtiError GetPotentialCapabilities(jvmtiEnv* env, jvmtiCapabilities* capabilities_ptr) {
-    return ERR(NOT_IMPLEMENTED);
+    ENSURE_VALID_ENV(env);
+    ENSURE_NON_NULL(capabilities_ptr);
+    *capabilities_ptr = kPotentialCapabilities;
+    return OK;
   }
 
   static jvmtiError AddCapabilities(jvmtiEnv* env, const jvmtiCapabilities* capabilities_ptr) {
-    return ERR(NOT_IMPLEMENTED);
+    ENSURE_VALID_ENV(env);
+    ENSURE_NON_NULL(capabilities_ptr);
+    ArtJvmTiEnv* art_env = static_cast<ArtJvmTiEnv*>(env);
+    jvmtiError ret = OK;
+#define ADD_CAPABILITY(e) \
+    do { \
+      if (capabilities_ptr->e == 1) { \
+        if (kPotentialCapabilities.e == 1) { \
+          art_env->capabilities.e = 1;\
+        } else { \
+          ret = ERR(NOT_AVAILABLE); \
+        } \
+      } \
+    } while (false)
+
+    ADD_CAPABILITY(can_tag_objects);
+    ADD_CAPABILITY(can_generate_field_modification_events);
+    ADD_CAPABILITY(can_generate_field_access_events);
+    ADD_CAPABILITY(can_get_bytecodes);
+    ADD_CAPABILITY(can_get_synthetic_attribute);
+    ADD_CAPABILITY(can_get_owned_monitor_info);
+    ADD_CAPABILITY(can_get_current_contended_monitor);
+    ADD_CAPABILITY(can_get_monitor_info);
+    ADD_CAPABILITY(can_pop_frame);
+    ADD_CAPABILITY(can_redefine_classes);
+    ADD_CAPABILITY(can_signal_thread);
+    ADD_CAPABILITY(can_get_source_file_name);
+    ADD_CAPABILITY(can_get_line_numbers);
+    ADD_CAPABILITY(can_get_source_debug_extension);
+    ADD_CAPABILITY(can_access_local_variables);
+    ADD_CAPABILITY(can_maintain_original_method_order);
+    ADD_CAPABILITY(can_generate_single_step_events);
+    ADD_CAPABILITY(can_generate_exception_events);
+    ADD_CAPABILITY(can_generate_frame_pop_events);
+    ADD_CAPABILITY(can_generate_breakpoint_events);
+    ADD_CAPABILITY(can_suspend);
+    ADD_CAPABILITY(can_redefine_any_class);
+    ADD_CAPABILITY(can_get_current_thread_cpu_time);
+    ADD_CAPABILITY(can_get_thread_cpu_time);
+    ADD_CAPABILITY(can_generate_method_entry_events);
+    ADD_CAPABILITY(can_generate_method_exit_events);
+    ADD_CAPABILITY(can_generate_all_class_hook_events);
+    ADD_CAPABILITY(can_generate_compiled_method_load_events);
+    ADD_CAPABILITY(can_generate_monitor_events);
+    ADD_CAPABILITY(can_generate_vm_object_alloc_events);
+    ADD_CAPABILITY(can_generate_native_method_bind_events);
+    ADD_CAPABILITY(can_generate_garbage_collection_events);
+    ADD_CAPABILITY(can_generate_object_free_events);
+    ADD_CAPABILITY(can_force_early_return);
+    ADD_CAPABILITY(can_get_owned_monitor_stack_depth_info);
+    ADD_CAPABILITY(can_get_constant_pool);
+    ADD_CAPABILITY(can_set_native_method_prefix);
+    ADD_CAPABILITY(can_retransform_classes);
+    ADD_CAPABILITY(can_retransform_any_class);
+    ADD_CAPABILITY(can_generate_resource_exhaustion_heap_events);
+    ADD_CAPABILITY(can_generate_resource_exhaustion_threads_events);
+#undef ADD_CAPABILITY
+    return ret;
   }
 
   static jvmtiError RelinquishCapabilities(jvmtiEnv* env,
                                            const jvmtiCapabilities* capabilities_ptr) {
-    return ERR(NOT_IMPLEMENTED);
+    ENSURE_VALID_ENV(env);
+    ENSURE_NON_NULL(capabilities_ptr);
+    ArtJvmTiEnv* art_env = reinterpret_cast<ArtJvmTiEnv*>(env);
+#define DEL_CAPABILITY(e) \
+    do { \
+      if (capabilities_ptr->e == 1) { \
+        art_env->capabilities.e = 0;\
+      } \
+    } while (false)
+
+    DEL_CAPABILITY(can_tag_objects);
+    DEL_CAPABILITY(can_generate_field_modification_events);
+    DEL_CAPABILITY(can_generate_field_access_events);
+    DEL_CAPABILITY(can_get_bytecodes);
+    DEL_CAPABILITY(can_get_synthetic_attribute);
+    DEL_CAPABILITY(can_get_owned_monitor_info);
+    DEL_CAPABILITY(can_get_current_contended_monitor);
+    DEL_CAPABILITY(can_get_monitor_info);
+    DEL_CAPABILITY(can_pop_frame);
+    DEL_CAPABILITY(can_redefine_classes);
+    DEL_CAPABILITY(can_signal_thread);
+    DEL_CAPABILITY(can_get_source_file_name);
+    DEL_CAPABILITY(can_get_line_numbers);
+    DEL_CAPABILITY(can_get_source_debug_extension);
+    DEL_CAPABILITY(can_access_local_variables);
+    DEL_CAPABILITY(can_maintain_original_method_order);
+    DEL_CAPABILITY(can_generate_single_step_events);
+    DEL_CAPABILITY(can_generate_exception_events);
+    DEL_CAPABILITY(can_generate_frame_pop_events);
+    DEL_CAPABILITY(can_generate_breakpoint_events);
+    DEL_CAPABILITY(can_suspend);
+    DEL_CAPABILITY(can_redefine_any_class);
+    DEL_CAPABILITY(can_get_current_thread_cpu_time);
+    DEL_CAPABILITY(can_get_thread_cpu_time);
+    DEL_CAPABILITY(can_generate_method_entry_events);
+    DEL_CAPABILITY(can_generate_method_exit_events);
+    DEL_CAPABILITY(can_generate_all_class_hook_events);
+    DEL_CAPABILITY(can_generate_compiled_method_load_events);
+    DEL_CAPABILITY(can_generate_monitor_events);
+    DEL_CAPABILITY(can_generate_vm_object_alloc_events);
+    DEL_CAPABILITY(can_generate_native_method_bind_events);
+    DEL_CAPABILITY(can_generate_garbage_collection_events);
+    DEL_CAPABILITY(can_generate_object_free_events);
+    DEL_CAPABILITY(can_force_early_return);
+    DEL_CAPABILITY(can_get_owned_monitor_stack_depth_info);
+    DEL_CAPABILITY(can_get_constant_pool);
+    DEL_CAPABILITY(can_set_native_method_prefix);
+    DEL_CAPABILITY(can_retransform_classes);
+    DEL_CAPABILITY(can_retransform_any_class);
+    DEL_CAPABILITY(can_generate_resource_exhaustion_heap_events);
+    DEL_CAPABILITY(can_generate_resource_exhaustion_threads_events);
+#undef DEL_CAPABILITY
+    return OK;
   }
 
   static jvmtiError GetCapabilities(jvmtiEnv* env, jvmtiCapabilities* capabilities_ptr) {
-    return ERR(NOT_IMPLEMENTED);
+    ENSURE_VALID_ENV(env);
+    ENSURE_NON_NULL(capabilities_ptr);
+    ArtJvmTiEnv* artenv = reinterpret_cast<ArtJvmTiEnv*>(env);
+    *capabilities_ptr = artenv->capabilities;
+    return OK;
   }
 
   static jvmtiError GetCurrentThreadCpuTimerInfo(jvmtiEnv* env, jvmtiTimerInfo* info_ptr) {
@@ -903,44 +1038,31 @@
   }
 
   static jvmtiError DisposeEnvironment(jvmtiEnv* env) {
-    if (!IsValidEnv(env)) {
-      return ERR(INVALID_ENVIRONMENT);
-    }
+    ENSURE_VALID_ENV(env);
     delete env;
     return OK;
   }
 
   static jvmtiError SetEnvironmentLocalStorage(jvmtiEnv* env, const void* data) {
-    if (!IsValidEnv(env)) {
-      return ERR(INVALID_ENVIRONMENT);
-    }
+    ENSURE_VALID_ENV(env);
     reinterpret_cast<ArtJvmTiEnv*>(env)->local_data = const_cast<void*>(data);
     return OK;
   }
 
   static jvmtiError GetEnvironmentLocalStorage(jvmtiEnv* env, void** data_ptr) {
-    if (!IsValidEnv(env)) {
-      return ERR(INVALID_ENVIRONMENT);
-    }
+    ENSURE_VALID_ENV(env);
     *data_ptr = reinterpret_cast<ArtJvmTiEnv*>(env)->local_data;
     return OK;
   }
 
   static jvmtiError GetVersionNumber(jvmtiEnv* env, jint* version_ptr) {
-    if (!IsValidEnv(env)) {
-      return ERR(INVALID_ENVIRONMENT);
-    }
+    ENSURE_VALID_ENV(env);
     *version_ptr = JVMTI_VERSION;
     return OK;
   }
 
   static jvmtiError GetErrorName(jvmtiEnv* env, jvmtiError error,  char** name_ptr) {
-    if (!IsValidEnv(env)) {
-      return ERR(INVALID_ENVIRONMENT);
-    }
-    if (name_ptr == nullptr) {
-      return ERR(NULL_POINTER);
-    }
+    ENSURE_NON_NULL(name_ptr);
     switch (error) {
 #define ERROR_CASE(e) case (JVMTI_ERROR_ ## e) : do { \
           *name_ptr = const_cast<char*>("JVMTI_ERROR_"#e); \
diff --git a/runtime/openjdkjvmti/art_jvmti.h b/runtime/openjdkjvmti/art_jvmti.h
index a321124..48b29a3 100644
--- a/runtime/openjdkjvmti/art_jvmti.h
+++ b/runtime/openjdkjvmti/art_jvmti.h
@@ -52,11 +52,13 @@
 struct ArtJvmTiEnv : public jvmtiEnv {
   art::JavaVMExt* art_vm;
   void* local_data;
+  jvmtiCapabilities capabilities;
 
   EventMasks event_masks;
   std::unique_ptr<jvmtiEventCallbacks> event_callbacks;
 
-  explicit ArtJvmTiEnv(art::JavaVMExt* runtime) : art_vm(runtime), local_data(nullptr) {
+  explicit ArtJvmTiEnv(art::JavaVMExt* runtime)
+      : art_vm(runtime), local_data(nullptr), capabilities() {
     functions = &gJvmtiInterface;
   }
 
@@ -121,6 +123,50 @@
   return ret;
 }
 
+const jvmtiCapabilities kPotentialCapabilities = {
+    .can_tag_objects                                 = 1,
+    .can_generate_field_modification_events          = 0,
+    .can_generate_field_access_events                = 0,
+    .can_get_bytecodes                               = 0,
+    .can_get_synthetic_attribute                     = 0,
+    .can_get_owned_monitor_info                      = 0,
+    .can_get_current_contended_monitor               = 0,
+    .can_get_monitor_info                            = 0,
+    .can_pop_frame                                   = 0,
+    .can_redefine_classes                            = 0,
+    .can_signal_thread                               = 0,
+    .can_get_source_file_name                        = 0,
+    .can_get_line_numbers                            = 0,
+    .can_get_source_debug_extension                  = 0,
+    .can_access_local_variables                      = 0,
+    .can_maintain_original_method_order              = 0,
+    .can_generate_single_step_events                 = 0,
+    .can_generate_exception_events                   = 0,
+    .can_generate_frame_pop_events                   = 0,
+    .can_generate_breakpoint_events                  = 0,
+    .can_suspend                                     = 0,
+    .can_redefine_any_class                          = 0,
+    .can_get_current_thread_cpu_time                 = 0,
+    .can_get_thread_cpu_time                         = 0,
+    .can_generate_method_entry_events                = 0,
+    .can_generate_method_exit_events                 = 0,
+    .can_generate_all_class_hook_events              = 0,
+    .can_generate_compiled_method_load_events        = 0,
+    .can_generate_monitor_events                     = 0,
+    .can_generate_vm_object_alloc_events             = 0,
+    .can_generate_native_method_bind_events          = 0,
+    .can_generate_garbage_collection_events          = 0,
+    .can_generate_object_free_events                 = 0,
+    .can_force_early_return                          = 0,
+    .can_get_owned_monitor_stack_depth_info          = 0,
+    .can_get_constant_pool                           = 0,
+    .can_set_native_method_prefix                    = 0,
+    .can_retransform_classes                         = 0,
+    .can_retransform_any_class                       = 0,
+    .can_generate_resource_exhaustion_heap_events    = 0,
+    .can_generate_resource_exhaustion_threads_events = 0,
+};
+
 }  // namespace openjdkjvmti
 
 #endif  // ART_RUNTIME_OPENJDKJVMTI_ART_JVMTI_H_
diff --git a/runtime/primitive.cc b/runtime/primitive.cc
index d29a060..2380284 100644
--- a/runtime/primitive.cc
+++ b/runtime/primitive.cc
@@ -31,11 +31,35 @@
   "PrimVoid",
 };
 
+static const char* kBoxedDescriptors[] = {
+  "Ljava/lang/Object;",
+  "Ljava/lang/Boolean;",
+  "Ljava/lang/Byte;",
+  "Ljava/lang/Character;",
+  "Ljava/lang/Short;",
+  "Ljava/lang/Integer;",
+  "Ljava/lang/Long;",
+  "Ljava/lang/Float;",
+  "Ljava/lang/Double;",
+  "Ljava/lang/Void;",
+};
+
+#define COUNT_OF(x) (sizeof(x) / sizeof(x[0]))
+
 const char* Primitive::PrettyDescriptor(Primitive::Type type) {
+  static_assert(COUNT_OF(kTypeNames) == static_cast<size_t>(Primitive::kPrimLast) + 1,
+                "Missing element");
   CHECK(Primitive::kPrimNot <= type && type <= Primitive::kPrimVoid) << static_cast<int>(type);
   return kTypeNames[type];
 }
 
+const char* Primitive::BoxedDescriptor(Primitive::Type type) {
+  static_assert(COUNT_OF(kBoxedDescriptors) == static_cast<size_t>(Primitive::kPrimLast) + 1,
+                "Missing element");
+  CHECK(Primitive::kPrimNot <= type && type <= Primitive::kPrimVoid) << static_cast<int>(type);
+  return kBoxedDescriptors[type];
+}
+
 std::ostream& operator<<(std::ostream& os, const Primitive::Type& type) {
   int32_t int_type = static_cast<int32_t>(type);
   if (type >= Primitive::kPrimNot && type <= Primitive::kPrimVoid) {
diff --git a/runtime/primitive.h b/runtime/primitive.h
index 18f45ff..a0edaee 100644
--- a/runtime/primitive.h
+++ b/runtime/primitive.h
@@ -138,6 +138,9 @@
 
   static const char* PrettyDescriptor(Type type);
 
+  // Returns the descriptor corresponding to the boxed type of |type|.
+  static const char* BoxedDescriptor(Type type);
+
   static bool IsFloatingPointType(Type type) {
     return type == kPrimFloat || type == kPrimDouble;
   }
@@ -158,6 +161,35 @@
     }
   }
 
+  // Return true if |type| is an numeric type.
+  static constexpr bool IsNumericType(Type type) {
+    switch (type) {
+      case Primitive::Type::kPrimNot: return false;
+      case Primitive::Type::kPrimBoolean: return false;
+      case Primitive::Type::kPrimByte: return true;
+      case Primitive::Type::kPrimChar: return false;
+      case Primitive::Type::kPrimShort: return true;
+      case Primitive::Type::kPrimInt: return true;
+      case Primitive::Type::kPrimLong: return true;
+      case Primitive::Type::kPrimFloat: return true;
+      case Primitive::Type::kPrimDouble: return true;
+      case Primitive::Type::kPrimVoid: return false;
+    }
+  }
+
+  // Returns true if it is possible to widen type |from| to type |to|. Both |from| and
+  // |to| should be numeric primitive types.
+  static bool IsWidenable(Type from, Type to) {
+    static_assert(Primitive::Type::kPrimByte < Primitive::Type::kPrimShort, "Bad ordering");
+    static_assert(Primitive::Type::kPrimShort < Primitive::Type::kPrimInt, "Bad ordering");
+    static_assert(Primitive::Type::kPrimInt < Primitive::Type::kPrimLong, "Bad ordering");
+    static_assert(Primitive::Type::kPrimLong < Primitive::Type::kPrimFloat, "Bad ordering");
+    static_assert(Primitive::Type::kPrimFloat < Primitive::Type::kPrimDouble, "Bad ordering");
+    // Widening is only applicable between numeric types, like byte
+    // and int. Non-numeric types, such as boolean, cannot be widened.
+    return IsNumericType(from) && IsNumericType(to) && from <= to;
+  }
+
   static bool IsIntOrLongType(Type type) {
     return type == kPrimInt || type == kPrimLong;
   }
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 8ce9661..23c077c 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -116,6 +116,13 @@
 }
 
 void InitEntryPoints(JniEntryPoints* jpoints, QuickEntryPoints* qpoints);
+void UpdateReadBarrierEntrypoints(QuickEntryPoints* qpoints, bool is_marking);
+
+void Thread::SetIsGcMarkingAndUpdateEntrypoints(bool is_marking) {
+  CHECK(kUseReadBarrier);
+  tls32_.is_gc_marking = is_marking;
+  UpdateReadBarrierEntrypoints(&tlsPtr_.quick_entrypoints, is_marking);
+}
 
 void Thread::InitTlsEntryPoints() {
   // Insert a placeholder so we can easily tell if we call an unimplemented entry point.
diff --git a/runtime/thread.h b/runtime/thread.h
index f3001be..faa77e1 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -845,10 +845,7 @@
     return tls32_.is_gc_marking;
   }
 
-  void SetIsGcMarking(bool is_marking) {
-    CHECK(kUseReadBarrier);
-    tls32_.is_gc_marking = is_marking;
-  }
+  void SetIsGcMarkingAndUpdateEntrypoints(bool is_marking);
 
   bool GetWeakRefAccessEnabled() const {
     CHECK(kUseReadBarrier);
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index eba6666..8a3bb15 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -1242,7 +1242,7 @@
     // Initialize according to the state of the CC collector.
     bool is_gc_marking =
         Runtime::Current()->GetHeap()->ConcurrentCopyingCollector()->IsMarking();
-    self->SetIsGcMarking(is_gc_marking);
+    self->SetIsGcMarkingAndUpdateEntrypoints(is_gc_marking);
     bool weak_ref_access_enabled =
         Runtime::Current()->GetHeap()->ConcurrentCopyingCollector()->IsWeakRefAccessEnabled();
     self->SetWeakRefAccessEnabled(weak_ref_access_enabled);
diff --git a/test/021-string2/src/Main.java b/test/021-string2/src/Main.java
index a848fba..51351e1 100644
--- a/test/021-string2/src/Main.java
+++ b/test/021-string2/src/Main.java
@@ -431,6 +431,22 @@
                 "\u0440\u0440\u0440\u0440\u0440\u0440z\u0440",
                 "\u0440\u0440\u0440\u0440\u0440\u0440\u0440z\u0440",
                 "\u0440\u0440\u0440\u0440\u0440\u0440\u0440\u0440z\u0440",
+                "\u0000",
+                "\u0000\u0000",
+                "\u0000\u0000\u0000",
+                "\u0000\u0000\u0000\u0000",
+                "\u0000\u0000\u0000\u0000\u0000",
+                "\u0000\u0000\u0000\u0000\u0000\u0000",
+                "\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
+                "\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
+                "\u0000z\u0000",
+                "\u0000\u0000z\u0000",
+                "\u0000\u0000\u0000z\u0000",
+                "\u0000\u0000\u0000\u0000z\u0000",
+                "\u0000\u0000\u0000\u0000\u0000z\u0000",
+                "\u0000\u0000\u0000\u0000\u0000\u0000z\u0000",
+                "\u0000\u0000\u0000\u0000\u0000\u0000\u0000z\u0000",
+                "\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000z\u0000",
         };
         String[] suffixes = {
                 "",
@@ -458,30 +474,40 @@
                     String full = p + c + s;
                     int expX = (c.isEmpty() || c.charAt(0) != 'x') ? -1 : p.length();
                     int exp0440 = (c.isEmpty() || c.charAt(0) != '\u0440') ? -1 : p.length();
+                    int exp0000 = (c.isEmpty() || c.charAt(0) != '\u0000') ? -1 : p.length();
                     Assert.assertEquals(expX, $noinline$indexOf(full, 'x'));
                     Assert.assertEquals(exp0440, $noinline$indexOf(full, '\u0440'));
+                    Assert.assertEquals(exp0000, $noinline$indexOf(full, '\u0000'));
                     Assert.assertEquals(expX, $noinline$indexOf(full, 'x', -1));
                     Assert.assertEquals(exp0440, $noinline$indexOf(full, '\u0440', -1));
+                    Assert.assertEquals(exp0000, $noinline$indexOf(full, '\u0000', -1));
                     Assert.assertEquals(-1, $noinline$indexOf(full, 'x', full.length() + 1));
                     Assert.assertEquals(-1, $noinline$indexOf(full, '\u0440', full.length() + 1));
+                    Assert.assertEquals(-1, $noinline$indexOf(full, '\u0000', full.length() + 1));
                     for (int from = 0; from != full.length(); ++from) {
                         final int eX;
                         final int e0440;
+                        final int e0000;
                         if (from <= p.length()) {
                             eX = expX;
                             e0440 = exp0440;
+                            e0000 = exp0000;
                         } else if (from >= p.length() + c.length()) {
                             eX = -1;
                             e0440 = -1;
+                            e0000 = -1;
                         } else if (full.charAt(from) == 'z') {
                             eX = (full.charAt(from + 1) != 'x') ? -1 : from + 1;
                             e0440 = (full.charAt(from + 1) != '\u0440') ? -1 : from + 1;
+                            e0000 = (full.charAt(from + 1) != '\u0000') ? -1 : from + 1;
                         } else {
                             eX = (full.charAt(from) != 'x') ? -1 : from;
                             e0440 = (full.charAt(from) != '\u0440') ? -1 : from;
+                            e0000 = (full.charAt(from) != '\u0000') ? -1 : from;
                         }
                         Assert.assertEquals(eX, $noinline$indexOf(full, 'x', from));
                         Assert.assertEquals(e0440, $noinline$indexOf(full, '\u0440', from));
+                        Assert.assertEquals(e0000, $noinline$indexOf(full, '\u0000', from));
                     }
                 }
             }
diff --git a/test/478-checker-inline-noreturn/expected.txt b/test/478-checker-inline-noreturn/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/478-checker-inline-noreturn/expected.txt
diff --git a/test/478-checker-inline-noreturn/info.txt b/test/478-checker-inline-noreturn/info.txt
new file mode 100644
index 0000000..64f42ed
--- /dev/null
+++ b/test/478-checker-inline-noreturn/info.txt
@@ -0,0 +1,3 @@
+Tests inlining a function with a no-exit loop into a loop. LinearOrder
+computation fails because of incorrect HLoopInformation if we inline
+a loop without an exit.
diff --git a/test/478-checker-inline-noreturn/src/Main.java b/test/478-checker-inline-noreturn/src/Main.java
new file mode 100644
index 0000000..7aaeac0
--- /dev/null
+++ b/test/478-checker-inline-noreturn/src/Main.java
@@ -0,0 +1,60 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/*
+ * A test that checks that the inliner does not inline functions that contain
+ * a loop with no exit.  This because the incremental update to
+ * HLoopInformation done by the inliner does not work with the LinearOrder
+ * computation if the inlined function does not always return.
+ */
+
+public class Main {
+
+  public static void assertIntEquals(int expected, int result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  public static int $opt$noinline$Function(int x, int y) {
+    int result;
+    if (x <= y) {
+      result = 42;
+    } else {
+      while (true);
+    }
+    return result;
+  }
+
+  /// CHECK-START: int Main.callerLoop(int, int) inliner (before)
+  /// CHECK:         InvokeStaticOrDirect method_name:Main.$opt$noinline$Function  loop:{{B\d+}}
+
+  /// CHECK-START: int Main.callerLoop(int, int) inliner (after)
+  /// CHECK:         InvokeStaticOrDirect method_name:Main.$opt$noinline$Function  loop:{{B\d+}}
+
+  public static int callerLoop(int max_x, int max_y) {
+    int total = 0;
+    for (int x = 0; x < max_x; ++x) {
+      total += $opt$noinline$Function(x, max_y);
+    }
+    return total;
+  }
+
+  public static void main(String[] args) {
+    assertIntEquals(42, callerLoop(1, 1));
+  }
+}
diff --git a/test/552-checker-sharpening/src/Main.java b/test/552-checker-sharpening/src/Main.java
index 95ecfb5..9e475ab 100644
--- a/test/552-checker-sharpening/src/Main.java
+++ b/test/552-checker-sharpening/src/Main.java
@@ -303,10 +303,6 @@
   /// CHECK-START-MIPS: java.lang.String Main.$noinline$getNonBootImageString() sharpening (after)
   /// CHECK:                LoadString load_kind:BssEntry
 
-  /// CHECK-START-MIPS: java.lang.String Main.$noinline$getNonBootImageString() pc_relative_fixups_mips (after)
-  /// CHECK-DAG:            MipsComputeBaseMethodAddress
-  /// CHECK-DAG:            LoadString load_kind:BssEntry
-
   public static String $noinline$getNonBootImageString() {
     // Prevent inlining to avoid the string comparison being optimized away.
     if (doThrow) { throw new Error(); }
diff --git a/test/902-hello-transformation/transform.cc b/test/902-hello-transformation/transform.cc
index 5b0d219..3369dd4 100644
--- a/test/902-hello-transformation/transform.cc
+++ b/test/902-hello-transformation/transform.cc
@@ -23,6 +23,7 @@
 #include "base/logging.h"
 #include "jni.h"
 #include "openjdkjvmti/jvmti.h"
+#include "ti-agent/common_helper.h"
 #include "ti-agent/common_load.h"
 #include "utils.h"
 
@@ -132,15 +133,13 @@
 jint OnLoad(JavaVM* vm,
             char* options,
             void* reserved ATTRIBUTE_UNUSED) {
-  jvmtiCapabilities caps;
   RuntimeIsJvm = (strcmp("jvm", options) == 0);
   if (vm->GetEnv(reinterpret_cast<void**>(&jvmti_env), JVMTI_VERSION_1_0)) {
     printf("Unable to get jvmti env!\n");
     return 1;
   }
+  SetAllCapabilities(jvmti_env);
   if (IsJVM()) {
-    jvmti_env->GetPotentialCapabilities(&caps);
-    jvmti_env->AddCapabilities(&caps);
     jvmtiEventCallbacks cbs;
     memset(&cbs, 0, sizeof(cbs));
     cbs.ClassFileLoadHook = transformationHook;
diff --git a/test/903-hello-tagging/tagging.cc b/test/903-hello-tagging/tagging.cc
index bed4e5d..1557d45 100644
--- a/test/903-hello-tagging/tagging.cc
+++ b/test/903-hello-tagging/tagging.cc
@@ -28,6 +28,7 @@
 #include "art_method-inl.h"
 #include "base/logging.h"
 #include "openjdkjvmti/jvmti.h"
+#include "ti-agent/common_helper.h"
 #include "ti-agent/common_load.h"
 #include "utils.h"
 
@@ -145,6 +146,7 @@
     printf("Unable to get jvmti env!\n");
     return 1;
   }
+  SetAllCapabilities(jvmti_env);
   return 0;
 }
 
diff --git a/test/904-object-allocation/tracking.cc b/test/904-object-allocation/tracking.cc
index 57bfed5..9261a9f 100644
--- a/test/904-object-allocation/tracking.cc
+++ b/test/904-object-allocation/tracking.cc
@@ -26,6 +26,7 @@
 #include "openjdkjvmti/jvmti.h"
 #include "ScopedLocalRef.h"
 #include "ScopedUtfChars.h"
+#include "ti-agent/common_helper.h"
 #include "ti-agent/common_load.h"
 #include "utils.h"
 
@@ -95,6 +96,7 @@
     return 1;
   }
   jvmti_env->SetEventNotificationMode(JVMTI_ENABLE, JVMTI_EVENT_VM_OBJECT_ALLOC, nullptr);
+  SetAllCapabilities(jvmti_env);
   return 0;
 }
 
diff --git a/test/905-object-free/tracking_free.cc b/test/905-object-free/tracking_free.cc
index b41a914..fc43acc 100644
--- a/test/905-object-free/tracking_free.cc
+++ b/test/905-object-free/tracking_free.cc
@@ -26,6 +26,7 @@
 #include "openjdkjvmti/jvmti.h"
 #include "ScopedLocalRef.h"
 #include "ScopedUtfChars.h"
+#include "ti-agent/common_helper.h"
 #include "ti-agent/common_load.h"
 #include "utils.h"
 
@@ -87,6 +88,7 @@
     printf("Unable to get jvmti env!\n");
     return 1;
   }
+  SetAllCapabilities(jvmti_env);
   return 0;
 }
 
diff --git a/test/906-iterate-heap/iterate_heap.cc b/test/906-iterate-heap/iterate_heap.cc
index ab1d8d8..8dac89d 100644
--- a/test/906-iterate-heap/iterate_heap.cc
+++ b/test/906-iterate-heap/iterate_heap.cc
@@ -25,6 +25,7 @@
 #include "jni.h"
 #include "openjdkjvmti/jvmti.h"
 #include "ScopedPrimitiveArray.h"
+#include "ti-agent/common_helper.h"
 #include "ti-agent/common_load.h"
 
 namespace art {
@@ -180,6 +181,7 @@
     printf("Unable to get jvmti env!\n");
     return 1;
   }
+  SetAllCapabilities(jvmti_env);
   return 0;
 }
 
diff --git a/test/907-get-loaded-classes/get_loaded_classes.cc b/test/907-get-loaded-classes/get_loaded_classes.cc
index ce929a6..afbb774 100644
--- a/test/907-get-loaded-classes/get_loaded_classes.cc
+++ b/test/907-get-loaded-classes/get_loaded_classes.cc
@@ -72,6 +72,7 @@
     printf("Unable to get jvmti env!\n");
     return 1;
   }
+  SetAllCapabilities(jvmti_env);
   return 0;
 }
 
diff --git a/test/908-gc-start-finish/gc_callbacks.cc b/test/908-gc-start-finish/gc_callbacks.cc
index d546513..771d1ad 100644
--- a/test/908-gc-start-finish/gc_callbacks.cc
+++ b/test/908-gc-start-finish/gc_callbacks.cc
@@ -22,6 +22,7 @@
 #include "base/macros.h"
 #include "jni.h"
 #include "openjdkjvmti/jvmti.h"
+#include "ti-agent/common_helper.h"
 #include "ti-agent/common_load.h"
 
 namespace art {
@@ -98,6 +99,7 @@
     printf("Unable to get jvmti env!\n");
     return 1;
   }
+  SetAllCapabilities(jvmti_env);
   return 0;
 }
 
diff --git a/test/910-methods/methods.cc b/test/910-methods/methods.cc
index 005cba6..8f0850b 100644
--- a/test/910-methods/methods.cc
+++ b/test/910-methods/methods.cc
@@ -109,6 +109,7 @@
     printf("Unable to get jvmti env!\n");
     return 1;
   }
+  SetAllCapabilities(jvmti_env);
   return 0;
 }
 
diff --git a/test/911-get-stack-trace/stack_trace.cc b/test/911-get-stack-trace/stack_trace.cc
index a30416d..e7d9380 100644
--- a/test/911-get-stack-trace/stack_trace.cc
+++ b/test/911-get-stack-trace/stack_trace.cc
@@ -87,6 +87,7 @@
     printf("Unable to get jvmti env!\n");
     return 1;
   }
+  SetAllCapabilities(jvmti_env);
   return 0;
 }
 
diff --git a/test/912-classes/classes.cc b/test/912-classes/classes.cc
index fbf3259..838a92a 100644
--- a/test/912-classes/classes.cc
+++ b/test/912-classes/classes.cc
@@ -69,6 +69,7 @@
     printf("Unable to get jvmti env!\n");
     return 1;
   }
+  SetAllCapabilities(jvmti_env);
   return 0;
 }
 
diff --git a/test/913-heaps/expected.txt b/test/913-heaps/expected.txt
index dc6e67d..d1ddbae 100644
--- a/test/913-heaps/expected.txt
+++ b/test/913-heaps/expected.txt
@@ -3,96 +3,90 @@
 root@root --(stack-local)--> 1@1000 [size=16, length=-1]
 root@root --(stack-local)--> 3000@0 [size=132, length=-1]
 root@root --(thread)--> 3000@0 [size=132, length=-1]
-1@1000 --(class)--> 1000@0 [size=123, length=-1]
-1@1000 --(field@8)--> 2@1000 [size=16, length=-1]
-1@1000 --(field@12)--> 3@1001 [size=24, length=-1]
 0@0 --(array-element@0)--> 1@1000 [size=16, length=-1]
+1001@0 --(superclass)--> 1000@0 [size=123, length=-1]
+1002@0 --(interface)--> 2001@0 [size=132, length=-1]
+1002@0 --(superclass)--> 1001@0 [size=123, length=-1]
+1@1000 --(class)--> 1000@0 [size=123, length=-1]
+1@1000 --(field@12)--> 3@1001 [size=24, length=-1]
+1@1000 --(field@8)--> 2@1000 [size=16, length=-1]
+2001@0 --(interface)--> 2000@0 [size=132, length=-1]
 2@1000 --(class)--> 1000@0 [size=123, length=-1]
 3@1001 --(class)--> 1001@0 [size=123, length=-1]
 3@1001 --(field@16)--> 4@1000 [size=16, length=-1]
 3@1001 --(field@20)--> 5@1002 [size=32, length=-1]
-1001@0 --(superclass)--> 1000@0 [size=123, length=-1]
 4@1000 --(class)--> 1000@0 [size=123, length=-1]
 5@1002 --(class)--> 1002@0 [size=123, length=-1]
 5@1002 --(field@24)--> 6@1000 [size=16, length=-1]
 5@1002 --(field@28)--> 1@1000 [size=16, length=-1]
-1002@0 --(superclass)--> 1001@0 [size=123, length=-1]
-1002@0 --(interface)--> 2001@0 [size=132, length=-1]
 6@1000 --(class)--> 1000@0 [size=123, length=-1]
-2001@0 --(interface)--> 2000@0 [size=132, length=-1]
 ---
 root@root --(stack-local)--> 1@1000 [size=16, length=-1]
-root@root --(stack-local)--> 1@1000 [size=16, length=-1]
-root@root --(stack-local)--> 1@1000 [size=16, length=-1]
 root@root --(stack-local)--> 2@1000 [size=16, length=-1]
 root@root --(stack-local)--> 3000@0 [size=132, length=-1]
 root@root --(thread)--> 2@1000 [size=16, length=-1]
 root@root --(thread)--> 3000@0 [size=132, length=-1]
-2@1000 --(class)--> 1000@0 [size=123, length=-1]
+0@0 --(array-element@0)--> 1@1000 [size=16, length=-1]
+1001@0 --(superclass)--> 1000@0 [size=123, length=-1]
+1002@0 --(interface)--> 2001@0 [size=132, length=-1]
+1002@0 --(superclass)--> 1001@0 [size=123, length=-1]
 1@1000 --(class)--> 1000@0 [size=123, length=-1]
-1@1000 --(field@8)--> 2@1000 [size=16, length=-1]
 1@1000 --(field@12)--> 3@1001 [size=24, length=-1]
+1@1000 --(field@8)--> 2@1000 [size=16, length=-1]
+2001@0 --(interface)--> 2000@0 [size=132, length=-1]
+2@1000 --(class)--> 1000@0 [size=123, length=-1]
 3@1001 --(class)--> 1001@0 [size=123, length=-1]
 3@1001 --(field@16)--> 4@1000 [size=16, length=-1]
 3@1001 --(field@20)--> 5@1002 [size=32, length=-1]
-0@0 --(array-element@0)--> 1@1000 [size=16, length=-1]
-1001@0 --(superclass)--> 1000@0 [size=123, length=-1]
 4@1000 --(class)--> 1000@0 [size=123, length=-1]
 5@1002 --(class)--> 1002@0 [size=123, length=-1]
 5@1002 --(field@24)--> 6@1000 [size=16, length=-1]
 5@1002 --(field@28)--> 1@1000 [size=16, length=-1]
-1002@0 --(superclass)--> 1001@0 [size=123, length=-1]
-1002@0 --(interface)--> 2001@0 [size=132, length=-1]
 6@1000 --(class)--> 1000@0 [size=123, length=-1]
-2001@0 --(interface)--> 2000@0 [size=132, length=-1]
 ---
 root@root --(jni-global)--> 1@1000 [size=16, length=-1]
 root@root --(jni-local)--> 1@1000 [size=16, length=-1]
 root@root --(stack-local)--> 1@1000 [size=16, length=-1]
-root@root --(stack-local)--> 1@1000 [size=16, length=-1]
 root@root --(thread)--> 1@1000 [size=16, length=-1]
 root@root --(thread)--> 3000@0 [size=132, length=-1]
+1001@0 --(superclass)--> 1000@0 [size=123, length=-1]
+1002@0 --(interface)--> 2001@0 [size=132, length=-1]
+1002@0 --(superclass)--> 1001@0 [size=123, length=-1]
 1@1000 --(class)--> 1000@0 [size=123, length=-1]
-1@1000 --(field@8)--> 2@1000 [size=16, length=-1]
 1@1000 --(field@12)--> 3@1001 [size=24, length=-1]
+1@1000 --(field@8)--> 2@1000 [size=16, length=-1]
+2001@0 --(interface)--> 2000@0 [size=132, length=-1]
 2@1000 --(class)--> 1000@0 [size=123, length=-1]
 3@1001 --(class)--> 1001@0 [size=123, length=-1]
 3@1001 --(field@16)--> 4@1000 [size=16, length=-1]
 3@1001 --(field@20)--> 5@1002 [size=32, length=-1]
-1001@0 --(superclass)--> 1000@0 [size=123, length=-1]
 4@1000 --(class)--> 1000@0 [size=123, length=-1]
 5@1002 --(class)--> 1002@0 [size=123, length=-1]
 5@1002 --(field@24)--> 6@1000 [size=16, length=-1]
 5@1002 --(field@28)--> 1@1000 [size=16, length=-1]
-1002@0 --(superclass)--> 1001@0 [size=123, length=-1]
-1002@0 --(interface)--> 2001@0 [size=132, length=-1]
 6@1000 --(class)--> 1000@0 [size=123, length=-1]
-2001@0 --(interface)--> 2000@0 [size=132, length=-1]
 ---
 root@root --(jni-global)--> 1@1000 [size=16, length=-1]
 root@root --(jni-local)--> 1@1000 [size=16, length=-1]
 root@root --(stack-local)--> 1@1000 [size=16, length=-1]
-root@root --(stack-local)--> 1@1000 [size=16, length=-1]
-root@root --(stack-local)--> 1@1000 [size=16, length=-1]
-root@root --(stack-local)--> 1@1000 [size=16, length=-1]
 root@root --(stack-local)--> 2@1000 [size=16, length=-1]
 root@root --(thread)--> 1@1000 [size=16, length=-1]
 root@root --(thread)--> 2@1000 [size=16, length=-1]
 root@root --(thread)--> 3000@0 [size=132, length=-1]
+1001@0 --(superclass)--> 1000@0 [size=123, length=-1]
+1002@0 --(interface)--> 2001@0 [size=132, length=-1]
+1002@0 --(superclass)--> 1001@0 [size=123, length=-1]
 1@1000 --(class)--> 1000@0 [size=123, length=-1]
-1@1000 --(field@8)--> 2@1000 [size=16, length=-1]
 1@1000 --(field@12)--> 3@1001 [size=24, length=-1]
+1@1000 --(field@8)--> 2@1000 [size=16, length=-1]
+2001@0 --(interface)--> 2000@0 [size=132, length=-1]
 2@1000 --(class)--> 1000@0 [size=123, length=-1]
 3@1001 --(class)--> 1001@0 [size=123, length=-1]
 3@1001 --(field@16)--> 4@1000 [size=16, length=-1]
 3@1001 --(field@20)--> 5@1002 [size=32, length=-1]
-1001@0 --(superclass)--> 1000@0 [size=123, length=-1]
 4@1000 --(class)--> 1000@0 [size=123, length=-1]
 5@1002 --(class)--> 1002@0 [size=123, length=-1]
 5@1002 --(field@24)--> 6@1000 [size=16, length=-1]
 5@1002 --(field@28)--> 1@1000 [size=16, length=-1]
-1002@0 --(superclass)--> 1001@0 [size=123, length=-1]
-1002@0 --(interface)--> 2001@0 [size=132, length=-1]
 6@1000 --(class)--> 1000@0 [size=123, length=-1]
-2001@0 --(interface)--> 2000@0 [size=132, length=-1]
 ---
diff --git a/test/913-heaps/heaps.cc b/test/913-heaps/heaps.cc
index d74026c..4087abd 100644
--- a/test/913-heaps/heaps.cc
+++ b/test/913-heaps/heaps.cc
@@ -25,8 +25,12 @@
 #include "base/logging.h"
 #include "base/macros.h"
 #include "base/stringprintf.h"
+#include "jit/jit.h"
 #include "jni.h"
 #include "openjdkjvmti/jvmti.h"
+#include "runtime.h"
+#include "thread-inl.h"
+
 #include "ti-agent/common_helper.h"
 #include "ti-agent/common_load.h"
 
@@ -275,8 +279,16 @@
     printf("Unable to get jvmti env!\n");
     return 1;
   }
+  SetAllCapabilities(jvmti_env);
   return 0;
 }
 
+extern "C" JNIEXPORT void JNICALL Java_Main_waitForJitCompilation(JNIEnv*, jclass) {
+  jit::Jit* jit = Runtime::Current()->GetJit();
+  if (jit != nullptr) {
+    jit->WaitForCompilationToFinish(Thread::Current());
+  }
+}
+
 }  // namespace Test913Heaps
 }  // namespace art
diff --git a/test/913-heaps/src/Main.java b/test/913-heaps/src/Main.java
index f463429..fc00ada 100644
--- a/test/913-heaps/src/Main.java
+++ b/test/913-heaps/src/Main.java
@@ -16,6 +16,8 @@
 
 import java.util.ArrayList;
 import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
 
 public class Main {
   public static void main(String[] args) throws Exception {
@@ -56,7 +58,6 @@
     Runtime.getRuntime().gc();
     Runtime.getRuntime().gc();
 
-    tagClasses();
     setTag(Thread.currentThread(), 3000);
 
     {
@@ -77,88 +78,103 @@
   }
 
   private static void doFollowReferencesTestNonRoot(ArrayList<Object> tmpStorage) {
-    A a = createTree();
+    Verifier v = new Verifier();
+    tagClasses(v);
+    A a = createTree(v);
     tmpStorage.add(a);
-    doFollowReferencesTestImpl(null, Integer.MAX_VALUE, -1, null);
-    doFollowReferencesTestImpl(a, Integer.MAX_VALUE, -1, null);
+    v.add("0@0", "1@1000");  // tmpStorage[0] --(array-element)--> a.
+
+    doFollowReferencesTestImpl(null, Integer.MAX_VALUE, -1, null, v, null);
+    doFollowReferencesTestImpl(a.foo, Integer.MAX_VALUE, -1, null, v, "2@1000");
+
     tmpStorage.clear();
   }
 
   private static void doFollowReferencesTestRoot() {
-    A a = createTree();
-    doFollowReferencesTestImpl(null, Integer.MAX_VALUE, -1, a);
-    doFollowReferencesTestImpl(a, Integer.MAX_VALUE, -1, a);
+    Verifier v = new Verifier();
+    tagClasses(v);
+    A a = createTree(v);
+
+    doFollowReferencesTestImpl(null, Integer.MAX_VALUE, -1, a, v, null);
+    doFollowReferencesTestImpl(a.foo, Integer.MAX_VALUE, -1, a, v, "2@1000");
   }
 
   private static void doFollowReferencesTestImpl(A root, int stopAfter, int followSet,
-      Object asRoot) {
+      Object asRoot, Verifier v, String additionalEnabled) {
+    waitForJitCompilation();  // Wait to avoid JIT influence (e.g., JNI globals).
+
     String[] lines =
-        followReferences(0, null, root == null ? null : root.foo, stopAfter, followSet, asRoot);
-    // Note: sort the roots, as stack locals visit order isn't defined, so may depend on compiled
-    //       code. Do not sort non-roots, as the order here needs to be verified (elements are
-    //       finished before a reference is followed). The test setup (and root visit order)
-    //       luckily ensures that this is deterministic.
+        followReferences(0, null, root, stopAfter, followSet, asRoot);
 
-    int i = 0;
-    ArrayList<String> rootLines = new ArrayList<>();
-    while (i < lines.length) {
-      if (lines[i].startsWith("root")) {
-        rootLines.add(lines[i]);
-      } else {
-        break;
-      }
-      i++;
-    }
-    Collections.sort(rootLines);
-    for (String l : rootLines) {
-      System.out.println(l);
-    }
-
-    // Print the non-root lines in order.
-    while (i < lines.length) {
-      System.out.println(lines[i]);
-      i++;
-    }
-
-    System.out.println("---");
+    v.process(lines, additionalEnabled);
 
     // TODO: Test filters.
   }
 
-  private static void tagClasses() {
+  private static void tagClasses(Verifier v) {
     setTag(A.class, 1000);
+
     setTag(B.class, 1001);
+    v.add("1001@0", "1000@0");  // B.class --(superclass)--> A.class.
+
     setTag(C.class, 1002);
+    v.add("1002@0", "1001@0");  // C.class --(superclass)--> B.class.
+    v.add("1002@0", "2001@0");  // C.class --(interface)--> I2.class.
+
     setTag(I1.class, 2000);
+
     setTag(I2.class, 2001);
+    v.add("2001@0", "2000@0");  // I2.class --(interface)--> I1.class.
   }
 
-  private static A createTree() {
-    A root = new A();
-    setTag(root, 1);
+  private static A createTree(Verifier v) {
+    A aInst = new A();
+    setTag(aInst, 1);
+    String aInstStr = "1@1000";
+    String aClassStr = "1000@0";
+    v.add(aInstStr, aClassStr);  // A -->(class) --> A.class.
 
-    A foo = new A();
-    setTag(foo, 2);
-    root.foo = foo;
+    A a2Inst = new A();
+    setTag(a2Inst, 2);
+    aInst.foo = a2Inst;
+    String a2InstStr = "2@1000";
+    v.add(a2InstStr, aClassStr);  // A2 -->(class) --> A.class.
+    v.add(aInstStr, a2InstStr);   // A -->(field) --> A2.
 
-    B foo2 = new B();
-    setTag(foo2, 3);
-    root.foo2 = foo2;
+    B bInst = new B();
+    setTag(bInst, 3);
+    aInst.foo2 = bInst;
+    String bInstStr = "3@1001";
+    String bClassStr = "1001@0";
+    v.add(bInstStr, bClassStr);  // B -->(class) --> B.class.
+    v.add(aInstStr, bInstStr);   // A -->(field) --> B.
 
-    A bar = new A();
-    setTag(bar, 4);
-    foo2.bar = bar;
+    A a3Inst = new A();
+    setTag(a3Inst, 4);
+    bInst.bar = a3Inst;
+    String a3InstStr = "4@1000";
+    v.add(a3InstStr, aClassStr);  // A3 -->(class) --> A.class.
+    v.add(bInstStr, a3InstStr);   // B -->(field) --> A3.
 
-    C bar2 = new C();
-    setTag(bar2, 5);
-    foo2.bar2 = bar2;
+    C cInst = new C();
+    setTag(cInst, 5);
+    bInst.bar2 = cInst;
+    String cInstStr = "5@1000";
+    String cClassStr = "1002@0";
+    v.add(cInstStr, cClassStr);  // C -->(class) --> C.class.
+    v.add(bInstStr, cInstStr);   // B -->(field) --> C.
 
-    A baz = new A();
-    setTag(baz, 6);
-    bar2.baz = baz;
-    bar2.baz2 = root;
+    A a4Inst = new A();
+    setTag(a4Inst, 6);
+    cInst.baz = a4Inst;
+    String a4InstStr = "6@1000";
+    v.add(a4InstStr, aClassStr);  // A4 -->(class) --> A.class.
+    v.add(cInstStr, a4InstStr);   // C -->(field) --> A4.
 
-    return root;
+    cInst.baz2 = aInst;
+    v.add(cInstStr, aInstStr);  // C -->(field) --> A.
+
+    return aInst;
   }
 
   public static class A {
@@ -202,6 +218,165 @@
     }
   }
 
+  public static class Verifier {
+    public static class Node {
+      public String referrer;
+
+      public HashSet<String> referrees = new HashSet<>();
+
+      public Node(String r) {
+        referrer = r;
+      }
+
+      public boolean isRoot() {
+        return referrer.startsWith("root@");
+      }
+    }
+
+    HashMap<String, Node> nodes = new HashMap<>();
+
+    public Verifier() {
+    }
+
+    public void add(String referrer, String referree) {
+      if (!nodes.containsKey(referrer)) {
+        nodes.put(referrer, new Node(referrer));
+      }
+      if (referree != null) {
+        nodes.get(referrer).referrees.add(referree);
+      }
+    }
+
+    public void process(String[] lines, String additionalEnabledReferrer) {
+      // This method isn't optimal. The loops could be merged. However, it's more readable if
+      // the different parts are separated.
+
+      ArrayList<String> rootLines = new ArrayList<>();
+      ArrayList<String> nonRootLines = new ArrayList<>();
+
+      // Check for consecutive chunks of referrers. Also ensure roots come first.
+      {
+        String currentHead = null;
+        boolean rootsDone = false;
+        HashSet<String> completedReferrers = new HashSet<>();
+        for (String l : lines) {
+          String referrer = getReferrer(l);
+
+          if (isRoot(referrer)) {
+            if (rootsDone) {
+              System.out.println("ERROR: Late root " + l);
+              print(lines);
+              return;
+            }
+            rootLines.add(l);
+            continue;
+          }
+
+          rootsDone = true;
+
+          if (currentHead == null) {
+            currentHead = referrer;
+          } else {
+            if (!currentHead.equals(referrer)) {
+              completedReferrers.add(currentHead);
+              currentHead = referrer;
+              if (completedReferrers.contains(referrer)) {
+                System.out.println("Non-contiguous referrer " + l);
+                print(lines);
+                return;
+              }
+            }
+          }
+          nonRootLines.add(l);
+        }
+      }
+
+      // Sort (root order is not specified) and print the roots.
+      // TODO: What about extra roots? JNI and the interpreter seem to introduce those (though it
+      //       isn't clear why a debuggable-AoT test doesn't have the same, at least for locals).
+      //       For now, swallow duplicates, and resolve once we have the metadata for the roots.
+      {
+        Collections.sort(rootLines);
+        String lastRoot = null;
+        for (String l : rootLines) {
+          if (lastRoot != null && lastRoot.equals(l)) {
+            continue;
+          }
+          lastRoot = l;
+          System.out.println(l);
+        }
+      }
+
+      // Iterate through the lines, keeping track of which referrers are visited, to ensure the
+      // order is acceptable.
+      HashSet<String> enabled = new HashSet<>();
+      if (additionalEnabledReferrer != null) {
+        enabled.add(additionalEnabledReferrer);
+      }
+      // Always add "0@0".
+      enabled.add("0@0");
+
+      for (String l : lines) {
+        String referrer = getReferrer(l);
+        String referree = getReferree(l);
+        if (isRoot(referrer)) {
+          // For a root src, just enable the referree.
+          enabled.add(referree);
+        } else {
+          // Check that the referrer is enabled (may be visited).
+          if (!enabled.contains(referrer)) {
+            System.out.println("Referrer " + referrer + " not enabled: " + l);
+            print(lines);
+            return;
+          }
+          enabled.add(referree);
+        }
+      }
+
+      // Now just sort the non-root lines and output them
+      Collections.sort(nonRootLines);
+      for (String l : nonRootLines) {
+        System.out.println(l);
+      }
+
+      System.out.println("---");
+    }
+
+    public static boolean isRoot(String ref) {
+      return ref.startsWith("root@");
+    }
+
+    private static String getReferrer(String line) {
+      int i = line.indexOf(" --");
+      if (i <= 0) {
+        throw new IllegalArgumentException(line);
+      }
+      int j = line.indexOf(' ');
+      if (i != j) {
+        throw new IllegalArgumentException(line);
+      }
+      return line.substring(0, i);
+    }
+
+    private static String getReferree(String line) {
+      int i = line.indexOf("--> ");
+      if (i <= 0) {
+        throw new IllegalArgumentException(line);
+      }
+      int j = line.indexOf(' ', i + 4);
+      if (j < 0) {
+        throw new IllegalArgumentException(line);
+      }
+      return line.substring(i + 4, j);
+    }
+
+    private static void print(String[] lines) {
+      for (String l : lines) {
+        System.out.println(l);
+      }
+    }
+  }
+
   private static native void setupGcCallback();
   private static native void enableGcTracking(boolean enable);
   private static native int getGcStarts();
@@ -213,4 +388,6 @@
 
   private static native String[] followReferences(int heapFilter, Class<?> klassFilter,
       Object initialObject, int stopAfter, int followSet, Object jniRef);
+
+  private static native void waitForJitCompilation();
 }
diff --git a/test/956-methodhandles/expected.txt b/test/956-methodhandles/expected.txt
index 9ca448c..0a5caa1 100644
--- a/test/956-methodhandles/expected.txt
+++ b/test/956-methodhandles/expected.txt
@@ -5,3 +5,5 @@
 privateRyan_D
 Received exception: Expected (java.lang.String, java.lang.String)java.lang.String but was (java.lang.String, java.lang.Object)void
 String constructors done.
+testReferenceReturnValueConversions done.
+testPrimitiveReturnValueConversions done.
diff --git a/test/956-methodhandles/src/Main.java b/test/956-methodhandles/src/Main.java
index d0c658f..aab9f50 100644
--- a/test/956-methodhandles/src/Main.java
+++ b/test/956-methodhandles/src/Main.java
@@ -69,6 +69,7 @@
     testAsType();
     testConstructors();
     testStringConstructors();
+    testReturnValueConversions();
   }
 
   public static void testfindSpecial_invokeSuperBehaviour() throws Throwable {
@@ -685,6 +686,204 @@
 
     System.out.println("String constructors done.");
   }
+
+  private static void testReferenceReturnValueConversions() throws Throwable {
+    MethodHandle mh = MethodHandles.lookup().findStatic(
+        Float.class, "valueOf", MethodType.methodType(Float.class, String.class));
+
+    // No conversion
+    Float f = (Float) mh.invokeExact("1.375");
+    if (f.floatValue() != 1.375) {
+      fail();
+    }
+    f = (Float) mh.invoke("1.875");
+    if (f.floatValue() != 1.875) {
+      fail();
+    }
+
+    // Bad conversion
+    try {
+      int i = (int) mh.invokeExact("7.77");
+      fail();
+    } catch (WrongMethodTypeException e) {}
+
+    try {
+      int i = (int) mh.invoke("7.77");
+      fail();
+    } catch (WrongMethodTypeException e) {}
+
+    // Assignment to super-class.
+    Number n = (Number) mh.invoke("1.11");
+    try {
+      Number o = (Number) mh.invokeExact("1.11");
+      fail();
+    } catch (WrongMethodTypeException e) {}
+
+    // Assignment to widened boxed primitive class.
+    try {
+      Double u = (Double) mh.invoke("1.11");
+      fail();
+    } catch (ClassCastException e) {}
+
+    try {
+      Double v = (Double) mh.invokeExact("1.11");
+      fail();
+    } catch (WrongMethodTypeException e) {}
+
+    // Unboxed
+    float p = (float) mh.invoke("1.11");
+    if (p != 1.11f) {
+      fail();
+    }
+
+    // Unboxed and widened
+    double d = (double) mh.invoke("2.5");
+    if (d != 2.5) {
+      fail();
+    }
+
+    // Interface
+    Comparable<Float> c = (Comparable<Float>) mh.invoke("2.125");
+    if (c.compareTo(new Float(2.125f)) != 0) {
+      fail();
+    }
+
+    System.out.println("testReferenceReturnValueConversions done.");
+  }
+
+  private static void testPrimitiveReturnValueConversions() throws Throwable {
+    MethodHandle mh = MethodHandles.lookup().findStatic(
+        Math.class, "min", MethodType.methodType(int.class, int.class, int.class));
+
+    final int SMALL = -8972;
+    final int LARGE = 7932529;
+
+    // No conversion
+    if ((int) mh.invokeExact(LARGE, SMALL) != SMALL) {
+      fail();
+    } else if ((int) mh.invoke(LARGE, SMALL) != SMALL) {
+      fail();
+    } else if ((int) mh.invokeExact(SMALL, LARGE) != SMALL) {
+      fail();
+    } else if ((int) mh.invoke(SMALL, LARGE) != SMALL) {
+      fail();
+    }
+
+    // int -> long
+    try {
+      if ((long) mh.invokeExact(LARGE, SMALL) != (long) SMALL) {}
+        fail();
+    } catch (WrongMethodTypeException e) {}
+
+    if ((long) mh.invoke(LARGE, SMALL) != (long) SMALL) {
+      fail();
+    }
+
+    // int -> short
+    try {
+      if ((short) mh.invokeExact(LARGE, SMALL) != (short) SMALL) {}
+      fail();
+    } catch (WrongMethodTypeException e) {}
+
+    try {
+      if ((short) mh.invoke(LARGE, SMALL) != (short) SMALL) {
+        fail();
+      }
+    } catch (WrongMethodTypeException e) {}
+
+    // int -> Integer
+    try {
+      if (!((Integer) mh.invokeExact(LARGE, SMALL)).equals(new Integer(SMALL))) {}
+      fail();
+    } catch (WrongMethodTypeException e) {}
+
+    if (!((Integer) mh.invoke(LARGE, SMALL)).equals(new Integer(SMALL))) {
+      fail();
+    }
+
+    // int -> Long
+    try {
+      Long l = (Long) mh.invokeExact(LARGE, SMALL);
+      fail();
+    } catch (WrongMethodTypeException e) {}
+
+    try {
+      Long l = (Long) mh.invoke(LARGE, SMALL);
+      fail();
+    } catch (WrongMethodTypeException e) {}
+
+    // int -> Short
+    try {
+      Short s = (Short) mh.invokeExact(LARGE, SMALL);
+      fail();
+    } catch (WrongMethodTypeException e) {}
+
+    try {
+      Short s = (Short) mh.invoke(LARGE, SMALL);
+      fail();
+    } catch (WrongMethodTypeException e) {}
+
+    // int -> Process
+    try {
+      Process p = (Process) mh.invokeExact(LARGE, SMALL);
+      fail();
+    } catch (WrongMethodTypeException e) {}
+
+    try {
+      Process p = (Process) mh.invoke(LARGE, SMALL);
+      fail();
+    } catch (WrongMethodTypeException e) {}
+
+    // void -> Object
+    mh = MethodHandles.lookup().findStatic(System.class, "gc", MethodType.methodType(void.class));
+    Object o = (Object) mh.invoke();
+    if (o != null) fail();
+
+    // void -> long
+    long l = (long) mh.invoke();
+    if (l != 0) fail();
+
+    // boolean -> Boolean
+    mh = MethodHandles.lookup().findStatic(Boolean.class, "parseBoolean",
+                                           MethodType.methodType(boolean.class, String.class));
+    Boolean z = (Boolean) mh.invoke("True");
+    if (!z.booleanValue()) fail();
+
+    // boolean -> int
+    try {
+        int dummy = (int) mh.invoke("True");
+        fail();
+    } catch (WrongMethodTypeException e) {}
+
+    // boolean -> Integer
+    try {
+        Integer dummy = (Integer) mh.invoke("True");
+        fail();
+    } catch (WrongMethodTypeException e) {}
+
+    // Boolean -> boolean
+    mh = MethodHandles.lookup().findStatic(Boolean.class, "valueOf",
+                                           MethodType.methodType(Boolean.class, boolean.class));
+    boolean w = (boolean) mh.invoke(false);
+    if (w) fail();
+
+    // Boolean -> int
+    try {
+        int dummy = (int) mh.invoke(false);
+        fail();
+    } catch (WrongMethodTypeException e) {}
+
+    // Boolean -> Integer
+    try {
+        Integer dummy = (Integer) mh.invoke("True");
+        fail();
+    } catch (WrongMethodTypeException e) {}
+
+    System.out.println("testPrimitiveReturnValueConversions done.");
+  }
+
+  public static void testReturnValueConversions() throws Throwable {
+    testReferenceReturnValueConversions();
+    testPrimitiveReturnValueConversions();
+  }
 }
-
-
diff --git a/test/957-methodhandle-transforms/src/Main.java b/test/957-methodhandle-transforms/src/Main.java
index 3c6f119..5806509 100644
--- a/test/957-methodhandle-transforms/src/Main.java
+++ b/test/957-methodhandle-transforms/src/Main.java
@@ -31,6 +31,8 @@
     testIdentity();
     testConstant();
     testBindTo();
+    testFilterReturnValue();
+    testPermuteArguments();
   }
 
   public static void testThrowException() throws Throwable {
@@ -708,6 +710,184 @@
     }
   }
 
+  public static String filterReturnValue_target(int a) {
+    return "ReturnValue" + a;
+  }
+
+  public static boolean filterReturnValue_filter(String value) {
+    return value.indexOf("42") != -1;
+  }
+
+  public static int filterReturnValue_intTarget(String a) {
+    return Integer.parseInt(a);
+  }
+
+  public static int filterReturnValue_intFilter(int b) {
+    return b + 1;
+  }
+
+  public static void filterReturnValue_voidTarget() {
+  }
+
+  public static int filterReturnValue_voidFilter() {
+    return 42;
+  }
+
+  public static void testFilterReturnValue() throws Throwable {
+    // A target that returns a reference.
+    {
+      final MethodHandle target = MethodHandles.lookup().findStatic(Main.class,
+          "filterReturnValue_target", MethodType.methodType(String.class, int.class));
+      final MethodHandle filter = MethodHandles.lookup().findStatic(Main.class,
+          "filterReturnValue_filter", MethodType.methodType(boolean.class, String.class));
+
+      MethodHandle adapter = MethodHandles.filterReturnValue(target, filter);
+
+      boolean value = (boolean) adapter.invoke((int) 42);
+      if (!value) {
+        System.out.println("Unexpected value: " + value);
+      }
+      value = (boolean) adapter.invoke((int) 43);
+      if (value) {
+        System.out.println("Unexpected value: " + value);
+      }
+    }
+
+    // A target that returns a primitive.
+    {
+      final MethodHandle target = MethodHandles.lookup().findStatic(Main.class,
+          "filterReturnValue_intTarget", MethodType.methodType(int.class, String.class));
+      final MethodHandle filter = MethodHandles.lookup().findStatic(Main.class,
+          "filterReturnValue_intFilter", MethodType.methodType(int.class, int.class));
+
+      MethodHandle adapter = MethodHandles.filterReturnValue(target, filter);
+
+      int value = (int) adapter.invoke("56");
+      if (value != 57) {
+        System.out.println("Unexpected value: " + value);
+      }
+    }
+
+    // A target that returns void.
+    {
+      final MethodHandle target = MethodHandles.lookup().findStatic(Main.class,
+          "filterReturnValue_voidTarget", MethodType.methodType(void.class));
+      final MethodHandle filter = MethodHandles.lookup().findStatic(Main.class,
+          "filterReturnValue_voidFilter", MethodType.methodType(int.class));
+
+      MethodHandle adapter = MethodHandles.filterReturnValue(target, filter);
+
+      int value = (int) adapter.invoke();
+      if (value != 42) {
+        System.out.println("Unexpected value: " + value);
+      }
+    }
+  }
+
+  public static void permuteArguments_callee(boolean a, byte b, char c,
+      short d, int e, long f, float g, double h) {
+    if (a == true && b == (byte) 'b' && c == 'c' && d == (short) 56 &&
+        e == 78 && f == (long) 97 && g == 98.0f && f == 97.0) {
+      return;
+    }
+
+    System.out.println("Unexpected arguments: " + a + ", " + b + ", " + c
+        + ", " + d + ", " + e + ", " + f + ", " + g + ", " + h);
+  }
+
+  public static void permuteArguments_boxingCallee(boolean a, Integer b) {
+    if (a && b.intValue() == 42) {
+      return;
+    }
+
+    System.out.println("Unexpected arguments: " + a + ", " + b);
+  }
+
+  public static void testPermuteArguments() throws Throwable {
+    {
+      final MethodHandle target = MethodHandles.lookup().findStatic(
+          Main.class, "permuteArguments_callee",
+          MethodType.methodType(void.class, new Class<?>[] {
+            boolean.class, byte.class, char.class, short.class, int.class,
+            long.class, float.class, double.class }));
+
+      final MethodType newType = MethodType.methodType(void.class, new Class<?>[] {
+        double.class, float.class, long.class, int.class, short.class, char.class,
+        byte.class, boolean.class });
+
+      final MethodHandle permutation = MethodHandles.permuteArguments(
+          target, newType, new int[] { 7, 6, 5, 4, 3, 2, 1, 0 });
+
+      permutation.invoke((double) 97.0, (float) 98.0f, (long) 97, 78,
+          (short) 56, 'c', (byte) 'b', (boolean) true);
+
+      // The permutation array was not of the right length.
+      try {
+        MethodHandles.permuteArguments(target, newType,
+            new int[] { 7 });
+        fail();
+      } catch (IllegalArgumentException expected) {
+      }
+
+      // The permutation array has an element that's out of bounds
+      // (there's no argument with idx == 8).
+      try {
+        MethodHandles.permuteArguments(target, newType,
+            new int[] { 8, 6, 5, 4, 3, 2, 1, 0 });
+        fail();
+      } catch (IllegalArgumentException expected) {
+      }
+
+      // The permutation array maps to an incorrect type.
+      try {
+        MethodHandles.permuteArguments(target, newType,
+            new int[] { 7, 7, 5, 4, 3, 2, 1, 0 });
+        fail();
+      } catch (IllegalArgumentException expected) {
+      }
+    }
+
+    // Tests for reference arguments as well as permutations that
+    // repeat arguments.
+    {
+      final MethodHandle target = MethodHandles.lookup().findVirtual(
+          String.class, "concat", MethodType.methodType(String.class, String.class));
+
+      final MethodType newType = MethodType.methodType(String.class, String.class,
+          String.class);
+
+      assertEquals("foobar", (String) target.invoke("foo", "bar"));
+
+      MethodHandle permutation = MethodHandles.permuteArguments(target,
+          newType, new int[] { 1, 0 });
+      assertEquals("barfoo", (String) permutation.invoke("foo", "bar"));
+
+      permutation = MethodHandles.permuteArguments(target, newType, new int[] { 0, 0 });
+      assertEquals("foofoo", (String) permutation.invoke("foo", "bar"));
+
+      permutation = MethodHandles.permuteArguments(target, newType, new int[] { 1, 1 });
+      assertEquals("barbar", (String) permutation.invoke("foo", "bar"));
+    }
+
+    // Tests for boxing and unboxing.
+    {
+      final MethodHandle target = MethodHandles.lookup().findStatic(
+          Main.class, "permuteArguments_boxingCallee",
+          MethodType.methodType(void.class, new Class<?>[] { boolean.class, Integer.class }));
+
+      final MethodType newType = MethodType.methodType(void.class,
+          new Class<?>[] { Integer.class, boolean.class });
+
+      MethodHandle permutation = MethodHandles.permuteArguments(target,
+          newType, new int[] { 1, 0 });
+
+      permutation.invoke(42, true);
+      permutation.invoke(42, Boolean.TRUE);
+      permutation.invoke(Integer.valueOf(42), true);
+      permutation.invoke(Integer.valueOf(42), Boolean.TRUE);
+    }
+  }
+
   public static void fail() {
     System.out.println("FAIL");
     Thread.dumpStack();
@@ -725,5 +905,3 @@
     throw new AssertionError("assertEquals s1: " + s1 + ", s2: " + s2);
   }
 }
-
-
diff --git a/test/959-invoke-polymorphic-accessors/src/Main.java b/test/959-invoke-polymorphic-accessors/src/Main.java
index 824a436..b7ecf8e 100644
--- a/test/959-invoke-polymorphic-accessors/src/Main.java
+++ b/test/959-invoke-polymorphic-accessors/src/Main.java
@@ -780,16 +780,28 @@
             } catch (WrongMethodTypeException e) {}
         }
 
+        /*package*/ static Number getDoubleAsNumber() {
+            return new Double(1.4e77);
+        }
+        /*package*/ static Number getFloatAsNumber() {
+            return new Float(7.77);
+        }
+        /*package*/ static Object getFloatAsObject() {
+            return new Float(-7.77);
+        }
+
         private static void testMemberSetter() throws Throwable {
             ValueHolder valueHolder = new ValueHolder();
             MethodHandles.Lookup lookup = MethodHandles.lookup();
             MethodHandle h0 = lookup.findSetter(ValueHolder.class, "m_f", float.class);
             h0.invoke(valueHolder, 0.22f);
             h0.invoke(valueHolder, new Float(1.11f));
-            Number floatNumber = new Float(0.88f);
+            Number floatNumber = getFloatAsNumber();
             h0.invoke(valueHolder, floatNumber);
             assertTrue(valueHolder.m_f == floatNumber.floatValue());
-
+            Object objNumber = getFloatAsObject();
+            h0.invoke(valueHolder, objNumber);
+            assertTrue(valueHolder.m_f == ((Float) objNumber).floatValue());
             try {
               h0.invoke(valueHolder, (Float)null);
               unreachable();
@@ -799,12 +811,17 @@
             h0.invoke(valueHolder, (short)2);
             h0.invoke(valueHolder, 3);
             h0.invoke(valueHolder, 4l);
+
+            assertTrue(null == (Object) h0.invoke(valueHolder, 33));
+            assertTrue(0.0f == (float) h0.invoke(valueHolder, 33));
+            assertTrue(0l == (long) h0.invoke(valueHolder, 33));
+
             try {
                 h0.invoke(valueHolder, 0.33);
                 unreachable();
             } catch (WrongMethodTypeException e) {}
             try {
-                Number doubleNumber = new Double(0.89);
+                Number doubleNumber = getDoubleAsNumber();
                 h0.invoke(valueHolder, doubleNumber);
                 unreachable();
             } catch (ClassCastException e) {}
@@ -847,12 +864,17 @@
             h0.invoke((short)2);
             h0.invoke(3);
             h0.invoke(4l);
+
+            assertTrue(null == (Object) h0.invoke(33));
+            assertTrue(0.0f == (float) h0.invoke(33));
+            assertTrue(0l == (long) h0.invoke(33));
+
             try {
                 h0.invoke(0.33);
                 unreachable();
             } catch (WrongMethodTypeException e) {}
             try {
-                Number doubleNumber = new Double(0.89);
+                Number doubleNumber = getDoubleAsNumber();
                 h0.invoke(doubleNumber);
                 unreachable();
             } catch (ClassCastException e) {}
diff --git a/test/Android.arm_vixl.mk b/test/Android.arm_vixl.mk
index e562812..a167847 100644
--- a/test/Android.arm_vixl.mk
+++ b/test/Android.arm_vixl.mk
@@ -16,109 +16,49 @@
 
 # Known broken tests for the ARM VIXL backend.
 TEST_ART_BROKEN_OPTIMIZING_ARM_VIXL_RUN_TESTS := \
-  002-sleep \
   003-omnibus-opcodes \
-  004-InterfaceTest \
-  004-JniTest \
   004-NativeAllocations \
   004-ThreadStress \
-  004-checker-UnsafeTest18 \
-  005-annotations \
-  009-instanceof \
   012-math \
   015-switch \
   021-string2 \
-  022-interface \
-  023-many-interfaces \
-  024-illegal-access \
-  025-access-controller \
   028-array-write \
-  031-class-attributes \
-  035-enum \
   036-finalizer \
   037-inherit \
   042-new-instance \
   044-proxy \
-  045-reflect-array \
-  046-reflect \
-  047-returns \
-  048-reflect-v8 \
   050-sync-test \
   051-thread \
-  052-verifier-fun \
-  053-wait-some \
-  054-uncaught \
-  055-enum-performance \
-  058-enum-order \
-  061-out-of-memory \
-  062-character-encodings \
-  063-process-manager \
-  064-field-access \
-  065-mismatched-implements \
-  066-mismatched-super \
-  067-preemptive-unpark \
   068-classloader \
-  069-field-type \
-  071-dexfile \
   074-gc-thrash \
-  075-verification-error \
   079-phantom \
   080-oom-throw \
   082-inline-execute \
   083-compiler-regressions \
-  086-null-super \
-  087-gc-after-link \
   088-monitor-verification \
-  091-override-package-private-method \
-  093-serialization \
   096-array-copy-concurrent-gc \
-  098-ddmc \
   099-vmdebug \
   103-string-append \
-  104-growth-limit \
   106-exceptions2 \
   107-int-math2 \
-  108-check-cast \
-  109-suspend-check \
-  113-multidex \
   114-ParallelGC \
   120-hashcode \
   121-modifiers \
   122-npe \
   123-compiler-regressions-mt \
   123-inline-execute2 \
-  127-checker-secondarydex \
   129-ThreadGetId \
   132-daemon-locks-shutdown \
-  133-static-invoke-super \
-  134-reg-promotion \
-  135-MirandaDispatch \
-  136-daemon-jni-shutdown \
   137-cfi \
   138-duplicate-classes-check2 \
-  140-field-packing \
   141-class-unload \
-  142-classloader2 \
   144-static-field-sigquit \
-  146-bad-interface \
-  150-loadlibrary \
-  151-OpenFileLimit \
   201-built-in-except-detail-messages \
-  304-method-tracing \
   412-new-array \
-  416-optimizing-arith-not \
   417-optimizing-arith-div \
-  422-instanceof \
   422-type-conversion \
-  423-invoke-interface \
-  424-checkcast \
-  425-invoke-super \
   426-monitor \
   428-optimizing-arith-rem \
-  430-live-register-slow-path \
-  431-type-propagation \
-  432-optimizing-cmp \
-  434-invoke-direct \
   436-rem-float \
   437-inline \
   439-npe \
@@ -126,40 +66,15 @@
   444-checker-nce \
   445-checker-licm \
   447-checker-inliner3 \
-  448-multiple-returns \
   449-checker-bce \
   450-checker-types \
-  452-multiple-returns2 \
-  453-not-byte \
   458-checker-instruct-simplification \
   458-long-to-fpu \
-  460-multiple-returns3 \
-  463-checker-boolean-simplifier \
-  467-regalloc-pair \
-  468-checker-bool-simplif-regression \
-  475-regression-inliner-ids \
-  477-checker-bound-type \
-  478-checker-clinit-check-pruning \
-  483-dce-block \
   485-checker-dce-switch \
-  486-checker-must-do-null-check \
   488-checker-inline-recursive-calls \
-  490-checker-inline \
-  492-checker-inline-invoke-interface \
-  493-checker-inline-invoke-interface \
-  494-checker-instanceof-tests \
-  495-checker-checkcast-tests \
-  496-checker-inlining-class-loader \
-  497-inlining-and-class-loader \
-  500-instanceof \
-  501-regression-packed-switch \
-  504-regression-baseline-entry \
   508-checker-disassembly \
   510-checker-try-catch \
   515-dce-dominator \
-  517-checker-builder-fallthrough \
-  518-null-array-get \
-  519-bound-load-class \
   520-equivalent-phi \
   522-checker-regression-monitor-exit \
   523-checker-can-throw-regression \
@@ -167,43 +82,24 @@
   525-checker-arrays-fields2 \
   526-checker-caller-callee-regs \
   527-checker-array-access-split \
-  528-long-hint \
-  529-checker-unresolved \
   530-checker-loops1 \
   530-checker-loops2 \
   530-checker-lse \
-  530-checker-regression-reftyp-final \
-  530-instanceof-checkcast \
-  534-checker-bce-deoptimization \
+  530-checker-lse2 \
   535-regression-const-val \
   536-checker-intrinsic-optimization \
-  536-checker-needs-access-check \
-  537-checker-inline-and-unverified \
   538-checker-embed-constants \
-  540-checker-rtp-bug \
-  541-regression-inlined-deopt \
-  542-unresolved-access-check \
   543-checker-dce-trycatch \
-  543-env-long-ref \
-  545-tracing-and-jit \
   546-regression-simplify-catch \
   550-checker-multiply-accumulate \
-  550-checker-regression-wide-store \
-  552-checker-sharpening \
-  551-invoke-super \
   552-checker-primitive-typeprop \
-  552-invoke-non-existent-super \
-  553-invoke-super \
-  554-checker-rtp-checkcast \
+  552-checker-sharpening \
   555-UnsafeGetLong-regression \
-  556-invoke-super \
   558-switch \
   560-packed-switch \
   561-divrem \
-  562-bce-preheader \
-  563-checker-fakestring \
+  562-checker-no-intermediate \
   564-checker-negbitwise \
-  566-polymorphic-inlining \
   570-checker-osr \
   570-checker-select \
   573-checker-checkcast-regression \
@@ -211,39 +107,11 @@
   575-checker-string-init-alias \
   580-checker-round \
   584-checker-div-bool \
-  586-checker-null-array-get \
-  587-inline-class-error \
   588-checker-irreducib-lifetime-hole \
-  589-super-imt \
-  592-checker-regression-bool-input \
   594-checker-array-alias \
-  594-invoke-super \
-  594-load-string-regression \
   597-deopt-new-string \
-  600-verifier-fails \
-  601-method-access \
   602-deoptimizeable \
-  603-checker-instanceof \
-  605-new-string-from-bytes \
-  608-checker-unresolved-lse \
-  609-checker-inline-interface \
-  610-arraycopy \
-  612-jit-dex-cache \
   700-LoadArgRegs \
   701-easy-div-rem \
   702-LargeBranchOffset \
   800-smali \
-  802-deoptimization \
-  960-default-smali \
-  963-default-range-smali \
-  965-default-verify \
-  966-default-conflict \
-  967-default-ame \
-  969-iface-super \
-  971-iface-super \
-  972-default-imt-collision \
-  972-iface-super-multidex \
-  973-default-multidex \
-  974-verify-interface-super \
-  975-iface-private \
-  979-invoke-polymorphic-accessors
diff --git a/test/ti-agent/common_helper.h b/test/ti-agent/common_helper.h
index 9aeb98c..84997f3 100644
--- a/test/ti-agent/common_helper.h
+++ b/test/ti-agent/common_helper.h
@@ -53,6 +53,12 @@
   return ret.release();
 }
 
+static void SetAllCapabilities(jvmtiEnv* env) {
+  jvmtiCapabilities caps;
+  env->GetPotentialCapabilities(&caps);
+  env->AddCapabilities(&caps);
+}
+
 }  // namespace art
 
 #endif  // ART_TEST_TI_AGENT_COMMON_HELPER_H_