Optimize stack maps: add fast path for no inline info.

Consumers of CodeInfo can skip significant chunks of work
if they can quickly determine that method has no inlining.

Store this fact as a flag bit at the start of code info.
This changes binary format and adds <0.1% to oat size.

I added the extra flag field as the simplest solution for now,
although I would like to use it for more things in the future.
(e.g. store the special cases of empty/deduped tables in it)

This improves app startup by 0.4% (maps,speed).
PMD on golem seems to gets around 15% faster.

Bug: 133257467
Test: ./art/test.py -b --host --64
Change-Id: Ia498a31bafc74b51cc95b8c70cf1da4b0e3d894e
diff --git a/compiler/optimizing/stack_map_stream.cc b/compiler/optimizing/stack_map_stream.cc
index 8c36643..e21e21c 100644
--- a/compiler/optimizing/stack_map_stream.cc
+++ b/compiler/optimizing/stack_map_stream.cc
@@ -184,6 +184,7 @@
   in_inline_info_ = true;
   DCHECK_EQ(expected_num_dex_registers_, current_dex_registers_.size());
 
+  flags_ |= CodeInfo::kHasInlineInfo;
   expected_num_dex_registers_ += num_dex_registers;
 
   BitTableBuilder<InlineInfo>::Entry entry;
@@ -305,6 +306,7 @@
 
   ScopedArenaVector<uint8_t> buffer(allocator_->Adapter(kArenaAllocStackMapStream));
   BitMemoryWriter<ScopedArenaVector<uint8_t>> out(&buffer);
+  out.WriteVarint(flags_);
   out.WriteVarint(packed_frame_size_);
   out.WriteVarint(core_spill_mask_);
   out.WriteVarint(fp_spill_mask_);
diff --git a/compiler/optimizing/stack_map_stream.h b/compiler/optimizing/stack_map_stream.h
index 01c6bf9..20dd32e 100644
--- a/compiler/optimizing/stack_map_stream.h
+++ b/compiler/optimizing/stack_map_stream.h
@@ -99,6 +99,7 @@
 
   ScopedArenaAllocator* allocator_;
   const InstructionSet instruction_set_;
+  uint32_t flags_ = 0;
   uint32_t packed_frame_size_ = 0;
   uint32_t core_spill_mask_ = 0;
   uint32_t fp_spill_mask_ = 0;
diff --git a/runtime/entrypoints/entrypoint_utils.cc b/runtime/entrypoints/entrypoint_utils.cc
index ee2ab56..71196d4 100644
--- a/runtime/entrypoints/entrypoint_utils.cc
+++ b/runtime/entrypoints/entrypoint_utils.cc
@@ -203,13 +203,15 @@
       const OatQuickMethodHeader* current_code = outer_method->GetOatQuickMethodHeader(caller_pc);
       DCHECK(current_code != nullptr);
       DCHECK(current_code->IsOptimized());
-      uintptr_t native_pc_offset = current_code->NativeQuickPcOffset(caller_pc);
-      CodeInfo code_info(current_code, CodeInfo::DecodeFlags::InlineInfoOnly);
-      StackMap stack_map = code_info.GetStackMapForNativePcOffset(native_pc_offset);
-      DCHECK(stack_map.IsValid());
-      BitTableRange<InlineInfo> inline_infos = code_info.GetInlineInfosOf(stack_map);
-      if (!inline_infos.empty()) {
-        caller = GetResolvedMethod(outer_method, code_info, inline_infos);
+      if (CodeInfo::HasInlineInfo(current_code->GetOptimizedCodeInfoPtr())) {
+        uintptr_t native_pc_offset = current_code->NativeQuickPcOffset(caller_pc);
+        CodeInfo code_info(current_code, CodeInfo::DecodeFlags::InlineInfoOnly);
+        StackMap stack_map = code_info.GetStackMapForNativePcOffset(native_pc_offset);
+        DCHECK(stack_map.IsValid());
+        BitTableRange<InlineInfo> inline_infos = code_info.GetInlineInfosOf(stack_map);
+        if (!inline_infos.empty()) {
+          caller = GetResolvedMethod(outer_method, code_info, inline_infos);
+        }
       }
     }
     if (kIsDebugBuild && do_caller_check) {
diff --git a/runtime/oat.h b/runtime/oat.h
index 15059a8..f4b5a6e 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,8 +32,8 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr std::array<uint8_t, 4> kOatMagic { { 'o', 'a', 't', '\n' } };
-  // Last oat version changed reason: Remove unused trampoline entrypoints.
-  static constexpr std::array<uint8_t, 4> kOatVersion { { '1', '7', '0', '\0' } };
+  // Last oat version changed reason: Optimize stack maps: add fast path for no inline info.
+  static constexpr std::array<uint8_t, 4> kOatVersion { { '1', '7', '1', '\0' } };
 
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";
   static constexpr const char* kDebuggableKey = "debuggable";
diff --git a/runtime/stack.cc b/runtime/stack.cc
index 6466efd..ec89d3f 100644
--- a/runtime/stack.cc
+++ b/runtime/stack.cc
@@ -816,8 +816,8 @@
         if ((walk_kind_ == StackWalkKind::kIncludeInlinedFrames)
             && (cur_oat_quick_method_header_ != nullptr)
             && cur_oat_quick_method_header_->IsOptimized()
-            // JNI methods cannot have any inlined frames.
-            && !method->IsNative()) {
+            && !method->IsNative()  // JNI methods cannot have any inlined frames.
+            && CodeInfo::HasInlineInfo(cur_oat_quick_method_header_->GetOptimizedCodeInfoPtr())) {
           DCHECK_NE(cur_quick_frame_pc_, 0u);
           current_code_info_ = CodeInfo(cur_oat_quick_method_header_,
                                         CodeInfo::DecodeFlags::InlineInfoOnly);
diff --git a/runtime/stack_map.cc b/runtime/stack_map.cc
index 6585a3b..eef7378 100644
--- a/runtime/stack_map.cc
+++ b/runtime/stack_map.cc
@@ -47,16 +47,20 @@
 
 void CodeInfo::Decode(const uint8_t* data, DecodeFlags flags) {
   BitMemoryReader reader(data);
-  uint32_t header[4];
+  uint32_t header[5];
   reader.ReadVarints(header);
-  packed_frame_size_ = header[0];
-  core_spill_mask_ = header[1];
-  fp_spill_mask_ = header[2];
-  number_of_dex_registers_ = header[3];
+  flags_ = header[0];
+  packed_frame_size_ = header[1];
+  core_spill_mask_ = header[2];
+  fp_spill_mask_ = header[3];
+  number_of_dex_registers_ = header[4];
   ForEachBitTableField([this, &reader](auto member_pointer) {
     DecodeTable(this->*member_pointer, reader);
   }, flags);
   size_in_bits_ = reader.NumberOfReadBits();
+  if (flags == AllTables) {
+    DCHECK_EQ(HasInlineInfo(data), HasInlineInfo());
+  }
 }
 
 size_t CodeInfo::Deduper::Dedupe(const uint8_t* code_info_data) {
@@ -230,6 +234,7 @@
                     bool verbose,
                     InstructionSet instruction_set) const {
   vios->Stream() << "CodeInfo BitSize=" << size_in_bits_
+    << " Flags:" << flags_
     << " FrameSize:" << packed_frame_size_ * kStackAlignment
     << " CoreSpillMask:" << std::hex << core_spill_mask_
     << " FpSpillMask:" << std::hex << fp_spill_mask_
diff --git a/runtime/stack_map.h b/runtime/stack_map.h
index a2f0019..a971467 100644
--- a/runtime/stack_map.h
+++ b/runtime/stack_map.h
@@ -438,11 +438,15 @@
   // Accumulate code info size statistics into the given Stats tree.
   static void CollectSizeStats(const uint8_t* code_info, /*out*/ Stats* parent);
 
-  ALWAYS_INLINE static QuickMethodFrameInfo DecodeFrameInfo(const uint8_t* data) {
-    BitMemoryReader reader(data);
-    uint32_t args[3];  // packed_frame_size, core_spill_mask, fp_spill_mask.
-    reader.ReadVarints(args);
-    return QuickMethodFrameInfo(args[0] * kStackAlignment, args[1], args[2]);
+  ALWAYS_INLINE static bool HasInlineInfo(const uint8_t* code_info_data) {
+    return (*code_info_data & kHasInlineInfo) != 0;
+  }
+
+  ALWAYS_INLINE static QuickMethodFrameInfo DecodeFrameInfo(const uint8_t* code_info_data) {
+    BitMemoryReader reader(code_info_data);
+    uint32_t header[4];  // flags, packed_frame_size, core_spill_mask, fp_spill_mask.
+    reader.ReadVarints(header);
+    return QuickMethodFrameInfo(header[1] * kStackAlignment, header[2], header[3]);
   }
 
  private:
@@ -460,6 +464,7 @@
   // Invokes the callback with member pointer of each header field.
   template<typename Callback>
   ALWAYS_INLINE static void ForEachHeaderField(Callback callback) {
+    callback(&CodeInfo::flags_);
     callback(&CodeInfo::packed_frame_size_);
     callback(&CodeInfo::core_spill_mask_);
     callback(&CodeInfo::fp_spill_mask_);
@@ -485,6 +490,11 @@
     callback(&CodeInfo::dex_register_catalog_);
   }
 
+  enum Flags {
+    kHasInlineInfo = 1 << 0,
+  };
+
+  uint32_t flags_ = 0;
   uint32_t packed_frame_size_ = 0;  // Frame size in kStackAlignment units.
   uint32_t core_spill_mask_ = 0;
   uint32_t fp_spill_mask_ = 0;