Optimize stack map decoding.

We usually read several consecutive varints.
Add helper method optimized for that use case
(ideally reading 8 varints from single load).

This improves app startup by 0.4% (maps,speed).
PMD on golem seems to get around 5% faster.
CodeInfo::Decode on its own is 25% faster.

Bug: 133257467
Test: ./art/test.py -b --host --64
Change-Id: Iaf7e8469ed6397b1d1d4102e409b5731f7229557
diff --git a/runtime/stack_map.cc b/runtime/stack_map.cc
index 62dec15..6585a3b 100644
--- a/runtime/stack_map.cc
+++ b/runtime/stack_map.cc
@@ -35,7 +35,7 @@
 template<typename Accessor>
 ALWAYS_INLINE static bool DecodeTable(BitTable<Accessor>& table, BitMemoryReader& reader) {
   bool is_deduped = reader.ReadBit();
-  if (is_deduped) {
+  if (UNLIKELY(is_deduped)) {
     ssize_t bit_offset = reader.NumberOfReadBits() - reader.ReadVarint();
     BitMemoryReader reader2(reader.data(), bit_offset);  // The offset is negative.
     table.Decode(reader2);
@@ -47,9 +47,12 @@
 
 void CodeInfo::Decode(const uint8_t* data, DecodeFlags flags) {
   BitMemoryReader reader(data);
-  ForEachHeaderField([this, &reader](auto member_pointer) {
-    this->*member_pointer = reader.ReadVarint();
-  });
+  uint32_t header[4];
+  reader.ReadVarints(header);
+  packed_frame_size_ = header[0];
+  core_spill_mask_ = header[1];
+  fp_spill_mask_ = header[2];
+  number_of_dex_registers_ = header[3];
   ForEachBitTableField([this, &reader](auto member_pointer) {
     DecodeTable(this->*member_pointer, reader);
   }, flags);