Optimize stack map decoding.
We usually read several consecutive varints.
Add helper method optimized for that use case
(ideally reading 8 varints from single load).
This improves app startup by 0.4% (maps,speed).
PMD on golem seems to get around 5% faster.
CodeInfo::Decode on its own is 25% faster.
Bug: 133257467
Test: ./art/test.py -b --host --64
Change-Id: Iaf7e8469ed6397b1d1d4102e409b5731f7229557
diff --git a/runtime/stack_map.cc b/runtime/stack_map.cc
index 62dec15..6585a3b 100644
--- a/runtime/stack_map.cc
+++ b/runtime/stack_map.cc
@@ -35,7 +35,7 @@
template<typename Accessor>
ALWAYS_INLINE static bool DecodeTable(BitTable<Accessor>& table, BitMemoryReader& reader) {
bool is_deduped = reader.ReadBit();
- if (is_deduped) {
+ if (UNLIKELY(is_deduped)) {
ssize_t bit_offset = reader.NumberOfReadBits() - reader.ReadVarint();
BitMemoryReader reader2(reader.data(), bit_offset); // The offset is negative.
table.Decode(reader2);
@@ -47,9 +47,12 @@
void CodeInfo::Decode(const uint8_t* data, DecodeFlags flags) {
BitMemoryReader reader(data);
- ForEachHeaderField([this, &reader](auto member_pointer) {
- this->*member_pointer = reader.ReadVarint();
- });
+ uint32_t header[4];
+ reader.ReadVarints(header);
+ packed_frame_size_ = header[0];
+ core_spill_mask_ = header[1];
+ fp_spill_mask_ = header[2];
+ number_of_dex_registers_ = header[3];
ForEachBitTableField([this, &reader](auto member_pointer) {
DecodeTable(this->*member_pointer, reader);
}, flags);