dex2oat: Pack likely-dirty objects together when generating the boot image

This introduces a new algorithm into image writer which "bins" objects
by how likely they are to be dirtied at runtime. Objects in the same bin
are placed contiguously in memory (i.e. into the same page). We try to
tune the bin selection based on how clean or how dirty the object will
likely be at runtime.

As-is, this saves about 150KB per-process (private-dirty pages) and 700KB in
zygote (shared-dirty).

There is still about 800KB of objects that are clean but located in
dirty pages, so with more analysis we can tune the bin selection and get
even more memory savings.

(cherry picked from commit 3f735bd4f9d09a0f9b2b01321e4c6917879dcae6)

Bug: 17611661
Change-Id: Ia1455e4c56ffd0a36ae2a723d35b7e06502980f7
diff --git a/compiler/image_writer.h b/compiler/image_writer.h
index 2fec0aa..8c84b68 100644
--- a/compiler/image_writer.h
+++ b/compiler/image_writer.h
@@ -23,6 +23,7 @@
 #include <memory>
 #include <set>
 #include <string>
+#include <ostream>
 
 #include "base/macros.h"
 #include "driver/compiler_driver.h"
@@ -32,6 +33,8 @@
 #include "mirror/dex_cache.h"
 #include "os.h"
 #include "safe_map.h"
+#include "gc/space/space.h"
+#include "utils.h"
 
 namespace art {
 
@@ -41,14 +44,15 @@
   ImageWriter(const CompilerDriver& compiler_driver, uintptr_t image_begin,
               bool compile_pic)
       : compiler_driver_(compiler_driver), image_begin_(reinterpret_cast<uint8_t*>(image_begin)),
-        image_end_(0), image_roots_address_(0), oat_file_(nullptr),
+        image_end_(0), image_objects_offset_begin_(0), image_roots_address_(0), oat_file_(nullptr),
         oat_data_begin_(nullptr), interpreter_to_interpreter_bridge_offset_(0),
         interpreter_to_compiled_code_bridge_offset_(0), jni_dlsym_lookup_offset_(0),
         portable_imt_conflict_trampoline_offset_(0), portable_resolution_trampoline_offset_(0),
         portable_to_interpreter_bridge_offset_(0), quick_generic_jni_trampoline_offset_(0),
         quick_imt_conflict_trampoline_offset_(0), quick_resolution_trampoline_offset_(0),
         quick_to_interpreter_bridge_offset_(0), compile_pic_(compile_pic),
-        target_ptr_size_(InstructionSetPointerSize(compiler_driver_.GetInstructionSet())) {
+        target_ptr_size_(InstructionSetPointerSize(compiler_driver_.GetInstructionSet())),
+        bin_slot_sizes_(), bin_slot_count_() {
     CHECK_NE(image_begin, 0U);
   }
 
@@ -87,14 +91,71 @@
   // Mark the objects defined in this space in the given live bitmap.
   void RecordImageAllocations() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  // Classify different kinds of bins that objects end up getting packed into during image writing.
+  enum Bin {
+    // Likely-clean:
+    kBinString,                        // [String] Almost always immutable (except for obj header).
+    kBinArtMethodsManagedInitialized,  // [ArtMethod] Not-native, and initialized. Unlikely to dirty
+    // Unknown mix of clean/dirty:
+    kBinRegular,
+    // Likely-dirty:
+    // All classes get their own bins since their fields often dirty
+    kBinClassInitializedFinalStatics,  // Class initializers have been run, no non-final statics
+    kBinClassInitialized,         // Class initializers have been run
+    kBinClassVerified,            // Class verified, but initializers haven't been run
+    kBinArtMethodNative,          // Art method that is actually native
+    kBinArtMethodNotInitialized,  // Art method with a declaring class that wasn't initialized
+    // Don't care about other art methods since they don't dirty
+    // Add more bins here if we add more segregation code.
+    kBinSize,
+  };
+
+  friend std::ostream& operator<<(std::ostream& stream, const Bin& bin);
+
+  static constexpr size_t kBinBits = MinimumBitsToStore(kBinSize - 1);
+  // uint32 = typeof(lockword_)
+  static constexpr size_t kBinShift = BitSizeOf<uint32_t>() - kBinBits;
+  // 111000.....0
+  static constexpr size_t kBinMask = ((static_cast<size_t>(1) << kBinBits) - 1) << kBinShift;
+
+  // We use the lock word to store the bin # and bin index of the object in the image.
+  //
+  // The struct size must be exactly sizeof(LockWord), currently 32-bits, since this will end up
+  // stored in the lock word bit-for-bit when object forwarding addresses are being calculated.
+  struct BinSlot {
+    explicit BinSlot(uint32_t lockword);
+    BinSlot(Bin bin, uint32_t index);
+
+    // The bin an object belongs to, i.e. regular, class/verified, class/initialized, etc.
+    Bin GetBin() const;
+    // The offset in bytes from the beginning of the bin. Aligned to object size.
+    uint32_t GetIndex() const;
+    // Pack into a single uint32_t, for storing into a lock word.
+    explicit operator uint32_t() const { return lockword_; }
+    // Comparison operator for map support
+    bool operator<(const BinSlot& other) const  { return lockword_ < other.lockword_; }
+
+  private:
+    // Must be the same size as LockWord, any larger and we would truncate the data.
+    const uint32_t lockword_;
+  };
+
   // We use the lock word to store the offset of the object in the image.
-  void AssignImageOffset(mirror::Object* object) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  void SetImageOffset(mirror::Object* object, size_t offset)
+  void AssignImageOffset(mirror::Object* object, BinSlot bin_slot)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void SetImageOffset(mirror::Object* object, BinSlot bin_slot, size_t offset)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   bool IsImageOffsetAssigned(mirror::Object* object) const
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   size_t GetImageOffset(mirror::Object* object) const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  void AssignImageBinSlot(mirror::Object* object) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void SetImageBinSlot(mirror::Object* object, BinSlot bin_slot)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  bool IsImageBinSlotAssigned(mirror::Object* object) const
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  BinSlot GetImageBinSlot(mirror::Object* object) const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
   static void* GetImageAddressCallback(void* writer, mirror::Object* obj)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     return reinterpret_cast<ImageWriter*>(writer)->GetImageAddress(obj);
@@ -157,7 +218,9 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   mirror::ObjectArray<mirror::Object>* CreateImageRoots() const
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  void CalculateObjectOffsets(mirror::Object* obj)
+  void CalculateObjectBinSlots(mirror::Object* obj)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void UnbinObjectsIntoOffset(mirror::Object* obj)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   void WalkInstanceFields(mirror::Object* obj, mirror::Class* klass)
@@ -166,6 +229,8 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   static void WalkFieldsCallback(mirror::Object* obj, void* arg)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  static void UnbinObjectsIntoOffsetCallback(mirror::Object* obj, void* arg)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Creates the contiguous image in memory and adjusts pointers.
   void CopyAndFixupObjects() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -186,6 +251,9 @@
   // Patches references in OatFile to expect runtime addresses.
   void SetOatChecksumFromElfFile(File* elf_file);
 
+  // Calculate the sum total of the bin slot sizes in [0, up_to). Defaults to all bins.
+  size_t GetBinSizeSum(Bin up_to = kBinSize) const;
+
   const CompilerDriver& compiler_driver_;
 
   // Beginning target image address for the output image.
@@ -194,6 +262,9 @@
   // Offset to the free space in image_.
   size_t image_end_;
 
+  // Offset from image_begin_ to where the first object is in image_.
+  size_t image_objects_offset_begin_;
+
   // The image roots address in the image.
   uint32_t image_roots_address_;
 
@@ -206,6 +277,9 @@
   // Saved hashes (objects are inside of the image so that they don't move).
   std::vector<std::pair<mirror::Object*, uint32_t>> saved_hashes_;
 
+  // Saved hashes (objects are bin slots to inside of the image, not yet allocated an address).
+  std::map<BinSlot, uint32_t> saved_hashes_map_;
+
   // Beginning target oat address for the pointers from the output image to its oat file.
   const uint8_t* oat_data_begin_;
 
@@ -228,6 +302,10 @@
   // Size of pointers on the target architecture.
   size_t target_ptr_size_;
 
+  // Bin slot tracking for dirty object packing
+  size_t bin_slot_sizes_[kBinSize];  // Number of bytes in a bin
+  size_t bin_slot_count_[kBinSize];  // Number of objects in a bin
+
   friend class FixupVisitor;
   friend class FixupClassVisitor;
   DISALLOW_COPY_AND_ASSIGN(ImageWriter);