Quick compiler compile-time/memory use improvement This CL delivers a surprisingly large reduction in compile time, as well as a significant reduction in memory usage by conditionally removing a CFG construction feature introduced to support LLVM bitcode generation. In short, bitcode requires all potential exception edges to be explicitly present in the CFG. The Quick compiler (based on the old JIT), can ignore, at least for the purposes of dataflow analysis, potential throw points that do not have a corresponding catch block. To support LLVM, we create a target basic block for every potentially throwing instruction to give us a destination for the exception edge. Then, following the check elimination pass, we remove blocks whose edges have gone away. However, if we're not using LLVM, we can skip the creation of those things in the first place. The savings are significant. Single-threaded compilation time on the host looks to be reduced by something in the vicinity of 10%. We create roughly 60% fewer basic blocks (and, importantly, the creation of fewer basic block nodes has a multiplying effect on memory use reduction because it results in fewer dataflow bitmaps). Some basic block redution stats: boot: 2325802 before, 844846 after. Phonesky: 485472 before, 156014 after. PlusOne: 806232 before, 243156 after. Thinkfree: 864498 before, 264858 after. Another nice side effect of this change is giving the basic block optimization pass generally larger scope. For arena memusage in the boot class path (compiled on the host): Average Max Before: 50,863 88,017,820 After: 41,964 4,914,208 The huge reduction in max arena memory usage is due to the collapsing of a large initialization method. Specifically, with complete exception edges org.ccil.cowan.tagsoup.Scheme requires 13,802 basic blocks. With exception edges collapsed, it requires 4. This change also caused 2 latent bugs to surface. 1) The dex parsing code did not expect that the target of a switch statement could reside in the middle of the same basic block ended by that same switch statement. 2) The x86 backend introduced a 5-operand LIR instruction for indexed memops. However, there was no corresponding change to the use/def mask creation code. Thus, the 5th operand was never included in the use/def mask. This allowed the instruction scheduling code to incorrectly move a use above a definition. We didn't see this before because the affected x86 instructions were only used for aget/aput, and prior to this CL those Dalvik opcodes caused a basic block break because of the implied exception edge - which prevented the code motion. And finally, also included is some minor tuning of register use weighting. Change-Id: I3f2cab7136dba2bded71e9e33b452b95e8fffc0e

commit: 17189ac098b2f156713db1821b49db7b2f018bbe [log] [tgz]
author: buzbee <buzbee@google.com> Fri Nov 08 11:07:02 2013 -0800
committer: buzbee <buzbee@google.com> Thu Nov 14 08:01:23 2013 -0800
tree: 112bbb910fadb310ef98d3b1bb306f9ef876984e
parent: 3b6f0fae76fddf81930a263a075dc87b6039b7fc [diff]
diff --git a/compiler/dex/frontend.cc b/compiler/dex/frontend.cc
index b8cd67e..057177b 100644
--- a/compiler/dex/frontend.cc
+++ b/compiler/dex/frontend.cc

@@ -84,6 +84,7 @@
   // (1 << kBBOpt) |
   // (1 << kMatch) |
   // (1 << kPromoteCompilerTemps) |
+  // (1 << kSuppressExceptionEdges) |
   0;
 
 static uint32_t kCompilerDebugFlags = 0 |     // Enable debug/testing modes
@@ -212,7 +213,9 @@
 
   if (compiler_backend == kPortable) {
     // Fused long branches not currently useful in bitcode.
-    cu.disable_opt |= (1 << kBranchFusing);
+    cu.disable_opt |=
+        (1 << kBranchFusing) |
+        (1 << kSuppressExceptionEdges);
   }
 
   if (cu.instruction_set == kMips) {

diff --git a/compiler/dex/frontend.h b/compiler/dex/frontend.h
index 43f6855..b9b4178 100644
--- a/compiler/dex/frontend.h
+++ b/compiler/dex/frontend.h

@@ -56,6 +56,7 @@
   kMatch,
   kPromoteCompilerTemps,
   kBranchFusing,
+  kSuppressExceptionEdges,
 };
 
 // Force code generation paths for testing.

diff --git a/compiler/dex/local_value_numbering.cc b/compiler/dex/local_value_numbering.cc
index 35d2923..75883b7 100644
--- a/compiler/dex/local_value_numbering.cc
+++ b/compiler/dex/local_value_numbering.cc

@@ -380,7 +380,9 @@
           }
           mir->optimization_flags |= MIR_IGNORE_RANGE_CHECK;
         }
-        mir->meta.throw_insn->optimization_flags |= mir->optimization_flags;
+        if (mir->meta.throw_insn != NULL) {
+          mir->meta.throw_insn->optimization_flags |= mir->optimization_flags;
+        }
         // Use side effect to note range check completed.
         (void)LookupValue(ARRAY_REF, array, index, NO_VALUE);
         // Establish value number for loaded register. Note use of memory version.
@@ -419,7 +421,9 @@
           }
           mir->optimization_flags |= MIR_IGNORE_RANGE_CHECK;
         }
-        mir->meta.throw_insn->optimization_flags |= mir->optimization_flags;
+        if (mir->meta.throw_insn != NULL) {
+          mir->meta.throw_insn->optimization_flags |= mir->optimization_flags;
+        }
         // Use side effect to note range check completed.
         (void)LookupValue(ARRAY_REF, array, index, NO_VALUE);
         // Rev the memory version
@@ -443,7 +447,9 @@
         } else {
           null_checked_.insert(base);
         }
-        mir->meta.throw_insn->optimization_flags |= mir->optimization_flags;
+        if (mir->meta.throw_insn != NULL) {
+          mir->meta.throw_insn->optimization_flags |= mir->optimization_flags;
+        }
         uint16_t field_ref = mir->dalvikInsn.vC;
         uint16_t memory_version = GetMemoryVersion(base, field_ref);
         if (opcode == Instruction::IGET_WIDE) {
@@ -473,7 +479,9 @@
         } else {
           null_checked_.insert(base);
         }
-        mir->meta.throw_insn->optimization_flags |= mir->optimization_flags;
+        if (mir->meta.throw_insn != NULL) {
+          mir->meta.throw_insn->optimization_flags |= mir->optimization_flags;
+        }
         uint16_t field_ref = mir->dalvikInsn.vC;
         AdvanceMemoryVersion(base, field_ref);
       }

diff --git a/compiler/dex/mir_dataflow.cc b/compiler/dex/mir_dataflow.cc
index 11e19dc..d359ee2 100644
--- a/compiler/dex/mir_dataflow.cc
+++ b/compiler/dex/mir_dataflow.cc

@@ -1243,12 +1243,13 @@
     if (mir->ssa_rep == NULL) {
       continue;
     }
-    // Each level of nesting adds *16 to count, up to 3 levels deep.
-    uint32_t weight = std::min(3U, static_cast<uint32_t>(bb->nesting_depth) * 4);
+    // Each level of nesting adds *100 to count, up to 3 levels deep.
+    uint32_t depth = std::min(3U, static_cast<uint32_t>(bb->nesting_depth));
+    uint32_t weight = std::max(1U, depth * 100);
     for (int i = 0; i < mir->ssa_rep->num_uses; i++) {
       int s_reg = mir->ssa_rep->uses[i];
       raw_use_counts_.Increment(s_reg);
-      use_counts_.Put(s_reg, use_counts_.Get(s_reg) + (1 << weight));
+      use_counts_.Put(s_reg, use_counts_.Get(s_reg) + weight);
     }
     if (!(cu_->disable_opt & (1 << kPromoteCompilerTemps))) {
       int df_attributes = oat_data_flow_attributes_[mir->dalvikInsn.opcode];
@@ -1267,7 +1268,7 @@
         }
         if (uses_method_star) {
           raw_use_counts_.Increment(method_sreg_);
-          use_counts_.Put(method_sreg_, use_counts_.Get(method_sreg_) + (1 << weight));
+          use_counts_.Put(method_sreg_, use_counts_.Get(method_sreg_) + weight);
         }
       }
     }

diff --git a/compiler/dex/mir_graph.cc b/compiler/dex/mir_graph.cc
index cf758fc..deaf2ff 100644
--- a/compiler/dex/mir_graph.cc
+++ b/compiler/dex/mir_graph.cc

@@ -365,8 +365,8 @@
 }
 
 /* Process instructions with the kSwitch flag */
-void MIRGraph::ProcessCanSwitch(BasicBlock* cur_block, MIR* insn, DexOffset cur_offset, int width,
-                                int flags) {
+BasicBlock* MIRGraph::ProcessCanSwitch(BasicBlock* cur_block, MIR* insn, DexOffset cur_offset,
+                                       int width, int flags) {
   const uint16_t* switch_data =
       reinterpret_cast<const uint16_t*>(GetCurrentInsns() + cur_offset + insn->dalvikInsn.vB);
   int size;
@@ -437,6 +437,7 @@
                                             /* create */ true, /* immed_pred_block_p */ NULL);
   cur_block->fall_through = fallthrough_block->id;
   fallthrough_block->predecessors->Insert(cur_block->id);
+  return cur_block;
 }
 
 /* Process instructions with the kThrow flag */
@@ -444,6 +445,9 @@
                                       int width, int flags, ArenaBitVector* try_block_addr,
                                       const uint16_t* code_ptr, const uint16_t* code_end) {
   bool in_try_block = try_block_addr->IsBitSet(cur_offset);
+  bool is_throw = (insn->dalvikInsn.opcode == Instruction::THROW);
+  bool build_all_edges =
+      (cu_->disable_opt & (1 << kSuppressExceptionEdges)) || is_throw || in_try_block;
 
   /* In try block */
   if (in_try_block) {
@@ -473,7 +477,7 @@
       cur_block->successor_blocks->Insert(successor_block_info);
       catch_block->predecessors->Insert(cur_block->id);
     }
-  } else {
+  } else if (build_all_edges) {
     BasicBlock *eh_block = NewMemBB(kExceptionHandling, num_blocks_++);
     cur_block->taken = eh_block->id;
     block_list_.Insert(eh_block);
@@ -481,7 +485,7 @@
     eh_block->predecessors->Insert(cur_block->id);
   }
 
-  if (insn->dalvikInsn.opcode == Instruction::THROW) {
+  if (is_throw) {
     cur_block->explicit_throw = true;
     if (code_ptr < code_end) {
       // Force creation of new block following THROW via side-effect
@@ -494,6 +498,16 @@
     }
   }
 
+  if (!build_all_edges) {
+    /*
+     * Even though there is an exception edge here, control cannot return to this
+     * method.  Thus, for the purposes of dataflow analysis and optimization, we can
+     * ignore the edge.  Doing this reduces compile time, and increases the scope
+     * of the basic-block level optimization pass.
+     */
+    return cur_block;
+  }
+
   /*
    * Split the potentially-throwing instruction into two parts.
    * The first half will be a pseudo-op that captures the exception
@@ -695,7 +709,7 @@
       cur_block = ProcessCanThrow(cur_block, insn, current_offset_, width, flags, try_block_addr_,
                                   code_ptr, code_end);
     } else if (flags & Instruction::kSwitch) {
-      ProcessCanSwitch(cur_block, insn, current_offset_, width, flags);
+      cur_block = ProcessCanSwitch(cur_block, insn, current_offset_, width, flags);
     }
     current_offset_ += width;
     BasicBlock *next_block = FindBlock(current_offset_, /* split */ false, /* create */
@@ -1100,6 +1114,7 @@
 void MIRGraph::DumpMIRGraph() {
   BasicBlock* bb;
   const char* block_type_names[] = {
+    "Null Block",
     "Entry Block",
     "Code Block",
     "Exit Block",

diff --git a/compiler/dex/mir_graph.h b/compiler/dex/mir_graph.h
index a69dde0..8c20728 100644
--- a/compiler/dex/mir_graph.h
+++ b/compiler/dex/mir_graph.h

@@ -698,8 +698,8 @@
   void ProcessTryCatchBlocks();
   BasicBlock* ProcessCanBranch(BasicBlock* cur_block, MIR* insn, DexOffset cur_offset, int width,
                                int flags, const uint16_t* code_ptr, const uint16_t* code_end);
-  void ProcessCanSwitch(BasicBlock* cur_block, MIR* insn, DexOffset cur_offset, int width,
-                        int flags);
+  BasicBlock* ProcessCanSwitch(BasicBlock* cur_block, MIR* insn, DexOffset cur_offset, int width,
+                               int flags);
   BasicBlock* ProcessCanThrow(BasicBlock* cur_block, MIR* insn, DexOffset cur_offset, int width,
                               int flags, ArenaBitVector* try_block_addr, const uint16_t* code_ptr,
                               const uint16_t* code_end);

diff --git a/compiler/dex/quick/mir_to_lir-inl.h b/compiler/dex/quick/mir_to_lir-inl.h
index 1a30b7a..f567b5c 100644
--- a/compiler/dex/quick/mir_to_lir-inl.h
+++ b/compiler/dex/quick/mir_to_lir-inl.h

@@ -198,6 +198,10 @@
     SetupRegMask(&lir->u.m.use_mask, lir->operands[3]);
   }
 
+  if (flags & REG_USE4) {
+    SetupRegMask(&lir->u.m.use_mask, lir->operands[4]);
+  }
+
   if (flags & SETS_CCODES) {
     lir->u.m.def_mask |= ENCODE_CCODE;
   }
commit	17189ac098b2f156713db1821b49db7b2f018bbe	[log] [tgz]
author	buzbee <buzbee@google.com>	Fri Nov 08 11:07:02 2013 -0800
committer	buzbee <buzbee@google.com>	Thu Nov 14 08:01:23 2013 -0800
tree	112bbb910fadb310ef98d3b1bb306f9ef876984e
parent	3b6f0fae76fddf81930a263a075dc87b6039b7fc [diff]