Quick compiler compile-time/memory use improvement

This CL delivers a surprisingly large reduction in compile time,
as well as a significant reduction in memory usage by conditionally
removing a CFG construction feature introduced to support LLVM
bitcode generation.

In short, bitcode requires all potential exception edges to be
explicitly present in the CFG.  The Quick compiler (based on the
old JIT), can ignore, at least for the purposes of dataflow
analysis, potential throw points that do not have a corresponding
catch block.

To support LLVM, we create a target basic block for every
potentially throwing instruction to give us a destination for
the exception edge.  Then, following the check elimination pass,
we remove blocks whose edges have gone away.

However, if we're not using LLVM, we can skip the creation of
those things in the first place.  The savings are significant.

Single-threaded compilation time on the host looks to be reduced
by something in the vicinity of 10%.  We create roughly 60% fewer
basic blocks (and, importantly, the creation of fewer basic
block nodes has a multiplying effect on memory use reduction
because it results in fewer dataflow bitmaps).

Some basic block redution stats:

boot: 2325802 before, 844846 after.
Phonesky: 485472 before, 156014 after.
PlusOne: 806232 before, 243156 after.
Thinkfree: 864498 before, 264858 after.

Another nice side effect of this change is giving the basic
block optimization pass generally larger scope.

For arena memusage in the boot class path (compiled on the host):

          Average            Max
Before:    50,863         88,017,820
After:     41,964          4,914,208

The huge reduction in max arena memory usage is due to the
collapsing of a large initialization method.  Specifically, with complete
exception edges org.ccil.cowan.tagsoup.Scheme requires 13,802
basic blocks.  With exception edges collapsed, it requires 4.

This change also caused 2 latent bugs to surface.

1) The dex parsing code did not expect that the target of a switch statement
could reside in the middle of the same basic block ended by that same switch
statement.

2) The x86 backend introduced a 5-operand LIR instruction for indexed memops.
However, there was no corresponding change to the use/def mask creation code.
Thus, the 5th operand was never included in the use/def mask.  This allowed
the instruction scheduling code to incorrectly move a use above a definition.
We didn't see this before because the affected x86 instructions were only used
for aget/aput, and prior to this CL those Dalvik opcodes caused a basic
block break because of the implied exception edge - which prevented the code
motion.

And finally, also included is some minor tuning of register use weighting.

Change-Id: I3f2cab7136dba2bded71e9e33b452b95e8fffc0e
diff --git a/compiler/dex/frontend.cc b/compiler/dex/frontend.cc
index b8cd67e..057177b 100644
--- a/compiler/dex/frontend.cc
+++ b/compiler/dex/frontend.cc
@@ -84,6 +84,7 @@
   // (1 << kBBOpt) |
   // (1 << kMatch) |
   // (1 << kPromoteCompilerTemps) |
+  // (1 << kSuppressExceptionEdges) |
   0;
 
 static uint32_t kCompilerDebugFlags = 0 |     // Enable debug/testing modes
@@ -212,7 +213,9 @@
 
   if (compiler_backend == kPortable) {
     // Fused long branches not currently useful in bitcode.
-    cu.disable_opt |= (1 << kBranchFusing);
+    cu.disable_opt |=
+        (1 << kBranchFusing) |
+        (1 << kSuppressExceptionEdges);
   }
 
   if (cu.instruction_set == kMips) {
diff --git a/compiler/dex/frontend.h b/compiler/dex/frontend.h
index 43f6855..b9b4178 100644
--- a/compiler/dex/frontend.h
+++ b/compiler/dex/frontend.h
@@ -56,6 +56,7 @@
   kMatch,
   kPromoteCompilerTemps,
   kBranchFusing,
+  kSuppressExceptionEdges,
 };
 
 // Force code generation paths for testing.
diff --git a/compiler/dex/local_value_numbering.cc b/compiler/dex/local_value_numbering.cc
index 35d2923..75883b7 100644
--- a/compiler/dex/local_value_numbering.cc
+++ b/compiler/dex/local_value_numbering.cc
@@ -380,7 +380,9 @@
           }
           mir->optimization_flags |= MIR_IGNORE_RANGE_CHECK;
         }
-        mir->meta.throw_insn->optimization_flags |= mir->optimization_flags;
+        if (mir->meta.throw_insn != NULL) {
+          mir->meta.throw_insn->optimization_flags |= mir->optimization_flags;
+        }
         // Use side effect to note range check completed.
         (void)LookupValue(ARRAY_REF, array, index, NO_VALUE);
         // Establish value number for loaded register. Note use of memory version.
@@ -419,7 +421,9 @@
           }
           mir->optimization_flags |= MIR_IGNORE_RANGE_CHECK;
         }
-        mir->meta.throw_insn->optimization_flags |= mir->optimization_flags;
+        if (mir->meta.throw_insn != NULL) {
+          mir->meta.throw_insn->optimization_flags |= mir->optimization_flags;
+        }
         // Use side effect to note range check completed.
         (void)LookupValue(ARRAY_REF, array, index, NO_VALUE);
         // Rev the memory version
@@ -443,7 +447,9 @@
         } else {
           null_checked_.insert(base);
         }
-        mir->meta.throw_insn->optimization_flags |= mir->optimization_flags;
+        if (mir->meta.throw_insn != NULL) {
+          mir->meta.throw_insn->optimization_flags |= mir->optimization_flags;
+        }
         uint16_t field_ref = mir->dalvikInsn.vC;
         uint16_t memory_version = GetMemoryVersion(base, field_ref);
         if (opcode == Instruction::IGET_WIDE) {
@@ -473,7 +479,9 @@
         } else {
           null_checked_.insert(base);
         }
-        mir->meta.throw_insn->optimization_flags |= mir->optimization_flags;
+        if (mir->meta.throw_insn != NULL) {
+          mir->meta.throw_insn->optimization_flags |= mir->optimization_flags;
+        }
         uint16_t field_ref = mir->dalvikInsn.vC;
         AdvanceMemoryVersion(base, field_ref);
       }
diff --git a/compiler/dex/mir_dataflow.cc b/compiler/dex/mir_dataflow.cc
index 11e19dc..d359ee2 100644
--- a/compiler/dex/mir_dataflow.cc
+++ b/compiler/dex/mir_dataflow.cc
@@ -1243,12 +1243,13 @@
     if (mir->ssa_rep == NULL) {
       continue;
     }
-    // Each level of nesting adds *16 to count, up to 3 levels deep.
-    uint32_t weight = std::min(3U, static_cast<uint32_t>(bb->nesting_depth) * 4);
+    // Each level of nesting adds *100 to count, up to 3 levels deep.
+    uint32_t depth = std::min(3U, static_cast<uint32_t>(bb->nesting_depth));
+    uint32_t weight = std::max(1U, depth * 100);
     for (int i = 0; i < mir->ssa_rep->num_uses; i++) {
       int s_reg = mir->ssa_rep->uses[i];
       raw_use_counts_.Increment(s_reg);
-      use_counts_.Put(s_reg, use_counts_.Get(s_reg) + (1 << weight));
+      use_counts_.Put(s_reg, use_counts_.Get(s_reg) + weight);
     }
     if (!(cu_->disable_opt & (1 << kPromoteCompilerTemps))) {
       int df_attributes = oat_data_flow_attributes_[mir->dalvikInsn.opcode];
@@ -1267,7 +1268,7 @@
         }
         if (uses_method_star) {
           raw_use_counts_.Increment(method_sreg_);
-          use_counts_.Put(method_sreg_, use_counts_.Get(method_sreg_) + (1 << weight));
+          use_counts_.Put(method_sreg_, use_counts_.Get(method_sreg_) + weight);
         }
       }
     }
diff --git a/compiler/dex/mir_graph.cc b/compiler/dex/mir_graph.cc
index cf758fc..deaf2ff 100644
--- a/compiler/dex/mir_graph.cc
+++ b/compiler/dex/mir_graph.cc
@@ -365,8 +365,8 @@
 }
 
 /* Process instructions with the kSwitch flag */
-void MIRGraph::ProcessCanSwitch(BasicBlock* cur_block, MIR* insn, DexOffset cur_offset, int width,
-                                int flags) {
+BasicBlock* MIRGraph::ProcessCanSwitch(BasicBlock* cur_block, MIR* insn, DexOffset cur_offset,
+                                       int width, int flags) {
   const uint16_t* switch_data =
       reinterpret_cast<const uint16_t*>(GetCurrentInsns() + cur_offset + insn->dalvikInsn.vB);
   int size;
@@ -437,6 +437,7 @@
                                             /* create */ true, /* immed_pred_block_p */ NULL);
   cur_block->fall_through = fallthrough_block->id;
   fallthrough_block->predecessors->Insert(cur_block->id);
+  return cur_block;
 }
 
 /* Process instructions with the kThrow flag */
@@ -444,6 +445,9 @@
                                       int width, int flags, ArenaBitVector* try_block_addr,
                                       const uint16_t* code_ptr, const uint16_t* code_end) {
   bool in_try_block = try_block_addr->IsBitSet(cur_offset);
+  bool is_throw = (insn->dalvikInsn.opcode == Instruction::THROW);
+  bool build_all_edges =
+      (cu_->disable_opt & (1 << kSuppressExceptionEdges)) || is_throw || in_try_block;
 
   /* In try block */
   if (in_try_block) {
@@ -473,7 +477,7 @@
       cur_block->successor_blocks->Insert(successor_block_info);
       catch_block->predecessors->Insert(cur_block->id);
     }
-  } else {
+  } else if (build_all_edges) {
     BasicBlock *eh_block = NewMemBB(kExceptionHandling, num_blocks_++);
     cur_block->taken = eh_block->id;
     block_list_.Insert(eh_block);
@@ -481,7 +485,7 @@
     eh_block->predecessors->Insert(cur_block->id);
   }
 
-  if (insn->dalvikInsn.opcode == Instruction::THROW) {
+  if (is_throw) {
     cur_block->explicit_throw = true;
     if (code_ptr < code_end) {
       // Force creation of new block following THROW via side-effect
@@ -494,6 +498,16 @@
     }
   }
 
+  if (!build_all_edges) {
+    /*
+     * Even though there is an exception edge here, control cannot return to this
+     * method.  Thus, for the purposes of dataflow analysis and optimization, we can
+     * ignore the edge.  Doing this reduces compile time, and increases the scope
+     * of the basic-block level optimization pass.
+     */
+    return cur_block;
+  }
+
   /*
    * Split the potentially-throwing instruction into two parts.
    * The first half will be a pseudo-op that captures the exception
@@ -695,7 +709,7 @@
       cur_block = ProcessCanThrow(cur_block, insn, current_offset_, width, flags, try_block_addr_,
                                   code_ptr, code_end);
     } else if (flags & Instruction::kSwitch) {
-      ProcessCanSwitch(cur_block, insn, current_offset_, width, flags);
+      cur_block = ProcessCanSwitch(cur_block, insn, current_offset_, width, flags);
     }
     current_offset_ += width;
     BasicBlock *next_block = FindBlock(current_offset_, /* split */ false, /* create */
@@ -1100,6 +1114,7 @@
 void MIRGraph::DumpMIRGraph() {
   BasicBlock* bb;
   const char* block_type_names[] = {
+    "Null Block",
     "Entry Block",
     "Code Block",
     "Exit Block",
diff --git a/compiler/dex/mir_graph.h b/compiler/dex/mir_graph.h
index a69dde0..8c20728 100644
--- a/compiler/dex/mir_graph.h
+++ b/compiler/dex/mir_graph.h
@@ -698,8 +698,8 @@
   void ProcessTryCatchBlocks();
   BasicBlock* ProcessCanBranch(BasicBlock* cur_block, MIR* insn, DexOffset cur_offset, int width,
                                int flags, const uint16_t* code_ptr, const uint16_t* code_end);
-  void ProcessCanSwitch(BasicBlock* cur_block, MIR* insn, DexOffset cur_offset, int width,
-                        int flags);
+  BasicBlock* ProcessCanSwitch(BasicBlock* cur_block, MIR* insn, DexOffset cur_offset, int width,
+                               int flags);
   BasicBlock* ProcessCanThrow(BasicBlock* cur_block, MIR* insn, DexOffset cur_offset, int width,
                               int flags, ArenaBitVector* try_block_addr, const uint16_t* code_ptr,
                               const uint16_t* code_end);
diff --git a/compiler/dex/quick/mir_to_lir-inl.h b/compiler/dex/quick/mir_to_lir-inl.h
index 1a30b7a..f567b5c 100644
--- a/compiler/dex/quick/mir_to_lir-inl.h
+++ b/compiler/dex/quick/mir_to_lir-inl.h
@@ -198,6 +198,10 @@
     SetupRegMask(&lir->u.m.use_mask, lir->operands[3]);
   }
 
+  if (flags & REG_USE4) {
+    SetupRegMask(&lir->u.m.use_mask, lir->operands[4]);
+  }
+
   if (flags & SETS_CCODES) {
     lir->u.m.def_mask |= ENCODE_CCODE;
   }