ART vectorizer.

Rationale:
Make SIMD great again with a retargetable and easily extendable vectorizer.

Provides a full x86/x86_64 and a proof-of-concept ARM implementation. Sample
improvement (without any perf tuning yet) for Linpack on x86 is about 20% to 50%.

Test: test-art-host, test-art-target (angler)
Bug: 34083438, 30933338

Change-Id: Ifb77a0f25f690a87cd65bf3d5e9f6be7ea71d6c1
diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index 0b798fc..16f7691 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h
@@ -27,7 +27,8 @@
 
 /**
  * Loop optimizations. Builds a loop hierarchy and applies optimizations to
- * the detected nested loops, such as removal of dead induction and empty loops.
+ * the detected nested loops, such as removal of dead induction and empty loops
+ * and inner loop vectorization.
  */
 class HLoopOptimization : public HOptimization {
  public:
@@ -50,34 +51,105 @@
           inner(nullptr),
           previous(nullptr),
           next(nullptr) {}
-    HLoopInformation* const loop_info;
+    HLoopInformation* loop_info;
     LoopNode* outer;
     LoopNode* inner;
     LoopNode* previous;
     LoopNode* next;
   };
 
-  void LocalRun();
+  /*
+   * Vectorization restrictions (bit mask).
+   */
+  enum VectorRestrictions {
+    kNone     = 0,   // no restrictions
+    kNoMul    = 1,   // no multiplication
+    kNoDiv    = 2,   // no division
+    kNoShift  = 4,   // no shift
+    kNoShr    = 8,   // no arithmetic shift right
+    kNoHiBits = 16,  // "wider" operations cannot bring in higher order bits
+  };
 
+  /*
+   * Vectorization mode during synthesis
+   * (sequential peeling/cleanup loop or vector loop).
+   */
+  enum VectorMode {
+    kSequential,
+    kVector
+  };
+
+  /*
+   * Representation of a unit-stride array reference.
+   */
+  struct ArrayReference {
+    ArrayReference(HInstruction* b, HInstruction* o, Primitive::Type t, bool l)
+        : base(b), offset(o), type(t), lhs(l) { }
+    bool operator<(const ArrayReference& other) const {
+      return
+          (base < other.base) ||
+          (base == other.base &&
+           (offset < other.offset || (offset == other.offset &&
+                                      (type < other.type ||
+                                       (type == other.type && lhs < other.lhs)))));
+    }
+    HInstruction* base;    // base address
+    HInstruction* offset;  // offset + i
+    Primitive::Type type;  // component type
+    bool lhs;              // def/use
+  };
+
+  // Loop setup and traversal.
+  void LocalRun();
   void AddLoop(HLoopInformation* loop_info);
   void RemoveLoop(LoopNode* node);
-
   void TraverseLoopsInnerToOuter(LoopNode* node);
 
-  // Simplification.
+  // Optimization.
   void SimplifyInduction(LoopNode* node);
   void SimplifyBlocks(LoopNode* node);
-  bool SimplifyInnerLoop(LoopNode* node);
+  void OptimizeInnerLoop(LoopNode* node);
+
+  // Vectorization analysis and synthesis.
+  bool CanVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count);
+  void Vectorize(LoopNode* node, HBasicBlock* block, HBasicBlock* exit, int64_t trip_count);
+  void GenerateNewLoop(LoopNode* node,
+                       HBasicBlock* block,
+                       HBasicBlock* new_preheader,
+                       HInstruction* lo,
+                       HInstruction* hi,
+                       HInstruction* step);
+  bool VectorizeDef(LoopNode* node, HInstruction* instruction, bool generate_code);
+  bool VectorizeUse(LoopNode* node,
+                    HInstruction* instruction,
+                    bool generate_code,
+                    Primitive::Type type,
+                    uint64_t restrictions);
+  bool TrySetVectorType(Primitive::Type type, /*out*/ uint64_t* restrictions);
+  bool TrySetVectorLength(uint32_t length);
+  void GenerateVecInv(HInstruction* org, Primitive::Type type);
+  void GenerateVecSub(HInstruction* org, HInstruction* off);
+  void GenerateVecMem(HInstruction* org,
+                      HInstruction* opa,
+                      HInstruction* opb,
+                      Primitive::Type type);
+  void GenerateVecOp(HInstruction* org, HInstruction* opa, HInstruction* opb, Primitive::Type type);
 
   // Helpers.
-  bool IsPhiInduction(HPhi* phi);
-  bool IsEmptyHeader(HBasicBlock* block);
+  bool TrySetPhiInduction(HPhi* phi, bool restrict_uses);
+  bool TrySetSimpleLoopHeader(HBasicBlock* block);
   bool IsEmptyBody(HBasicBlock* block);
   bool IsOnlyUsedAfterLoop(HLoopInformation* loop_info,
                            HInstruction* instruction,
                            bool collect_loop_uses,
                            /*out*/ int32_t* use_count);
+  bool IsUsedOutsideLoop(HLoopInformation* loop_info,
+                         HInstruction* instruction);
   bool TryReplaceWithLastValue(HInstruction* instruction, HBasicBlock* block);
+  bool TryAssignLastValue(HLoopInformation* loop_info,
+                          HInstruction* instruction,
+                          HBasicBlock* block,
+                          bool collect_loop_uses);
   void RemoveDeadInstructions(const HInstructionList& list);
 
   // Compiler driver (to query ISA features).
@@ -90,6 +162,9 @@
   // through this allocator is immediately released when the loop optimizer is done.
   ArenaAllocator* loop_allocator_;
 
+  // Global heap memory allocator. Used to build HIR.
+  ArenaAllocator* global_allocator_;
+
   // Entries into the loop hierarchy representation. The hierarchy resides
   // in phase-local heap memory.
   LoopNode* top_loop_;
@@ -102,11 +177,33 @@
   // Counter that tracks how many induction cycles have been simplified. Useful
   // to trigger incremental updates of induction variable analysis of outer loops
   // when the induction of inner loops has changed.
-  int32_t induction_simplication_count_;
+  uint32_t induction_simplication_count_;
 
   // Flag that tracks if any simplifications have occurred.
   bool simplified_;
 
+  // Number of "lanes" for selected packed type.
+  uint32_t vector_length_;
+
+  // Set of array references in the vector loop.
+  // Contents reside in phase-local heap memory.
+  ArenaSet<ArrayReference>* vector_refs_;
+
+  // Mapping used during vectorization synthesis for both the scalar peeling/cleanup
+  // loop (simd_ is false) and the actual vector loop (simd_ is true). The data
+  // structure maps original instructions into the new instructions.
+  // Contents reside in phase-local heap memory.
+  ArenaSafeMap<HInstruction*, HInstruction*>* vector_map_;
+
+  // Temporary vectorization bookkeeping.
+  HBasicBlock* vector_preheader_;  // preheader of the new loop
+  HBasicBlock* vector_header_;  // header of the new loop
+  HBasicBlock* vector_body_;  // body of the new loop
+  HInstruction* vector_runtime_test_a_;
+  HInstruction* vector_runtime_test_b_;  // defines a != b runtime test
+  HPhi* vector_phi_;  // the Phi representing the normalized loop index
+  VectorMode vector_mode_;  // selects synthesis mode
+
   friend class LoopOptimizationTest;
 
   DISALLOW_COPY_AND_ASSIGN(HLoopOptimization);