ART vectorizer.
Rationale:
Make SIMD great again with a retargetable and easily extendable vectorizer.
Provides a full x86/x86_64 and a proof-of-concept ARM implementation. Sample
improvement (without any perf tuning yet) for Linpack on x86 is about 20% to 50%.
Test: test-art-host, test-art-target (angler)
Bug: 34083438, 30933338
Change-Id: Ifb77a0f25f690a87cd65bf3d5e9f6be7ea71d6c1
diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index 0b798fc..16f7691 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h
@@ -27,7 +27,8 @@
/**
* Loop optimizations. Builds a loop hierarchy and applies optimizations to
- * the detected nested loops, such as removal of dead induction and empty loops.
+ * the detected nested loops, such as removal of dead induction and empty loops
+ * and inner loop vectorization.
*/
class HLoopOptimization : public HOptimization {
public:
@@ -50,34 +51,105 @@
inner(nullptr),
previous(nullptr),
next(nullptr) {}
- HLoopInformation* const loop_info;
+ HLoopInformation* loop_info;
LoopNode* outer;
LoopNode* inner;
LoopNode* previous;
LoopNode* next;
};
- void LocalRun();
+ /*
+ * Vectorization restrictions (bit mask).
+ */
+ enum VectorRestrictions {
+ kNone = 0, // no restrictions
+ kNoMul = 1, // no multiplication
+ kNoDiv = 2, // no division
+ kNoShift = 4, // no shift
+ kNoShr = 8, // no arithmetic shift right
+ kNoHiBits = 16, // "wider" operations cannot bring in higher order bits
+ };
+ /*
+ * Vectorization mode during synthesis
+ * (sequential peeling/cleanup loop or vector loop).
+ */
+ enum VectorMode {
+ kSequential,
+ kVector
+ };
+
+ /*
+ * Representation of a unit-stride array reference.
+ */
+ struct ArrayReference {
+ ArrayReference(HInstruction* b, HInstruction* o, Primitive::Type t, bool l)
+ : base(b), offset(o), type(t), lhs(l) { }
+ bool operator<(const ArrayReference& other) const {
+ return
+ (base < other.base) ||
+ (base == other.base &&
+ (offset < other.offset || (offset == other.offset &&
+ (type < other.type ||
+ (type == other.type && lhs < other.lhs)))));
+ }
+ HInstruction* base; // base address
+ HInstruction* offset; // offset + i
+ Primitive::Type type; // component type
+ bool lhs; // def/use
+ };
+
+ // Loop setup and traversal.
+ void LocalRun();
void AddLoop(HLoopInformation* loop_info);
void RemoveLoop(LoopNode* node);
-
void TraverseLoopsInnerToOuter(LoopNode* node);
- // Simplification.
+ // Optimization.
void SimplifyInduction(LoopNode* node);
void SimplifyBlocks(LoopNode* node);
- bool SimplifyInnerLoop(LoopNode* node);
+ void OptimizeInnerLoop(LoopNode* node);
+
+ // Vectorization analysis and synthesis.
+ bool CanVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count);
+ void Vectorize(LoopNode* node, HBasicBlock* block, HBasicBlock* exit, int64_t trip_count);
+ void GenerateNewLoop(LoopNode* node,
+ HBasicBlock* block,
+ HBasicBlock* new_preheader,
+ HInstruction* lo,
+ HInstruction* hi,
+ HInstruction* step);
+ bool VectorizeDef(LoopNode* node, HInstruction* instruction, bool generate_code);
+ bool VectorizeUse(LoopNode* node,
+ HInstruction* instruction,
+ bool generate_code,
+ Primitive::Type type,
+ uint64_t restrictions);
+ bool TrySetVectorType(Primitive::Type type, /*out*/ uint64_t* restrictions);
+ bool TrySetVectorLength(uint32_t length);
+ void GenerateVecInv(HInstruction* org, Primitive::Type type);
+ void GenerateVecSub(HInstruction* org, HInstruction* off);
+ void GenerateVecMem(HInstruction* org,
+ HInstruction* opa,
+ HInstruction* opb,
+ Primitive::Type type);
+ void GenerateVecOp(HInstruction* org, HInstruction* opa, HInstruction* opb, Primitive::Type type);
// Helpers.
- bool IsPhiInduction(HPhi* phi);
- bool IsEmptyHeader(HBasicBlock* block);
+ bool TrySetPhiInduction(HPhi* phi, bool restrict_uses);
+ bool TrySetSimpleLoopHeader(HBasicBlock* block);
bool IsEmptyBody(HBasicBlock* block);
bool IsOnlyUsedAfterLoop(HLoopInformation* loop_info,
HInstruction* instruction,
bool collect_loop_uses,
/*out*/ int32_t* use_count);
+ bool IsUsedOutsideLoop(HLoopInformation* loop_info,
+ HInstruction* instruction);
bool TryReplaceWithLastValue(HInstruction* instruction, HBasicBlock* block);
+ bool TryAssignLastValue(HLoopInformation* loop_info,
+ HInstruction* instruction,
+ HBasicBlock* block,
+ bool collect_loop_uses);
void RemoveDeadInstructions(const HInstructionList& list);
// Compiler driver (to query ISA features).
@@ -90,6 +162,9 @@
// through this allocator is immediately released when the loop optimizer is done.
ArenaAllocator* loop_allocator_;
+ // Global heap memory allocator. Used to build HIR.
+ ArenaAllocator* global_allocator_;
+
// Entries into the loop hierarchy representation. The hierarchy resides
// in phase-local heap memory.
LoopNode* top_loop_;
@@ -102,11 +177,33 @@
// Counter that tracks how many induction cycles have been simplified. Useful
// to trigger incremental updates of induction variable analysis of outer loops
// when the induction of inner loops has changed.
- int32_t induction_simplication_count_;
+ uint32_t induction_simplication_count_;
// Flag that tracks if any simplifications have occurred.
bool simplified_;
+ // Number of "lanes" for selected packed type.
+ uint32_t vector_length_;
+
+ // Set of array references in the vector loop.
+ // Contents reside in phase-local heap memory.
+ ArenaSet<ArrayReference>* vector_refs_;
+
+ // Mapping used during vectorization synthesis for both the scalar peeling/cleanup
+ // loop (simd_ is false) and the actual vector loop (simd_ is true). The data
+ // structure maps original instructions into the new instructions.
+ // Contents reside in phase-local heap memory.
+ ArenaSafeMap<HInstruction*, HInstruction*>* vector_map_;
+
+ // Temporary vectorization bookkeeping.
+ HBasicBlock* vector_preheader_; // preheader of the new loop
+ HBasicBlock* vector_header_; // header of the new loop
+ HBasicBlock* vector_body_; // body of the new loop
+ HInstruction* vector_runtime_test_a_;
+ HInstruction* vector_runtime_test_b_; // defines a != b runtime test
+ HPhi* vector_phi_; // the Phi representing the normalized loop index
+ VectorMode vector_mode_; // selects synthesis mode
+
friend class LoopOptimizationTest;
DISALLOW_COPY_AND_ASSIGN(HLoopOptimization);