Implement Sum-of-Abs-Differences idiom recognition.

Rationale:
Currently just on ARM64 (x86 lacks proper support),
using the SAD idiom yields great speedup on loops
that compute the sum-of-abs-difference operation.
Also includes some refinements around type conversions.

Speedup ExoPlayerAudio (golem run):
1.3x on ARM64
1.1x on x86

Test: test-art-host test-art-target

Bug: 64091002

Change-Id: Ia2b711d2bc23609a2ed50493dfe6719eedfe0130
diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h
index c5e75a7..1488b70 100644
--- a/compiler/optimizing/nodes_vector.h
+++ b/compiler/optimizing/nodes_vector.h
@@ -461,8 +461,8 @@
 };
 
 // Performs halving add on every component in the two vectors, viz.
-// rounded [ x1, .. , xn ] hradd [ y1, .. , yn ] = [ (x1 + y1 + 1) >> 1, .. , (xn + yn + 1) >> 1 ]
-// or      [ x1, .. , xn ] hadd  [ y1, .. , yn ] = [ (x1 + y1)     >> 1, .. , (xn + yn )    >> 1 ]
+// rounded   [ x1, .. , xn ] hradd [ y1, .. , yn ] = [ (x1 + y1 + 1) >> 1, .. , (xn + yn + 1) >> 1 ]
+// truncated [ x1, .. , xn ] hadd  [ y1, .. , yn ] = [ (x1 + y1)     >> 1, .. , (xn + yn )    >> 1 ]
 // for signed operands x, y (sign extension) or unsigned operands x, y (zero extension).
 class HVecHalvingAdd FINAL : public HVecBinaryOperation {
  public:
@@ -810,8 +810,8 @@
 //
 
 // Assigns the given scalar elements to a vector,
-// viz. set( array(x1, .., xn) ) = [ x1, .. ,           xn ] if n == m,
-//      set( array(x1, .., xm) ) = [ x1, .. , xm, 0, .., 0 ] if m <  n.
+// viz. set( array(x1, .. , xn) ) = [ x1, .. ,            xn ] if n == m,
+//      set( array(x1, .. , xm) ) = [ x1, .. , xm, 0, .. , 0 ] if m <  n.
 class HVecSetScalars FINAL : public HVecOperation {
  public:
   HVecSetScalars(ArenaAllocator* arena,
@@ -842,9 +842,8 @@
   DISALLOW_COPY_AND_ASSIGN(HVecSetScalars);
 };
 
-// Multiplies every component in the two vectors, adds the result vector to the accumulator vector.
-// viz. [ acc1, .., accn ] + [ x1, .. , xn ] * [ y1, .. , yn ] =
-//     [ acc1 + x1 * y1, .. , accn + xn * yn ].
+// Multiplies every component in the two vectors, adds the result vector to the accumulator vector,
+// viz. [ a1, .. , an ] + [ x1, .. , xn ] * [ y1, .. , yn ] = [ a1 + x1 * y1, .. , an + xn * yn ].
 class HVecMultiplyAccumulate FINAL : public HVecOperation {
  public:
   HVecMultiplyAccumulate(ArenaAllocator* arena,
@@ -866,15 +865,11 @@
     DCHECK(HasConsistentPackedTypes(accumulator, packed_type));
     DCHECK(HasConsistentPackedTypes(mul_left, packed_type));
     DCHECK(HasConsistentPackedTypes(mul_right, packed_type));
-    SetRawInputAt(kInputAccumulatorIndex, accumulator);
-    SetRawInputAt(kInputMulLeftIndex, mul_left);
-    SetRawInputAt(kInputMulRightIndex, mul_right);
+    SetRawInputAt(0, accumulator);
+    SetRawInputAt(1, mul_left);
+    SetRawInputAt(2, mul_right);
   }
 
-  static constexpr int kInputAccumulatorIndex = 0;
-  static constexpr int kInputMulLeftIndex = 1;
-  static constexpr int kInputMulRightIndex = 2;
-
   bool CanBeMoved() const OVERRIDE { return true; }
 
   bool InstructionDataEquals(const HInstruction* other) const OVERRIDE {
@@ -894,6 +889,42 @@
   DISALLOW_COPY_AND_ASSIGN(HVecMultiplyAccumulate);
 };
 
+// Takes the absolute difference of two vectors, and adds the results to
+// same-precision or wider-precision components in the accumulator,
+// viz. SAD([ a1, .. , am ], [ x1, .. , xn ], [ y1, .. , yn ] =
+//          [ a1 + sum abs(xi-yi), .. , am + sum abs(xj-yj) ],
+//      for m <= n and non-overlapping sums.
+class HVecSADAccumulate FINAL : public HVecOperation {
+ public:
+  HVecSADAccumulate(ArenaAllocator* arena,
+                    HInstruction* accumulator,
+                    HInstruction* sad_left,
+                    HInstruction* sad_right,
+                    Primitive::Type packed_type,
+                    size_t vector_length,
+                    uint32_t dex_pc = kNoDexPc)
+      : HVecOperation(arena,
+                      packed_type,
+                      SideEffects::None(),
+                      /* number_of_inputs */ 3,
+                      vector_length,
+                      dex_pc) {
+    DCHECK(HasConsistentPackedTypes(accumulator, packed_type));
+    DCHECK(sad_left->IsVecOperation());
+    DCHECK(sad_right->IsVecOperation());
+    DCHECK_EQ(sad_left->AsVecOperation()->GetPackedType(),
+              sad_right->AsVecOperation()->GetPackedType());
+    SetRawInputAt(0, accumulator);
+    SetRawInputAt(1, sad_left);
+    SetRawInputAt(2, sad_right);
+  }
+
+  DECLARE_INSTRUCTION(VecSADAccumulate);
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HVecSADAccumulate);
+};
+
 // Loads a vector from memory, viz. load(mem, 1)
 // yield the vector [ mem(1), .. , mem(n) ].
 class HVecLoad FINAL : public HVecMemoryOperation {