Implement halving add idiom (with checker tests).

Rationale:
First of several idioms that map to very efficient SIMD instructions.
Note that the is-zero-ext and is-sign-ext are general-purpose utilities
that will be widely used in the vectorizer to detect low precision
idioms, so expect that code to be shared with many CLs to come.

Test: test-art-host, test-art-target
Change-Id: If7dc2926c72a2e4b5cea15c44ef68cf5503e9be9
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 8e88c1e..5a95abd 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -63,12 +63,122 @@
   return false;
 }
 
+// Detect a sign extension from the given type. Returns the promoted operand on success.
+static bool IsSignExtensionAndGet(HInstruction* instruction,
+                                  Primitive::Type type,
+                                  /*out*/ HInstruction** operand) {
+  // Accept any already wider constant that would be handled properly by sign
+  // extension when represented in the *width* of the given narrower data type
+  // (the fact that char normally zero extends does not matter here).
+  int64_t value = 0;
+  if (IsInt64AndGet(instruction, &value)) {
+    switch (type) {
+      case Primitive::kPrimByte:
+        if (std::numeric_limits<int8_t>::min() <= value &&
+            std::numeric_limits<int8_t>::max() >= value) {
+          *operand = instruction;
+          return true;
+        }
+        return false;
+      case Primitive::kPrimChar:
+      case Primitive::kPrimShort:
+        if (std::numeric_limits<int16_t>::min() <= value &&
+            std::numeric_limits<int16_t>::max() <= value) {
+          *operand = instruction;
+          return true;
+        }
+        return false;
+      default:
+        return false;
+    }
+  }
+  // An implicit widening conversion of a signed integer to an integral type sign-extends
+  // the two's-complement representation of the integer value to fill the wider format.
+  if (instruction->GetType() == type && (instruction->IsArrayGet() ||
+                                         instruction->IsStaticFieldGet() ||
+                                         instruction->IsInstanceFieldGet())) {
+    switch (type) {
+      case Primitive::kPrimByte:
+      case Primitive::kPrimShort:
+        *operand = instruction;
+        return true;
+      default:
+        return false;
+    }
+  }
+  // TODO: perhaps explicit conversions later too?
+  //       (this may return something different from instruction)
+  return false;
+}
+
+// Detect a zero extension from the given type. Returns the promoted operand on success.
+static bool IsZeroExtensionAndGet(HInstruction* instruction,
+                                  Primitive::Type type,
+                                  /*out*/ HInstruction** operand) {
+  // Accept any already wider constant that would be handled properly by zero
+  // extension when represented in the *width* of the given narrower data type
+  // (the fact that byte/short normally sign extend does not matter here).
+  int64_t value = 0;
+  if (IsInt64AndGet(instruction, &value)) {
+    switch (type) {
+      case Primitive::kPrimByte:
+        if (std::numeric_limits<uint8_t>::min() <= value &&
+            std::numeric_limits<uint8_t>::max() >= value) {
+          *operand = instruction;
+          return true;
+        }
+        return false;
+      case Primitive::kPrimChar:
+      case Primitive::kPrimShort:
+        if (std::numeric_limits<uint16_t>::min() <= value &&
+            std::numeric_limits<uint16_t>::max() <= value) {
+          *operand = instruction;
+          return true;
+        }
+        return false;
+      default:
+        return false;
+    }
+  }
+  // An implicit widening conversion of a char to an integral type zero-extends
+  // the representation of the char value to fill the wider format.
+  if (instruction->GetType() == type && (instruction->IsArrayGet() ||
+                                         instruction->IsStaticFieldGet() ||
+                                         instruction->IsInstanceFieldGet())) {
+    if (type == Primitive::kPrimChar) {
+      *operand = instruction;
+      return true;
+    }
+  }
+  // A sign (or zero) extension followed by an explicit removal of just the
+  // higher sign bits is equivalent to a zero extension of the underlying operand.
+  if (instruction->IsAnd()) {
+    int64_t mask = 0;
+    HInstruction* a = instruction->InputAt(0);
+    HInstruction* b = instruction->InputAt(1);
+    // In (a & b) find (mask & b) or (a & mask) with sign or zero extension on the non-mask.
+    if ((IsInt64AndGet(a, /*out*/ &mask) && (IsSignExtensionAndGet(b, type, /*out*/ operand) ||
+                                             IsZeroExtensionAndGet(b, type, /*out*/ operand))) ||
+        (IsInt64AndGet(b, /*out*/ &mask) && (IsSignExtensionAndGet(a, type, /*out*/ operand) ||
+                                             IsZeroExtensionAndGet(a, type, /*out*/ operand)))) {
+      switch ((*operand)->GetType()) {
+        case Primitive::kPrimByte:  return mask == std::numeric_limits<uint8_t>::max();
+        case Primitive::kPrimChar:
+        case Primitive::kPrimShort: return mask == std::numeric_limits<uint16_t>::max();
+        default: return false;
+      }
+    }
+  }
+  // TODO: perhaps explicit conversions later too?
+  return false;
+}
+
 // Test vector restrictions.
 static bool HasVectorRestrictions(uint64_t restrictions, uint64_t tested) {
   return (restrictions & tested) != 0;
 }
 
-// Inserts an instruction.
+// Insert an instruction.
 static HInstruction* Insert(HBasicBlock* block, HInstruction* instruction) {
   DCHECK(block != nullptr);
   DCHECK(instruction != nullptr);
@@ -713,6 +823,10 @@
       return true;
     }
   } else if (instruction->IsShl() || instruction->IsShr() || instruction->IsUShr()) {
+    // Recognize vectorization idioms.
+    if (VectorizeHalvingAddIdiom(node, instruction, generate_code, type, restrictions)) {
+      return true;
+    }
     // Deal with vector restrictions.
     if ((HasVectorRestrictions(restrictions, kNoShift)) ||
         (instruction->IsShr() && HasVectorRestrictions(restrictions, kNoShr))) {
@@ -806,11 +920,11 @@
         switch (type) {
           case Primitive::kPrimBoolean:
           case Primitive::kPrimByte:
-            *restrictions |= kNoMul | kNoDiv | kNoShift | kNoAbs;
+            *restrictions |= kNoMul | kNoDiv | kNoShift | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd;
             return TrySetVectorLength(16);
           case Primitive::kPrimChar:
           case Primitive::kPrimShort:
-            *restrictions |= kNoDiv | kNoAbs;
+            *restrictions |= kNoDiv | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd;
             return TrySetVectorLength(8);
           case Primitive::kPrimInt:
             *restrictions |= kNoDiv;
@@ -1039,6 +1153,90 @@
 #undef GENERATE_VEC
 
 //
+// Vectorization idioms.
+//
+
+// Method recognizes the following idioms:
+//   rounding halving add (a + b + 1) >> 1 for unsigned/signed operands a, b
+//   regular  halving add (a + b)     >> 1 for unsigned/signed operands a, b
+// Provided that the operands are promoted to a wider form to do the arithmetic and
+// then cast back to narrower form, the idioms can be mapped into efficient SIMD
+// implementation that operates directly in narrower form (plus one extra bit).
+// TODO: current version recognizes implicit byte/short/char widening only;
+//       explicit widening from int to long could be added later.
+bool HLoopOptimization::VectorizeHalvingAddIdiom(LoopNode* node,
+                                                 HInstruction* instruction,
+                                                 bool generate_code,
+                                                 Primitive::Type type,
+                                                 uint64_t restrictions) {
+  // Test for top level arithmetic shift right x >> 1 or logical shift right x >>> 1
+  // (note whether the sign bit in higher precision is shifted in has no effect
+  // on the narrow precision computed by the idiom).
+  int64_t value = 0;
+  if ((instruction->IsShr() ||
+       instruction->IsUShr()) &&
+      IsInt64AndGet(instruction->InputAt(1), &value) && value == 1) {
+    //
+    // TODO: make following code less sensitive to associativity and commutativity differences.
+    //
+    HInstruction* x = instruction->InputAt(0);
+    // Test for an optional rounding part (x + 1) >> 1.
+    bool is_rounded = false;
+    if (x->IsAdd() && IsInt64AndGet(x->InputAt(1), &value) && value == 1) {
+      x = x->InputAt(0);
+      is_rounded = true;
+    }
+    // Test for a core addition (a + b) >> 1 (possibly rounded), either unsigned or signed.
+    if (x->IsAdd()) {
+      HInstruction* a = x->InputAt(0);
+      HInstruction* b = x->InputAt(1);
+      HInstruction* r = nullptr;
+      HInstruction* s = nullptr;
+      bool is_unsigned = false;
+      if (IsZeroExtensionAndGet(a, type, &r) && IsZeroExtensionAndGet(b, type, &s)) {
+        is_unsigned = true;
+      } else if (IsSignExtensionAndGet(a, type, &r) && IsSignExtensionAndGet(b, type, &s)) {
+        is_unsigned = false;
+      } else {
+        return false;
+      }
+      // Deal with vector restrictions.
+      if ((!is_unsigned && HasVectorRestrictions(restrictions, kNoSignedHAdd)) ||
+          (!is_rounded && HasVectorRestrictions(restrictions, kNoUnroundedHAdd))) {
+        return false;
+      }
+      // Accept recognized halving add for vectorizable operands. Vectorized code uses the
+      // shorthand idiomatic operation. Sequential code uses the original scalar expressions.
+      DCHECK(r != nullptr && s != nullptr);
+      if (VectorizeUse(node, r, generate_code, type, restrictions) &&
+          VectorizeUse(node, s, generate_code, type, restrictions)) {
+        if (generate_code) {
+          if (vector_mode_ == kVector) {
+            vector_map_->Put(instruction, new (global_allocator_) HVecHalvingAdd(
+                global_allocator_,
+                vector_map_->Get(r),
+                vector_map_->Get(s),
+                type,
+                vector_length_,
+                is_unsigned,
+                is_rounded));
+          } else {
+            VectorizeUse(node, instruction->InputAt(0), generate_code, type, restrictions);
+            VectorizeUse(node, instruction->InputAt(1), generate_code, type, restrictions);
+            GenerateVecOp(instruction,
+                          vector_map_->Get(instruction->InputAt(0)),
+                          vector_map_->Get(instruction->InputAt(1)),
+                          type);
+          }
+        }
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+//
 // Helpers.
 //