Add AVX support for packed add/sub instructions on x86

Test: ./test.py --host, test-art-host-gtest

Change-Id: I48d05e6f6befd54657d962119a543b27a8a51d71
Signed-off-by: Shalini Salomi Bodapati <shalini.salomi.bodapati@intel.com>
diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc
index 0ee0035..c8964dd 100644
--- a/compiler/optimizing/code_generator_vector_x86.cc
+++ b/compiler/optimizing/code_generator_vector_x86.cc
@@ -473,6 +473,70 @@
   }
 }
 
+static void CreateVecTerOpLocations(ArenaAllocator* allocator, HVecOperation* instruction) {
+  LocationSummary* locations = new (allocator) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case DataType::Type::kBool:
+    case DataType::Type::kUint8:
+    case DataType::Type::kInt8:
+    case DataType::Type::kUint16:
+    case DataType::Type::kInt16:
+    case DataType::Type::kInt32:
+    case DataType::Type::kInt64:
+    case DataType::Type::kFloat32:
+    case DataType::Type::kFloat64:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::RequiresFpuRegister());
+      locations->SetOut(Location::RequiresFpuRegister());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86::VisitVecAvxAdd(HVecAvxAdd* instruction) {
+    CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+}
+
+void InstructionCodeGeneratorX86::VisitVecAvxAdd(HVecAvxAdd* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  XmmRegister src1 = locations->InAt(0).AsFpuRegister<XmmRegister>();
+  XmmRegister src2 = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case DataType::Type::kUint8:
+    case DataType::Type::kInt8:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+        __ vpaddb(dst, src1, src2);
+      break;
+    case DataType::Type::kUint16:
+    case DataType::Type::kInt16:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ vpaddw(dst, src1, src2);
+      break;
+    case DataType::Type::kInt32:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ vpaddd(dst, src1, src2);
+      break;
+    case DataType::Type::kInt64:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ vpaddq(dst, src1, src2);
+      break;
+    case DataType::Type::kFloat32:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ vaddps(dst, src1, src2);
+      break;
+    case DataType::Type::kFloat64:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ vaddpd(dst, src1, src2);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
 void LocationsBuilderX86::VisitVecSaturationAdd(HVecSaturationAdd* instruction) {
   CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
 }
@@ -574,6 +638,48 @@
   }
 }
 
+void LocationsBuilderX86::VisitVecAvxSub(HVecAvxSub* instruction) {
+  CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+}
+
+void InstructionCodeGeneratorX86::VisitVecAvxSub(HVecAvxSub* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  XmmRegister src1 = locations->InAt(0).AsFpuRegister<XmmRegister>();
+  XmmRegister src2 = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case DataType::Type::kUint8:
+    case DataType::Type::kInt8:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ vpsubb(dst, src1, src2);
+      break;
+    case DataType::Type::kUint16:
+    case DataType::Type::kInt16:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ vpsubw(dst, src1, src2);
+      break;
+    case DataType::Type::kInt32:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ vpsubd(dst, src1, src2);
+      break;
+    case DataType::Type::kInt64:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ vpsubq(dst, src1, src2);
+      break;
+    case DataType::Type::kFloat32:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ vsubps(dst, src1, src2);
+      break;
+    case DataType::Type::kFloat64:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ vsubpd(dst, src1, src2);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
 void LocationsBuilderX86::VisitVecSaturationSub(HVecSaturationSub* instruction) {
   CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
 }
diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc
index 9c28827..c147659 100644
--- a/compiler/optimizing/code_generator_vector_x86_64.cc
+++ b/compiler/optimizing/code_generator_vector_x86_64.cc
@@ -414,6 +414,28 @@
   }
 }
 
+static void CreateVecTerOpLocations(ArenaAllocator* allocator, HVecOperation* instruction) {
+  LocationSummary* locations = new (allocator) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case DataType::Type::kBool:
+    case DataType::Type::kUint8:
+    case DataType::Type::kInt8:
+    case DataType::Type::kUint16:
+    case DataType::Type::kInt16:
+    case DataType::Type::kInt32:
+    case DataType::Type::kInt64:
+    case DataType::Type::kFloat32:
+    case DataType::Type::kFloat64:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::RequiresFpuRegister());
+      locations->SetOut(Location::RequiresFpuRegister());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
 void LocationsBuilderX86_64::VisitVecAdd(HVecAdd* instruction) {
   CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
 }
@@ -456,6 +478,48 @@
   }
 }
 
+void LocationsBuilderX86_64::VisitVecAvxAdd(HVecAvxAdd* instruction) {
+    CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecAvxAdd(HVecAvxAdd* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  XmmRegister src1 = locations->InAt(0).AsFpuRegister<XmmRegister>();
+  XmmRegister src2 = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case DataType::Type::kUint8:
+    case DataType::Type::kInt8:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+        __ vpaddb(dst, src1, src2);
+      break;
+    case DataType::Type::kUint16:
+    case DataType::Type::kInt16:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ vpaddw(dst, src1, src2);
+      break;
+    case DataType::Type::kInt32:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ vpaddd(dst, src1, src2);
+      break;
+    case DataType::Type::kInt64:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ vpaddq(dst, src1, src2);
+      break;
+    case DataType::Type::kFloat32:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ vaddps(dst, src1, src2);
+      break;
+    case DataType::Type::kFloat64:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ vaddpd(dst, src1, src2);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
 void LocationsBuilderX86_64::VisitVecSaturationAdd(HVecSaturationAdd* instruction) {
   CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
 }
@@ -557,6 +621,48 @@
   }
 }
 
+void LocationsBuilderX86_64::VisitVecAvxSub(HVecAvxSub* instruction) {
+  CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecAvxSub(HVecAvxSub* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  XmmRegister src1 = locations->InAt(0).AsFpuRegister<XmmRegister>();
+  XmmRegister src2 = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case DataType::Type::kUint8:
+    case DataType::Type::kInt8:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ vpsubb(dst, src1, src2);
+      break;
+    case DataType::Type::kUint16:
+    case DataType::Type::kInt16:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ vpsubw(dst, src1, src2);
+      break;
+    case DataType::Type::kInt32:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ vpsubd(dst, src1, src2);
+      break;
+    case DataType::Type::kInt64:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ vpsubq(dst, src1, src2);
+      break;
+    case DataType::Type::kFloat32:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ vsubps(dst, src1, src2);
+      break;
+    case DataType::Type::kFloat64:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ vsubpd(dst, src1, src2);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
 void LocationsBuilderX86_64::VisitVecSaturationSub(HVecSaturationSub* instruction) {
   CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
 }
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 6c76ab8..c6e7560 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -351,8 +351,11 @@
 
 // Translates vector operation to reduction kind.
 static HVecReduce::ReductionKind GetReductionKind(HVecOperation* reduction) {
-  if (reduction->IsVecAdd() ||
+  if (reduction->IsVecAdd()  ||
       reduction->IsVecSub() ||
+      #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
+      reduction->IsVecAvxSub() || reduction->IsVecAvxAdd() ||
+      #endif
       reduction->IsVecSADAccumulate() ||
       reduction->IsVecDotProd()) {
     return HVecReduce::kSum;
@@ -1940,10 +1943,34 @@
         new (global_allocator_) HVecCnv(global_allocator_, opa, type, vector_length_, dex_pc),
         new (global_allocator_) HTypeConversion(org_type, opa, dex_pc));
     case HInstruction::kAdd:
+      #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
+      if ((compiler_options_->GetInstructionSet() == InstructionSet::kX86 ||
+           compiler_options_->GetInstructionSet() == InstructionSet::kX86_64) &&
+           compiler_options_->GetInstructionSetFeatures()->AsX86InstructionSetFeatures()
+               ->HasAVX2()) {
+        GENERATE_VEC(
+          new (global_allocator_) HVecAvxAdd(
+                                      global_allocator_, opa, opb, type, vector_length_, dex_pc),
+          new (global_allocator_) HAdd(org_type, opa, opb, dex_pc));
+        UNREACHABLE();  // GENERATE_VEC ends with a "break".
+      }
+      #endif
       GENERATE_VEC(
         new (global_allocator_) HVecAdd(global_allocator_, opa, opb, type, vector_length_, dex_pc),
         new (global_allocator_) HAdd(org_type, opa, opb, dex_pc));
     case HInstruction::kSub:
+      #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
+      if ((compiler_options_->GetInstructionSet() == InstructionSet::kX86 ||
+           compiler_options_->GetInstructionSet() == InstructionSet::kX86_64) &&
+           compiler_options_->GetInstructionSetFeatures()->AsX86InstructionSetFeatures()
+               ->HasAVX2()) {
+        GENERATE_VEC(
+          new (global_allocator_) HVecAvxSub(
+                                      global_allocator_, opa, opb, type, vector_length_, dex_pc),
+          new (global_allocator_) HSub(org_type, opa, opb, dex_pc));
+        UNREACHABLE();  // GENERATE_VEC ends with a "break".
+      }
+      #endif
       GENERATE_VEC(
         new (global_allocator_) HVecSub(global_allocator_, opa, opb, type, vector_length_, dex_pc),
         new (global_allocator_) HSub(org_type, opa, opb, dex_pc));
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index cb53ae3..57ed71d 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1521,8 +1521,10 @@
 
 #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
 #define FOR_EACH_CONCRETE_INSTRUCTION_X86_COMMON(M)                     \
-  M(X86AndNot, Instruction)                                                \
-  M(X86MaskOrResetLeastSetBit, Instruction)
+  M(X86AndNot, Instruction)                                             \
+  M(X86MaskOrResetLeastSetBit, Instruction)                             \
+  M(VecAvxSub, VecOperation)                                            \
+  M(VecAvxAdd, VecOperation)
 #else
 #define FOR_EACH_CONCRETE_INSTRUCTION_X86_COMMON(M)
 #endif
@@ -7853,6 +7855,7 @@
 #endif
 #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
 #include "nodes_x86.h"
+#include "nodes_vector_x86.h"
 #endif
 
 namespace art {
diff --git a/compiler/optimizing/nodes_vector_x86.h b/compiler/optimizing/nodes_vector_x86.h
new file mode 100644
index 0000000..a8f576f
--- /dev/null
+++ b/compiler/optimizing/nodes_vector_x86.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_NODES_VECTOR_X86_H_
+#define ART_COMPILER_OPTIMIZING_NODES_VECTOR_X86_H_
+
+#include "nodes_vector.h"
+
+namespace art {
+
+class HVecAvxAdd final : public HVecOperation {
+ public:
+  HVecAvxAdd(ArenaAllocator* allocator,
+             HInstruction* src1,
+             HInstruction* src2,
+             DataType::Type packed_type,
+             size_t vector_length,
+             uint32_t dex_pc)
+      : HVecOperation(kVecAvxAdd,
+                      allocator,
+                      packed_type,
+                      SideEffects::None(),
+                      /* number_of_inputs */ 2,
+                      vector_length,
+                      dex_pc) {
+    DCHECK(HasConsistentPackedTypes(src1, packed_type));
+    DCHECK(HasConsistentPackedTypes(src2, packed_type));
+    SetRawInputAt(0, src1);
+    SetRawInputAt(1, src2);
+  }
+
+  bool CanBeMoved() const override { return true; }
+
+  DECLARE_INSTRUCTION(VecAvxAdd);
+
+ protected:
+  DEFAULT_COPY_CONSTRUCTOR(VecAvxAdd);
+};
+
+class HVecAvxSub final : public HVecOperation {
+ public:
+  HVecAvxSub(ArenaAllocator* allocator,
+             HInstruction* src1,
+             HInstruction* src2,
+             DataType::Type packed_type,
+             size_t vector_length,
+             uint32_t dex_pc)
+      : HVecOperation(kVecAvxSub,
+                      allocator,
+                      packed_type,
+                      SideEffects::None(),
+                      /* number_of_inputs */ 2,
+                      vector_length,
+                      dex_pc) {
+    DCHECK(HasConsistentPackedTypes(src1, packed_type));
+    DCHECK(HasConsistentPackedTypes(src2, packed_type));
+    SetRawInputAt(0, src1);
+    SetRawInputAt(1, src2);
+  }
+
+  bool CanBeMoved() const override { return true; }
+
+  DECLARE_INSTRUCTION(VecAvxSub);
+
+ protected:
+  DEFAULT_COPY_CONSTRUCTOR(VecAvxSub);
+};
+
+}  // namespace art
+
+#endif  // ART_COMPILER_OPTIMIZING_NODES_VECTOR_X86_H_