Add AVX support for packed add/sub instructions on x86
Test: ./test.py --host, test-art-host-gtest
Change-Id: I48d05e6f6befd54657d962119a543b27a8a51d71
Signed-off-by: Shalini Salomi Bodapati <shalini.salomi.bodapati@intel.com>
diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc
index 0ee0035..c8964dd 100644
--- a/compiler/optimizing/code_generator_vector_x86.cc
+++ b/compiler/optimizing/code_generator_vector_x86.cc
@@ -473,6 +473,70 @@
}
}
+static void CreateVecTerOpLocations(ArenaAllocator* allocator, HVecOperation* instruction) {
+ LocationSummary* locations = new (allocator) LocationSummary(instruction);
+ switch (instruction->GetPackedType()) {
+ case DataType::Type::kBool:
+ case DataType::Type::kUint8:
+ case DataType::Type::kInt8:
+ case DataType::Type::kUint16:
+ case DataType::Type::kInt16:
+ case DataType::Type::kInt32:
+ case DataType::Type::kInt64:
+ case DataType::Type::kFloat32:
+ case DataType::Type::kFloat64:
+ locations->SetInAt(0, Location::RequiresFpuRegister());
+ locations->SetInAt(1, Location::RequiresFpuRegister());
+ locations->SetOut(Location::RequiresFpuRegister());
+ break;
+ default:
+ LOG(FATAL) << "Unsupported SIMD type";
+ UNREACHABLE();
+ }
+}
+
+void LocationsBuilderX86::VisitVecAvxAdd(HVecAvxAdd* instruction) {
+ CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+}
+
+void InstructionCodeGeneratorX86::VisitVecAvxAdd(HVecAvxAdd* instruction) {
+ LocationSummary* locations = instruction->GetLocations();
+ XmmRegister src1 = locations->InAt(0).AsFpuRegister<XmmRegister>();
+ XmmRegister src2 = locations->InAt(1).AsFpuRegister<XmmRegister>();
+ XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+ switch (instruction->GetPackedType()) {
+ case DataType::Type::kUint8:
+ case DataType::Type::kInt8:
+ DCHECK_EQ(16u, instruction->GetVectorLength());
+ __ vpaddb(dst, src1, src2);
+ break;
+ case DataType::Type::kUint16:
+ case DataType::Type::kInt16:
+ DCHECK_EQ(8u, instruction->GetVectorLength());
+ __ vpaddw(dst, src1, src2);
+ break;
+ case DataType::Type::kInt32:
+ DCHECK_EQ(4u, instruction->GetVectorLength());
+ __ vpaddd(dst, src1, src2);
+ break;
+ case DataType::Type::kInt64:
+ DCHECK_EQ(2u, instruction->GetVectorLength());
+ __ vpaddq(dst, src1, src2);
+ break;
+ case DataType::Type::kFloat32:
+ DCHECK_EQ(4u, instruction->GetVectorLength());
+ __ vaddps(dst, src1, src2);
+ break;
+ case DataType::Type::kFloat64:
+ DCHECK_EQ(2u, instruction->GetVectorLength());
+ __ vaddpd(dst, src1, src2);
+ break;
+ default:
+ LOG(FATAL) << "Unsupported SIMD type";
+ UNREACHABLE();
+ }
+}
+
void LocationsBuilderX86::VisitVecSaturationAdd(HVecSaturationAdd* instruction) {
CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
}
@@ -574,6 +638,48 @@
}
}
+void LocationsBuilderX86::VisitVecAvxSub(HVecAvxSub* instruction) {
+ CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+}
+
+void InstructionCodeGeneratorX86::VisitVecAvxSub(HVecAvxSub* instruction) {
+ LocationSummary* locations = instruction->GetLocations();
+ XmmRegister src1 = locations->InAt(0).AsFpuRegister<XmmRegister>();
+ XmmRegister src2 = locations->InAt(1).AsFpuRegister<XmmRegister>();
+ XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+ switch (instruction->GetPackedType()) {
+ case DataType::Type::kUint8:
+ case DataType::Type::kInt8:
+ DCHECK_EQ(16u, instruction->GetVectorLength());
+ __ vpsubb(dst, src1, src2);
+ break;
+ case DataType::Type::kUint16:
+ case DataType::Type::kInt16:
+ DCHECK_EQ(8u, instruction->GetVectorLength());
+ __ vpsubw(dst, src1, src2);
+ break;
+ case DataType::Type::kInt32:
+ DCHECK_EQ(4u, instruction->GetVectorLength());
+ __ vpsubd(dst, src1, src2);
+ break;
+ case DataType::Type::kInt64:
+ DCHECK_EQ(2u, instruction->GetVectorLength());
+ __ vpsubq(dst, src1, src2);
+ break;
+ case DataType::Type::kFloat32:
+ DCHECK_EQ(4u, instruction->GetVectorLength());
+ __ vsubps(dst, src1, src2);
+ break;
+ case DataType::Type::kFloat64:
+ DCHECK_EQ(2u, instruction->GetVectorLength());
+ __ vsubpd(dst, src1, src2);
+ break;
+ default:
+ LOG(FATAL) << "Unsupported SIMD type";
+ UNREACHABLE();
+ }
+}
+
void LocationsBuilderX86::VisitVecSaturationSub(HVecSaturationSub* instruction) {
CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
}
diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc
index 9c28827..c147659 100644
--- a/compiler/optimizing/code_generator_vector_x86_64.cc
+++ b/compiler/optimizing/code_generator_vector_x86_64.cc
@@ -414,6 +414,28 @@
}
}
+static void CreateVecTerOpLocations(ArenaAllocator* allocator, HVecOperation* instruction) {
+ LocationSummary* locations = new (allocator) LocationSummary(instruction);
+ switch (instruction->GetPackedType()) {
+ case DataType::Type::kBool:
+ case DataType::Type::kUint8:
+ case DataType::Type::kInt8:
+ case DataType::Type::kUint16:
+ case DataType::Type::kInt16:
+ case DataType::Type::kInt32:
+ case DataType::Type::kInt64:
+ case DataType::Type::kFloat32:
+ case DataType::Type::kFloat64:
+ locations->SetInAt(0, Location::RequiresFpuRegister());
+ locations->SetInAt(1, Location::RequiresFpuRegister());
+ locations->SetOut(Location::RequiresFpuRegister());
+ break;
+ default:
+ LOG(FATAL) << "Unsupported SIMD type";
+ UNREACHABLE();
+ }
+}
+
void LocationsBuilderX86_64::VisitVecAdd(HVecAdd* instruction) {
CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
}
@@ -456,6 +478,48 @@
}
}
+void LocationsBuilderX86_64::VisitVecAvxAdd(HVecAvxAdd* instruction) {
+ CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecAvxAdd(HVecAvxAdd* instruction) {
+ LocationSummary* locations = instruction->GetLocations();
+ XmmRegister src1 = locations->InAt(0).AsFpuRegister<XmmRegister>();
+ XmmRegister src2 = locations->InAt(1).AsFpuRegister<XmmRegister>();
+ XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+ switch (instruction->GetPackedType()) {
+ case DataType::Type::kUint8:
+ case DataType::Type::kInt8:
+ DCHECK_EQ(16u, instruction->GetVectorLength());
+ __ vpaddb(dst, src1, src2);
+ break;
+ case DataType::Type::kUint16:
+ case DataType::Type::kInt16:
+ DCHECK_EQ(8u, instruction->GetVectorLength());
+ __ vpaddw(dst, src1, src2);
+ break;
+ case DataType::Type::kInt32:
+ DCHECK_EQ(4u, instruction->GetVectorLength());
+ __ vpaddd(dst, src1, src2);
+ break;
+ case DataType::Type::kInt64:
+ DCHECK_EQ(2u, instruction->GetVectorLength());
+ __ vpaddq(dst, src1, src2);
+ break;
+ case DataType::Type::kFloat32:
+ DCHECK_EQ(4u, instruction->GetVectorLength());
+ __ vaddps(dst, src1, src2);
+ break;
+ case DataType::Type::kFloat64:
+ DCHECK_EQ(2u, instruction->GetVectorLength());
+ __ vaddpd(dst, src1, src2);
+ break;
+ default:
+ LOG(FATAL) << "Unsupported SIMD type";
+ UNREACHABLE();
+ }
+}
+
void LocationsBuilderX86_64::VisitVecSaturationAdd(HVecSaturationAdd* instruction) {
CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
}
@@ -557,6 +621,48 @@
}
}
+void LocationsBuilderX86_64::VisitVecAvxSub(HVecAvxSub* instruction) {
+ CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecAvxSub(HVecAvxSub* instruction) {
+ LocationSummary* locations = instruction->GetLocations();
+ XmmRegister src1 = locations->InAt(0).AsFpuRegister<XmmRegister>();
+ XmmRegister src2 = locations->InAt(1).AsFpuRegister<XmmRegister>();
+ XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+ switch (instruction->GetPackedType()) {
+ case DataType::Type::kUint8:
+ case DataType::Type::kInt8:
+ DCHECK_EQ(16u, instruction->GetVectorLength());
+ __ vpsubb(dst, src1, src2);
+ break;
+ case DataType::Type::kUint16:
+ case DataType::Type::kInt16:
+ DCHECK_EQ(8u, instruction->GetVectorLength());
+ __ vpsubw(dst, src1, src2);
+ break;
+ case DataType::Type::kInt32:
+ DCHECK_EQ(4u, instruction->GetVectorLength());
+ __ vpsubd(dst, src1, src2);
+ break;
+ case DataType::Type::kInt64:
+ DCHECK_EQ(2u, instruction->GetVectorLength());
+ __ vpsubq(dst, src1, src2);
+ break;
+ case DataType::Type::kFloat32:
+ DCHECK_EQ(4u, instruction->GetVectorLength());
+ __ vsubps(dst, src1, src2);
+ break;
+ case DataType::Type::kFloat64:
+ DCHECK_EQ(2u, instruction->GetVectorLength());
+ __ vsubpd(dst, src1, src2);
+ break;
+ default:
+ LOG(FATAL) << "Unsupported SIMD type";
+ UNREACHABLE();
+ }
+}
+
void LocationsBuilderX86_64::VisitVecSaturationSub(HVecSaturationSub* instruction) {
CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
}
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 6c76ab8..c6e7560 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -351,8 +351,11 @@
// Translates vector operation to reduction kind.
static HVecReduce::ReductionKind GetReductionKind(HVecOperation* reduction) {
- if (reduction->IsVecAdd() ||
+ if (reduction->IsVecAdd() ||
reduction->IsVecSub() ||
+ #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
+ reduction->IsVecAvxSub() || reduction->IsVecAvxAdd() ||
+ #endif
reduction->IsVecSADAccumulate() ||
reduction->IsVecDotProd()) {
return HVecReduce::kSum;
@@ -1940,10 +1943,34 @@
new (global_allocator_) HVecCnv(global_allocator_, opa, type, vector_length_, dex_pc),
new (global_allocator_) HTypeConversion(org_type, opa, dex_pc));
case HInstruction::kAdd:
+ #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
+ if ((compiler_options_->GetInstructionSet() == InstructionSet::kX86 ||
+ compiler_options_->GetInstructionSet() == InstructionSet::kX86_64) &&
+ compiler_options_->GetInstructionSetFeatures()->AsX86InstructionSetFeatures()
+ ->HasAVX2()) {
+ GENERATE_VEC(
+ new (global_allocator_) HVecAvxAdd(
+ global_allocator_, opa, opb, type, vector_length_, dex_pc),
+ new (global_allocator_) HAdd(org_type, opa, opb, dex_pc));
+ UNREACHABLE(); // GENERATE_VEC ends with a "break".
+ }
+ #endif
GENERATE_VEC(
new (global_allocator_) HVecAdd(global_allocator_, opa, opb, type, vector_length_, dex_pc),
new (global_allocator_) HAdd(org_type, opa, opb, dex_pc));
case HInstruction::kSub:
+ #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
+ if ((compiler_options_->GetInstructionSet() == InstructionSet::kX86 ||
+ compiler_options_->GetInstructionSet() == InstructionSet::kX86_64) &&
+ compiler_options_->GetInstructionSetFeatures()->AsX86InstructionSetFeatures()
+ ->HasAVX2()) {
+ GENERATE_VEC(
+ new (global_allocator_) HVecAvxSub(
+ global_allocator_, opa, opb, type, vector_length_, dex_pc),
+ new (global_allocator_) HSub(org_type, opa, opb, dex_pc));
+ UNREACHABLE(); // GENERATE_VEC ends with a "break".
+ }
+ #endif
GENERATE_VEC(
new (global_allocator_) HVecSub(global_allocator_, opa, opb, type, vector_length_, dex_pc),
new (global_allocator_) HSub(org_type, opa, opb, dex_pc));
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index cb53ae3..57ed71d 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1521,8 +1521,10 @@
#if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
#define FOR_EACH_CONCRETE_INSTRUCTION_X86_COMMON(M) \
- M(X86AndNot, Instruction) \
- M(X86MaskOrResetLeastSetBit, Instruction)
+ M(X86AndNot, Instruction) \
+ M(X86MaskOrResetLeastSetBit, Instruction) \
+ M(VecAvxSub, VecOperation) \
+ M(VecAvxAdd, VecOperation)
#else
#define FOR_EACH_CONCRETE_INSTRUCTION_X86_COMMON(M)
#endif
@@ -7853,6 +7855,7 @@
#endif
#if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
#include "nodes_x86.h"
+#include "nodes_vector_x86.h"
#endif
namespace art {
diff --git a/compiler/optimizing/nodes_vector_x86.h b/compiler/optimizing/nodes_vector_x86.h
new file mode 100644
index 0000000..a8f576f
--- /dev/null
+++ b/compiler/optimizing/nodes_vector_x86.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_NODES_VECTOR_X86_H_
+#define ART_COMPILER_OPTIMIZING_NODES_VECTOR_X86_H_
+
+#include "nodes_vector.h"
+
+namespace art {
+
+class HVecAvxAdd final : public HVecOperation {
+ public:
+ HVecAvxAdd(ArenaAllocator* allocator,
+ HInstruction* src1,
+ HInstruction* src2,
+ DataType::Type packed_type,
+ size_t vector_length,
+ uint32_t dex_pc)
+ : HVecOperation(kVecAvxAdd,
+ allocator,
+ packed_type,
+ SideEffects::None(),
+ /* number_of_inputs */ 2,
+ vector_length,
+ dex_pc) {
+ DCHECK(HasConsistentPackedTypes(src1, packed_type));
+ DCHECK(HasConsistentPackedTypes(src2, packed_type));
+ SetRawInputAt(0, src1);
+ SetRawInputAt(1, src2);
+ }
+
+ bool CanBeMoved() const override { return true; }
+
+ DECLARE_INSTRUCTION(VecAvxAdd);
+
+ protected:
+ DEFAULT_COPY_CONSTRUCTOR(VecAvxAdd);
+};
+
+class HVecAvxSub final : public HVecOperation {
+ public:
+ HVecAvxSub(ArenaAllocator* allocator,
+ HInstruction* src1,
+ HInstruction* src2,
+ DataType::Type packed_type,
+ size_t vector_length,
+ uint32_t dex_pc)
+ : HVecOperation(kVecAvxSub,
+ allocator,
+ packed_type,
+ SideEffects::None(),
+ /* number_of_inputs */ 2,
+ vector_length,
+ dex_pc) {
+ DCHECK(HasConsistentPackedTypes(src1, packed_type));
+ DCHECK(HasConsistentPackedTypes(src2, packed_type));
+ SetRawInputAt(0, src1);
+ SetRawInputAt(1, src2);
+ }
+
+ bool CanBeMoved() const override { return true; }
+
+ DECLARE_INSTRUCTION(VecAvxSub);
+
+ protected:
+ DEFAULT_COPY_CONSTRUCTOR(VecAvxSub);
+};
+
+} // namespace art
+
+#endif // ART_COMPILER_OPTIMIZING_NODES_VECTOR_X86_H_