Emit vector mulitply and accumulate instructions for x86.

This patch adds a new cpu vaiant named kabylake and performs
instruction simplification to generate VectorMulitplyAccumulate.

Test: ./test.py --host --64

Change-Id: Ie6cc882dadf1322dd4d3ae49bfdb600b0c447765
Signed-off-by: Gupta Kumar, Sanjiv <sanjiv.kumar.gupta@intel.com>
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index 86f9010..c2ce03b 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -525,6 +525,58 @@
   EmitOperand(dst, src);
 }
 
+void X86Assembler::vfmadd231ps(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
+  uint8_t byte_one = EmitVexByte1(false, false, false, 2);
+  uint8_t byte_two = EmitVexByte2(false, 128, X86ManagedRegister::FromXmmRegister(mul_left), 1);
+  EmitUint8(byte_zero);
+  EmitUint8(byte_one);
+  EmitUint8(byte_two);
+  // Opcode field.
+  EmitUint8(0xB8);
+  EmitXmmRegisterOperand(acc, mul_right);
+}
+
+void X86Assembler::vfmsub231ps(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
+  uint8_t byte_one = EmitVexByte1(false, false, false, 2);
+  uint8_t byte_two = EmitVexByte2(false, 128, X86ManagedRegister::FromXmmRegister(mul_left), 1);
+  EmitUint8(byte_zero);
+  EmitUint8(byte_one);
+  EmitUint8(byte_two);
+  // Opcode field.
+  EmitUint8(0xBA);
+  EmitXmmRegisterOperand(acc, mul_right);
+}
+
+void X86Assembler::vfmadd231pd(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
+  uint8_t byte_one = EmitVexByte1(false, false, false, 2);
+  uint8_t byte_two = EmitVexByte2(true, 128, X86ManagedRegister::FromXmmRegister(mul_left), 1);
+  EmitUint8(byte_zero);
+  EmitUint8(byte_one);
+  EmitUint8(byte_two);
+  // Opcode field.
+  EmitUint8(0xB8);
+  EmitXmmRegisterOperand(acc, mul_right);
+}
+
+void X86Assembler::vfmsub231pd(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
+  uint8_t byte_one = EmitVexByte1(false, false, false, 2);
+  uint8_t byte_two = EmitVexByte2(true, 128, X86ManagedRegister::FromXmmRegister(mul_left), 1);
+  EmitUint8(byte_zero);
+  EmitUint8(byte_one);
+  EmitUint8(byte_two);
+  // Opcode field.
+  EmitUint8(0xBA);
+  EmitXmmRegisterOperand(acc, mul_right);
+}
+
 
 void X86Assembler::addps(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -2898,6 +2950,99 @@
 }
 
 
+uint8_t X86Assembler::EmitVexByteZero(bool is_two_byte) {
+  uint8_t vex_zero = 0xC0;
+  if (!is_two_byte) {
+    vex_zero |= 0xC4;
+  } else {
+    vex_zero |= 0xC5;
+  }
+  return vex_zero;
+}
+
+uint8_t X86Assembler::EmitVexByte1(bool r, bool x, bool b, int mmmmm ) {
+  // VEX Byte 1.
+  uint8_t vex_prefix = 0;
+  if (!r) {
+    vex_prefix |= 0x80;  // VEX.R .
+  }
+  if (!x) {
+    vex_prefix |= 0x40;  // VEX.X .
+  }
+  if (!b) {
+    vex_prefix |= 0x20;  // VEX.B .
+  }
+
+  // VEX.mmmmm.
+  switch (mmmmm) {
+  case 1:
+    // Implied 0F leading opcode byte.
+    vex_prefix |= 0x01;
+    break;
+  case 2:
+    // Implied leading 0F 38 opcode byte.
+    vex_prefix |= 0x02;
+    break;
+  case 3:
+    // Implied leading OF 3A opcode byte.
+    vex_prefix |= 0x03;
+    break;
+  default:
+    LOG(FATAL) << "unknown opcode bytes";
+  }
+  return vex_prefix;
+}
+
+uint8_t X86Assembler::EmitVexByte2(bool w, int l, X86ManagedRegister operand, int pp) {
+  uint8_t vex_prefix = 0;
+  // VEX Byte 2.
+  if (w) {
+    vex_prefix |= 0x80;
+  }
+
+  // VEX.vvvv.
+  if (operand.IsXmmRegister()) {
+    XmmRegister vvvv = operand.AsXmmRegister();
+    int inverted_reg = 15-static_cast<int>(vvvv);
+    uint8_t reg = static_cast<uint8_t>(inverted_reg);
+    vex_prefix |= ((reg & 0x0F) << 3);
+  } else if (operand.IsCpuRegister()) {
+    Register vvvv = operand.AsCpuRegister();
+    int inverted_reg = 15 - static_cast<int>(vvvv);
+    uint8_t reg = static_cast<uint8_t>(inverted_reg);
+    vex_prefix |= ((reg & 0x0F) << 3);
+  }
+
+  // VEX.L.
+  if (l == 256) {
+    vex_prefix |= 0x04;
+  }
+
+  // VEX.pp.
+  switch (pp) {
+  case 0:
+    // SIMD Pefix - None.
+    vex_prefix |= 0x00;
+    break;
+  case 1:
+    // SIMD Prefix - 66.
+    vex_prefix |= 0x01;
+    break;
+  case 2:
+    // SIMD Prefix - F3.
+    vex_prefix |= 0x02;
+    break;
+  case 3:
+    // SIMD Prefix - F2.
+    vex_prefix |= 0x03;
+    break;
+  default:
+    LOG(FATAL) << "unknown SIMD Prefix";
+  }
+
+  return vex_prefix;
+}
+
 void X86Assembler::EmitGenericShift(int reg_or_opcode,
                                     const Operand& operand,
                                     const Immediate& imm) {