Emit vector mulitply and accumulate instructions for x86.

This patch adds a new cpu vaiant named kabylake and performs
instruction simplification to generate VectorMulitplyAccumulate.

Test: ./test.py --host --64

Change-Id: Ie6cc882dadf1322dd4d3ae49bfdb600b0c447765
Signed-off-by: Gupta Kumar, Sanjiv <sanjiv.kumar.gupta@intel.com>
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index e42c4c9..8c9ce82 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -397,6 +397,12 @@
   void divss(XmmRegister dst, XmmRegister src);
   void divss(XmmRegister dst, const Address& src);
 
+  // FMA Mac Instructions
+  void vfmadd231ps(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+  void vfmadd231pd(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+  void vfmsub231ps(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+  void vfmsub231pd(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+
   void addps(XmmRegister dst, XmmRegister src);  // no addr variant (for now)
   void subps(XmmRegister dst, XmmRegister src);
   void mulps(XmmRegister dst, XmmRegister src);
@@ -834,6 +840,11 @@
   void EmitLabelLink(Label* label);
   void EmitLabelLink(NearLabel* label);
 
+  // Emit a 3 byte VEX Prefix
+  uint8_t EmitVexByteZero(bool is_two_byte);
+  uint8_t EmitVexByte1(bool r, bool x, bool b, int mmmmm);
+  uint8_t EmitVexByte2(bool w , int l , X86ManagedRegister vvv, int pp);
+
   void EmitGenericShift(int rm, const Operand& operand, const Immediate& imm);
   void EmitGenericShift(int rm, const Operand& operand, Register shifter);