Implement all vector instructions for X86

Add X86 code generation for the vector operations.  Added support for
X86 disassembler for the new instructions.

Change-Id: I72b48f5efa3a516a16bb1dd4bdb5c9270a8db53a
Signed-off-by: Mark Mendell <mark.p.mendell@intel.com>
diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index adfed0c..430bc7d 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h
@@ -151,7 +151,7 @@
   rRET           = RegStorage::k32BitSolo | RegStorage::kCoreRegister | 16,
 #endif
 
-  // xmm registers, single precision view
+  // xmm registers, single precision view.
   fr0  = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 0,
   fr1  = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 1,
   fr2  = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 2,
@@ -161,7 +161,7 @@
   fr6  = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 6,
   fr7  = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 7,
 
-  // xmm registers, double precision alises
+  // xmm registers, double precision aliases.
   dr0  = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 0,
   dr1  = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 1,
   dr2  = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 2,
@@ -171,15 +171,15 @@
   dr6  = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 6,
   dr7  = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 7,
 
-  // xmm registers, quad precision alises
-  qr0  = RegStorage::k128BitSolo | RegStorage::kFloatingPoint | 0,
-  qr1  = RegStorage::k128BitSolo | RegStorage::kFloatingPoint | 1,
-  qr2  = RegStorage::k128BitSolo | RegStorage::kFloatingPoint | 2,
-  qr3  = RegStorage::k128BitSolo | RegStorage::kFloatingPoint | 3,
-  qr4  = RegStorage::k128BitSolo | RegStorage::kFloatingPoint | 4,
-  qr5  = RegStorage::k128BitSolo | RegStorage::kFloatingPoint | 5,
-  qr6  = RegStorage::k128BitSolo | RegStorage::kFloatingPoint | 6,
-  qr7  = RegStorage::k128BitSolo | RegStorage::kFloatingPoint | 7,
+  // xmm registers aliases.
+  xr0  = RegStorage::k128BitSolo | 0,
+  xr1  = RegStorage::k128BitSolo | 1,
+  xr2  = RegStorage::k128BitSolo | 2,
+  xr3  = RegStorage::k128BitSolo | 3,
+  xr4  = RegStorage::k128BitSolo | 4,
+  xr5  = RegStorage::k128BitSolo | 5,
+  xr6  = RegStorage::k128BitSolo | 6,
+  xr7  = RegStorage::k128BitSolo | 7,
 
   // TODO: as needed, add 256, 512 and 1024-bit xmm views.
 };
@@ -221,14 +221,14 @@
 constexpr RegStorage rs_dr6(RegStorage::kValid | dr6);
 constexpr RegStorage rs_dr7(RegStorage::kValid | dr7);
 
-constexpr RegStorage rs_qr0(RegStorage::kValid | qr0);
-constexpr RegStorage rs_qr1(RegStorage::kValid | qr1);
-constexpr RegStorage rs_qr2(RegStorage::kValid | qr2);
-constexpr RegStorage rs_qr3(RegStorage::kValid | qr3);
-constexpr RegStorage rs_qr4(RegStorage::kValid | qr4);
-constexpr RegStorage rs_qr5(RegStorage::kValid | qr5);
-constexpr RegStorage rs_qr6(RegStorage::kValid | qr6);
-constexpr RegStorage rs_qr7(RegStorage::kValid | qr7);
+constexpr RegStorage rs_xr0(RegStorage::kValid | xr0);
+constexpr RegStorage rs_xr1(RegStorage::kValid | xr1);
+constexpr RegStorage rs_xr2(RegStorage::kValid | xr2);
+constexpr RegStorage rs_xr3(RegStorage::kValid | xr3);
+constexpr RegStorage rs_xr4(RegStorage::kValid | xr4);
+constexpr RegStorage rs_xr5(RegStorage::kValid | xr5);
+constexpr RegStorage rs_xr6(RegStorage::kValid | xr6);
+constexpr RegStorage rs_xr7(RegStorage::kValid | xr7);
 
 extern X86NativeRegisterPool rX86_ARG0;
 extern X86NativeRegisterPool rX86_ARG1;
@@ -418,9 +418,39 @@
   Binary0fOpCode(kX86Divsd),    // double divide
   Binary0fOpCode(kX86Divss),    // float divide
   Binary0fOpCode(kX86Punpckldq),  // Interleave low-order double words
-  kX86PsrlqRI,                  // right shift of floating point registers
-  kX86PsllqRI,                  // left shift of floating point registers
-  kX86SqrtsdRR,                 // sqrt of floating point register
+  Binary0fOpCode(kX86Sqrtsd),   // square root
+  Binary0fOpCode(kX86Pmulld),   // parallel integer multiply 32 bits x 4
+  Binary0fOpCode(kX86Pmullw),   // parallel integer multiply 16 bits x 8
+  Binary0fOpCode(kX86Mulps),    // parallel FP multiply 32 bits x 4
+  Binary0fOpCode(kX86Mulpd),    // parallel FP multiply 64 bits x 2
+  Binary0fOpCode(kX86Paddb),    // parallel integer addition 8 bits x 16
+  Binary0fOpCode(kX86Paddw),    // parallel integer addition 16 bits x 8
+  Binary0fOpCode(kX86Paddd),    // parallel integer addition 32 bits x 4
+  Binary0fOpCode(kX86Addps),    // parallel FP addition 32 bits x 4
+  Binary0fOpCode(kX86Addpd),    // parallel FP addition 64 bits x 2
+  Binary0fOpCode(kX86Psubb),    // parallel integer subtraction 8 bits x 16
+  Binary0fOpCode(kX86Psubw),    // parallel integer subtraction 16 bits x 8
+  Binary0fOpCode(kX86Psubd),    // parallel integer subtraction 32 bits x 4
+  Binary0fOpCode(kX86Subps),    // parallel FP subtraction 32 bits x 4
+  Binary0fOpCode(kX86Subpd),    // parallel FP subtraction 64 bits x 2
+  Binary0fOpCode(kX86Pand),     // parallel AND 128 bits x 1
+  Binary0fOpCode(kX86Por),      // parallel OR 128 bits x 1
+  Binary0fOpCode(kX86Pxor),     // parallel XOR 128 bits x 1
+  Binary0fOpCode(kX86Phaddw),   // parallel horizontal addition 16 bits x 8
+  Binary0fOpCode(kX86Phaddd),   // parallel horizontal addition 32 bits x 4
+  kX86PextrbRRI,                // Extract 8 bits from XMM into GPR
+  kX86PextrwRRI,                // Extract 16 bits from XMM into GPR
+  kX86PextrdRRI,                // Extract 32 bits from XMM into GPR
+  kX86PshuflwRRI,               // Shuffle 16 bits in lower 64 bits of XMM.
+  kX86PshufdRRI,                // Shuffle 32 bits in XMM.
+  kX86PsrawRI,                  // signed right shift of floating point registers 16 bits x 8
+  kX86PsradRI,                  // signed right shift of floating point registers 32 bits x 4
+  kX86PsrlwRI,                  // logical right shift of floating point registers 16 bits x 8
+  kX86PsrldRI,                  // logical right shift of floating point registers 32 bits x 4
+  kX86PsrlqRI,                  // logical right shift of floating point registers 64 bits x 2
+  kX86PsllwRI,                  // left shift of floating point registers 16 bits x 8
+  kX86PslldRI,                  // left shift of floating point registers 32 bits x 4
+  kX86PsllqRI,                  // left shift of floating point registers 64 bits x 2
   kX86Fild32M,                  // push 32-bit integer on x87 stack
   kX86Fild64M,                  // push 64-bit integer on x87 stack
   kX86Fstp32M,                  // pop top x87 fp stack and do 32-bit store