ART: Add non-temporal store support

Added non-temporal store support as a hint from the ME.
Added the implementation of the memory barrier
extended instruction that supports non-temporal stores
by explicitly serializing all previous store-to-memory instructions.

Change-Id: I8205a92083f9725253d8ce893671a133a0b6849d
Signed-off-by: Jean Christophe Beyler <jean.christophe.beyler@intel.com>
Signed-off-by: Chao-ying Fu <chao-ying.fu@intel.com>
diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc
index 6173163..46f5dd3 100644
--- a/compiler/dex/quick/x86/assemble_x86.cc
+++ b/compiler/dex/quick/x86/assemble_x86.cc
@@ -188,6 +188,8 @@
 
   { kX86Mov32MR, kMemReg,    IS_STORE | IS_TERTIARY_OP | REG_USE02,      { 0,             0, 0x89, 0, 0, 0, 0, 0, false }, "Mov32MR", "[!0r+!1d],!2r" },
   { kX86Mov32AR, kArrayReg,  IS_STORE | IS_QUIN_OP     | REG_USE014,     { 0,             0, 0x89, 0, 0, 0, 0, 0, false }, "Mov32AR", "[!0r+!1r<<!2d+!3d],!4r" },
+  { kX86Movnti32MR, kMemReg,    IS_STORE | IS_TERTIARY_OP | REG_USE02,   { 0x0F,          0, 0xC3, 0, 0, 0, 0, 0, false }, "Movnti32MR", "[!0r+!1d],!2r" },
+  { kX86Movnti32AR, kArrayReg,  IS_STORE | IS_QUIN_OP     | REG_USE014,  { 0x0F,          0, 0xC3, 0, 0, 0, 0, 0, false }, "Movnti32AR", "[!0r+!1r<<!2d+!3d],!4r" },
   { kX86Mov32TR, kThreadReg, IS_STORE | IS_BINARY_OP   | REG_USE1,       { THREAD_PREFIX, 0, 0x89, 0, 0, 0, 0, 0, false }, "Mov32TR", "fs:[!0d],!1r" },
   { kX86Mov32RR, kRegReg,               IS_BINARY_OP   | REG_DEF0_USE1,  { 0,             0, 0x8B, 0, 0, 0, 0, 0, false }, "Mov32RR", "!0r,!1r" },
   { kX86Mov32RM, kRegMem,    IS_LOAD  | IS_TERTIARY_OP | REG_DEF0_USE1,  { 0,             0, 0x8B, 0, 0, 0, 0, 0, false }, "Mov32RM", "!0r,[!1r+!2d]" },
@@ -203,6 +205,8 @@
 
   { kX86Mov64MR, kMemReg,    IS_STORE | IS_TERTIARY_OP | REG_USE02,      { REX_W,             0, 0x89, 0, 0, 0, 0, 0, false }, "Mov64MR", "[!0r+!1d],!2r" },
   { kX86Mov64AR, kArrayReg,  IS_STORE | IS_QUIN_OP     | REG_USE014,     { REX_W,             0, 0x89, 0, 0, 0, 0, 0, false }, "Mov64AR", "[!0r+!1r<<!2d+!3d],!4r" },
+  { kX86Movnti64MR, kMemReg,    IS_STORE | IS_TERTIARY_OP | REG_USE02,   { 0x0F,              0, 0xC3, 0, 0, 0, 0, 0, false }, "Movnti64MR", "[!0r+!1d],!2r" },
+  { kX86Movnti64AR, kArrayReg,  IS_STORE | IS_QUIN_OP     | REG_USE014,  { 0x0F,              0, 0xC3, 0, 0, 0, 0, 0, false }, "Movnti64AR", "[!0r+!1r<<!2d+!3d],!4r" },
   { kX86Mov64TR, kThreadReg, IS_STORE | IS_BINARY_OP   | REG_USE1,       { THREAD_PREFIX, REX_W, 0x89, 0, 0, 0, 0, 0, false }, "Mov64TR", "fs:[!0d],!1r" },
   { kX86Mov64RR, kRegReg,               IS_BINARY_OP   | REG_DEF0_USE1,  { REX_W,             0, 0x8B, 0, 0, 0, 0, 0, false }, "Mov64RR", "!0r,!1r" },
   { kX86Mov64RM, kRegMem,    IS_LOAD  | IS_TERTIARY_OP | REG_DEF0_USE1,  { REX_W,             0, 0x8B, 0, 0, 0, 0, 0, false }, "Mov64RM", "!0r,[!1r+!2d]" },
@@ -486,7 +490,9 @@
 
   // TODO: load/store?
   // Encode the modrm opcode as an extra opcode byte to avoid computation during assembly.
+  { kX86Lfence, kReg,                 NO_OPERAND,     { 0, 0, 0x0F, 0xAE, 0, 5, 0, 0, false }, "Lfence", "" },
   { kX86Mfence, kReg,                 NO_OPERAND,     { 0, 0, 0x0F, 0xAE, 0, 6, 0, 0, false }, "Mfence", "" },
+  { kX86Sfence, kReg,                 NO_OPERAND,     { 0, 0, 0x0F, 0xAE, 0, 7, 0, 0, false }, "Sfence", "" },
 
   EXT_0F_ENCODING_MAP(Imul16,  0x66, 0xAF, REG_USE0 | REG_DEF0 | SETS_CCODES),
   EXT_0F_ENCODING_MAP(Imul32,  0x00, 0xAF, REG_USE0 | REG_DEF0 | SETS_CCODES),
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index 9c50121..aeeaea2 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -408,7 +408,7 @@
   LIR* LoadBaseIndexedDisp(RegStorage r_base, RegStorage r_index, int scale, int displacement,
                            RegStorage r_dest, OpSize size);
   LIR* StoreBaseIndexedDisp(RegStorage r_base, RegStorage r_index, int scale, int displacement,
-                            RegStorage r_src, OpSize size);
+                            RegStorage r_src, OpSize size, int opt_flags = 0);
 
   RegStorage GetCoreArgMappingToPhysicalReg(int core_arg_num);
 
diff --git a/compiler/dex/quick/x86/int_x86.cc b/compiler/dex/quick/x86/int_x86.cc
index cc51538..d08ea7c 100755
--- a/compiler/dex/quick/x86/int_x86.cc
+++ b/compiler/dex/quick/x86/int_x86.cc
@@ -2324,9 +2324,9 @@
   if ((size == kSignedByte || size == kUnsignedByte) && !IsByteRegister(rl_src.reg)) {
     RegStorage temp = AllocTemp();
     OpRegCopy(temp, rl_src.reg);
-    StoreBaseIndexedDisp(rl_array.reg, rl_index.reg, scale, data_offset, temp, size);
+    StoreBaseIndexedDisp(rl_array.reg, rl_index.reg, scale, data_offset, temp, size, opt_flags);
   } else {
-    StoreBaseIndexedDisp(rl_array.reg, rl_index.reg, scale, data_offset, rl_src.reg, size);
+    StoreBaseIndexedDisp(rl_array.reg, rl_index.reg, scale, data_offset, rl_src.reg, size, opt_flags);
   }
   if (card_mark) {
     // Free rl_index if its a temp. Ensures there are 2 free regs for card mark.
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index 604f4bf..c43a1ff 100755
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -597,6 +597,9 @@
       mem_barrier = NewLIR0(kX86Mfence);
       ret = true;
     }
+  } else if (barrier_kind == kNTStoreStore) {
+      mem_barrier = NewLIR0(kX86Sfence);
+      ret = true;
   }
 
   // Now ensure that a scheduling barrier is in place.
@@ -1530,6 +1533,9 @@
     case kMirOpPackedSet:
       GenSetVector(bb, mir);
       break;
+    case kMirOpMemBarrier:
+      GenMemBarrier(static_cast<MemBarrierKind>(mir->dalvikInsn.vA));
+      break;
     default:
       break;
   }
diff --git a/compiler/dex/quick/x86/utility_x86.cc b/compiler/dex/quick/x86/utility_x86.cc
index f159beb..037dfed 100644
--- a/compiler/dex/quick/x86/utility_x86.cc
+++ b/compiler/dex/quick/x86/utility_x86.cc
@@ -779,15 +779,20 @@
 }
 
 LIR* X86Mir2Lir::StoreBaseIndexedDisp(RegStorage r_base, RegStorage r_index, int scale,
-                                      int displacement, RegStorage r_src, OpSize size) {
+                                      int displacement, RegStorage r_src, OpSize size,
+                                      int opt_flags) {
   LIR *store = NULL;
   LIR *store2 = NULL;
   bool is_array = r_index.Valid();
   bool pair = r_src.IsPair();
   bool is64bit = (size == k64) || (size == kDouble);
+  bool consider_non_temporal = false;
+
   X86OpCode opcode = kX86Nop;
   switch (size) {
     case k64:
+      consider_non_temporal = true;
+      // Fall through!
     case kDouble:
       if (r_src.IsFloat()) {
         opcode = is_array ? kX86MovsdAR : kX86MovsdMR;
@@ -804,6 +809,7 @@
         opcode = is_array ? kX86Mov64AR  : kX86Mov64MR;
         CHECK_EQ(is_array, false);
         CHECK_EQ(r_src.IsFloat(), false);
+        consider_non_temporal = true;
         break;
       }  // else fall-through to k32 case
     case k32:
@@ -815,6 +821,7 @@
         DCHECK(r_src.IsSingle());
       }
       DCHECK_EQ((displacement & 0x3), 0);
+      consider_non_temporal = true;
       break;
     case kUnsignedHalf:
     case kSignedHalf:
@@ -829,6 +836,28 @@
       LOG(FATAL) << "Bad case in StoreBaseIndexedDispBody";
   }
 
+  // Handle non temporal hint here.
+  if (consider_non_temporal && ((opt_flags & MIR_STORE_NON_TEMPORAL) != 0)) {
+    switch (opcode) {
+      // We currently only handle 32/64 bit moves here.
+      case kX86Mov64AR:
+        opcode = kX86Movnti64AR;
+        break;
+      case kX86Mov64MR:
+        opcode = kX86Movnti64MR;
+        break;
+      case kX86Mov32AR:
+        opcode = kX86Movnti32AR;
+        break;
+      case kX86Mov32MR:
+        opcode = kX86Movnti32MR;
+        break;
+      default:
+        // Do nothing here.
+        break;
+    }
+  }
+
   if (!is_array) {
     if (!pair) {
       store = NewLIR3(opcode, r_base.GetReg(), displacement + LOWORD_OFFSET, r_src.GetReg());
diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index 9620cd1..15fe7e2 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h
@@ -440,12 +440,12 @@
   kX86Mov16MR, kX86Mov16AR, kX86Mov16TR,
   kX86Mov16RR, kX86Mov16RM, kX86Mov16RA, kX86Mov16RT,
   kX86Mov16RI, kX86Mov16MI, kX86Mov16AI, kX86Mov16TI,
-  kX86Mov32MR, kX86Mov32AR, kX86Mov32TR,
+  kX86Mov32MR, kX86Mov32AR, kX86Movnti32MR, kX86Movnti32AR, kX86Mov32TR,
   kX86Mov32RR, kX86Mov32RM, kX86Mov32RA, kX86Mov32RT,
   kX86Mov32RI, kX86Mov32MI, kX86Mov32AI, kX86Mov32TI,
   kX86Lea32RM,
   kX86Lea32RA,
-  kX86Mov64MR, kX86Mov64AR, kX86Mov64TR,
+  kX86Mov64MR, kX86Mov64AR, kX86Movnti64MR, kX86Movnti64AR, kX86Mov64TR,
   kX86Mov64RR, kX86Mov64RM, kX86Mov64RA, kX86Mov64RT,
   kX86Mov64RI32, kX86Mov64RI64, kX86Mov64MI, kX86Mov64AI, kX86Mov64TI,
   kX86Lea64RM,
@@ -620,7 +620,12 @@
   kX86MovdrxRR, kX86MovdrxMR, kX86MovdrxAR,  // move into reg from xmm
   kX86MovsxdRR, kX86MovsxdRM, kX86MovsxdRA,  // move 32 bit to 64 bit with sign extension
   kX86Set8R, kX86Set8M, kX86Set8A,  // set byte depending on condition operand
-  kX86Mfence,                   // memory barrier
+  kX86Lfence,                   // memory barrier to serialize all previous
+                                // load-from-memory instructions
+  kX86Mfence,                   // memory barrier to serialize all previous
+                                // load-from-memory and store-to-memory instructions
+  kX86Sfence,                   // memory barrier to serialize all previous
+                                // store-to-memory instructions
   Binary0fOpCode(kX86Imul16),   // 16bit multiply
   Binary0fOpCode(kX86Imul32),   // 32bit multiply
   Binary0fOpCode(kX86Imul64),   // 64bit multiply