Add small thread-local cache for use by the interpreter.

Small (one page) cache which can be used on the hottest paths
in the interpreter and which does not require synchronisation.
This CL adds the code but it does not use it for anything yet.

Test: test-art-host-gtest
Change-Id: I41d4e7a86a0f62f7a4efc165b8934232b4e766c7
diff --git a/libartbase/base/macros.h b/libartbase/base/macros.h
index 33866bb..315f4d2 100644
--- a/libartbase/base/macros.h
+++ b/libartbase/base/macros.h
@@ -48,6 +48,7 @@
 #define OFFSETOF_MEMBERPTR(t, f) \
   (reinterpret_cast<uintptr_t>(&(reinterpret_cast<t*>(16)->*f)) - static_cast<uintptr_t>(16))  // NOLINT
 
+#define ALIGNED(x) __attribute__ ((__aligned__(x)))
 #define PACKED(x) __attribute__ ((__aligned__(x), __packed__))
 
 // Stringify the argument.
diff --git a/runtime/Android.bp b/runtime/Android.bp
index 15ccb70..f4b8697 100644
--- a/runtime/Android.bp
+++ b/runtime/Android.bp
@@ -93,6 +93,7 @@
         "instrumentation.cc",
         "intern_table.cc",
         "interpreter/interpreter.cc",
+        "interpreter/interpreter_cache.cc",
         "interpreter/interpreter_common.cc",
         "interpreter/interpreter_intrinsics.cc",
         "interpreter/interpreter_switch_impl.cc",
diff --git a/runtime/asm_support.h b/runtime/asm_support.h
index e65c194..00c9360 100644
--- a/runtime/asm_support.h
+++ b/runtime/asm_support.h
@@ -96,6 +96,10 @@
 #define THREAD_LOCAL_ALLOC_STACK_END_OFFSET (THREAD_ROSALLOC_RUNS_OFFSET + 17 * __SIZEOF_POINTER__)
 ADD_TEST_EQ(THREAD_LOCAL_ALLOC_STACK_END_OFFSET,
             art::Thread::ThreadLocalAllocStackEndOffset<POINTER_SIZE>().Int32Value())
+// Offset of field Thread::interpreter_cache_.
+#define THREAD_INTERPRETER_CACHE_OFFSET (144 + 312 * __SIZEOF_POINTER__)
+ADD_TEST_EQ(THREAD_INTERPRETER_CACHE_OFFSET,
+            art::Thread::InterpreterCacheOffset<POINTER_SIZE>().Int32Value())
 
 // Offsets within ShadowFrame.
 #define SHADOWFRAME_LINK_OFFSET 0
diff --git a/runtime/generated/asm_support_gen.h b/runtime/generated/asm_support_gen.h
index 464c2b7..ae31a54 100644
--- a/runtime/generated/asm_support_gen.h
+++ b/runtime/generated/asm_support_gen.h
@@ -164,6 +164,8 @@
 DEFINE_CHECK_EQ(static_cast<int32_t>(THREAD_EMPTY_CHECKPOINT_REQUEST), (static_cast<int32_t>((art::kEmptyCheckpointRequest))))
 #define THREAD_SUSPEND_OR_CHECKPOINT_REQUEST 7
 DEFINE_CHECK_EQ(static_cast<int32_t>(THREAD_SUSPEND_OR_CHECKPOINT_REQUEST), (static_cast<int32_t>((art::kSuspendRequest | art::kCheckpointRequest | art::kEmptyCheckpointRequest))))
+#define THREAD_INTERPRETER_CACHE_SIZE_LOG2 8
+DEFINE_CHECK_EQ(static_cast<int32_t>(THREAD_INTERPRETER_CACHE_SIZE_LOG2), (static_cast<int32_t>((art::Thread::InterpreterCacheSizeLog2()))))
 #define JIT_CHECK_OSR (-1)
 DEFINE_CHECK_EQ(static_cast<int16_t>(JIT_CHECK_OSR), (static_cast<int16_t>((art::jit::kJitCheckForOSR))))
 #define JIT_HOTNESS_DISABLE (-2)
diff --git a/runtime/interpreter/interpreter_cache.cc b/runtime/interpreter/interpreter_cache.cc
new file mode 100644
index 0000000..e43fe31
--- /dev/null
+++ b/runtime/interpreter/interpreter_cache.cc
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "interpreter_cache.h"
+#include "thread-inl.h"
+
+namespace art {
+
+void InterpreterCache::Clear(Thread* owning_thread) {
+  DCHECK(owning_thread->GetInterpreterCache() == this);
+  DCHECK(owning_thread == Thread::Current() || owning_thread->IsSuspended());
+  data_.fill(Entry{});
+}
+
+bool InterpreterCache::IsCalledFromOwningThread() {
+  return Thread::Current()->GetInterpreterCache() == this;
+}
+
+}  // namespace art
diff --git a/runtime/interpreter/interpreter_cache.h b/runtime/interpreter/interpreter_cache.h
new file mode 100644
index 0000000..c25222e
--- /dev/null
+++ b/runtime/interpreter/interpreter_cache.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_INTERPRETER_INTERPRETER_CACHE_H_
+#define ART_RUNTIME_INTERPRETER_INTERPRETER_CACHE_H_
+
+#include <array>
+#include <atomic>
+
+#include "base/bit_utils.h"
+#include "base/macros.h"
+
+namespace art {
+
+class Instruction;
+class Thread;
+
+// Small fast thread-local cache for the interpreter.
+// The key for the cache is the dex instruction pointer.
+// The interpretation of the value depends on the opcode.
+// Presence of entry might imply some performance pre-conditions.
+// All operations must be done from the owning thread,
+// or at a point when the owning thread is suspended.
+//
+// Aligned to 16-bytes to make it easier to get the address of the cache
+// from assembly (it ensures that the offset is valid immediate value).
+class ALIGNED(16) InterpreterCache {
+  // Aligned since we load the whole entry in single assembly instruction.
+  typedef std::pair<const Instruction*, size_t> Entry ALIGNED(2 * sizeof(size_t));
+
+ public:
+  // 2x size increase/decrease corresponds to ~0.5% interpreter performance change.
+  // Value of 256 has around 75% cache hit rate.
+  static constexpr size_t kSize = 256;
+
+  InterpreterCache() {
+    // We can not use the Clear() method since the constructor will not
+    // be called from the owning thread.
+    data_.fill(Entry{});
+  }
+
+  // Clear the whole cache. It requires the owning thread for DCHECKs.
+  void Clear(Thread* owning_thread);
+
+  ALWAYS_INLINE bool Get(const Instruction* key, /* out */ size_t* value) {
+    DCHECK(IsCalledFromOwningThread());
+    Entry& entry = data_[IndexOf(key)];
+    if (LIKELY(entry.first == key)) {
+      *value = entry.second;
+      return true;
+    }
+    return false;
+  }
+
+  ALWAYS_INLINE void Set(const Instruction* key, size_t value) {
+    DCHECK(IsCalledFromOwningThread());
+    data_[IndexOf(key)] = Entry{key, value};
+  }
+
+ private:
+  bool IsCalledFromOwningThread();
+
+  static ALWAYS_INLINE size_t IndexOf(const Instruction* key) {
+    static_assert(IsPowerOfTwo(kSize), "Size must be power of two");
+    size_t index = (reinterpret_cast<uintptr_t>(key) >> 2) & (kSize - 1);
+    DCHECK_LT(index, kSize);
+    return index;
+  }
+
+  std::array<Entry, kSize> data_;
+};
+
+}  // namespace art
+
+#endif  // ART_RUNTIME_INTERPRETER_INTERPRETER_CACHE_H_
diff --git a/runtime/native/dalvik_system_DexFile.cc b/runtime/native/dalvik_system_DexFile.cc
index 71fabd0..0d1fe44 100644
--- a/runtime/native/dalvik_system_DexFile.cc
+++ b/runtime/native/dalvik_system_DexFile.cc
@@ -323,6 +323,9 @@
   }
   Runtime* const runtime = Runtime::Current();
   bool all_deleted = true;
+  // We need to clear the caches since they may contain pointers to the dex instructions.
+  // Different dex file can be loaded at the same memory location later by chance.
+  Thread::ClearAllInterpreterCaches();
   {
     ScopedObjectAccess soa(env);
     ObjPtr<mirror::Object> dex_files_object = soa.Decode<mirror::Object>(cookie);
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 8a8f537..497b146 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -4076,4 +4076,13 @@
   UpdateReadBarrierEntrypoints(&tlsPtr_.quick_entrypoints, /* is_active*/ true);
 }
 
+void Thread::ClearAllInterpreterCaches() {
+  static struct ClearInterpreterCacheClosure : Closure {
+    virtual void Run(Thread* thread) {
+      thread->GetInterpreterCache()->Clear(thread);
+    }
+  } closure;
+  Runtime::Current()->GetThreadList()->RunCheckpoint(&closure);
+}
+
 }  // namespace art
diff --git a/runtime/thread.h b/runtime/thread.h
index d169a62..3c85b80 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -38,6 +38,7 @@
 #include "entrypoints/quick/quick_entrypoints.h"
 #include "handle_scope.h"
 #include "instrumentation.h"
+#include "interpreter/interpreter_cache.h"
 #include "jvalue.h"
 #include "managed_stack.h"
 #include "offsets.h"
@@ -1299,6 +1300,29 @@
                                        jobject thread_group)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
+  ALWAYS_INLINE InterpreterCache* GetInterpreterCache() {
+    return &interpreter_cache_;
+  }
+
+  // Clear all thread-local interpreter caches.
+  //
+  // Since the caches are keyed by memory pointer to dex instructions, this must be
+  // called when any dex code is unloaded (before different code gets loaded at the
+  // same memory location).
+  //
+  // If presence of cache entry implies some pre-conditions, this must also be
+  // called if the pre-conditions might no longer hold true.
+  static void ClearAllInterpreterCaches();
+
+  template<PointerSize pointer_size>
+  static ThreadOffset<pointer_size> InterpreterCacheOffset() {
+    return ThreadOffset<pointer_size>(OFFSETOF_MEMBER(Thread, interpreter_cache_));
+  }
+
+  static int InterpreterCacheSizeLog2() {
+    return WhichPowerOf2(InterpreterCache::kSize);
+  }
+
  private:
   explicit Thread(bool daemon);
   ~Thread() REQUIRES(!Locks::mutator_lock_, !Locks::thread_suspend_count_lock_);
@@ -1788,6 +1812,11 @@
   // be false for threads where '!can_call_into_java_'.
   bool can_be_suspended_by_user_code_;
 
+  // Small thread-local cache to be used from the interpreter.
+  // It is keyed by dex instruction pointer.
+  // The value is opcode-depended (e.g. field offset).
+  InterpreterCache interpreter_cache_;
+
   friend class Dbg;  // For SetStateUnsafe.
   friend class gc::collector::SemiSpace;  // For getting stack traces.
   friend class Runtime;  // For CreatePeer.
diff --git a/tools/cpp-define-generator/constant_thread.def b/tools/cpp-define-generator/constant_thread.def
index 1364b55..7e1df6b 100644
--- a/tools/cpp-define-generator/constant_thread.def
+++ b/tools/cpp-define-generator/constant_thread.def
@@ -27,5 +27,4 @@
 DEFINE_THREAD_CONSTANT(CHECKPOINT_REQUEST, int32_t, art::kCheckpointRequest)
 DEFINE_THREAD_CONSTANT(EMPTY_CHECKPOINT_REQUEST, int32_t, art::kEmptyCheckpointRequest)
 DEFINE_THREAD_CONSTANT(SUSPEND_OR_CHECKPOINT_REQUEST,  int32_t, art::kSuspendRequest | art::kCheckpointRequest | art::kEmptyCheckpointRequest)
-
-#undef DEFINE_THREAD_CONSTANT
+DEFINE_THREAD_CONSTANT(INTERPRETER_CACHE_SIZE_LOG2, int32_t, art::Thread::InterpreterCacheSizeLog2())