jni: Fast path for @FastNative annotated java methods

Adds a faster path for java methods annotated with
dalvik.annotation.optimization.FastNative .

Intended to replace usage of fast JNI (registering with "!(FOO)BAR" descriptors).

Performance Microbenchmark Results (Angler):
* Regular JNI cost in nanoseconds: 115
* Fast JNI cost in nanoseconds: 60
* @FastNative cost in nanoseconds: 36

Summary: Up to 67% faster (vs fast jni) JNI transition cost

Change-Id: Ic23823ae0f232270c068ec999fd89aa993894b0e
diff --git a/runtime/art_method.cc b/runtime/art_method.cc
index 60975d4..d812590 100644
--- a/runtime/art_method.cc
+++ b/runtime/art_method.cc
@@ -334,6 +334,23 @@
   return GetDeclaringClass()->IsInterface();
 }
 
+bool ArtMethod::IsAnnotatedWithFastNative() {
+  Thread* self = Thread::Current();
+  ScopedObjectAccess soa(self);
+  StackHandleScope<1> shs(self);
+
+  const DexFile& dex_file = GetDeclaringClass()->GetDexFile();
+
+  mirror::Class* fast_native_annotation =
+      soa.Decode<mirror::Class*>(WellKnownClasses::dalvik_annotation_optimization_FastNative);
+  Handle<mirror::Class> fast_native_handle(shs.NewHandle(fast_native_annotation));
+
+  // Note: Resolves any method annotations' classes as a side-effect.
+  // -- This seems allowed by the spec since it says we can preload any classes
+  //    referenced by another classes's constant pool table.
+  return dex_file.IsMethodAnnotationPresent(this, fast_native_handle, DexFile::kDexVisibilityBuild);
+}
+
 bool ArtMethod::EqualParameters(Handle<mirror::ObjectArray<mirror::Class>> params) {
   auto* dex_cache = GetDexCache();
   auto* dex_file = dex_cache->GetDexFile();
diff --git a/runtime/art_method.h b/runtime/art_method.h
index acf06fd..a90ef23 100644
--- a/runtime/art_method.h
+++ b/runtime/art_method.h
@@ -375,6 +375,10 @@
     return (GetAccessFlags() & kAccMustCountLocks) != 0;
   }
 
+  // Checks to see if the method was annotated with @dalvik.annotation.optimization.FastNative
+  // -- Independent of kAccFastNative access flags.
+  bool IsAnnotatedWithFastNative();
+
   // Returns true if this method could be overridden by a default method.
   bool IsOverridableByDefaultMethod() SHARED_REQUIRES(Locks::mutator_lock_);
 
diff --git a/runtime/asm_support.h b/runtime/asm_support.h
index d4cee44..e318f56 100644
--- a/runtime/asm_support.h
+++ b/runtime/asm_support.h
@@ -87,7 +87,7 @@
             art::Thread::SelfOffset<POINTER_SIZE>().Int32Value())
 
 // Offset of field Thread::tlsPtr_.thread_local_objects.
-#define THREAD_LOCAL_OBJECTS_OFFSET (THREAD_CARD_TABLE_OFFSET + 197 * __SIZEOF_POINTER__)
+#define THREAD_LOCAL_OBJECTS_OFFSET (THREAD_CARD_TABLE_OFFSET + 199 * __SIZEOF_POINTER__)
 ADD_TEST_EQ(THREAD_LOCAL_OBJECTS_OFFSET,
             art::Thread::ThreadLocalObjectsOffset<POINTER_SIZE>().Int32Value())
 // Offset of field Thread::tlsPtr_.thread_local_pos.
diff --git a/runtime/dex_file.cc b/runtime/dex_file.cc
index a6eb5f6..90c678c 100644
--- a/runtime/dex_file.cc
+++ b/runtime/dex_file.cc
@@ -1403,7 +1403,9 @@
   return GetSignatureValue(method_class, annotation_set);
 }
 
-bool DexFile::IsMethodAnnotationPresent(ArtMethod* method, Handle<mirror::Class> annotation_class)
+bool DexFile::IsMethodAnnotationPresent(ArtMethod* method,
+                                        Handle<mirror::Class> annotation_class,
+                                        uint32_t visibility /* = kDexVisibilityRuntime */)
     const {
   const AnnotationSetItem* annotation_set = FindAnnotationSetForMethod(method);
   if (annotation_set == nullptr) {
@@ -1411,8 +1413,10 @@
   }
   StackHandleScope<1> hs(Thread::Current());
   Handle<mirror::Class> method_class(hs.NewHandle(method->GetDeclaringClass()));
-  const AnnotationItem* annotation_item = GetAnnotationItemFromAnnotationSet(
-      method_class, annotation_set, kDexVisibilityRuntime, annotation_class);
+  const AnnotationItem* annotation_item = GetAnnotationItemFromAnnotationSet(method_class,
+                                                                             annotation_set,
+                                                                             visibility,
+                                                                             annotation_class);
   return annotation_item != nullptr;
 }
 
diff --git a/runtime/dex_file.h b/runtime/dex_file.h
index 2eca495..59339ef 100644
--- a/runtime/dex_file.h
+++ b/runtime/dex_file.h
@@ -960,7 +960,9 @@
       SHARED_REQUIRES(Locks::mutator_lock_);
   mirror::ObjectArray<mirror::String>* GetSignatureAnnotationForMethod(ArtMethod* method) const
       SHARED_REQUIRES(Locks::mutator_lock_);
-  bool IsMethodAnnotationPresent(ArtMethod* method, Handle<mirror::Class> annotation_class) const
+  bool IsMethodAnnotationPresent(ArtMethod* method,
+                                 Handle<mirror::Class> annotation_class,
+                                 uint32_t visibility = kDexVisibilityRuntime) const
       SHARED_REQUIRES(Locks::mutator_lock_);
 
   const AnnotationSetItem* FindAnnotationSetForClass(Handle<mirror::Class> klass) const
diff --git a/runtime/entrypoints/quick/quick_default_init_entrypoints.h b/runtime/entrypoints/quick/quick_default_init_entrypoints.h
index f98de95..2a206c2 100644
--- a/runtime/entrypoints/quick/quick_default_init_entrypoints.h
+++ b/runtime/entrypoints/quick/quick_default_init_entrypoints.h
@@ -73,11 +73,13 @@
 
   // JNI
   qpoints->pJniMethodStart = JniMethodStart;
+  qpoints->pJniMethodFastStart = JniMethodFastStart;
   qpoints->pJniMethodStartSynchronized = JniMethodStartSynchronized;
   qpoints->pJniMethodEnd = JniMethodEnd;
   qpoints->pJniMethodEndSynchronized = JniMethodEndSynchronized;
   qpoints->pJniMethodEndWithReference = JniMethodEndWithReference;
   qpoints->pJniMethodEndWithReferenceSynchronized = JniMethodEndWithReferenceSynchronized;
+  qpoints->pJniMethodFastEnd = JniMethodFastEnd;
   qpoints->pQuickGenericJniTrampoline = art_quick_generic_jni_trampoline;
 
   // Locks
diff --git a/runtime/entrypoints/quick/quick_entrypoints.h b/runtime/entrypoints/quick/quick_entrypoints.h
index f5b68fa..08e0d6e 100644
--- a/runtime/entrypoints/quick/quick_entrypoints.h
+++ b/runtime/entrypoints/quick/quick_entrypoints.h
@@ -52,10 +52,13 @@
 // JNI entrypoints.
 // TODO: NO_THREAD_SAFETY_ANALYSIS due to different control paths depending on fast JNI.
 extern uint32_t JniMethodStart(Thread* self) NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
+extern uint32_t JniMethodFastStart(Thread* self) NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
 extern uint32_t JniMethodStartSynchronized(jobject to_lock, Thread* self)
     NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
 extern void JniMethodEnd(uint32_t saved_local_ref_cookie, Thread* self)
     NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
+extern void JniMethodFastEnd(uint32_t saved_local_ref_cookie, Thread* self)
+    NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
 extern void JniMethodEndSynchronized(uint32_t saved_local_ref_cookie, jobject locked,
                                      Thread* self)
     NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
diff --git a/runtime/entrypoints/quick/quick_entrypoints_list.h b/runtime/entrypoints/quick/quick_entrypoints_list.h
index 07f0394..74c928a 100644
--- a/runtime/entrypoints/quick/quick_entrypoints_list.h
+++ b/runtime/entrypoints/quick/quick_entrypoints_list.h
@@ -72,8 +72,10 @@
   V(HandleFillArrayData, void, void*, void*) \
 \
   V(JniMethodStart, uint32_t, Thread*) \
+  V(JniMethodFastStart, uint32_t, Thread*) \
   V(JniMethodStartSynchronized, uint32_t, jobject, Thread*) \
   V(JniMethodEnd, void, uint32_t, Thread*) \
+  V(JniMethodFastEnd, void, uint32_t, Thread*) \
   V(JniMethodEndSynchronized, void, uint32_t, jobject, Thread*) \
   V(JniMethodEndWithReference, mirror::Object*, jobject, uint32_t, Thread*) \
   V(JniMethodEndWithReferenceSynchronized, mirror::Object*, jobject, uint32_t, jobject, Thread*) \
@@ -195,7 +197,8 @@
   V(ReadBarrierMarkReg28, mirror::Object*, mirror::Object*) \
   V(ReadBarrierMarkReg29, mirror::Object*, mirror::Object*) \
   V(ReadBarrierSlow, mirror::Object*, mirror::Object*, mirror::Object*, uint32_t) \
-  V(ReadBarrierForRootSlow, mirror::Object*, GcRoot<mirror::Object>*)
+  V(ReadBarrierForRootSlow, mirror::Object*, GcRoot<mirror::Object>*) \
+\
 
 #endif  // ART_RUNTIME_ENTRYPOINTS_QUICK_QUICK_ENTRYPOINTS_LIST_H_
 #undef ART_RUNTIME_ENTRYPOINTS_QUICK_QUICK_ENTRYPOINTS_LIST_H_   // #define is only for lint.
diff --git a/runtime/entrypoints/quick/quick_jni_entrypoints.cc b/runtime/entrypoints/quick/quick_jni_entrypoints.cc
index 58f256a..c06824c 100644
--- a/runtime/entrypoints/quick/quick_jni_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_jni_entrypoints.cc
@@ -29,6 +29,21 @@
   handle_on_stack->Assign(to_ref);
 }
 
+// Called on entry to fast JNI, push a new local reference table only.
+extern uint32_t JniMethodFastStart(Thread* self) {
+  JNIEnvExt* env = self->GetJniEnv();
+  DCHECK(env != nullptr);
+  uint32_t saved_local_ref_cookie = env->local_ref_cookie;
+  env->local_ref_cookie = env->locals.GetSegmentState();
+
+  if (kIsDebugBuild) {
+    ArtMethod* native_method = *self->GetManagedStack()->GetTopQuickFrame();
+    CHECK(native_method->IsAnnotatedWithFastNative()) << PrettyMethod(native_method);
+  }
+
+  return saved_local_ref_cookie;
+}
+
 // Called on entry to JNI, transition out of Runnable and release share of mutator_lock_.
 extern uint32_t JniMethodStart(Thread* self) {
   JNIEnvExt* env = self->GetJniEnv();
@@ -73,11 +88,32 @@
   self->PopHandleScope();
 }
 
+// TODO: These should probably be templatized or macro-ized.
+// Otherwise there's just too much repetitive boilerplate.
+
 extern void JniMethodEnd(uint32_t saved_local_ref_cookie, Thread* self) {
   GoToRunnable(self);
   PopLocalReferences(saved_local_ref_cookie, self);
 }
 
+extern void JniMethodFastEnd(uint32_t saved_local_ref_cookie, Thread* self) {
+  // inlined fast version of GoToRunnable(self);
+
+  if (kIsDebugBuild) {
+    ArtMethod* native_method = *self->GetManagedStack()->GetTopQuickFrame();
+    CHECK(native_method->IsAnnotatedWithFastNative()) << PrettyMethod(native_method);
+  }
+
+  if (UNLIKELY(self->TestAllFlags())) {
+    // In fast JNI mode we never transitioned out of runnable. Perform a suspend check if there
+    // is a flag raised.
+    DCHECK(Locks::mutator_lock_->IsSharedHeld(self));
+    self->CheckSuspend();
+  }
+
+  PopLocalReferences(saved_local_ref_cookie, self);
+}
+
 extern void JniMethodEndSynchronized(uint32_t saved_local_ref_cookie, jobject locked,
                                      Thread* self) {
   GoToRunnable(self);
@@ -85,6 +121,10 @@
   PopLocalReferences(saved_local_ref_cookie, self);
 }
 
+// TODO: JniMethodFastEndWithReference
+// (Probably don't need to have a synchronized variant since
+// it already has to do atomic operations)
+
 // Common result handling for EndWithReference.
 static mirror::Object* JniMethodEndWithReferenceHandleResult(jobject result,
                                                              uint32_t saved_local_ref_cookie,
diff --git a/runtime/entrypoints_order_test.cc b/runtime/entrypoints_order_test.cc
index e3203dc..004cdc4 100644
--- a/runtime/entrypoints_order_test.cc
+++ b/runtime/entrypoints_order_test.cc
@@ -211,11 +211,14 @@
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pAputObjectWithBoundCheck, pAputObject, sizeof(void*));
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pAputObject, pHandleFillArrayData, sizeof(void*));
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pHandleFillArrayData, pJniMethodStart, sizeof(void*));
-    EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniMethodStart, pJniMethodStartSynchronized,
+    EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniMethodStart, pJniMethodFastStart,
+                         sizeof(void*));
+    EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniMethodFastStart, pJniMethodStartSynchronized,
                          sizeof(void*));
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniMethodStartSynchronized, pJniMethodEnd,
                          sizeof(void*));
-    EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniMethodEnd, pJniMethodEndSynchronized, sizeof(void*));
+    EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniMethodEnd, pJniMethodFastEnd, sizeof(void*));
+    EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniMethodFastEnd, pJniMethodEndSynchronized, sizeof(void*));
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniMethodEndSynchronized, pJniMethodEndWithReference,
                          sizeof(void*));
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniMethodEndWithReference,
diff --git a/runtime/jni_internal.cc b/runtime/jni_internal.cc
index e1a4e2a..c322475 100644
--- a/runtime/jni_internal.cc
+++ b/runtime/jni_internal.cc
@@ -2205,6 +2205,7 @@
 
       VLOG(jni) << "[Registering JNI native method " << PrettyMethod(m) << "]";
 
+      is_fast = is_fast || m->IsFastNative();  // Merge with @FastNative state.
       m->RegisterNative(fnPtr, is_fast);
     }
     return JNI_OK;
diff --git a/runtime/parsed_options.cc b/runtime/parsed_options.cc
index c7e4f8b..2a040a3 100644
--- a/runtime/parsed_options.cc
+++ b/runtime/parsed_options.cc
@@ -487,7 +487,7 @@
   args.SetIfMissing(M::ParallelGCThreads, gc::Heap::kDefaultEnableParallelGC ?
       static_cast<unsigned int>(sysconf(_SC_NPROCESSORS_CONF) - 1u) : 0u);
 
-  // -Xverbose:
+  // -verbose:
   {
     LogVerbosity *log_verbosity = args.Get(M::Verbose);
     if (log_verbosity != nullptr) {
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 3326736..b35a614 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -2599,6 +2599,9 @@
   QUICK_ENTRY_POINT_INFO(pReadBarrierMarkReg29)
   QUICK_ENTRY_POINT_INFO(pReadBarrierSlow)
   QUICK_ENTRY_POINT_INFO(pReadBarrierForRootSlow)
+
+  QUICK_ENTRY_POINT_INFO(pJniMethodFastStart)
+  QUICK_ENTRY_POINT_INFO(pJniMethodFastEnd)
 #undef QUICK_ENTRY_POINT_INFO
 
   os << offset;
diff --git a/runtime/well_known_classes.cc b/runtime/well_known_classes.cc
index 48deb35..ddce344 100644
--- a/runtime/well_known_classes.cc
+++ b/runtime/well_known_classes.cc
@@ -30,6 +30,7 @@
 namespace art {
 
 jclass WellKnownClasses::com_android_dex_Dex;
+jclass WellKnownClasses::dalvik_annotation_optimization_FastNative;
 jclass WellKnownClasses::dalvik_system_DexFile;
 jclass WellKnownClasses::dalvik_system_DexPathList;
 jclass WellKnownClasses::dalvik_system_DexPathList__Element;
@@ -215,6 +216,7 @@
 
 void WellKnownClasses::Init(JNIEnv* env) {
   com_android_dex_Dex = CacheClass(env, "com/android/dex/Dex");
+  dalvik_annotation_optimization_FastNative = CacheClass(env, "dalvik/annotation/optimization/FastNative");
   dalvik_system_DexFile = CacheClass(env, "dalvik/system/DexFile");
   dalvik_system_DexPathList = CacheClass(env, "dalvik/system/DexPathList");
   dalvik_system_DexPathList__Element = CacheClass(env, "dalvik/system/DexPathList$Element");
diff --git a/runtime/well_known_classes.h b/runtime/well_known_classes.h
index c9faf69..b8e05b8 100644
--- a/runtime/well_known_classes.h
+++ b/runtime/well_known_classes.h
@@ -41,6 +41,7 @@
       SHARED_REQUIRES(Locks::mutator_lock_);
 
   static jclass com_android_dex_Dex;
+  static jclass dalvik_annotation_optimization_FastNative;
   static jclass dalvik_system_DexFile;
   static jclass dalvik_system_DexPathList;
   static jclass dalvik_system_DexPathList__Element;