am b89e81dc: fs_config: align with new explicit fs_config target_out parameter

* commit 'b89e81dcb9bfa707912d9e370949b250367b0998':
  fs_config: align with new explicit fs_config target_out parameter
diff --git a/ext4_utils/Android.mk b/ext4_utils/Android.mk
index 31a4b71..8953b45 100644
--- a/ext4_utils/Android.mk
+++ b/ext4_utils/Android.mk
@@ -22,6 +22,8 @@
 include $(CLEAR_VARS)
 LOCAL_SRC_FILES := $(libext4_utils_src_files)
 LOCAL_MODULE := libext4_utils_host
+# Various instances of dereferencing a type-punned pointer in extent.c
+LOCAL_CFLAGS += -fno-strict-aliasing
 LOCAL_STATIC_LIBRARIES := \
     libsparse_host \
     libz
@@ -63,6 +65,8 @@
 LOCAL_SRC_FILES := $(libext4_utils_src_files)
 LOCAL_MODULE := libext4_utils
 LOCAL_C_INCLUDES += system/core/logwrapper/include
+# Various instances of dereferencing a type-punned pointer in extent.c
+LOCAL_CFLAGS += -fno-strict-aliasing
 LOCAL_SHARED_LIBRARIES := \
     libcutils \
     libext2_uuid \
@@ -77,6 +81,8 @@
 LOCAL_SRC_FILES := $(libext4_utils_src_files) \
     ext4_crypt_init_extensions.cpp
 LOCAL_MODULE := libext4_utils_static
+# Various instances of dereferencing a type-punned pointer in extent.c
+LOCAL_CFLAGS += -fno-strict-aliasing
 LOCAL_STATIC_LIBRARIES := \
     libsparse_static
 include $(BUILD_STATIC_LIBRARY)
diff --git a/ext4_utils/ext4_crypt_init_extensions.cpp b/ext4_utils/ext4_crypt_init_extensions.cpp
index 3fb04b9..e1b69e1 100644
--- a/ext4_utils/ext4_crypt_init_extensions.cpp
+++ b/ext4_utils/ext4_crypt_init_extensions.cpp
@@ -142,10 +142,15 @@
     if (!dir || strncmp(dir, "/data/", 6) || strchr(dir + 6, '/')) {
         return 0;
     }
+    // ext4enc:TODO exclude /data/user with a horrible special case.
+    if (!strcmp(dir, "/data/user")) {
+        return 0;
+    }
 
     UnencryptedProperties props("/data");
     std::string policy = props.Get<std::string>(properties::ref);
     if (policy.empty()) {
+        // ext4enc:TODO why is this OK?
         return 0;
     }
 
diff --git a/ext4_utils/ext4_sb.h b/ext4_utils/ext4_sb.h
index 832fa33..159580d 100644
--- a/ext4_utils/ext4_sb.h
+++ b/ext4_utils/ext4_sb.h
@@ -25,6 +25,8 @@
 extern "C" {
 #endif
 
+#include <stdbool.h>
+
 struct fs_info {
 	int64_t len;	/* If set to 0, ask the block device for the size,
 			 * if less than 0, reserve that much space at the
@@ -41,6 +43,7 @@
 	uint32_t bg_desc_reserve_blocks;
 	const char *label;
 	uint8_t no_journal;
+	bool block_device;	/* target fd is a block device? */
 };
 
 int ext4_parse_sb(struct ext4_super_block *sb, struct fs_info *info);
diff --git a/ext4_utils/ext4_utils.c b/ext4_utils/ext4_utils.c
index 3b22b81..29cbc72 100644
--- a/ext4_utils/ext4_utils.c
+++ b/ext4_utils/ext4_utils.c
@@ -168,10 +168,31 @@
 		critical_error("failed to write all of superblock");
 }
 
+static void block_device_write_sb(int fd)
+{
+	unsigned long long offset;
+	u32 i;
+
+	/* write out the backup superblocks */
+	for (i = 1; i < aux_info.groups; i++) {
+		if (ext4_bg_has_super_block(i)) {
+			offset = info.block_size * (aux_info.first_data_block
+				+ i * info.blocks_per_group);
+			write_sb(fd, offset, aux_info.backup_sb[i]);
+		}
+	}
+
+	/* write out the primary superblock */
+	write_sb(fd, 1024, aux_info.sb);
+}
+
 /* Write the filesystem image to a file */
 void write_ext4_image(int fd, int gz, int sparse, int crc)
 {
 	sparse_file_write(ext4_sparse_file, fd, gz, sparse, crc);
+
+	if (info.block_device)
+		block_device_write_sb(fd);
 }
 
 /* Compute the rest of the parameters of the filesystem from the basic info */
@@ -203,7 +224,27 @@
 		aux_info.len_blocks -= last_group_size;
 	}
 
-	aux_info.sb = calloc(info.block_size, 1);
+	/* A zero-filled superblock to be written firstly to the block
+	 * device to mark the file-system as invalid
+	 */
+	aux_info.sb_zero = calloc(1, info.block_size);
+	if (!aux_info.sb_zero)
+		critical_error_errno("calloc");
+
+	/* The write_data* functions expect only block aligned calls.
+	 * This is not an issue, except when we write out the super
+	 * block on a system with a block size > 1K.  So, we need to
+	 * deal with that here.
+	 */
+	aux_info.sb_block = calloc(1, info.block_size);
+	if (!aux_info.sb_block)
+		critical_error_errno("calloc");
+
+	if (info.block_size > 1024)
+		aux_info.sb = (struct ext4_super_block *)((char *)aux_info.sb_block + 1024);
+	else
+		aux_info.sb = aux_info.sb_block;
+
 	/* Alloc an array to hold the pointers to the backup superblocks */
 	aux_info.backup_sb = calloc(aux_info.groups, sizeof(char *));
 
@@ -224,7 +265,8 @@
 		if (aux_info.backup_sb[i])
 			free(aux_info.backup_sb[i]);
 	}
-	free(aux_info.sb);
+	free(aux_info.sb_block);
+	free(aux_info.sb_zero);
 	free(aux_info.bg_desc);
 }
 
@@ -321,11 +363,11 @@
 		if (ext4_bg_has_super_block(i)) {
 			if (i != 0) {
 				aux_info.backup_sb[i] = calloc(info.block_size, 1);
-				memcpy(aux_info.backup_sb[i], sb, info.block_size);
+				memcpy(aux_info.backup_sb[i], sb, sizeof(struct ext4_super_block));
 				/* Update the block group nr of this backup superblock */
 				aux_info.backup_sb[i]->s_block_group_nr = i;
-				sparse_file_add_data(ext4_sparse_file, aux_info.backup_sb[i],
-						info.block_size, group_start_block);
+				ext4_queue_sb(group_start_block, info.block_device ?
+						aux_info.sb_zero : aux_info.backup_sb[i]);
 			}
 			sparse_file_add_data(ext4_sparse_file, aux_info.bg_desc,
 				aux_info.bg_desc_blocks * info.block_size,
@@ -341,22 +383,23 @@
 		aux_info.bg_desc[i].bg_free_inodes_count = sb->s_inodes_per_group;
 		aux_info.bg_desc[i].bg_used_dirs_count = 0;
 	}
+
+	/* Queue the primary superblock to be written out - if it's a block device,
+	 * queue a zero-filled block first, the correct version of superblock will
+	 * be written to the block device after all other blocks are written.
+	 *
+	 * The file-system on the block device will not be valid until the correct
+	 * version of superblocks are written, this is to avoid the likelihood of a
+	 * partially created file-system.
+	 */
+	ext4_queue_sb(aux_info.first_data_block, info.block_device ?
+				aux_info.sb_zero : aux_info.sb_block);
 }
 
-void ext4_queue_sb(void)
+
+void ext4_queue_sb(u64 start_block, struct ext4_super_block *sb)
 {
-	/* The write_data* functions expect only block aligned calls.
-	 * This is not an issue, except when we write out the super
-	 * block on a system with a block size > 1K.  So, we need to
-	 * deal with that here.
-	 */
-	if (info.block_size > 1024) {
-		u8 *buf = calloc(info.block_size, 1);
-		memcpy(buf + 1024, (u8*)aux_info.sb, 1024);
-		sparse_file_add_data(ext4_sparse_file, buf, info.block_size, 0);
-	} else {
-		sparse_file_add_data(ext4_sparse_file, aux_info.sb, 1024, 1);
-	}
+	sparse_file_add_data(ext4_sparse_file, sb, info.block_size, start_block);
 }
 
 void ext4_parse_sb_info(struct ext4_super_block *sb)
diff --git a/ext4_utils/ext4_utils.h b/ext4_utils/ext4_utils.h
index 8a4ad8f..0159dbe 100644
--- a/ext4_utils/ext4_utils.h
+++ b/ext4_utils/ext4_utils.h
@@ -99,6 +99,8 @@
 
 struct fs_aux_info {
 	struct ext4_super_block *sb;
+	struct ext4_super_block *sb_block;
+	struct ext4_super_block *sb_zero;
 	struct ext4_super_block **backup_sb;
 	struct ext2_group_desc *bg_desc;
 	struct block_group_info *bgs;
@@ -142,7 +144,7 @@
 void ext4_create_resize_inode(void);
 void ext4_create_journal_inode(void);
 void ext4_update_free(void);
-void ext4_queue_sb(void);
+void ext4_queue_sb(u64 start_block, struct ext4_super_block *sb);
 u64 get_block_device_size(int fd);
 int is_block_device_fd(int fd);
 u64 get_file_size(int fd);
diff --git a/ext4_utils/make_ext4fs.c b/ext4_utils/make_ext4fs.c
index 669d080..b4ebbce 100644
--- a/ext4_utils/make_ext4fs.c
+++ b/ext4_utils/make_ext4fs.c
@@ -503,6 +503,13 @@
 	if (setjmp(setjmp_env))
 		return EXIT_FAILURE; /* Handle a call to longjmp() */
 
+	info.block_device = is_block_device_fd(fd);
+
+	if (info.block_device && (sparse || gzip || crc)) {
+		fprintf(stderr, "No sparse/gzip/crc allowed for block device\n");
+		return EXIT_FAILURE;
+	}
+
 	if (_mountpoint == NULL) {
 		mountpoint = strdup("");
 	} else {
@@ -633,8 +640,6 @@
 
 	ext4_update_free();
 
-	ext4_queue_sb();
-
 	if (block_list_file) {
 		size_t dirlen = directory ? strlen(directory) : 0;
 		struct block_allocation* p = get_saved_allocation_chain();
diff --git a/ext4_utils/unencrypted_properties.cpp b/ext4_utils/unencrypted_properties.cpp
index d873e91..ed36e20 100644
--- a/ext4_utils/unencrypted_properties.cpp
+++ b/ext4_utils/unencrypted_properties.cpp
@@ -84,6 +84,7 @@
 
 bool UnencryptedProperties::Remove(const char* name)
 {
+    if (!OK()) return false;
     if (remove((folder_ + "/" + name).c_str())
         && errno != ENOENT) {
         return false;
diff --git a/libpagemap/pm_map.c b/libpagemap/pm_map.c
index a65d315..c6a1798 100644
--- a/libpagemap/pm_map.c
+++ b/libpagemap/pm_map.c
@@ -46,7 +46,8 @@
     for (i = 0; i < len; i++) {
         usage.vss += map->proc->ker->pagesize;
 
-        if (!PM_PAGEMAP_PRESENT(pagemap[i]))
+        if (!PM_PAGEMAP_PRESENT(pagemap[i]) &&
+	    !PM_PAGEMAP_SWAPPED(pagemap[i]))
             continue;
 
         if (!PM_PAGEMAP_SWAPPED(pagemap[i])) {
diff --git a/procrank/procrank.c b/procrank/procrank.c
index de26cd1..1728467 100644
--- a/procrank/procrank.c
+++ b/procrank/procrank.c
@@ -324,7 +324,7 @@
     }
 
     if (has_swap) {
-        printf("%6" PRIu64 "K  ", total_swap);
+        printf("%6" PRIu64 "K  ", total_swap / 1024);
     }
 
     printf("TOTAL\n");
diff --git a/simpleperf/Android.mk b/simpleperf/Android.mk
index f37c4b0..b923738 100644
--- a/simpleperf/Android.mk
+++ b/simpleperf/Android.mk
@@ -17,6 +17,11 @@
 LOCAL_PATH := $(call my-dir)
 
 simpleperf_common_cppflags := -std=c++11 -Wall -Wextra -Werror -Wunused
+simpleperf_host_common_cppflags := $(simpleperf_common_cppflags) \
+                                   -DUSE_BIONIC_PERF_EVENT_H -I bionic \
+
+simpleperf_host_darwin_cppflags := $(simpleperf_host_common_cppflags) \
+                                   -I $(LOCAL_PATH)/darwin_support \
 
 simpleperf_common_shared_libraries := \
   libbase \
@@ -24,24 +29,38 @@
 
 LLVM_ROOT_PATH := external/llvm
 
-libsimpleperf_src_files := \
+# libsimpleperf
+# =========================================================
+libsimpleperf_common_src_files := \
+  callchain.cpp \
   cmd_dumprecord.cpp \
   cmd_help.cpp \
-  cmd_list.cpp \
-  cmd_record.cpp \
-  cmd_stat.cpp \
+  cmd_report.cpp \
   command.cpp \
-  environment.cpp \
+  dso.cpp \
   event_attr.cpp \
-  event_fd.cpp \
-  event_selection_set.cpp \
   event_type.cpp \
   read_elf.cpp \
   record.cpp \
-  record_file.cpp \
+  record_file_reader.cpp \
+  sample_tree.cpp \
   utils.cpp \
+
+libsimpleperf_src_files := \
+  $(libsimpleperf_common_src_files) \
+  cmd_list.cpp \
+  cmd_record.cpp \
+  cmd_stat.cpp \
+  environment.cpp \
+  event_fd.cpp \
+  event_selection_set.cpp \
+  record_file_writer.cpp \
   workload.cpp \
 
+libsimpleperf_darwin_src_files := \
+  $(libsimpleperf_common_src_files) \
+  environment_fake.cpp \
+
 include $(CLEAR_VARS)
 LOCAL_CLANG := true
 LOCAL_CPPFLAGS := $(simpleperf_common_cppflags)
@@ -58,7 +77,7 @@
 ifeq ($(HOST_OS),linux)
 include $(CLEAR_VARS)
 LOCAL_CLANG := true
-LOCAL_CPPFLAGS := $(simpleperf_common_cppflags)
+LOCAL_CPPFLAGS := $(simpleperf_host_common_cppflags)
 LOCAL_SRC_FILES := $(libsimpleperf_src_files)
 LOCAL_SHARED_LIBRARIES := $(simpleperf_common_shared_libraries)
 LOCAL_LDLIBS := -lrt
@@ -70,6 +89,22 @@
 include $(BUILD_HOST_STATIC_LIBRARY)
 endif
 
+ifeq ($(HOST_OS),darwin)
+include $(CLEAR_VARS)
+LOCAL_CLANG := true
+LOCAL_CPPFLAGS := $(simpleperf_host_darwin_cppflags)
+LOCAL_SRC_FILES := $(libsimpleperf_darwin_src_files)
+LOCAL_SHARED_LIBRARIES := $(simpleperf_common_shared_libraries)
+LOCAL_MODULE := libsimpleperf
+LOCAL_MODULE_TAGS := optional
+LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.mk
+include $(LLVM_ROOT_PATH)/llvm.mk
+include $(LLVM_HOST_BUILD_MK)
+include $(BUILD_HOST_SHARED_LIBRARY)
+endif
+
+# simpleperf
+# =========================================================
 include $(CLEAR_VARS)
 LOCAL_CLANG := true
 LOCAL_CPPFLAGS := $(simpleperf_common_cppflags)
@@ -85,7 +120,7 @@
 ifeq ($(HOST_OS),linux)
 include $(CLEAR_VARS)
 LOCAL_CLANG := true
-LOCAL_CPPFLAGS := $(simpleperf_common_cppflags)
+LOCAL_CPPFLAGS := $(simpleperf_host_common_cppflags)
 LOCAL_SRC_FILES := main.cpp
 LOCAL_WHOLE_STATIC_LIBRARIES := libsimpleperf
 LOCAL_SHARED_LIBRARIES := $(simpleperf_common_shared_libraries)
@@ -96,16 +131,37 @@
 include $(BUILD_HOST_EXECUTABLE)
 endif
 
+ifeq ($(HOST_OS),darwin)
+include $(CLEAR_VARS)
+LOCAL_CLANG := true
+LOCAL_CPPFLAGS := $(simpleperf_host_darwin_cppflags)
+LOCAL_SRC_FILES := main.cpp
+LOCAL_SHARED_LIBRARIES := libsimpleperf $(simpleperf_common_shared_libraries)
+LOCAL_MODULE := simpleperf
+LOCAL_MODULE_TAGS := optional
+LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.mk
+include $(BUILD_HOST_EXECUTABLE)
+endif
+
+# simpleperf_unit_test
+# =========================================================
+simpleperf_unit_test_common_src_files := \
+  command_test.cpp \
+  gtest_main.cpp \
+  record_test.cpp \
+  sample_tree_test.cpp \
+
 simpleperf_unit_test_src_files := \
+  $(simpleperf_unit_test_common_src_files) \
   cmd_dumprecord_test.cpp \
   cmd_list_test.cpp \
   cmd_record_test.cpp \
+  cmd_report_test.cpp \
   cmd_stat_test.cpp \
-  command_test.cpp \
+  cpu_offline_test.cpp \
   environment_test.cpp \
-  gtest_main.cpp \
+  read_elf_test.cpp \
   record_file_test.cpp \
-  record_test.cpp \
   workload_test.cpp \
 
 include $(CLEAR_VARS)
@@ -122,7 +178,7 @@
 ifeq ($(HOST_OS),linux)
 include $(CLEAR_VARS)
 LOCAL_CLANG := true
-LOCAL_CPPFLAGS := $(simpleperf_common_cppflags)
+LOCAL_CPPFLAGS := $(simpleperf_host_common_cppflags)
 LOCAL_SRC_FILES := $(simpleperf_unit_test_src_files)
 LOCAL_WHOLE_STATIC_LIBRARIES := libsimpleperf
 LOCAL_SHARED_LIBRARIES := $(simpleperf_common_shared_libraries)
@@ -131,3 +187,15 @@
 LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.mk
 include $(BUILD_HOST_NATIVE_TEST)
 endif
+
+ifeq ($(HOST_OS),darwin)
+include $(CLEAR_VARS)
+LOCAL_CLANG := true
+LOCAL_CPPFLAGS := $(simpleperf_host_darwin_cppflags)
+LOCAL_SRC_FILES := $(simpleperf_unit_test_common_src_files)
+LOCAL_SHARED_LIBRARIES := libsimpleperf $(simpleperf_common_shared_libraries)
+LOCAL_MODULE := simpleperf_unit_test
+LOCAL_MODULE_TAGS := optional
+LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.mk
+include $(BUILD_HOST_NATIVE_TEST)
+endif
diff --git a/simpleperf/callchain.cpp b/simpleperf/callchain.cpp
new file mode 100644
index 0000000..c914d1e
--- /dev/null
+++ b/simpleperf/callchain.cpp
@@ -0,0 +1,125 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "callchain.h"
+
+#include <queue>
+#include <base/logging.h>
+#include "sample_tree.h"
+
+static bool MatchSampleByName(const SampleEntry* sample1, const SampleEntry* sample2) {
+  return (sample1->symbol->name == sample2->symbol->name);
+}
+
+static size_t GetMatchingLengthInNode(const CallChainNode* node,
+                                      const std::vector<SampleEntry*>& chain, size_t chain_start) {
+  size_t i, j;
+  for (i = 0, j = chain_start; i < node->chain.size() && j < chain.size(); ++i, ++j) {
+    if (!MatchSampleByName(node->chain[i], chain[j])) {
+      break;
+    }
+  }
+  return i;
+}
+
+static CallChainNode* FindMatchingNode(const std::vector<std::unique_ptr<CallChainNode>>& nodes,
+                                       const SampleEntry* sample) {
+  for (auto& node : nodes) {
+    if (MatchSampleByName(node->chain.front(), sample)) {
+      return node.get();
+    }
+  }
+  return nullptr;
+}
+
+static std::unique_ptr<CallChainNode> AllocateNode(const std::vector<SampleEntry*>& chain,
+                                                   size_t chain_start, uint64_t period,
+                                                   uint64_t children_period) {
+  std::unique_ptr<CallChainNode> node(new CallChainNode);
+  for (size_t i = chain_start; i < chain.size(); ++i) {
+    node->chain.push_back(chain[i]);
+  }
+  node->period = period;
+  node->children_period = children_period;
+  return node;
+}
+
+static void SplitNode(CallChainNode* parent, size_t parent_length) {
+  std::unique_ptr<CallChainNode> child =
+      AllocateNode(parent->chain, parent_length, parent->period, parent->children_period);
+  child->children = std::move(parent->children);
+  parent->period = 0;
+  parent->children_period = child->period + child->children_period;
+  parent->chain.resize(parent_length);
+  parent->children.clear();
+  parent->children.push_back(std::move(child));
+}
+
+void CallChainRoot::AddCallChain(const std::vector<SampleEntry*>& callchain, uint64_t period) {
+  children_period += period;
+  CallChainNode* p = FindMatchingNode(children, callchain[0]);
+  if (p == nullptr) {
+    std::unique_ptr<CallChainNode> new_node = AllocateNode(callchain, 0, period, 0);
+    children.push_back(std::move(new_node));
+    return;
+  }
+  size_t callchain_pos = 0;
+  while (true) {
+    size_t match_length = GetMatchingLengthInNode(p, callchain, callchain_pos);
+    CHECK_GT(match_length, 0u);
+    callchain_pos += match_length;
+    bool find_child = true;
+    if (match_length < p->chain.size()) {
+      SplitNode(p, match_length);
+      find_child = false;  // No need to find matching node in p->children.
+    }
+    if (callchain_pos == callchain.size()) {
+      p->period += period;
+      return;
+    }
+    p->children_period += period;
+    if (find_child) {
+      CallChainNode* np = FindMatchingNode(p->children, callchain[callchain_pos]);
+      if (np != nullptr) {
+        p = np;
+        continue;
+      }
+    }
+    std::unique_ptr<CallChainNode> new_node = AllocateNode(callchain, callchain_pos, period, 0);
+    p->children.push_back(std::move(new_node));
+    break;
+  }
+}
+
+static bool CompareNodeByPeriod(const std::unique_ptr<CallChainNode>& n1,
+                                const std::unique_ptr<CallChainNode>& n2) {
+  uint64_t period1 = n1->period + n1->children_period;
+  uint64_t period2 = n2->period + n2->children_period;
+  return period1 > period2;
+}
+
+void CallChainRoot::SortByPeriod() {
+  std::queue<std::vector<std::unique_ptr<CallChainNode>>*> queue;
+  queue.push(&children);
+  while (!queue.empty()) {
+    std::vector<std::unique_ptr<CallChainNode>>* v = queue.front();
+    queue.pop();
+    std::sort(v->begin(), v->end(), CompareNodeByPeriod);
+    for (auto& node : *v) {
+      queue.push(&node->children);
+    }
+  }
+}
diff --git a/simpleperf/callchain.h b/simpleperf/callchain.h
new file mode 100644
index 0000000..4b8a9d4
--- /dev/null
+++ b/simpleperf/callchain.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SIMPLE_PERF_CALLCHAIN_H_
+#define SIMPLE_PERF_CALLCHAIN_H_
+
+#include <memory>
+#include <vector>
+
+struct SampleEntry;
+
+struct CallChainNode {
+  uint64_t period;
+  uint64_t children_period;
+  std::vector<SampleEntry*> chain;
+  std::vector<std::unique_ptr<CallChainNode>> children;
+};
+
+struct CallChainRoot {
+  uint64_t children_period;
+  std::vector<std::unique_ptr<CallChainNode>> children;
+
+  CallChainRoot() : children_period(0) {
+  }
+
+  void AddCallChain(const std::vector<SampleEntry*>& callchain, uint64_t period);
+  void SortByPeriod();
+};
+
+#endif  // SIMPLE_PERF_CALLCHAIN_H_
diff --git a/simpleperf/cmd_dumprecord.cpp b/simpleperf/cmd_dumprecord.cpp
index 57eec1f..28175d9 100644
--- a/simpleperf/cmd_dumprecord.cpp
+++ b/simpleperf/cmd_dumprecord.cpp
@@ -22,17 +22,23 @@
 
 #include <base/logging.h>
 #include <base/stringprintf.h>
+#include <base/strings.h>
 
 #include "command.h"
 #include "event_attr.h"
 #include "record.h"
 #include "record_file.h"
+#include "utils.h"
 
 using namespace PerfFileFormat;
 
-class DumpRecordCommandImpl {
+class DumpRecordCommand : public Command {
  public:
-  DumpRecordCommandImpl() : record_filename_("perf.data") {
+  DumpRecordCommand()
+      : Command("dump", "dump perf record file",
+                "Usage: simpleperf dumprecord [options] [perf_record_file]\n"
+                "    Dump different parts of a perf record file. Default file is perf.data.\n"),
+        record_filename_("perf.data") {
   }
 
   bool Run(const std::vector<std::string>& args);
@@ -46,11 +52,9 @@
 
   std::string record_filename_;
   std::unique_ptr<RecordFileReader> record_file_reader_;
-
-  std::vector<int> features_;
 };
 
-bool DumpRecordCommandImpl::Run(const std::vector<std::string>& args) {
+bool DumpRecordCommand::Run(const std::vector<std::string>& args) {
   if (!ParseOptions(args)) {
     return false;
   }
@@ -66,16 +70,19 @@
   return true;
 }
 
-bool DumpRecordCommandImpl::ParseOptions(const std::vector<std::string>& args) {
-  if (args.size() == 2) {
-    record_filename_ = args[1];
+bool DumpRecordCommand::ParseOptions(const std::vector<std::string>& args) {
+  if (args.size() == 1) {
+    record_filename_ = args[0];
+  } else if (args.size() > 1) {
+    ReportUnknownOption(args, 1);
+    return false;
   }
   return true;
 }
 
 static const std::string GetFeatureName(int feature);
 
-void DumpRecordCommandImpl::DumpFileHeader() {
+void DumpRecordCommand::DumpFileHeader() {
   const FileHeader* header = record_file_reader_->FileHeader();
   printf("magic: ");
   for (size_t i = 0; i < 8; ++i) {
@@ -98,15 +105,15 @@
   printf("event_types[file section]: offset %" PRId64 ", size %" PRId64 "\n",
          header->event_types.offset, header->event_types.size);
 
-  features_.clear();
+  std::vector<int> features;
   for (size_t i = 0; i < FEAT_MAX_NUM; ++i) {
     size_t j = i / 8;
     size_t k = i % 8;
     if ((header->features[j] & (1 << k)) != 0) {
-      features_.push_back(i);
+      features.push_back(i);
     }
   }
-  for (auto& feature : features_) {
+  for (auto& feature : features) {
     printf("feature: %s\n", GetFeatureName(feature).c_str());
   }
 }
@@ -127,7 +134,7 @@
       {FEAT_EVENT_DESC, "event_desc"},
       {FEAT_CPU_TOPOLOGY, "cpu_topology"},
       {FEAT_NUMA_TOPOLOGY, "numa_topology"},
-      {FEAT_BRANCH_STACK, "branck_stack"},
+      {FEAT_BRANCH_STACK, "branch_stack"},
       {FEAT_PMU_MAPPINGS, "pmu_mappings"},
       {FEAT_GROUP_DESC, "group_desc"},
   };
@@ -138,7 +145,7 @@
   return android::base::StringPrintf("unknown_feature(%d)", feature);
 }
 
-void DumpRecordCommandImpl::DumpAttrSection() {
+void DumpRecordCommand::DumpAttrSection() {
   std::vector<const FileAttr*> attrs = record_file_reader_->AttrSection();
   for (size_t i = 0; i < attrs.size(); ++i) {
     auto& attr = attrs[i];
@@ -157,19 +164,18 @@
   }
 }
 
-void DumpRecordCommandImpl::DumpDataSection() {
+void DumpRecordCommand::DumpDataSection() {
   std::vector<std::unique_ptr<const Record>> records = record_file_reader_->DataSection();
   for (auto& record : records) {
     record->Dump();
   }
 }
 
-void DumpRecordCommandImpl::DumpFeatureSection() {
-  std::vector<SectionDesc> sections = record_file_reader_->FeatureSectionDescriptors();
-  CHECK_EQ(sections.size(), features_.size());
-  for (size_t i = 0; i < features_.size(); ++i) {
-    int feature = features_[i];
-    SectionDesc& section = sections[i];
+void DumpRecordCommand::DumpFeatureSection() {
+  std::map<int, SectionDesc> section_map = record_file_reader_->FeatureSectionDescriptors();
+  for (auto& pair : section_map) {
+    int feature = pair.first;
+    SectionDesc& section = pair.second;
     printf("feature section for %s: offset %" PRId64 ", size %" PRId64 "\n",
            GetFeatureName(feature).c_str(), section.offset, section.size);
     if (feature == FEAT_BUILD_ID) {
@@ -178,27 +184,18 @@
       while (p < end) {
         const perf_event_header* header = reinterpret_cast<const perf_event_header*>(p);
         CHECK_LE(p + header->size, end);
-        CHECK_EQ(PERF_RECORD_BUILD_ID, header->type);
         BuildIdRecord record(header);
+        record.header.type = PERF_RECORD_BUILD_ID;  // Set type explicitly as perf doesn't set it.
         record.Dump(1);
         p += header->size;
       }
+    } else if (feature == FEAT_CMDLINE) {
+      std::vector<std::string> cmdline = record_file_reader_->ReadCmdlineFeature();
+      PrintIndented(1, "cmdline: %s\n", android::base::Join(cmdline, ' ').c_str());
     }
   }
 }
 
-class DumpRecordCommand : public Command {
- public:
-  DumpRecordCommand()
-      : Command("dump", "dump perf record file",
-                "Usage: simpleperf dumprecord [options] [perf_record_file]\n"
-                "    Dump different parts of a perf record file. Default file is perf.data.\n") {
-  }
-
-  bool Run(const std::vector<std::string>& args) override {
-    DumpRecordCommandImpl impl;
-    return impl.Run(args);
-  }
-};
-
-DumpRecordCommand dumprecord_cmd;
+__attribute__((constructor)) static void RegisterDumpRecordCommand() {
+  RegisterCommand("dump", [] { return std::unique_ptr<Command>(new DumpRecordCommand); });
+}
diff --git a/simpleperf/cmd_dumprecord_test.cpp b/simpleperf/cmd_dumprecord_test.cpp
index c470833..f23ae16 100644
--- a/simpleperf/cmd_dumprecord_test.cpp
+++ b/simpleperf/cmd_dumprecord_test.cpp
@@ -21,22 +21,22 @@
 class DumpRecordCommandTest : public ::testing::Test {
  protected:
   virtual void SetUp() {
-    record_cmd = Command::FindCommandByName("record");
+    record_cmd = CreateCommandInstance("record");
     ASSERT_TRUE(record_cmd != nullptr);
-    dumprecord_cmd = Command::FindCommandByName("dump");
+    dumprecord_cmd = CreateCommandInstance("dump");
     ASSERT_TRUE(dumprecord_cmd != nullptr);
   }
 
-  Command* record_cmd;
-  Command* dumprecord_cmd;
+  std::unique_ptr<Command> record_cmd;
+  std::unique_ptr<Command> dumprecord_cmd;
 };
 
 TEST_F(DumpRecordCommandTest, no_options) {
-  ASSERT_TRUE(record_cmd->Run({"record", "-a", "sleep", "1"}));
-  ASSERT_TRUE(dumprecord_cmd->Run({"dump"}));
+  ASSERT_TRUE(record_cmd->Run({"-a", "sleep", "1"}));
+  ASSERT_TRUE(dumprecord_cmd->Run({}));
 }
 
 TEST_F(DumpRecordCommandTest, record_file_option) {
-  ASSERT_TRUE(record_cmd->Run({"record", "-a", "-o", "perf2.data", "sleep", "1"}));
-  ASSERT_TRUE(dumprecord_cmd->Run({"dump", "perf2.data"}));
+  ASSERT_TRUE(record_cmd->Run({"-a", "-o", "perf2.data", "sleep", "1"}));
+  ASSERT_TRUE(dumprecord_cmd->Run({"perf2.data"}));
 }
diff --git a/simpleperf/cmd_help.cpp b/simpleperf/cmd_help.cpp
index 0f3839b..cc66376 100644
--- a/simpleperf/cmd_help.cpp
+++ b/simpleperf/cmd_help.cpp
@@ -39,10 +39,10 @@
 };
 
 bool HelpCommand::Run(const std::vector<std::string>& args) {
-  if (args.size() == 1) {
+  if (args.empty()) {
     PrintShortHelp();
   } else {
-    Command* cmd = Command::FindCommandByName(args[1]);
+    std::unique_ptr<Command> cmd = CreateCommandInstance(args[0]);
     if (cmd == nullptr) {
       LOG(ERROR) << "malformed command line: can't find help string for unknown command " << args[0];
       LOG(ERROR) << "try using \"--help\"";
@@ -55,9 +55,16 @@
 }
 
 void HelpCommand::PrintShortHelp() {
-  printf("Usage: simpleperf [--help] subcommand [args_for_subcommand]\n\n");
-  for (auto& command : Command::GetAllCommands()) {
-    printf("%-20s%s\n", command->Name().c_str(), command->ShortHelpString().c_str());
+  printf(
+      "Usage: simpleperf [common options] subcommand [args_for_subcommand]\n"
+      "common options:\n"
+      "    -h/--help     Print this help information.\n"
+      "    --log <severity> Set the minimum severity of logging. Possible severities\n"
+      "                     include debug, warning, error, fatal. Default is error.\n"
+      "subcommands:\n");
+  for (auto& cmd_name : GetAllCommandNames()) {
+    std::unique_ptr<Command> cmd = CreateCommandInstance(cmd_name);
+    printf("    %-20s%s\n", cmd_name.c_str(), cmd->ShortHelpString().c_str());
   }
 }
 
@@ -65,4 +72,6 @@
   printf("%s\n", command.LongHelpString().c_str());
 }
 
-HelpCommand help_command;
+__attribute__((constructor)) static void RegisterHelpCommand() {
+  RegisterCommand("help", [] { return std::unique_ptr<Command>(new HelpCommand); });
+}
diff --git a/simpleperf/cmd_list.cpp b/simpleperf/cmd_list.cpp
index 923a884..e65756a 100644
--- a/simpleperf/cmd_list.cpp
+++ b/simpleperf/cmd_list.cpp
@@ -15,21 +15,24 @@
  */
 
 #include <stdio.h>
+#include <map>
 #include <string>
 #include <vector>
 
 #include <base/logging.h>
 
 #include "command.h"
+#include "event_attr.h"
+#include "event_fd.h"
 #include "event_type.h"
-#include "perf_event.h"
 
-static void PrintEventTypesOfType(uint32_t type, const char* type_name,
-                                  const std::vector<const EventType>& event_types) {
-  printf("List of %s:\n", type_name);
+static void PrintEventTypesOfType(uint32_t type, const std::string& type_name,
+                                  const std::vector<EventType>& event_types) {
+  printf("List of %s:\n", type_name.c_str());
   for (auto& event_type : event_types) {
-    if (event_type.type == type && event_type.IsSupportedByKernel()) {
-      printf("  %s\n", event_type.name);
+    if (event_type.type == type &&
+        IsEventAttrSupportedByKernel(CreateDefaultPerfEventAttr(event_type))) {
+      printf("  %s\n", event_type.name.c_str());
     }
   }
   printf("\n");
@@ -38,8 +41,8 @@
 class ListCommand : public Command {
  public:
   ListCommand()
-      : Command("list", "list all available perf events",
-                "Usage: simpleperf list\n"
+      : Command("list", "list available event types",
+                "Usage: simpleperf list [hw|sw|cache|tracepoint]\n"
                 "    List all available perf events on this machine.\n") {
   }
 
@@ -47,17 +50,38 @@
 };
 
 bool ListCommand::Run(const std::vector<std::string>& args) {
-  if (args.size() != 1) {
-    LOG(ERROR) << "malformed command line: list subcommand needs no argument";
-    LOG(ERROR) << "try using \"help list\"";
-    return false;
-  }
-  auto& event_types = EventTypeFactory::GetAllEventTypes();
+  static std::map<std::string, std::pair<int, std::string>> type_map = {
+      {"hw", {PERF_TYPE_HARDWARE, "hardware events"}},
+      {"sw", {PERF_TYPE_SOFTWARE, "software events"}},
+      {"cache", {PERF_TYPE_HW_CACHE, "hw-cache events"}},
+      {"tracepoint", {PERF_TYPE_TRACEPOINT, "tracepoint events"}},
+  };
 
-  PrintEventTypesOfType(PERF_TYPE_HARDWARE, "hardware events", event_types);
-  PrintEventTypesOfType(PERF_TYPE_SOFTWARE, "software events", event_types);
-  PrintEventTypesOfType(PERF_TYPE_HW_CACHE, "hw-cache events", event_types);
+  std::vector<std::string> names;
+  if (args.empty()) {
+    for (auto& item : type_map) {
+      names.push_back(item.first);
+    }
+  } else {
+    for (auto& arg : args) {
+      if (type_map.find(arg) != type_map.end()) {
+        names.push_back(arg);
+      } else {
+        LOG(ERROR) << "unknown event type category: " << arg << ", try using \"help list\"";
+        return false;
+      }
+    }
+  }
+
+  auto& event_types = GetAllEventTypes();
+
+  for (auto& name : names) {
+    auto it = type_map.find(name);
+    PrintEventTypesOfType(it->second.first, it->second.second, event_types);
+  }
   return true;
 }
 
-ListCommand list_command;
+__attribute__((constructor)) static void RegisterListCommand() {
+  RegisterCommand("list", [] { return std::unique_ptr<Command>(new ListCommand); });
+}
diff --git a/simpleperf/cmd_list_test.cpp b/simpleperf/cmd_list_test.cpp
index 4b873a1..2bc6421 100644
--- a/simpleperf/cmd_list_test.cpp
+++ b/simpleperf/cmd_list_test.cpp
@@ -18,8 +18,24 @@
 
 #include "command.h"
 
-TEST(cmd_list, smoke) {
-  Command* list_cmd = Command::FindCommandByName("list");
-  ASSERT_TRUE(list_cmd != nullptr);
-  ASSERT_TRUE(list_cmd->Run({"list"}));
+class ListCommandTest : public ::testing::Test {
+ protected:
+  virtual void SetUp() {
+    list_cmd = CreateCommandInstance("list");
+    ASSERT_TRUE(list_cmd != nullptr);
+  }
+
+  std::unique_ptr<Command> list_cmd;
+};
+
+TEST_F(ListCommandTest, no_options) {
+  ASSERT_TRUE(list_cmd->Run({}));
+}
+
+TEST_F(ListCommandTest, one_option) {
+  ASSERT_TRUE(list_cmd->Run({"sw"}));
+}
+
+TEST_F(ListCommandTest, multiple_options) {
+  ASSERT_TRUE(list_cmd->Run({"hw", "tracepoint"}));
 }
diff --git a/simpleperf/cmd_record.cpp b/simpleperf/cmd_record.cpp
index 98a0cd5..f91100a 100644
--- a/simpleperf/cmd_record.cpp
+++ b/simpleperf/cmd_record.cpp
@@ -17,7 +17,9 @@
 #include <libgen.h>
 #include <poll.h>
 #include <signal.h>
+#include <set>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 #include <base/logging.h>
@@ -35,21 +37,69 @@
 
 static std::string default_measured_event_type = "cpu-cycles";
 
-class RecordCommandImpl {
+static std::unordered_map<std::string, uint64_t> branch_sampling_type_map = {
+    {"u", PERF_SAMPLE_BRANCH_USER},
+    {"k", PERF_SAMPLE_BRANCH_KERNEL},
+    {"any", PERF_SAMPLE_BRANCH_ANY},
+    {"any_call", PERF_SAMPLE_BRANCH_ANY_CALL},
+    {"any_ret", PERF_SAMPLE_BRANCH_ANY_RETURN},
+    {"ind_call", PERF_SAMPLE_BRANCH_IND_CALL},
+};
+
+static volatile bool signaled;
+static void signal_handler(int) {
+  signaled = true;
+}
+
+class RecordCommand : public Command {
  public:
-  RecordCommandImpl()
-      : use_sample_freq_(true),
-        sample_freq_(1000),
+  RecordCommand()
+      : Command(
+            "record", "record sampling info in perf.data",
+            "Usage: simpleperf record [options] [command [command-args]]\n"
+            "    Gather sampling information when running [command].\n"
+            "    -a           System-wide collection.\n"
+            "    -b           Enable take branch stack sampling. Same as '-j any'\n"
+            "    -c count     Set event sample period.\n"
+            "    -e event[:modifier]\n"
+            "                 Select the event to sample. Use `simpleperf list` to find\n"
+            "                 all possible event names. Modifiers can be added to define\n"
+            "                 how the event should be monitored. Possible modifiers are:\n"
+            "                   u - monitor user space events only\n"
+            "                   k - monitor kernel space events only\n"
+            "    -f freq      Set event sample frequency.\n"
+            "    -F freq      Same as '-f freq'.\n"
+            "    -g           Enables call-graph recording.\n"
+            "    -j branch_filter1,branch_filter2,...\n"
+            "                 Enable taken branch stack sampling. Each sample\n"
+            "                 captures a series of consecutive taken branches.\n"
+            "                 The following filters are defined:\n"
+            "                   any: any type of branch\n"
+            "                   any_call: any function call or system call\n"
+            "                   any_ret: any function return or system call return\n"
+            "                   ind_call: any indirect branch\n"
+            "                   u: only when the branch target is at the user level\n"
+            "                   k: only when the branch target is in the kernel\n"
+            "                 This option requires at least one branch type among any,\n"
+            "                 any_call, any_ret, ind_call.\n"
+            "    --no-inherit\n"
+            "                 Don't record created child threads/processes.\n"
+            "    -o record_file_name    Set record file name, default is perf.data.\n"
+            "    -p pid1,pid2,...\n"
+            "                 Record events on existing processes. Mutually exclusive with -a.\n"
+            "    -t tid1,tid2,...\n"
+            "                 Record events on existing threads. Mutually exclusive with -a.\n"),
+        use_sample_freq_(true),
+        sample_freq_(4000),
         system_wide_collection_(false),
-        measured_event_type_(nullptr),
+        branch_sampling_(0),
+        callchain_sampling_(false),
+        child_inherit_(true),
         perf_mmap_pages_(256),
         record_filename_("perf.data") {
-    // We need signal SIGCHLD to break poll().
-    saved_sigchild_handler_ = signal(SIGCHLD, [](int) {});
-  }
-
-  ~RecordCommandImpl() {
-    signal(SIGCHLD, saved_sigchild_handler_);
+    signaled = false;
+    signal_handler_register_.reset(
+        new SignalHandlerRegister({SIGCHLD, SIGINT, SIGTERM}, signal_handler));
   }
 
   bool Run(const std::vector<std::string>& args);
@@ -59,11 +109,11 @@
  private:
   bool ParseOptions(const std::vector<std::string>& args, std::vector<std::string>* non_option_args);
   bool SetMeasuredEventType(const std::string& event_type_name);
-  void SetEventSelection();
+  bool SetEventSelection();
   bool WriteData(const char* data, size_t size);
   bool DumpKernelAndModuleMmaps();
   bool DumpThreadCommAndMmaps();
-  bool DumpAdditionalFeatures();
+  bool DumpAdditionalFeatures(const std::vector<std::string>& args);
   bool DumpBuildIdFeature();
 
   bool use_sample_freq_;    // Use sample_freq_ when true, otherwise using sample_period_.
@@ -71,7 +121,11 @@
   uint64_t sample_period_;  // Sample once when 'sample_period_' events occur.
 
   bool system_wide_collection_;
-  const EventType* measured_event_type_;
+  uint64_t branch_sampling_;
+  bool callchain_sampling_;
+  bool child_inherit_;
+  std::vector<pid_t> monitored_threads_;
+  std::unique_ptr<EventTypeAndModifier> measured_event_type_modifier_;
   EventSelectionSet event_selection_set_;
 
   // mmap pages used by each perf event file, should be power of 2.
@@ -80,30 +134,40 @@
   std::string record_filename_;
   std::unique_ptr<RecordFileWriter> record_file_writer_;
 
-  sighandler_t saved_sigchild_handler_;
+  std::unique_ptr<SignalHandlerRegister> signal_handler_register_;
 };
 
-bool RecordCommandImpl::Run(const std::vector<std::string>& args) {
+bool RecordCommand::Run(const std::vector<std::string>& args) {
   // 1. Parse options, and use default measured event type if not given.
   std::vector<std::string> workload_args;
   if (!ParseOptions(args, &workload_args)) {
     return false;
   }
-  if (measured_event_type_ == nullptr) {
+  if (measured_event_type_modifier_ == nullptr) {
     if (!SetMeasuredEventType(default_measured_event_type)) {
       return false;
     }
   }
-  SetEventSelection();
+  if (!SetEventSelection()) {
+    return false;
+  }
 
   // 2. Create workload.
-  if (workload_args.empty()) {
-    // TODO: change default workload to sleep 99999, and run record until Ctrl-C.
-    workload_args = std::vector<std::string>({"sleep", "1"});
+  std::unique_ptr<Workload> workload;
+  if (!workload_args.empty()) {
+    workload = Workload::CreateWorkload(workload_args);
+    if (workload == nullptr) {
+      return false;
+    }
   }
-  std::unique_ptr<Workload> workload = Workload::CreateWorkload(workload_args);
-  if (workload == nullptr) {
-    return false;
+  if (!system_wide_collection_ && monitored_threads_.empty()) {
+    if (workload != nullptr) {
+      monitored_threads_.push_back(workload->GetPid());
+      event_selection_set_.SetEnableOnExec(true);
+    } else {
+      LOG(ERROR) << "No threads to monitor. Try `simpleperf help record` for help\n";
+      return false;
+    }
   }
 
   // 3. Open perf_event_files, create memory mapped buffers for perf_event_files, add prepare poll
@@ -113,8 +177,7 @@
       return false;
     }
   } else {
-    event_selection_set_.EnableOnExec();
-    if (!event_selection_set_.OpenEventFilesForProcess(workload->GetPid())) {
+    if (!event_selection_set_.OpenEventFilesForThreadsOnAllCpus(monitored_threads_)) {
       return false;
     }
   }
@@ -126,8 +189,9 @@
 
   // 4. Open record file writer, and dump kernel/modules/threads mmap information.
   record_file_writer_ = RecordFileWriter::CreateInstance(
-      record_filename_, event_selection_set_.FindEventAttrByType(*measured_event_type_),
-      event_selection_set_.FindEventFdsByType(*measured_event_type_));
+      record_filename_,
+      event_selection_set_.FindEventAttrByType(measured_event_type_modifier_->event_type),
+      event_selection_set_.FindEventFdsByType(measured_event_type_modifier_->event_type));
   if (record_file_writer_ == nullptr) {
     return false;
   }
@@ -139,31 +203,28 @@
   }
 
   // 5. Write records in mmap buffers of perf_event_files to output file while workload is running.
-
-  // If monitoring only one process, we use the enable_on_exec flag, and don't need to start
-  // recording manually.
-  if (system_wide_collection_) {
+  if (!event_selection_set_.GetEnableOnExec()) {
     if (!event_selection_set_.EnableEvents()) {
       return false;
     }
   }
-  if (!workload->Start()) {
+  if (workload != nullptr && !workload->Start()) {
     return false;
   }
   auto callback =
-      std::bind(&RecordCommandImpl::WriteData, this, std::placeholders::_1, std::placeholders::_2);
+      std::bind(&RecordCommand::WriteData, this, std::placeholders::_1, std::placeholders::_2);
   while (true) {
     if (!event_selection_set_.ReadMmapEventData(callback)) {
       return false;
     }
-    if (workload->IsFinished()) {
+    if (signaled) {
       break;
     }
     poll(&pollfds[0], pollfds.size(), -1);
   }
 
   // 6. Dump additional features, and close record file.
-  if (!DumpAdditionalFeatures()) {
+  if (!DumpAdditionalFeatures(args)) {
     return false;
   }
   if (!record_file_writer_->Close()) {
@@ -172,12 +233,15 @@
   return true;
 }
 
-bool RecordCommandImpl::ParseOptions(const std::vector<std::string>& args,
-                                     std::vector<std::string>* non_option_args) {
+bool RecordCommand::ParseOptions(const std::vector<std::string>& args,
+                                 std::vector<std::string>* non_option_args) {
+  std::set<pid_t> tid_set;
   size_t i;
-  for (i = 1; i < args.size() && args[i].size() > 0 && args[i][0] == '-'; ++i) {
+  for (i = 0; i < args.size() && args[i].size() > 0 && args[i][0] == '-'; ++i) {
     if (args[i] == "-a") {
       system_wide_collection_ = true;
+    } else if (args[i] == "-b") {
+      branch_sampling_ = branch_sampling_type_map["any"];
     } else if (args[i] == "-c") {
       if (!NextArgumentOrError(args, &i)) {
         return false;
@@ -207,18 +271,55 @@
         return false;
       }
       use_sample_freq_ = true;
+    } else if (args[i] == "-g") {
+      callchain_sampling_ = true;
+    } else if (args[i] == "-j") {
+      if (!NextArgumentOrError(args, &i)) {
+        return false;
+      }
+      std::vector<std::string> branch_sampling_types = android::base::Split(args[i], ",");
+      for (auto& type : branch_sampling_types) {
+        auto it = branch_sampling_type_map.find(type);
+        if (it == branch_sampling_type_map.end()) {
+          LOG(ERROR) << "unrecognized branch sampling filter: " << type;
+          return false;
+        }
+        branch_sampling_ |= it->second;
+      }
+    } else if (args[i] == "--no-inherit") {
+      child_inherit_ = false;
     } else if (args[i] == "-o") {
       if (!NextArgumentOrError(args, &i)) {
         return false;
       }
       record_filename_ = args[i];
+    } else if (args[i] == "-p") {
+      if (!NextArgumentOrError(args, &i)) {
+        return false;
+      }
+      if (!GetValidThreadsFromProcessString(args[i], &tid_set)) {
+        return false;
+      }
+    } else if (args[i] == "-t") {
+      if (!NextArgumentOrError(args, &i)) {
+        return false;
+      }
+      if (!GetValidThreadsFromThreadString(args[i], &tid_set)) {
+        return false;
+      }
     } else {
-      LOG(ERROR) << "Unknown option for record command: '" << args[i] << "'\n";
-      LOG(ERROR) << "Try `simpleperf help record`";
+      ReportUnknownOption(args, i);
       return false;
     }
   }
 
+  monitored_threads_.insert(monitored_threads_.end(), tid_set.begin(), tid_set.end());
+  if (system_wide_collection_ && !monitored_threads_.empty()) {
+    LOG(ERROR)
+        << "Record system wide and existing processes/threads can't be used at the same time.";
+    return false;
+  }
+
   if (non_option_args != nullptr) {
     non_option_args->clear();
     for (; i < args.size(); ++i) {
@@ -228,36 +329,47 @@
   return true;
 }
 
-bool RecordCommandImpl::SetMeasuredEventType(const std::string& event_type_name) {
-  const EventType* event_type = EventTypeFactory::FindEventTypeByName(event_type_name);
-  if (event_type == nullptr) {
+bool RecordCommand::SetMeasuredEventType(const std::string& event_type_name) {
+  std::unique_ptr<EventTypeAndModifier> event_type_modifier = ParseEventType(event_type_name);
+  if (event_type_modifier == nullptr) {
     return false;
   }
-  measured_event_type_ = event_type;
+  measured_event_type_modifier_ = std::move(event_type_modifier);
   return true;
 }
 
-void RecordCommandImpl::SetEventSelection() {
-  event_selection_set_.AddEventType(*measured_event_type_);
+bool RecordCommand::SetEventSelection() {
+  if (!event_selection_set_.AddEventType(*measured_event_type_modifier_)) {
+    return false;
+  }
   if (use_sample_freq_) {
     event_selection_set_.SetSampleFreq(sample_freq_);
   } else {
     event_selection_set_.SetSamplePeriod(sample_period_);
   }
   event_selection_set_.SampleIdAll();
+  if (!event_selection_set_.SetBranchSampling(branch_sampling_)) {
+    return false;
+  }
+  if (callchain_sampling_) {
+    event_selection_set_.EnableCallChainSampling();
+  }
+  event_selection_set_.SetInherit(child_inherit_);
+  return true;
 }
 
-bool RecordCommandImpl::WriteData(const char* data, size_t size) {
+bool RecordCommand::WriteData(const char* data, size_t size) {
   return record_file_writer_->WriteData(data, size);
 }
 
-bool RecordCommandImpl::DumpKernelAndModuleMmaps() {
+bool RecordCommand::DumpKernelAndModuleMmaps() {
   KernelMmap kernel_mmap;
   std::vector<ModuleMmap> module_mmaps;
   if (!GetKernelAndModuleMmaps(&kernel_mmap, &module_mmaps)) {
     return false;
   }
-  const perf_event_attr& attr = event_selection_set_.FindEventAttrByType(*measured_event_type_);
+  const perf_event_attr& attr =
+      event_selection_set_.FindEventAttrByType(measured_event_type_modifier_->event_type);
   MmapRecord mmap_record = CreateMmapRecord(attr, true, UINT_MAX, 0, kernel_mmap.start_addr,
                                             kernel_mmap.len, kernel_mmap.pgoff, kernel_mmap.name);
   if (!record_file_writer_->WriteData(mmap_record.BinaryFormat())) {
@@ -277,47 +389,82 @@
   return true;
 }
 
-bool RecordCommandImpl::DumpThreadCommAndMmaps() {
+bool RecordCommand::DumpThreadCommAndMmaps() {
   std::vector<ThreadComm> thread_comms;
   if (!GetThreadComms(&thread_comms)) {
     return false;
   }
-  const perf_event_attr& attr = event_selection_set_.FindEventAttrByType(*measured_event_type_);
+  const perf_event_attr& attr =
+      event_selection_set_.FindEventAttrByType(measured_event_type_modifier_->event_type);
+
+  // Dump processes.
   for (auto& thread : thread_comms) {
-    CommRecord record = CreateCommRecord(attr, thread.tgid, thread.tid, thread.comm);
+    if (thread.pid != thread.tid) {
+      continue;
+    }
+    CommRecord record = CreateCommRecord(attr, thread.pid, thread.tid, thread.comm);
     if (!record_file_writer_->WriteData(record.BinaryFormat())) {
       return false;
     }
-    if (thread.is_process) {
-      std::vector<ThreadMmap> thread_mmaps;
-      if (!GetThreadMmapsInProcess(thread.tgid, &thread_mmaps)) {
-        // The thread may exit before we get its info.
-        continue;
+    std::vector<ThreadMmap> thread_mmaps;
+    if (!GetThreadMmapsInProcess(thread.pid, &thread_mmaps)) {
+      // The thread may exit before we get its info.
+      continue;
+    }
+    for (auto& thread_mmap : thread_mmaps) {
+      if (thread_mmap.executable == 0) {
+        continue;  // No need to dump non-executable mmap info.
       }
-      for (auto& thread_mmap : thread_mmaps) {
-        if (thread_mmap.executable == 0) {
-          continue;  // No need to dump non-executable mmap info.
-        }
-        MmapRecord record =
-            CreateMmapRecord(attr, false, thread.tgid, thread.tid, thread_mmap.start_addr,
-                             thread_mmap.len, thread_mmap.pgoff, thread_mmap.name);
-        if (!record_file_writer_->WriteData(record.BinaryFormat())) {
-          return false;
-        }
+      MmapRecord record =
+          CreateMmapRecord(attr, false, thread.pid, thread.tid, thread_mmap.start_addr,
+                           thread_mmap.len, thread_mmap.pgoff, thread_mmap.name);
+      if (!record_file_writer_->WriteData(record.BinaryFormat())) {
+        return false;
       }
     }
   }
+
+  // Dump threads.
+  for (auto& thread : thread_comms) {
+    if (thread.pid == thread.tid) {
+      continue;
+    }
+    ForkRecord fork_record = CreateForkRecord(attr, thread.pid, thread.tid, thread.pid, thread.pid);
+    if (!record_file_writer_->WriteData(fork_record.BinaryFormat())) {
+      return false;
+    }
+    CommRecord comm_record = CreateCommRecord(attr, thread.pid, thread.tid, thread.comm);
+    if (!record_file_writer_->WriteData(comm_record.BinaryFormat())) {
+      return false;
+    }
+  }
   return true;
 }
 
-bool RecordCommandImpl::DumpAdditionalFeatures() {
-  if (!record_file_writer_->WriteFeatureHeader(1)) {
+bool RecordCommand::DumpAdditionalFeatures(const std::vector<std::string>& args) {
+  size_t feature_count = (branch_sampling_ != 0 ? 3 : 2);
+  if (!record_file_writer_->WriteFeatureHeader(feature_count)) {
     return false;
   }
-  return DumpBuildIdFeature();
+  if (!DumpBuildIdFeature()) {
+    return false;
+  }
+  std::string exec_path = "simpleperf";
+  GetExecPath(&exec_path);
+  std::vector<std::string> cmdline;
+  cmdline.push_back(exec_path);
+  cmdline.push_back("record");
+  cmdline.insert(cmdline.end(), args.begin(), args.end());
+  if (!record_file_writer_->WriteCmdlineFeature(cmdline)) {
+    return false;
+  }
+  if (branch_sampling_ != 0 && !record_file_writer_->WriteBranchStackFeature()) {
+    return false;
+  }
+  return true;
 }
 
-bool RecordCommandImpl::DumpBuildIdFeature() {
+bool RecordCommand::DumpBuildIdFeature() {
   std::vector<std::string> hit_kernel_modules;
   std::vector<std::string> hit_user_files;
   if (!record_file_writer_->GetHitModules(&hit_kernel_modules, &hit_user_files)) {
@@ -363,26 +510,6 @@
   return true;
 }
 
-class RecordCommand : public Command {
- public:
-  RecordCommand()
-      : Command("record", "record sampling info in perf.data",
-                "Usage: simpleperf record [options] [command [command-args]]\n"
-                "    Gather sampling information when running [command]. If [command]\n"
-                "    is not specified, sleep 1 is used instead.\n"
-                "    -a           System-wide collection.\n"
-                "    -c count     Set event sample period.\n"
-                "    -e event     Select the event to sample (Use `simpleperf list`)\n"
-                "                 to find all possible event names.\n"
-                "    -f freq      Set event sample frequency.\n"
-                "    -F freq      Same as '-f freq'.\n"
-                "    -o record_file_name    Set record file name, default is perf.data.\n") {
-  }
-
-  bool Run(const std::vector<std::string>& args) override {
-    RecordCommandImpl impl;
-    return impl.Run(args);
-  }
-};
-
-RecordCommand record_command;
+__attribute__((constructor)) static void RegisterRecordCommand() {
+  RegisterCommand("record", [] { return std::unique_ptr<Command>(new RecordCommand()); });
+}
diff --git a/simpleperf/cmd_record_test.cpp b/simpleperf/cmd_record_test.cpp
index f0a8878..a4e2be6 100644
--- a/simpleperf/cmd_record_test.cpp
+++ b/simpleperf/cmd_record_test.cpp
@@ -16,50 +16,47 @@
 
 #include <gtest/gtest.h>
 
+#include <base/stringprintf.h>
+
 #include "command.h"
 #include "environment.h"
 #include "record.h"
 #include "record_file.h"
+#include "test_util.h"
 
 using namespace PerfFileFormat;
 
-class RecordCommandTest : public ::testing::Test {
- protected:
-  virtual void SetUp() {
-    record_cmd = Command::FindCommandByName("record");
-    ASSERT_TRUE(record_cmd != nullptr);
-  }
-
-  Command* record_cmd;
-};
-
-TEST_F(RecordCommandTest, no_options) {
-  ASSERT_TRUE(record_cmd->Run({"record", "sleep", "1"}));
+static std::unique_ptr<Command> RecordCmd() {
+  return CreateCommandInstance("record");
 }
 
-TEST_F(RecordCommandTest, system_wide_option) {
-  ASSERT_TRUE(record_cmd->Run({"record", "-a", "sleep", "1"}));
+TEST(record_cmd, no_options) {
+  ASSERT_TRUE(RecordCmd()->Run({"sleep", "1"}));
 }
 
-TEST_F(RecordCommandTest, sample_period_option) {
-  ASSERT_TRUE(record_cmd->Run({"record", "-c", "100000", "sleep", "1"}));
+TEST(record_cmd, system_wide_option) {
+  ASSERT_TRUE(RecordCmd()->Run({"-a", "sleep", "1"}));
 }
 
-TEST_F(RecordCommandTest, event_option) {
-  ASSERT_TRUE(record_cmd->Run({"record", "-e", "cpu-clock", "sleep", "1"}));
+TEST(record_cmd, sample_period_option) {
+  ASSERT_TRUE(RecordCmd()->Run({"-c", "100000", "sleep", "1"}));
 }
 
-TEST_F(RecordCommandTest, freq_option) {
-  ASSERT_TRUE(record_cmd->Run({"record", "-f", "99", "sleep", "1"}));
-  ASSERT_TRUE(record_cmd->Run({"record", "-F", "99", "sleep", "1"}));
+TEST(record_cmd, event_option) {
+  ASSERT_TRUE(RecordCmd()->Run({"-e", "cpu-clock", "sleep", "1"}));
 }
 
-TEST_F(RecordCommandTest, output_file_option) {
-  ASSERT_TRUE(record_cmd->Run({"record", "-o", "perf2.data", "sleep", "1"}));
+TEST(record_cmd, freq_option) {
+  ASSERT_TRUE(RecordCmd()->Run({"-f", "99", "sleep", "1"}));
+  ASSERT_TRUE(RecordCmd()->Run({"-F", "99", "sleep", "1"}));
 }
 
-TEST_F(RecordCommandTest, dump_kernel_mmap) {
-  ASSERT_TRUE(record_cmd->Run({"record", "sleep", "1"}));
+TEST(record_cmd, output_file_option) {
+  ASSERT_TRUE(RecordCmd()->Run({"-o", "perf2.data", "sleep", "1"}));
+}
+
+TEST(record_cmd, dump_kernel_mmap) {
+  ASSERT_TRUE(RecordCmd()->Run({"sleep", "1"}));
   std::unique_ptr<RecordFileReader> reader = RecordFileReader::CreateInstance("perf.data");
   ASSERT_TRUE(reader != nullptr);
   std::vector<std::unique_ptr<const Record>> records = reader->DataSection();
@@ -77,8 +74,8 @@
   ASSERT_TRUE(have_kernel_mmap);
 }
 
-TEST_F(RecordCommandTest, dump_build_id_feature) {
-  ASSERT_TRUE(record_cmd->Run({"record", "sleep", "1"}));
+TEST(record_cmd, dump_build_id_feature) {
+  ASSERT_TRUE(RecordCmd()->Run({"sleep", "1"}));
   std::unique_ptr<RecordFileReader> reader = RecordFileReader::CreateInstance("perf.data");
   ASSERT_TRUE(reader != nullptr);
   const FileHeader* file_header = reader->FileHeader();
@@ -86,3 +83,51 @@
   ASSERT_TRUE(file_header->features[FEAT_BUILD_ID / 8] & (1 << (FEAT_BUILD_ID % 8)));
   ASSERT_GT(reader->FeatureSectionDescriptors().size(), 0u);
 }
+
+TEST(record_cmd, tracepoint_event) {
+  ASSERT_TRUE(RecordCmd()->Run({"-a", "-e", "sched:sched_switch", "sleep", "1"}));
+}
+
+extern bool IsBranchSamplingSupported();
+
+TEST(record_cmd, branch_sampling) {
+  if (IsBranchSamplingSupported()) {
+    ASSERT_TRUE(RecordCmd()->Run({"-a", "-b", "sleep", "1"}));
+    ASSERT_TRUE(RecordCmd()->Run({"-j", "any,any_call,any_ret,ind_call", "sleep", "1"}));
+    ASSERT_TRUE(RecordCmd()->Run({"-j", "any,k", "sleep", "1"}));
+    ASSERT_TRUE(RecordCmd()->Run({"-j", "any,u", "sleep", "1"}));
+    ASSERT_FALSE(RecordCmd()->Run({"-j", "u", "sleep", "1"}));
+  } else {
+    GTEST_LOG_(INFO)
+        << "This test does nothing as branch stack sampling is not supported on this device.";
+  }
+}
+
+TEST(record_cmd, event_modifier) {
+  ASSERT_TRUE(RecordCmd()->Run({"-e", "cpu-cycles:u", "sleep", "1"}));
+}
+
+TEST(record_cmd, callchain_sampling) {
+  ASSERT_TRUE(RecordCmd()->Run({"-g", "sleep", "1"}));
+}
+
+TEST(record_cmd, existing_processes) {
+  std::vector<std::unique_ptr<Workload>> workloads;
+  CreateProcesses(2, &workloads);
+  std::string pid_list =
+      android::base::StringPrintf("%d,%d", workloads[0]->GetPid(), workloads[1]->GetPid());
+  ASSERT_TRUE(RecordCmd()->Run({"-p", pid_list}));
+}
+
+TEST(record_cmd, existing_threads) {
+  std::vector<std::unique_ptr<Workload>> workloads;
+  CreateProcesses(2, &workloads);
+  // Process id can also be used as thread id in linux.
+  std::string tid_list =
+      android::base::StringPrintf("%d,%d", workloads[0]->GetPid(), workloads[1]->GetPid());
+  ASSERT_TRUE(RecordCmd()->Run({"-t", tid_list}));
+}
+
+TEST(record_cmd, no_monitored_threads) {
+  ASSERT_FALSE(RecordCmd()->Run({""}));
+}
diff --git a/simpleperf/cmd_report.cpp b/simpleperf/cmd_report.cpp
new file mode 100644
index 0000000..0e8f8e6
--- /dev/null
+++ b/simpleperf/cmd_report.cpp
@@ -0,0 +1,646 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <inttypes.h>
+#include <functional>
+#include <map>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <base/logging.h>
+#include <base/stringprintf.h>
+#include <base/strings.h>
+
+#include "command.h"
+#include "environment.h"
+#include "event_attr.h"
+#include "event_type.h"
+#include "record.h"
+#include "record_file.h"
+#include "sample_tree.h"
+
+class Displayable {
+ public:
+  Displayable(const std::string& name) : name_(name), width_(name.size()) {
+  }
+
+  virtual ~Displayable() {
+  }
+
+  const std::string& Name() const {
+    return name_;
+  }
+  size_t Width() const {
+    return width_;
+  }
+
+  virtual std::string Show(const SampleEntry& sample) const = 0;
+  void AdjustWidth(const SampleEntry& sample) {
+    size_t size = Show(sample).size();
+    width_ = std::max(width_, size);
+  }
+
+ private:
+  const std::string name_;
+  size_t width_;
+};
+
+class AccumulatedOverheadItem : public Displayable {
+ public:
+  AccumulatedOverheadItem(const SampleTree& sample_tree)
+      : Displayable("Children"), sample_tree_(sample_tree) {
+  }
+
+  std::string Show(const SampleEntry& sample) const override {
+    uint64_t period = sample.period + sample.accumulated_period;
+    uint64_t total_period = sample_tree_.TotalPeriod();
+    double percentage = (total_period != 0) ? 100.0 * period / total_period : 0.0;
+    return android::base::StringPrintf("%.2lf%%", percentage);
+  }
+
+ private:
+  const SampleTree& sample_tree_;
+};
+
+class SelfOverheadItem : public Displayable {
+ public:
+  SelfOverheadItem(const SampleTree& sample_tree, const std::string& name = "Self")
+      : Displayable(name), sample_tree_(sample_tree) {
+  }
+
+  std::string Show(const SampleEntry& sample) const override {
+    uint64_t period = sample.period;
+    uint64_t total_period = sample_tree_.TotalPeriod();
+    double percentage = (total_period != 0) ? 100.0 * period / total_period : 0.0;
+    return android::base::StringPrintf("%.2lf%%", percentage);
+  }
+
+ private:
+  const SampleTree& sample_tree_;
+};
+
+class SampleCountItem : public Displayable {
+ public:
+  SampleCountItem() : Displayable("Sample") {
+  }
+
+  std::string Show(const SampleEntry& sample) const override {
+    return android::base::StringPrintf("%" PRId64, sample.sample_count);
+  }
+};
+
+class Comparable {
+ public:
+  virtual ~Comparable() {
+  }
+
+  virtual int Compare(const SampleEntry& sample1, const SampleEntry& sample2) const = 0;
+};
+
+class PidItem : public Displayable, public Comparable {
+ public:
+  PidItem() : Displayable("Pid") {
+  }
+
+  int Compare(const SampleEntry& sample1, const SampleEntry& sample2) const override {
+    return sample1.thread->pid - sample2.thread->pid;
+  }
+
+  std::string Show(const SampleEntry& sample) const override {
+    return android::base::StringPrintf("%d", sample.thread->pid);
+  }
+};
+
+class TidItem : public Displayable, public Comparable {
+ public:
+  TidItem() : Displayable("Tid") {
+  }
+
+  int Compare(const SampleEntry& sample1, const SampleEntry& sample2) const override {
+    return sample1.thread->tid - sample2.thread->tid;
+  }
+
+  std::string Show(const SampleEntry& sample) const override {
+    return android::base::StringPrintf("%d", sample.thread->tid);
+  }
+};
+
+class CommItem : public Displayable, public Comparable {
+ public:
+  CommItem() : Displayable("Command") {
+  }
+
+  int Compare(const SampleEntry& sample1, const SampleEntry& sample2) const override {
+    return strcmp(sample1.thread_comm, sample2.thread_comm);
+  }
+
+  std::string Show(const SampleEntry& sample) const override {
+    return sample.thread_comm;
+  }
+};
+
+class DsoItem : public Displayable, public Comparable {
+ public:
+  DsoItem(const std::string& name = "Shared Object") : Displayable(name) {
+  }
+
+  int Compare(const SampleEntry& sample1, const SampleEntry& sample2) const override {
+    return strcmp(sample1.map->dso->path.c_str(), sample2.map->dso->path.c_str());
+  }
+
+  std::string Show(const SampleEntry& sample) const override {
+    return sample.map->dso->path;
+  }
+};
+
+class SymbolItem : public Displayable, public Comparable {
+ public:
+  SymbolItem(const std::string& name = "Symbol") : Displayable(name) {
+  }
+
+  int Compare(const SampleEntry& sample1, const SampleEntry& sample2) const override {
+    return strcmp(sample1.symbol->name.c_str(), sample2.symbol->name.c_str());
+  }
+
+  std::string Show(const SampleEntry& sample) const override {
+    return sample.symbol->name;
+  }
+};
+
+class DsoFromItem : public Displayable, public Comparable {
+ public:
+  DsoFromItem() : Displayable("Source Shared Object") {
+  }
+
+  int Compare(const SampleEntry& sample1, const SampleEntry& sample2) const override {
+    return strcmp(sample1.branch_from.map->dso->path.c_str(),
+                  sample2.branch_from.map->dso->path.c_str());
+  }
+
+  std::string Show(const SampleEntry& sample) const override {
+    return sample.branch_from.map->dso->path;
+  }
+};
+
+class DsoToItem : public DsoItem {
+ public:
+  DsoToItem() : DsoItem("Target Shared Object") {
+  }
+};
+
+class SymbolFromItem : public Displayable, public Comparable {
+ public:
+  SymbolFromItem() : Displayable("Source Symbol") {
+  }
+
+  int Compare(const SampleEntry& sample1, const SampleEntry& sample2) const override {
+    return strcmp(sample1.branch_from.symbol->name.c_str(),
+                  sample2.branch_from.symbol->name.c_str());
+  }
+
+  std::string Show(const SampleEntry& sample) const override {
+    return sample.branch_from.symbol->name;
+  }
+};
+
+class SymbolToItem : public SymbolItem {
+ public:
+  SymbolToItem() : SymbolItem("Target Symbol") {
+  }
+};
+
+static std::set<std::string> branch_sort_keys = {
+    "dso_from", "dso_to", "symbol_from", "symbol_to",
+};
+
+class ReportCommand : public Command {
+ public:
+  ReportCommand()
+      : Command(
+            "report", "report sampling information in perf.data",
+            "Usage: simpleperf report [options]\n"
+            "    -b            Use the branch-to addresses in sampled take branches instead of\n"
+            "                  the instruction addresses. Only valid for perf.data recorded with\n"
+            "                  -b/-j option.\n"
+            "    --children    Print the overhead accumulated by appearing in the callchain.\n"
+            "    -g            Print call graph.\n"
+            "    -i <file>     Specify path of record file, default is perf.data.\n"
+            "    -n            Print the sample count for each item.\n"
+            "    --no-demangle        Don't demangle symbol names.\n"
+            "    --sort key1,key2,...\n"
+            "                  Select the keys to sort and print the report. Possible keys\n"
+            "                  include pid, tid, comm, dso, symbol, dso_from, dso_to, symbol_from\n"
+            "                  symbol_to. dso_from, dso_to, symbol_from, symbol_to can only be\n"
+            "                  used with -b option. Default keys are \"comm,pid,tid,dso,symbol\"\n"
+            "    --symfs <dir>  Look for files with symbols relative to this directory.\n"),
+        record_filename_("perf.data"),
+        use_branch_address_(false),
+        accumulate_callchain_(false),
+        print_callgraph_(false) {
+    compare_sample_func_t compare_sample_callback = std::bind(
+        &ReportCommand::CompareSampleEntry, this, std::placeholders::_1, std::placeholders::_2);
+    sample_tree_ = std::unique_ptr<SampleTree>(new SampleTree(compare_sample_callback));
+  }
+
+  bool Run(const std::vector<std::string>& args);
+
+ private:
+  bool ParseOptions(const std::vector<std::string>& args);
+  bool ReadEventAttrFromRecordFile();
+  void ReadSampleTreeFromRecordFile();
+  void ProcessSampleRecord(const SampleRecord& r);
+  void ReadFeaturesFromRecordFile();
+  int CompareSampleEntry(const SampleEntry& sample1, const SampleEntry& sample2);
+  void PrintReport();
+  void PrintReportContext();
+  void CollectReportWidth();
+  void CollectReportEntryWidth(const SampleEntry& sample);
+  void PrintReportHeader();
+  void PrintReportEntry(const SampleEntry& sample);
+  void PrintCallGraph(const SampleEntry& sample);
+
+  std::string record_filename_;
+  std::unique_ptr<RecordFileReader> record_file_reader_;
+  perf_event_attr event_attr_;
+  std::vector<std::unique_ptr<Displayable>> displayable_items_;
+  std::vector<Comparable*> comparable_items_;
+  std::unique_ptr<SampleTree> sample_tree_;
+  bool use_branch_address_;
+  std::string record_cmdline_;
+  bool accumulate_callchain_;
+  bool print_callgraph_;
+};
+
+bool ReportCommand::Run(const std::vector<std::string>& args) {
+  // 1. Parse options.
+  if (!ParseOptions(args)) {
+    return false;
+  }
+
+  // 2. Read record file and build SampleTree.
+  record_file_reader_ = RecordFileReader::CreateInstance(record_filename_);
+  if (record_file_reader_ == nullptr) {
+    return false;
+  }
+  if (!ReadEventAttrFromRecordFile()) {
+    return false;
+  }
+  ReadSampleTreeFromRecordFile();
+  ReadFeaturesFromRecordFile();
+
+  // 3. Show collected information.
+  PrintReport();
+
+  return true;
+}
+
+bool ReportCommand::ParseOptions(const std::vector<std::string>& args) {
+  bool print_sample_count = false;
+  std::vector<std::string> sort_keys = {"comm", "pid", "tid", "dso", "symbol"};
+  for (size_t i = 0; i < args.size(); ++i) {
+    if (args[i] == "-b") {
+      use_branch_address_ = true;
+    } else if (args[i] == "--children") {
+      accumulate_callchain_ = true;
+    } else if (args[i] == "-g") {
+      print_callgraph_ = true;
+      accumulate_callchain_ = true;
+    } else if (args[i] == "-i") {
+      if (!NextArgumentOrError(args, &i)) {
+        return false;
+      }
+      record_filename_ = args[i];
+
+    } else if (args[i] == "-n") {
+      print_sample_count = true;
+
+    } else if (args[i] == "--no-demangle") {
+      DsoFactory::SetDemangle(false);
+
+    } else if (args[i] == "--sort") {
+      if (!NextArgumentOrError(args, &i)) {
+        return false;
+      }
+      sort_keys = android::base::Split(args[i], ",");
+    } else if (args[i] == "--symfs") {
+      if (!NextArgumentOrError(args, &i)) {
+        return false;
+      }
+      if (!DsoFactory::SetSymFsDir(args[i])) {
+        return false;
+      }
+    } else {
+      ReportUnknownOption(args, i);
+      return false;
+    }
+  }
+
+  if (!accumulate_callchain_) {
+    displayable_items_.push_back(
+        std::unique_ptr<Displayable>(new SelfOverheadItem(*sample_tree_, "Overhead")));
+  } else {
+    displayable_items_.push_back(
+        std::unique_ptr<Displayable>(new AccumulatedOverheadItem(*sample_tree_)));
+    displayable_items_.push_back(std::unique_ptr<Displayable>(new SelfOverheadItem(*sample_tree_)));
+  }
+  if (print_sample_count) {
+    displayable_items_.push_back(std::unique_ptr<Displayable>(new SampleCountItem));
+  }
+  for (auto& key : sort_keys) {
+    if (!use_branch_address_ && branch_sort_keys.find(key) != branch_sort_keys.end()) {
+      LOG(ERROR) << "sort key '" << key << "' can only be used with -b option.";
+      return false;
+    }
+    if (key == "pid") {
+      PidItem* item = new PidItem;
+      displayable_items_.push_back(std::unique_ptr<Displayable>(item));
+      comparable_items_.push_back(item);
+    } else if (key == "tid") {
+      TidItem* item = new TidItem;
+      displayable_items_.push_back(std::unique_ptr<Displayable>(item));
+      comparable_items_.push_back(item);
+    } else if (key == "comm") {
+      CommItem* item = new CommItem;
+      displayable_items_.push_back(std::unique_ptr<Displayable>(item));
+      comparable_items_.push_back(item);
+    } else if (key == "dso") {
+      DsoItem* item = new DsoItem;
+      displayable_items_.push_back(std::unique_ptr<Displayable>(item));
+      comparable_items_.push_back(item);
+    } else if (key == "symbol") {
+      SymbolItem* item = new SymbolItem;
+      displayable_items_.push_back(std::unique_ptr<Displayable>(item));
+      comparable_items_.push_back(item);
+    } else if (key == "dso_from") {
+      DsoFromItem* item = new DsoFromItem;
+      displayable_items_.push_back(std::unique_ptr<Displayable>(item));
+      comparable_items_.push_back(item);
+    } else if (key == "dso_to") {
+      DsoToItem* item = new DsoToItem;
+      displayable_items_.push_back(std::unique_ptr<Displayable>(item));
+      comparable_items_.push_back(item);
+    } else if (key == "symbol_from") {
+      SymbolFromItem* item = new SymbolFromItem;
+      displayable_items_.push_back(std::unique_ptr<Displayable>(item));
+      comparable_items_.push_back(item);
+    } else if (key == "symbol_to") {
+      SymbolToItem* item = new SymbolToItem;
+      displayable_items_.push_back(std::unique_ptr<Displayable>(item));
+      comparable_items_.push_back(item);
+    } else {
+      LOG(ERROR) << "Unknown sort key: " << key;
+      return false;
+    }
+  }
+  return true;
+}
+
+bool ReportCommand::ReadEventAttrFromRecordFile() {
+  std::vector<const PerfFileFormat::FileAttr*> attrs = record_file_reader_->AttrSection();
+  if (attrs.size() != 1) {
+    LOG(ERROR) << "record file contains " << attrs.size() << " attrs";
+    return false;
+  }
+  event_attr_ = attrs[0]->attr;
+  if (use_branch_address_ && (event_attr_.sample_type & PERF_SAMPLE_BRANCH_STACK) == 0) {
+    LOG(ERROR) << record_filename_ << " is not recorded with branch stack sampling option.";
+    return false;
+  }
+  return true;
+}
+
+void ReportCommand::ReadSampleTreeFromRecordFile() {
+  sample_tree_->AddThread(0, 0, "swapper");
+
+  std::vector<std::unique_ptr<const Record>> records = record_file_reader_->DataSection();
+  for (auto& record : records) {
+    if (record->header.type == PERF_RECORD_MMAP) {
+      const MmapRecord& r = *static_cast<const MmapRecord*>(record.get());
+      if ((r.header.misc & PERF_RECORD_MISC_CPUMODE_MASK) == PERF_RECORD_MISC_KERNEL) {
+        sample_tree_->AddKernelMap(r.data.addr, r.data.len, r.data.pgoff,
+                                   r.sample_id.time_data.time, r.filename);
+      } else {
+        sample_tree_->AddThreadMap(r.data.pid, r.data.tid, r.data.addr, r.data.len, r.data.pgoff,
+                                   r.sample_id.time_data.time, r.filename);
+      }
+    } else if (record->header.type == PERF_RECORD_MMAP2) {
+      const Mmap2Record& r = *static_cast<const Mmap2Record*>(record.get());
+      if ((r.header.misc & PERF_RECORD_MISC_CPUMODE_MASK) == PERF_RECORD_MISC_KERNEL) {
+        sample_tree_->AddKernelMap(r.data.addr, r.data.len, r.data.pgoff,
+                                   r.sample_id.time_data.time, r.filename);
+      } else {
+        std::string filename =
+            (r.filename == DEFAULT_EXECNAME_FOR_THREAD_MMAP) ? "[unknown]" : r.filename;
+        sample_tree_->AddThreadMap(r.data.pid, r.data.tid, r.data.addr, r.data.len, r.data.pgoff,
+                                   r.sample_id.time_data.time, filename);
+      }
+    } else if (record->header.type == PERF_RECORD_SAMPLE) {
+      ProcessSampleRecord(*static_cast<const SampleRecord*>(record.get()));
+    } else if (record->header.type == PERF_RECORD_COMM) {
+      const CommRecord& r = *static_cast<const CommRecord*>(record.get());
+      sample_tree_->AddThread(r.data.pid, r.data.tid, r.comm);
+    } else if (record->header.type == PERF_RECORD_FORK) {
+      const ForkRecord& r = *static_cast<const ForkRecord*>(record.get());
+      sample_tree_->ForkThread(r.data.pid, r.data.tid, r.data.ppid, r.data.ptid);
+    }
+  }
+}
+
+void ReportCommand::ProcessSampleRecord(const SampleRecord& r) {
+  if (use_branch_address_ && (r.sample_type & PERF_SAMPLE_BRANCH_STACK)) {
+    for (auto& item : r.branch_stack_data.stack) {
+      if (item.from != 0 && item.to != 0) {
+        sample_tree_->AddBranchSample(r.tid_data.pid, r.tid_data.tid, item.from, item.to,
+                                      item.flags, r.time_data.time, r.period_data.period);
+      }
+    }
+  } else {
+    bool in_kernel = (r.header.misc & PERF_RECORD_MISC_CPUMODE_MASK) == PERF_RECORD_MISC_KERNEL;
+    SampleEntry* sample = sample_tree_->AddSample(r.tid_data.pid, r.tid_data.tid, r.ip_data.ip,
+                                                  r.time_data.time, r.period_data.period, in_kernel);
+    CHECK(sample != nullptr);
+    if (accumulate_callchain_ && (r.sample_type & PERF_SAMPLE_CALLCHAIN) != 0) {
+      std::vector<SampleEntry*> callchain;
+      callchain.push_back(sample);
+      const std::vector<uint64_t>& ips = r.callchain_data.ips;
+      bool first_ip = true;
+      for (auto& ip : ips) {
+        if (ip >= PERF_CONTEXT_MAX) {
+          switch (ip) {
+            case PERF_CONTEXT_KERNEL:
+              in_kernel = true;
+              break;
+            case PERF_CONTEXT_USER:
+              in_kernel = false;
+              break;
+            default:
+              LOG(ERROR) << "Unexpected perf_context in callchain: " << ip;
+          }
+        } else {
+          if (first_ip) {
+            // Remove duplication with sampled ip.
+            if (ip == r.ip_data.ip) {
+              continue;
+            }
+            first_ip = false;
+          }
+          SampleEntry* sample =
+              sample_tree_->AddCallChainSample(r.tid_data.pid, r.tid_data.tid, ip, r.time_data.time,
+                                               r.period_data.period, in_kernel, callchain);
+          callchain.push_back(sample);
+        }
+      }
+      if (print_callgraph_) {
+        std::set<SampleEntry*> added_set;
+        while (callchain.size() >= 2) {
+          SampleEntry* sample = callchain[0];
+          callchain.erase(callchain.begin());
+          // Add only once for recursive calls on callchain.
+          if (added_set.find(sample) != added_set.end()) {
+            continue;
+          }
+          added_set.insert(sample);
+          sample_tree_->InsertCallChainForSample(sample, callchain, r.period_data.period);
+        }
+      }
+    }
+  }
+}
+
+void ReportCommand::ReadFeaturesFromRecordFile() {
+  std::vector<std::string> cmdline = record_file_reader_->ReadCmdlineFeature();
+  if (!cmdline.empty()) {
+    record_cmdline_ = android::base::Join(cmdline, ' ');
+  }
+}
+
+int ReportCommand::CompareSampleEntry(const SampleEntry& sample1, const SampleEntry& sample2) {
+  for (auto& item : comparable_items_) {
+    int result = item->Compare(sample1, sample2);
+    if (result != 0) {
+      return result;
+    }
+  }
+  return 0;
+}
+
+void ReportCommand::PrintReport() {
+  PrintReportContext();
+  CollectReportWidth();
+  PrintReportHeader();
+  sample_tree_->VisitAllSamples(
+      std::bind(&ReportCommand::PrintReportEntry, this, std::placeholders::_1));
+  fflush(stdout);
+}
+
+void ReportCommand::PrintReportContext() {
+  const EventType* event_type = FindEventTypeByConfig(event_attr_.type, event_attr_.config);
+  std::string event_type_name;
+  if (event_type != nullptr) {
+    event_type_name = event_type->name;
+  } else {
+    event_type_name =
+        android::base::StringPrintf("(type %u, config %llu)", event_attr_.type, event_attr_.config);
+  }
+  if (!record_cmdline_.empty()) {
+    printf("Cmdline: %s\n", record_cmdline_.c_str());
+  }
+  printf("Samples: %" PRIu64 " of event '%s'\n", sample_tree_->TotalSamples(),
+         event_type_name.c_str());
+  printf("Event count: %" PRIu64 "\n\n", sample_tree_->TotalPeriod());
+}
+
+void ReportCommand::CollectReportWidth() {
+  sample_tree_->VisitAllSamples(
+      std::bind(&ReportCommand::CollectReportEntryWidth, this, std::placeholders::_1));
+}
+
+void ReportCommand::CollectReportEntryWidth(const SampleEntry& sample) {
+  for (auto& item : displayable_items_) {
+    item->AdjustWidth(sample);
+  }
+}
+
+void ReportCommand::PrintReportHeader() {
+  for (size_t i = 0; i < displayable_items_.size(); ++i) {
+    auto& item = displayable_items_[i];
+    if (i != displayable_items_.size() - 1) {
+      printf("%-*s  ", static_cast<int>(item->Width()), item->Name().c_str());
+    } else {
+      printf("%s\n", item->Name().c_str());
+    }
+  }
+}
+
+void ReportCommand::PrintReportEntry(const SampleEntry& sample) {
+  for (size_t i = 0; i < displayable_items_.size(); ++i) {
+    auto& item = displayable_items_[i];
+    if (i != displayable_items_.size() - 1) {
+      printf("%-*s  ", static_cast<int>(item->Width()), item->Show(sample).c_str());
+    } else {
+      printf("%s\n", item->Show(sample).c_str());
+    }
+  }
+  if (print_callgraph_) {
+    PrintCallGraph(sample);
+  }
+}
+
+static void PrintCallGraphEntry(size_t depth, std::string prefix,
+                                const std::unique_ptr<CallChainNode>& node, uint64_t parent_period,
+                                bool last) {
+  if (depth > 20) {
+    LOG(WARNING) << "truncated callgraph at depth " << depth;
+    return;
+  }
+  prefix += "|";
+  printf("%s\n", prefix.c_str());
+  if (last) {
+    prefix.back() = ' ';
+  }
+  std::string percentage_s = "-- ";
+  if (node->period + node->children_period != parent_period) {
+    double percentage = 100.0 * (node->period + node->children_period) / parent_period;
+    percentage_s = android::base::StringPrintf("--%.2lf%%-- ", percentage);
+  }
+  printf("%s%s%s\n", prefix.c_str(), percentage_s.c_str(), node->chain[0]->symbol->name.c_str());
+  prefix.append(percentage_s.size(), ' ');
+  for (size_t i = 1; i < node->chain.size(); ++i) {
+    printf("%s%s\n", prefix.c_str(), node->chain[i]->symbol->name.c_str());
+  }
+
+  for (size_t i = 0; i < node->children.size(); ++i) {
+    PrintCallGraphEntry(depth + 1, prefix, node->children[i], node->children_period,
+                        (i + 1 == node->children.size()));
+  }
+}
+
+void ReportCommand::PrintCallGraph(const SampleEntry& sample) {
+  std::string prefix = "       ";
+  printf("%s|\n", prefix.c_str());
+  printf("%s-- %s\n", prefix.c_str(), sample.symbol->name.c_str());
+  prefix.append(3, ' ');
+  for (size_t i = 0; i < sample.callchain.children.size(); ++i) {
+    PrintCallGraphEntry(1, prefix, sample.callchain.children[i], sample.callchain.children_period,
+                        (i + 1 == sample.callchain.children.size()));
+  }
+}
+
+__attribute__((constructor)) static void RegisterReportCommand() {
+  RegisterCommand("report", [] { return std::unique_ptr<Command>(new ReportCommand()); });
+}
diff --git a/simpleperf/cmd_report_test.cpp b/simpleperf/cmd_report_test.cpp
new file mode 100644
index 0000000..a0dc596
--- /dev/null
+++ b/simpleperf/cmd_report_test.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "command.h"
+
+static std::unique_ptr<Command> RecordCmd() {
+  return CreateCommandInstance("record");
+}
+
+static std::unique_ptr<Command> ReportCmd() {
+  return CreateCommandInstance("report");
+}
+
+class ReportCommandTest : public ::testing::Test {
+ protected:
+  static void SetUpTestCase() {
+    ASSERT_TRUE(RecordCmd()->Run({"-a", "sleep", "1"}));
+    ASSERT_TRUE(RecordCmd()->Run({"-a", "-o", "perf2.data", "sleep", "1"}));
+    ASSERT_TRUE(RecordCmd()->Run({"-g", "-o", "perf_g.data", "sleep", "1"}));
+  }
+};
+
+TEST_F(ReportCommandTest, no_options) {
+  ASSERT_TRUE(ReportCmd()->Run({}));
+}
+
+TEST_F(ReportCommandTest, input_file_option) {
+  ASSERT_TRUE(ReportCmd()->Run({"-i", "perf2.data"}));
+}
+
+TEST_F(ReportCommandTest, sort_option_pid) {
+  ASSERT_TRUE(ReportCmd()->Run({"--sort", "pid"}));
+}
+
+TEST_F(ReportCommandTest, sort_option_all) {
+  ASSERT_TRUE(ReportCmd()->Run({"--sort", "comm,pid,dso,symbol"}));
+}
+
+TEST_F(ReportCommandTest, children_option) {
+  ASSERT_TRUE(ReportCmd()->Run({"--children", "-i", "perf_g.data"}));
+}
+
+TEST_F(ReportCommandTest, callgraph_option) {
+  ASSERT_TRUE(ReportCmd()->Run({"-g", "-i", "perf_g.data"}));
+}
+
+extern bool IsBranchSamplingSupported();
+
+TEST(report_cmd, use_branch_address) {
+  if (IsBranchSamplingSupported()) {
+    ASSERT_TRUE(RecordCmd()->Run({"-b", "sleep", "1"}));
+    ASSERT_TRUE(
+        ReportCmd()->Run({"-b", "--sort", "comm,pid,dso_from,symbol_from,dso_to,symbol_to"}));
+  } else {
+    GTEST_LOG_(INFO)
+        << "This test does nothing as branch stack sampling is not supported on this device.";
+  }
+}
diff --git a/simpleperf/cmd_stat.cpp b/simpleperf/cmd_stat.cpp
index c8e59d9..d4b1923 100644
--- a/simpleperf/cmd_stat.cpp
+++ b/simpleperf/cmd_stat.cpp
@@ -15,8 +15,10 @@
  */
 
 #include <inttypes.h>
+#include <signal.h>
 #include <stdio.h>
 #include <chrono>
+#include <set>
 #include <string>
 #include <vector>
 
@@ -25,88 +27,132 @@
 
 #include "command.h"
 #include "environment.h"
+#include "event_attr.h"
+#include "event_fd.h"
 #include "event_selection_set.h"
 #include "event_type.h"
-#include "perf_event.h"
 #include "utils.h"
 #include "workload.h"
 
 static std::vector<std::string> default_measured_event_types{
-    "cpu-cycles", "stalled-cycles-frontend", "stalled-cycles-backend", "instructions",
-    "branch-instructions", "branch-misses", "task-clock", "context-switches", "page-faults",
+    "cpu-cycles",   "stalled-cycles-frontend", "stalled-cycles-backend",
+    "instructions", "branch-instructions",     "branch-misses",
+    "task-clock",   "context-switches",        "page-faults",
 };
 
-class StatCommandImpl {
+static volatile bool signaled;
+static void signal_handler(int) {
+  signaled = true;
+}
+
+class StatCommand : public Command {
  public:
-  StatCommandImpl() : verbose_mode_(false), system_wide_collection_(false) {
+  StatCommand()
+      : Command("stat", "gather performance counter information",
+                "Usage: simpleperf stat [options] [command [command-args]]\n"
+                "    Gather performance counter information of running [command].\n"
+                "    -a           Collect system-wide information.\n"
+                "    -e event1[:modifier1],event2[:modifier2],...\n"
+                "                 Select the event list to count. Use `simpleperf list` to find\n"
+                "                 all possible event names. Modifiers can be added to define\n"
+                "                 how the event should be monitored. Possible modifiers are:\n"
+                "                   u - monitor user space events only\n"
+                "                   k - monitor kernel space events only\n"
+                "    --no-inherit\n"
+                "                 Don't stat created child threads/processes.\n"
+                "    -p pid1,pid2,...\n"
+                "                 Stat events on existing processes. Mutually exclusive with -a.\n"
+                "    -t tid1,tid2,...\n"
+                "                 Stat events on existing threads. Mutually exclusive with -a.\n"
+                "    --verbose    Show result in verbose mode.\n"),
+        verbose_mode_(false),
+        system_wide_collection_(false),
+        child_inherit_(true) {
+    signaled = false;
+    signal_handler_register_.reset(
+        new SignalHandlerRegister({SIGCHLD, SIGINT, SIGTERM}, signal_handler));
   }
 
   bool Run(const std::vector<std::string>& args);
 
  private:
   bool ParseOptions(const std::vector<std::string>& args, std::vector<std::string>* non_option_args);
-  bool AddMeasuredEventType(const std::string& event_type_name, bool report_unsupported_type = true);
+  bool AddMeasuredEventType(const std::string& event_type_name);
   bool AddDefaultMeasuredEventTypes();
+  bool SetEventSelection();
   bool ShowCounters(const std::map<const EventType*, std::vector<PerfCounter>>& counters_map,
                     std::chrono::steady_clock::duration counting_duration);
 
-  EventSelectionSet event_selection_set_;
   bool verbose_mode_;
   bool system_wide_collection_;
+  bool child_inherit_;
+  std::vector<pid_t> monitored_threads_;
+  std::vector<std::pair<std::string, EventTypeAndModifier>> measured_event_types_;
+  EventSelectionSet event_selection_set_;
+
+  std::unique_ptr<SignalHandlerRegister> signal_handler_register_;
 };
 
-bool StatCommandImpl::Run(const std::vector<std::string>& args) {
-  // 1. Parse options.
+bool StatCommand::Run(const std::vector<std::string>& args) {
+  // 1. Parse options, and use default measured event types if not given.
   std::vector<std::string> workload_args;
   if (!ParseOptions(args, &workload_args)) {
     return false;
   }
-
-  // 2. Add default measured event types.
-  if (event_selection_set_.Empty()) {
+  if (measured_event_types_.empty()) {
     if (!AddDefaultMeasuredEventTypes()) {
       return false;
     }
   }
-
-  // 3. Create workload.
-  if (workload_args.empty()) {
-    // TODO: change default workload to sleep 99999, and run stat until Ctrl-C.
-    workload_args = std::vector<std::string>({"sleep", "1"});
-  }
-  std::unique_ptr<Workload> workload = Workload::CreateWorkload(workload_args);
-  if (workload == nullptr) {
+  if (!SetEventSelection()) {
     return false;
   }
 
-  // 4. Open perf_event_files.
+  // 2. Create workload.
+  std::unique_ptr<Workload> workload;
+  if (!workload_args.empty()) {
+    workload = Workload::CreateWorkload(workload_args);
+    if (workload == nullptr) {
+      return false;
+    }
+  }
+  if (!system_wide_collection_ && monitored_threads_.empty()) {
+    if (workload != nullptr) {
+      monitored_threads_.push_back(workload->GetPid());
+      event_selection_set_.SetEnableOnExec(true);
+    } else {
+      LOG(ERROR) << "No threads to monitor. Try `simpleperf help stat` for help\n";
+      return false;
+    }
+  }
+
+  // 3. Open perf_event_files.
   if (system_wide_collection_) {
     if (!event_selection_set_.OpenEventFilesForAllCpus()) {
       return false;
     }
   } else {
-    event_selection_set_.EnableOnExec();
-    if (!event_selection_set_.OpenEventFilesForProcess(workload->GetPid())) {
+    if (!event_selection_set_.OpenEventFilesForThreads(monitored_threads_)) {
       return false;
     }
   }
 
-  // 5. Count events while workload running.
+  // 4. Count events while workload running.
   auto start_time = std::chrono::steady_clock::now();
-  // If monitoring only one process, we use the enable_on_exec flag, and don't need to start
-  // counting manually.
-  if (system_wide_collection_) {
+  if (!event_selection_set_.GetEnableOnExec()) {
     if (!event_selection_set_.EnableEvents()) {
       return false;
     }
   }
-  if (!workload->Start()) {
+  if (workload != nullptr && !workload->Start()) {
     return false;
   }
-  workload->WaitFinish();
+  while (!signaled) {
+    sleep(1);
+  }
   auto end_time = std::chrono::steady_clock::now();
 
-  // 6. Read and print counters.
+  // 5. Read and print counters.
   std::map<const EventType*, std::vector<PerfCounter>> counters_map;
   if (!event_selection_set_.ReadCounters(&counters_map)) {
     return false;
@@ -117,10 +163,11 @@
   return true;
 }
 
-bool StatCommandImpl::ParseOptions(const std::vector<std::string>& args,
-                                   std::vector<std::string>* non_option_args) {
+bool StatCommand::ParseOptions(const std::vector<std::string>& args,
+                               std::vector<std::string>* non_option_args) {
+  std::set<pid_t> tid_set;
   size_t i;
-  for (i = 1; i < args.size() && args[i].size() > 0 && args[i][0] == '-'; ++i) {
+  for (i = 0; i < args.size() && args[i].size() > 0 && args[i][0] == '-'; ++i) {
     if (args[i] == "-a") {
       system_wide_collection_ = true;
     } else if (args[i] == "-e") {
@@ -133,15 +180,36 @@
           return false;
         }
       }
+    } else if (args[i] == "--no-inherit") {
+      child_inherit_ = false;
+    } else if (args[i] == "-p") {
+      if (!NextArgumentOrError(args, &i)) {
+        return false;
+      }
+      if (!GetValidThreadsFromProcessString(args[i], &tid_set)) {
+        return false;
+      }
+    } else if (args[i] == "-t") {
+      if (!NextArgumentOrError(args, &i)) {
+        return false;
+      }
+      if (!GetValidThreadsFromThreadString(args[i], &tid_set)) {
+        return false;
+      }
     } else if (args[i] == "--verbose") {
       verbose_mode_ = true;
     } else {
-      LOG(ERROR) << "Unknown option for stat command: " << args[i];
-      LOG(ERROR) << "Try `simpleperf help stat`";
+      ReportUnknownOption(args, i);
       return false;
     }
   }
 
+  monitored_threads_.insert(monitored_threads_.end(), tid_set.begin(), tid_set.end());
+  if (system_wide_collection_ && !monitored_threads_.empty()) {
+    LOG(ERROR) << "Stat system wide and existing processes/threads can't be used at the same time.";
+    return false;
+  }
+
   if (non_option_args != nullptr) {
     non_option_args->clear();
     for (; i < args.size(); ++i) {
@@ -151,30 +219,41 @@
   return true;
 }
 
-bool StatCommandImpl::AddMeasuredEventType(const std::string& event_type_name,
-                                           bool report_unsupported_type) {
-  const EventType* event_type =
-      EventTypeFactory::FindEventTypeByName(event_type_name, report_unsupported_type);
-  if (event_type == nullptr) {
+bool StatCommand::AddMeasuredEventType(const std::string& event_type_name) {
+  std::unique_ptr<EventTypeAndModifier> event_type_modifier = ParseEventType(event_type_name);
+  if (event_type_modifier == nullptr) {
     return false;
   }
-  event_selection_set_.AddEventType(*event_type);
+  measured_event_types_.push_back(std::make_pair(event_type_name, *event_type_modifier));
   return true;
 }
 
-bool StatCommandImpl::AddDefaultMeasuredEventTypes() {
+bool StatCommand::AddDefaultMeasuredEventTypes() {
   for (auto& name : default_measured_event_types) {
     // It is not an error when some event types in the default list are not supported by the kernel.
-    AddMeasuredEventType(name, false);
+    const EventType* type = FindEventTypeByName(name);
+    if (type != nullptr && IsEventAttrSupportedByKernel(CreateDefaultPerfEventAttr(*type))) {
+      AddMeasuredEventType(name);
+    }
   }
-  if (event_selection_set_.Empty()) {
+  if (measured_event_types_.empty()) {
     LOG(ERROR) << "Failed to add any supported default measured types";
     return false;
   }
   return true;
 }
 
-bool StatCommandImpl::ShowCounters(
+bool StatCommand::SetEventSelection() {
+  for (auto& pair : measured_event_types_) {
+    if (!event_selection_set_.AddEventType(pair.second)) {
+      return false;
+    }
+  }
+  event_selection_set_.SetInherit(child_inherit_);
+  return true;
+}
+
+bool StatCommand::ShowCounters(
     const std::map<const EventType*, std::vector<PerfCounter>>& counters_map,
     std::chrono::steady_clock::duration counting_duration) {
   printf("Performance counter statistics:\n\n");
@@ -209,34 +288,20 @@
                                             sum_counter.time_enabled / sum_counter.time_running);
       }
     }
+    std::string event_type_name;
+    for (auto& pair : measured_event_types_) {
+      if (pair.second.event_type.name == event_type->name) {
+        event_type_name = pair.first;
+      }
+    }
     printf("%'30" PRId64 "%s  %s\n", scaled_count, scaled ? "(scaled)" : "       ",
-           event_type->name);
+           event_type_name.c_str());
   }
   printf("\nTotal test time: %lf seconds.\n",
          std::chrono::duration_cast<std::chrono::duration<double>>(counting_duration).count());
   return true;
 }
 
-class StatCommand : public Command {
- public:
-  StatCommand()
-      : Command("stat", "gather performance counter information",
-                "Usage: simpleperf stat [options] [command [command-args]]\n"
-                "    Gather performance counter information of running [command]. If [command]\n"
-                "    is not specified, sleep 1 is used instead.\n\n"
-                "    -a           Collect system-wide information.\n"
-                "    -e event1,event2,... Select the event list to count. Use `simpleperf list`\n"
-                "                         to find all possible event names.\n"
-                "    --verbose    Show result in verbose mode.\n") {
-  }
-
-  bool Run(const std::vector<std::string>& args) override {
-    // Keep the implementation in StatCommandImpl, so the resources used are cleaned up when the
-    // command finishes. This is useful when we need to call some commands multiple times, like
-    // in unit tests.
-    StatCommandImpl impl;
-    return impl.Run(args);
-  }
-};
-
-StatCommand stat_command;
+__attribute__((constructor)) static void RegisterStatCommand() {
+  RegisterCommand("stat", [] { return std::unique_ptr<Command>(new StatCommand); });
+}
diff --git a/simpleperf/cmd_stat_test.cpp b/simpleperf/cmd_stat_test.cpp
index 6a7a1cd..c6c4ef7 100644
--- a/simpleperf/cmd_stat_test.cpp
+++ b/simpleperf/cmd_stat_test.cpp
@@ -16,31 +16,56 @@
 
 #include <gtest/gtest.h>
 
+#include <base/stringprintf.h>
+
 #include "command.h"
+#include "test_util.h"
 
-class StatCommandTest : public ::testing::Test {
- protected:
-  virtual void SetUp() {
-    stat_cmd = Command::FindCommandByName("stat");
-    ASSERT_TRUE(stat_cmd != nullptr);
-  }
-
- protected:
-  Command* stat_cmd;
-};
-
-TEST_F(StatCommandTest, no_options) {
-  ASSERT_TRUE(stat_cmd->Run({"stat", "sleep", "1"}));
+static std::unique_ptr<Command> StatCmd() {
+  return CreateCommandInstance("stat");
 }
 
-TEST_F(StatCommandTest, event_option) {
-  ASSERT_TRUE(stat_cmd->Run({"stat", "-e", "cpu-clock,task-clock", "sleep", "1"}));
+TEST(stat_cmd, no_options) {
+  ASSERT_TRUE(StatCmd()->Run({"sleep", "1"}));
 }
 
-TEST_F(StatCommandTest, system_wide_option) {
-  ASSERT_TRUE(stat_cmd->Run({"stat", "-a", "sleep", "1"}));
+TEST(stat_cmd, event_option) {
+  ASSERT_TRUE(StatCmd()->Run({"-e", "cpu-clock,task-clock", "sleep", "1"}));
 }
 
-TEST_F(StatCommandTest, verbose_option) {
-  ASSERT_TRUE(stat_cmd->Run({"stat", "--verbose", "sleep", "1"}));
+TEST(stat_cmd, system_wide_option) {
+  ASSERT_TRUE(StatCmd()->Run({"-a", "sleep", "1"}));
+}
+
+TEST(stat_cmd, verbose_option) {
+  ASSERT_TRUE(StatCmd()->Run({"--verbose", "sleep", "1"}));
+}
+
+TEST(stat_cmd, tracepoint_event) {
+  ASSERT_TRUE(StatCmd()->Run({"-a", "-e", "sched:sched_switch", "sleep", "1"}));
+}
+
+TEST(stat_cmd, event_modifier) {
+  ASSERT_TRUE(StatCmd()->Run({"-e", "cpu-cycles:u,sched:sched_switch:k", "sleep", "1"}));
+}
+
+TEST(stat_cmd, existing_processes) {
+  std::vector<std::unique_ptr<Workload>> workloads;
+  CreateProcesses(2, &workloads);
+  std::string pid_list =
+      android::base::StringPrintf("%d,%d", workloads[0]->GetPid(), workloads[1]->GetPid());
+  ASSERT_TRUE(StatCmd()->Run({"-p", pid_list}));
+}
+
+TEST(stat_cmd, existing_threads) {
+  std::vector<std::unique_ptr<Workload>> workloads;
+  CreateProcesses(2, &workloads);
+  // Process id can be used as thread id in linux.
+  std::string tid_list =
+      android::base::StringPrintf("%d,%d", workloads[0]->GetPid(), workloads[1]->GetPid());
+  ASSERT_TRUE(StatCmd()->Run({"-t", tid_list}));
+}
+
+TEST(stat_cmd, no_monitored_threads) {
+  ASSERT_FALSE(StatCmd()->Run({""}));
 }
diff --git a/simpleperf/command.cpp b/simpleperf/command.cpp
index 79cbc44..8889d7f 100644
--- a/simpleperf/command.cpp
+++ b/simpleperf/command.cpp
@@ -17,43 +17,54 @@
 #include "command.h"
 
 #include <algorithm>
+#include <map>
 #include <string>
 #include <vector>
 
-static std::vector<Command*>& Commands() {
+#include <base/logging.h>
+
+bool Command::NextArgumentOrError(const std::vector<std::string>& args, size_t* pi) {
+  if (*pi + 1 == args.size()) {
+    LOG(ERROR) << "No argument following " << args[*pi] << " option. Try `simpleperf help " << name_
+               << "`";
+    return false;
+  }
+  ++*pi;
+  return true;
+}
+
+void Command::ReportUnknownOption(const std::vector<std::string>& args, size_t i) {
+  LOG(ERROR) << "Unknown option for " << name_ << " command: '" << args[i]
+             << "'. Try `simpleperf help " << name_ << "`";
+}
+
+typedef std::function<std::unique_ptr<Command>(void)> callback_t;
+
+static std::map<std::string, callback_t>& CommandMap() {
   // commands is used in the constructor of Command. Defining it as a static
   // variable in a function makes sure it is initialized before use.
-  static std::vector<Command*> commands;
-  return commands;
+  static std::map<std::string, callback_t> command_map;
+  return command_map;
 }
 
-Command* Command::FindCommandByName(const std::string& cmd_name) {
-  for (auto& command : Commands()) {
-    if (command->Name() == cmd_name) {
-      return command;
-    }
+void RegisterCommand(const std::string& cmd_name,
+                     std::function<std::unique_ptr<Command>(void)> callback) {
+  CommandMap().insert(std::make_pair(cmd_name, callback));
+}
+
+void UnRegisterCommand(const std::string& cmd_name) {
+  CommandMap().erase(cmd_name);
+}
+
+std::unique_ptr<Command> CreateCommandInstance(const std::string& cmd_name) {
+  auto it = CommandMap().find(cmd_name);
+  return (it == CommandMap().end()) ? nullptr : (it->second)();
+}
+
+const std::vector<std::string> GetAllCommandNames() {
+  std::vector<std::string> names;
+  for (auto pair : CommandMap()) {
+    names.push_back(pair.first);
   }
-  return nullptr;
-}
-
-static bool CompareCommandByName(Command* cmd1, Command* cmd2) {
-  return cmd1->Name() < cmd2->Name();
-}
-
-const std::vector<Command*>& Command::GetAllCommands() {
-  std::sort(Commands().begin(), Commands().end(), CompareCommandByName);
-  return Commands();
-}
-
-void Command::RegisterCommand(Command* cmd) {
-  Commands().push_back(cmd);
-}
-
-void Command::UnRegisterCommand(Command* cmd) {
-  for (auto it = Commands().begin(); it != Commands().end(); ++it) {
-    if (*it == cmd) {
-      Commands().erase(it);
-      break;
-    }
-  }
+  return names;
 }
diff --git a/simpleperf/command.h b/simpleperf/command.h
index 46b49cb..e2c8453 100644
--- a/simpleperf/command.h
+++ b/simpleperf/command.h
@@ -17,6 +17,8 @@
 #ifndef SIMPLE_PERF_COMMAND_H_
 #define SIMPLE_PERF_COMMAND_H_
 
+#include <functional>
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -27,11 +29,9 @@
   Command(const std::string& name, const std::string& short_help_string,
           const std::string& long_help_string)
       : name_(name), short_help_string_(short_help_string), long_help_string_(long_help_string) {
-    RegisterCommand(this);
   }
 
   virtual ~Command() {
-    UnRegisterCommand(this);
   }
 
   const std::string& Name() const {
@@ -48,18 +48,22 @@
 
   virtual bool Run(const std::vector<std::string>& args) = 0;
 
-  static Command* FindCommandByName(const std::string& cmd_name);
-  static const std::vector<Command*>& GetAllCommands();
+ protected:
+  bool NextArgumentOrError(const std::vector<std::string>& args, size_t* pi);
+  void ReportUnknownOption(const std::vector<std::string>& args, size_t i);
 
  private:
   const std::string name_;
   const std::string short_help_string_;
   const std::string long_help_string_;
 
-  static void RegisterCommand(Command* cmd);
-  static void UnRegisterCommand(Command* cmd);
-
   DISALLOW_COPY_AND_ASSIGN(Command);
 };
 
+void RegisterCommand(const std::string& cmd_name,
+                     std::function<std::unique_ptr<Command>(void)> callback);
+void UnRegisterCommand(const std::string& cmd_name);
+std::unique_ptr<Command> CreateCommandInstance(const std::string& cmd_name);
+const std::vector<std::string> GetAllCommandNames();
+
 #endif  // SIMPLE_PERF_COMMAND_H_
diff --git a/simpleperf/command_test.cpp b/simpleperf/command_test.cpp
index 4a0baa6..18cb569 100644
--- a/simpleperf/command_test.cpp
+++ b/simpleperf/command_test.cpp
@@ -20,7 +20,7 @@
 
 class MockCommand : public Command {
  public:
-  MockCommand(const std::string& name) : Command(name, name + "_short_help", name + "_long_help") {
+  MockCommand() : Command("mock", "mock_short_help", "mock_long_help") {
   }
 
   bool Run(const std::vector<std::string>&) override {
@@ -28,20 +28,18 @@
   }
 };
 
-TEST(command, FindCommandByName) {
-  ASSERT_EQ(Command::FindCommandByName("mock1"), nullptr);
-  {
-    MockCommand mock1("mock1");
-    ASSERT_EQ(Command::FindCommandByName("mock1"), &mock1);
-  }
-  ASSERT_EQ(Command::FindCommandByName("mock1"), nullptr);
+TEST(command, CreateCommandInstance) {
+  ASSERT_TRUE(CreateCommandInstance("mock1") == nullptr);
+  RegisterCommand("mock1", [] { return std::unique_ptr<Command>(new MockCommand); });
+  ASSERT_TRUE(CreateCommandInstance("mock1") != nullptr);
+  UnRegisterCommand("mock1");
+  ASSERT_TRUE(CreateCommandInstance("mock1") == nullptr);
 }
 
 TEST(command, GetAllCommands) {
-  size_t command_count = Command::GetAllCommands().size();
-  {
-    MockCommand mock1("mock1");
-    ASSERT_EQ(command_count + 1, Command::GetAllCommands().size());
-  }
-  ASSERT_EQ(command_count, Command::GetAllCommands().size());
+  size_t command_count = GetAllCommandNames().size();
+  RegisterCommand("mock1", [] { return std::unique_ptr<Command>(new MockCommand); });
+  ASSERT_EQ(command_count + 1, GetAllCommandNames().size());
+  UnRegisterCommand("mock1");
+  ASSERT_EQ(command_count, GetAllCommandNames().size());
 }
diff --git a/simpleperf/cpu_offline_test.cpp b/simpleperf/cpu_offline_test.cpp
new file mode 100644
index 0000000..723518a
--- /dev/null
+++ b/simpleperf/cpu_offline_test.cpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include <sys/stat.h>
+
+#include <base/file.h>
+
+#include "event_attr.h"
+#include "event_fd.h"
+#include "event_type.h"
+
+static std::unique_ptr<EventFd> OpenHardwareEventOnCpu0() {
+  std::unique_ptr<EventTypeAndModifier> event_type_modifier = ParseEventType("cpu-cycles");
+  if (event_type_modifier == nullptr) {
+    return nullptr;
+  }
+  perf_event_attr attr = CreateDefaultPerfEventAttr(event_type_modifier->event_type);
+  return EventFd::OpenEventFile(attr, getpid(), 0);
+}
+
+static const char* cpu1_online_path = "/sys/devices/system/cpu/cpu1/online";
+
+static bool HaveCpuOne() {
+  struct stat st;
+  return (stat(cpu1_online_path, &st) == 0 && S_ISREG(st.st_mode));
+}
+
+static void IsCpuOneOnline(bool* online, bool* has_error) {
+  std::string content;
+  *has_error = true;
+  ASSERT_TRUE(android::base::ReadFileToString(cpu1_online_path, &content));
+  ASSERT_GT(content.size(), 0U);
+  *has_error = false;
+  *online = (content[0] == '0') ? false : true;
+}
+
+static void SetCpuOneOnline(bool online, bool* has_error, bool* interrupted) {
+  *interrupted = false;
+  errno = 0;
+  int ret = android::base::WriteStringToFile(online ? "1" : "0", cpu1_online_path);
+  int saved_errno = errno;
+  bool new_state;
+  IsCpuOneOnline(&new_state, has_error);
+  if (*has_error) {
+    return;
+  }
+  if (new_state == online) {
+    return;
+  } else if (ret) {
+    *interrupted = true;
+  } else {
+    *has_error = true;
+    FAIL() << "Failed to SetCpuOneOnline, online = " << online
+           << ", error = " << strerror(saved_errno) << ", new_state = " << new_state;
+  }
+}
+
+// On some devices like flo, the kernel can't work correctly if a cpu
+// is offlined when perf is monitoring a hardware event.
+TEST(cpu_offline, smoke) {
+  if (!HaveCpuOne()) {
+    GTEST_LOG_(INFO) << "This test does nothing on uniprocessor devices.";
+    return;
+  }
+
+  bool has_error;
+  bool interrupted;
+  bool saved_online;
+  bool success = false;
+  IsCpuOneOnline(&saved_online, &has_error);
+  // A loop is used in case the test is interrupted by other processes controling cpu hotplug, like
+  // mpdecision.
+  for (size_t loop_count = 0; !has_error && loop_count < 50; ++loop_count) {
+    SetCpuOneOnline(true, &has_error, &interrupted);
+    if (has_error || interrupted) {
+      continue;
+    }
+
+    std::unique_ptr<EventFd> event_fd = OpenHardwareEventOnCpu0();
+    ASSERT_TRUE(event_fd != nullptr);
+
+    bool online;
+    IsCpuOneOnline(&online, &has_error);
+    if (has_error || !online) {
+      continue;
+    }
+    SetCpuOneOnline(false, &has_error, &interrupted);
+    if (has_error || interrupted) {
+      continue;
+    }
+
+    event_fd = nullptr;
+    event_fd = OpenHardwareEventOnCpu0();
+    ASSERT_TRUE(event_fd != nullptr);
+    success = true;
+    break;
+  }
+  SetCpuOneOnline(saved_online, &has_error, &interrupted);
+  ASSERT_TRUE(success);
+}
diff --git a/simpleperf/darwin_support/asm/byteorder.h b/simpleperf/darwin_support/asm/byteorder.h
new file mode 100644
index 0000000..d118abc
--- /dev/null
+++ b/simpleperf/darwin_support/asm/byteorder.h
@@ -0,0 +1,15 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
diff --git a/simpleperf/darwin_support/linux/ioctl.h b/simpleperf/darwin_support/linux/ioctl.h
new file mode 100644
index 0000000..f580736
--- /dev/null
+++ b/simpleperf/darwin_support/linux/ioctl.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define __IO(type, nr)
+#define __IOR(type, nr, size)
+#define __IOW(type, nr, size)
diff --git a/simpleperf/darwin_support/linux/types.h b/simpleperf/darwin_support/linux/types.h
new file mode 100644
index 0000000..30478eb
--- /dev/null
+++ b/simpleperf/darwin_support/linux/types.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdint.h>
+
+typedef uint8_t __u8;
+typedef uint16_t __u16;
+typedef uint32_t __u32;
+typedef uint64_t __u64;
+typedef int64_t __s64;
diff --git a/simpleperf/dso.cpp b/simpleperf/dso.cpp
new file mode 100644
index 0000000..562727b
--- /dev/null
+++ b/simpleperf/dso.cpp
@@ -0,0 +1,176 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dso.h"
+
+#include <stdlib.h>
+#include <base/logging.h>
+#include "environment.h"
+#include "read_elf.h"
+#include "utils.h"
+
+bool SymbolComparator::operator()(const std::unique_ptr<SymbolEntry>& symbol1,
+                                  const std::unique_ptr<SymbolEntry>& symbol2) {
+  return symbol1->addr < symbol2->addr;
+}
+
+const SymbolEntry* DsoEntry::FindSymbol(uint64_t offset_in_dso) {
+  std::unique_ptr<SymbolEntry> symbol(new SymbolEntry{
+      "",             // name
+      offset_in_dso,  // addr
+      0,              // len
+  });
+
+  auto it = symbols.upper_bound(symbol);
+  if (it != symbols.begin()) {
+    --it;
+    if ((*it)->addr <= offset_in_dso && (*it)->addr + (*it)->len > offset_in_dso) {
+      return (*it).get();
+    }
+  }
+  return nullptr;
+}
+
+bool DsoFactory::demangle = true;
+
+void DsoFactory::SetDemangle(bool demangle) {
+  DsoFactory::demangle = demangle;
+}
+
+std::string DsoFactory::symfs_dir;
+
+bool DsoFactory::SetSymFsDir(const std::string& symfs_dir) {
+  std::string dirname = symfs_dir;
+  if (!dirname.empty() && dirname.back() != '/') {
+    dirname.push_back('/');
+  }
+  std::vector<std::string> files;
+  std::vector<std::string> subdirs;
+  GetEntriesInDir(symfs_dir, &files, &subdirs);
+  if (files.empty() && subdirs.empty()) {
+    LOG(ERROR) << "Invalid symfs_dir '" << symfs_dir << "'";
+    return false;
+  }
+  DsoFactory::symfs_dir = dirname;
+  return true;
+}
+
+static bool IsKernelFunctionSymbol(const KernelSymbol& symbol) {
+  return (symbol.type == 'T' || symbol.type == 't' || symbol.type == 'W' || symbol.type == 'w');
+}
+
+static bool KernelSymbolCallback(const KernelSymbol& kernel_symbol, DsoEntry* dso) {
+  if (IsKernelFunctionSymbol(kernel_symbol)) {
+    SymbolEntry* symbol = new SymbolEntry{
+        kernel_symbol.name,  // name
+        kernel_symbol.addr,  // addr
+        0,                   // len
+    };
+    dso->symbols.insert(std::unique_ptr<SymbolEntry>(symbol));
+  }
+  return false;
+}
+
+static void FixupSymbolLength(DsoEntry* dso) {
+  SymbolEntry* prev_symbol = nullptr;
+  for (auto& symbol : dso->symbols) {
+    if (prev_symbol != nullptr && prev_symbol->len == 0) {
+      prev_symbol->len = symbol->addr - prev_symbol->addr;
+    }
+    prev_symbol = symbol.get();
+  }
+  if (prev_symbol != nullptr && prev_symbol->len == 0) {
+    prev_symbol->len = ULLONG_MAX - prev_symbol->addr;
+  }
+}
+
+// TODO: Fix the way to get kernel symbols. See b/22179177.
+std::unique_ptr<DsoEntry> DsoFactory::LoadKernel() {
+  std::unique_ptr<DsoEntry> dso(new DsoEntry);
+  dso->path = "[kernel.kallsyms]";
+
+  ProcessKernelSymbols("/proc/kallsyms",
+                       std::bind(&KernelSymbolCallback, std::placeholders::_1, dso.get()));
+  FixupSymbolLength(dso.get());
+  return dso;
+}
+
+static void ParseSymbolCallback(const ElfFileSymbol& elf_symbol, DsoEntry* dso,
+                                bool (*filter)(const ElfFileSymbol&)) {
+  if (filter(elf_symbol)) {
+    SymbolEntry* symbol = new SymbolEntry{
+        elf_symbol.name,           // name
+        elf_symbol.start_in_file,  // addr
+        elf_symbol.len,            // len
+    };
+    dso->symbols.insert(std::unique_ptr<SymbolEntry>(symbol));
+  }
+}
+
+static bool SymbolFilterForKernelModule(const ElfFileSymbol& elf_symbol) {
+  // TODO: Parse symbol outside of .text section.
+  return (elf_symbol.is_func && elf_symbol.is_in_text_section);
+}
+
+std::unique_ptr<DsoEntry> DsoFactory::LoadKernelModule(const std::string& dso_path) {
+  std::unique_ptr<DsoEntry> dso(new DsoEntry);
+  dso->path = dso_path;
+  ParseSymbolsFromElfFile(symfs_dir + dso_path, std::bind(ParseSymbolCallback, std::placeholders::_1,
+                                                          dso.get(), SymbolFilterForKernelModule));
+  FixupSymbolLength(dso.get());
+  return dso;
+}
+
+static bool SymbolFilterForDso(const ElfFileSymbol& elf_symbol) {
+  return elf_symbol.is_func || (elf_symbol.is_label && elf_symbol.is_in_text_section);
+}
+
+extern "C" char* __cxa_demangle(const char* mangled_name, char* buf, size_t* n, int* status);
+
+static void DemangleInPlace(std::string* name) {
+  int status;
+  bool is_linker_symbol = (name->find(linker_prefix) == 0);
+  const char* mangled_str = name->c_str();
+  if (is_linker_symbol) {
+    mangled_str += linker_prefix.size();
+  }
+  char* demangled_name = __cxa_demangle(mangled_str, nullptr, nullptr, &status);
+  if (status == 0) {
+    if (is_linker_symbol) {
+      *name = std::string("[linker]") + demangled_name;
+    } else {
+      *name = demangled_name;
+    }
+    free(demangled_name);
+  } else if (is_linker_symbol) {
+    std::string temp = std::string("[linker]") + mangled_str;
+    *name = std::move(temp);
+  }
+}
+
+std::unique_ptr<DsoEntry> DsoFactory::LoadDso(const std::string& dso_path) {
+  std::unique_ptr<DsoEntry> dso(new DsoEntry);
+  dso->path = dso_path;
+  ParseSymbolsFromElfFile(symfs_dir + dso_path, std::bind(ParseSymbolCallback, std::placeholders::_1,
+                                                          dso.get(), SymbolFilterForDso));
+  if (demangle) {
+    for (auto& symbol : dso->symbols) {
+      DemangleInPlace(&symbol->name);
+    }
+  }
+  FixupSymbolLength(dso.get());
+  return dso;
+}
diff --git a/simpleperf/dso.h b/simpleperf/dso.h
new file mode 100644
index 0000000..2d79c92
--- /dev/null
+++ b/simpleperf/dso.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SIMPLE_PERF_DSO_H_
+#define SIMPLE_PERF_DSO_H_
+
+#include <memory>
+#include <set>
+#include <string>
+
+struct SymbolEntry {
+  std::string name;
+  uint64_t addr;
+  uint64_t len;
+};
+
+struct SymbolComparator {
+  bool operator()(const std::unique_ptr<SymbolEntry>& symbol1,
+                  const std::unique_ptr<SymbolEntry>& symbol2);
+};
+
+struct DsoEntry {
+  std::string path;
+  std::set<std::unique_ptr<SymbolEntry>, SymbolComparator> symbols;
+
+  const SymbolEntry* FindSymbol(uint64_t offset_in_dso);
+};
+
+class DsoFactory {
+ public:
+  static void SetDemangle(bool demangle);
+  static bool SetSymFsDir(const std::string& symfs_dir);
+  static std::unique_ptr<DsoEntry> LoadKernel();
+  static std::unique_ptr<DsoEntry> LoadKernelModule(const std::string& dso_path);
+  static std::unique_ptr<DsoEntry> LoadDso(const std::string& dso_path);
+
+ private:
+  static bool demangle;
+  static std::string symfs_dir;
+};
+
+#endif  // SIMPLE_PERF_DSO_H_
diff --git a/simpleperf/environment.cpp b/simpleperf/environment.cpp
index 0270b24..e139b03 100644
--- a/simpleperf/environment.cpp
+++ b/simpleperf/environment.cpp
@@ -182,7 +182,7 @@
     if (android::base::EndsWith(name, ".ko")) {
       std::string module_name = name.substr(0, name.size() - 3);
       std::replace(module_name.begin(), module_name.end(), '-', '_');
-      module_file_map->insert(std::make_pair(module_name, path + name));
+      module_file_map->insert(std::make_pair(module_name, path + "/" + name));
     }
   }
   for (auto& name : subdirs) {
@@ -273,7 +273,8 @@
   return false;
 }
 
-static bool GetThreadComm(pid_t pid, std::vector<ThreadComm>* thread_comms) {
+static std::vector<pid_t> GetThreadsInProcess(pid_t pid) {
+  std::vector<pid_t> result;
   std::string task_dirname = android::base::StringPrintf("/proc/%d/task", pid);
   std::vector<std::string> subdirs;
   GetEntriesInDir(task_dirname, nullptr, &subdirs);
@@ -282,17 +283,26 @@
     if (!StringToPid(name, &tid)) {
       continue;
     }
-    std::string status_file = task_dirname + "/" + name + "/status";
+    result.push_back(tid);
+  }
+  return result;
+}
+
+static bool GetThreadComm(pid_t pid, std::vector<ThreadComm>* thread_comms) {
+  std::vector<pid_t> tids = GetThreadsInProcess(pid);
+  for (auto& tid : tids) {
+    std::string status_file = android::base::StringPrintf("/proc/%d/task/%d/status", pid, tid);
     std::string comm;
     pid_t tgid;
+    // It is possible that the process or thread exited before we can read its status.
     if (!ReadThreadNameAndTgid(status_file, &comm, &tgid)) {
-      return false;
+      continue;
     }
+    CHECK_EQ(pid, tgid);
     ThreadComm thread;
     thread.tid = tid;
-    thread.tgid = tgid;
+    thread.pid = pid;
     thread.comm = comm;
-    thread.is_process = (tid == pid);
     thread_comms->push_back(thread);
   }
   return true;
@@ -356,3 +366,50 @@
   std::string notefile = "/sys/module/" + module_name + "/notes/.note.gnu.build-id";
   return GetBuildIdFromNoteFile(notefile, build_id);
 }
+
+bool GetValidThreadsFromProcessString(const std::string& pid_str, std::set<pid_t>* tid_set) {
+  std::vector<std::string> strs = android::base::Split(pid_str, ",");
+  for (auto& s : strs) {
+    pid_t pid;
+    if (!StringToPid(s, &pid)) {
+      LOG(ERROR) << "Invalid pid '" << s << "'";
+      return false;
+    }
+    std::vector<pid_t> tids = GetThreadsInProcess(pid);
+    if (tids.empty()) {
+      LOG(ERROR) << "Non existing process '" << pid << "'";
+      return false;
+    }
+    tid_set->insert(tids.begin(), tids.end());
+  }
+  return true;
+}
+
+bool GetValidThreadsFromThreadString(const std::string& tid_str, std::set<pid_t>* tid_set) {
+  std::vector<std::string> strs = android::base::Split(tid_str, ",");
+  for (auto& s : strs) {
+    pid_t tid;
+    if (!StringToPid(s, &tid)) {
+      LOG(ERROR) << "Invalid tid '" << s << "'";
+      return false;
+    }
+    if (!IsDir(android::base::StringPrintf("/proc/%d", tid))) {
+      LOG(ERROR) << "Non existing thread '" << tid << "'";
+      return false;
+    }
+    tid_set->insert(tid);
+  }
+  return true;
+}
+
+bool GetExecPath(std::string* exec_path) {
+  char path[PATH_MAX];
+  ssize_t path_len = readlink("/proc/self/exe", path, sizeof(path));
+  if (path_len <= 0 || path_len >= static_cast<ssize_t>(sizeof(path))) {
+    PLOG(ERROR) << "readlink failed";
+    return false;
+  }
+  path[path_len] = '\0';
+  *exec_path = path;
+  return true;
+}
diff --git a/simpleperf/environment.h b/simpleperf/environment.h
index f81005c..aa6f5eb 100644
--- a/simpleperf/environment.h
+++ b/simpleperf/environment.h
@@ -18,13 +18,15 @@
 #define SIMPLE_PERF_ENVIRONMENT_H_
 
 #include <functional>
+#include <set>
 #include <string>
 #include <vector>
+
 #include "build_id.h"
 
 std::vector<int> GetOnlineCpus();
 
-static const char* DEFAULT_KERNEL_MMAP_NAME = "[kernel.kallsyms]_text";
+constexpr char DEFAULT_KERNEL_MMAP_NAME[] = "[kernel.kallsyms]_text";
 
 struct KernelMmap {
   std::string name;
@@ -43,14 +45,13 @@
 bool GetKernelAndModuleMmaps(KernelMmap* kernel_mmap, std::vector<ModuleMmap>* module_mmaps);
 
 struct ThreadComm {
-  pid_t tgid, tid;
+  pid_t pid, tid;
   std::string comm;
-  bool is_process;
 };
 
 bool GetThreadComms(std::vector<ThreadComm>* thread_comms);
 
-static const char* DEFAULT_EXECNAME_FOR_THREAD_MMAP = "//anon";
+constexpr char DEFAULT_EXECNAME_FOR_THREAD_MMAP[] = "//anon";
 
 struct ThreadMmap {
   uint64_t start_addr;
@@ -62,11 +63,16 @@
 
 bool GetThreadMmapsInProcess(pid_t pid, std::vector<ThreadMmap>* thread_mmaps);
 
-static const char* DEFAULT_KERNEL_FILENAME_FOR_BUILD_ID = "[kernel.kallsyms]";
+constexpr char DEFAULT_KERNEL_FILENAME_FOR_BUILD_ID[] = "[kernel.kallsyms]";
 
 bool GetKernelBuildId(BuildId* build_id);
 bool GetModuleBuildId(const std::string& module_name, BuildId* build_id);
 
+bool GetValidThreadsFromProcessString(const std::string& pid_str, std::set<pid_t>* tid_set);
+bool GetValidThreadsFromThreadString(const std::string& tid_str, std::set<pid_t>* tid_set);
+
+bool GetExecPath(std::string* exec_path);
+
 // Expose the following functions for unit tests.
 std::vector<int> GetOnlineCpusFromString(const std::string& s);
 
diff --git a/simpleperf/environment_fake.cpp b/simpleperf/environment_fake.cpp
new file mode 100644
index 0000000..e8c9dd8
--- /dev/null
+++ b/simpleperf/environment_fake.cpp
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Add fake functions to build successfully on non-linux environments.
+#include "environment.h"
+
+bool ProcessKernelSymbols(const std::string&, std::function<bool(const KernelSymbol&)>) {
+  return false;
+}
diff --git a/simpleperf/event_attr.cpp b/simpleperf/event_attr.cpp
index 2b05931..c7ee182 100644
--- a/simpleperf/event_attr.cpp
+++ b/simpleperf/event_attr.cpp
@@ -46,17 +46,17 @@
 
 static std::string SampleTypeToString(uint64_t sample_type) {
   static std::vector<std::pair<int, std::string>> sample_type_names = {
+      {PERF_SAMPLE_ADDR, "addr"},
+      {PERF_SAMPLE_CALLCHAIN, "callchain"},
+      {PERF_SAMPLE_CPU, "cpu"},
+      {PERF_SAMPLE_ID, "id"},
       {PERF_SAMPLE_IP, "ip"},
+      {PERF_SAMPLE_PERIOD, "period"},
+      {PERF_SAMPLE_RAW, "raw"},
+      {PERF_SAMPLE_READ, "read"},
+      {PERF_SAMPLE_STREAM_ID, "stream_id"},
       {PERF_SAMPLE_TID, "tid"},
       {PERF_SAMPLE_TIME, "time"},
-      {PERF_SAMPLE_ADDR, "addr"},
-      {PERF_SAMPLE_READ, "read"},
-      {PERF_SAMPLE_CALLCHAIN, "callchain"},
-      {PERF_SAMPLE_ID, "id"},
-      {PERF_SAMPLE_CPU, "cpu"},
-      {PERF_SAMPLE_PERIOD, "period"},
-      {PERF_SAMPLE_STREAM_ID, "stream_id"},
-      {PERF_SAMPLE_RAW, "raw"},
   };
   return BitsToString("sample_type", sample_type, sample_type_names);
 }
@@ -85,17 +85,22 @@
   attr.read_format =
       PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING | PERF_FORMAT_ID;
   attr.sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_TIME | PERF_SAMPLE_PERIOD;
+
+  if (attr.type == PERF_TYPE_TRACEPOINT) {
+    attr.sample_freq = 0;
+    attr.sample_period = 1;
+  }
   return attr;
 }
 
 void DumpPerfEventAttr(const perf_event_attr& attr, size_t indent) {
   std::string event_name = "unknown";
-  const EventType* event_type = EventTypeFactory::FindEventTypeByConfig(attr.type, attr.config);
+  const EventType* event_type = FindEventTypeByConfig(attr.type, attr.config);
   if (event_type != nullptr) {
     event_name = event_type->name;
   }
 
-  PrintIndented(indent, "event_attr: for event %s\n", event_name.c_str());
+  PrintIndented(indent, "event_attr: for event type %s\n", event_name.c_str());
 
   PrintIndented(indent + 1, "type %u, size %u, config %llu\n", attr.type, attr.size, attr.config);
 
@@ -111,21 +116,21 @@
   PrintIndented(indent + 1, "read_format (0x%llx) %s\n", attr.read_format,
                 ReadFormatToString(attr.read_format).c_str());
 
-  PrintIndented(indent + 1, "disabled %llu, inherit %llu, pinned %llu, exclusive %llu\n",
-                attr.disabled, attr.inherit, attr.pinned, attr.exclusive);
+  PrintIndented(indent + 1, "disabled %u, inherit %u, pinned %u, exclusive %u\n", attr.disabled,
+                attr.inherit, attr.pinned, attr.exclusive);
 
-  PrintIndented(indent + 1, "exclude_user %llu, exclude_kernel %llu, exclude_hv %llu\n",
+  PrintIndented(indent + 1, "exclude_user %u, exclude_kernel %u, exclude_hv %u\n",
                 attr.exclude_user, attr.exclude_kernel, attr.exclude_hv);
 
-  PrintIndented(indent + 1, "exclude_idle %llu, mmap %llu, comm %llu, freq %llu\n",
-                attr.exclude_idle, attr.mmap, attr.comm, attr.freq);
+  PrintIndented(indent + 1, "exclude_idle %u, mmap %u, comm %u, freq %u\n", attr.exclude_idle,
+                attr.mmap, attr.comm, attr.freq);
 
-  PrintIndented(indent + 1, "inherit_stat %llu, enable_on_exec %llu, task %llu\n",
-                attr.inherit_stat, attr.enable_on_exec, attr.task);
+  PrintIndented(indent + 1, "inherit_stat %u, enable_on_exec %u, task %u\n", attr.inherit_stat,
+                attr.enable_on_exec, attr.task);
 
-  PrintIndented(indent + 1, "watermark %llu, precise_ip %llu, mmap_data %llu\n", attr.watermark,
+  PrintIndented(indent + 1, "watermark %u, precise_ip %u, mmap_data %u\n", attr.watermark,
                 attr.precise_ip, attr.mmap_data);
 
-  PrintIndented(indent + 1, "sample_id_all %llu, exclude_host %llu, exclude_guest %llu\n",
+  PrintIndented(indent + 1, "sample_id_all %u, exclude_host %u, exclude_guest %u\n",
                 attr.sample_id_all, attr.exclude_host, attr.exclude_guest);
 }
diff --git a/simpleperf/event_attr.h b/simpleperf/event_attr.h
index 52f4aca..79d3df4 100644
--- a/simpleperf/event_attr.h
+++ b/simpleperf/event_attr.h
@@ -17,8 +17,7 @@
 #ifndef SIMPLE_PERF_EVENT_ATTR_H_
 #define SIMPLE_PERF_EVENT_ATTR_H_
 
-#include <stdint.h>
-#include <string>
+#include <stddef.h>
 
 #include "perf_event.h"
 
diff --git a/simpleperf/event_fd.cpp b/simpleperf/event_fd.cpp
index 386685c..9c5e4ab 100644
--- a/simpleperf/event_fd.cpp
+++ b/simpleperf/event_fd.cpp
@@ -22,6 +22,7 @@
 #include <sys/mman.h>
 #include <sys/syscall.h>
 #include <sys/types.h>
+#include <atomic>
 #include <memory>
 
 #include <base/file.h>
@@ -37,36 +38,27 @@
   return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags);
 }
 
-std::unique_ptr<EventFd> EventFd::OpenEventFileForProcess(const perf_event_attr& attr, pid_t pid) {
-  return OpenEventFile(attr, pid, -1);
-}
-
-std::unique_ptr<EventFd> EventFd::OpenEventFileForCpu(const perf_event_attr& attr, int cpu) {
-  return OpenEventFile(attr, -1, cpu);
-}
-
-std::unique_ptr<EventFd> EventFd::OpenEventFile(const perf_event_attr& attr, pid_t pid, int cpu) {
+std::unique_ptr<EventFd> EventFd::OpenEventFile(const perf_event_attr& attr, pid_t tid, int cpu,
+                                                bool report_error) {
   perf_event_attr perf_attr = attr;
   std::string event_name = "unknown event";
-  const EventType* event_type =
-      EventTypeFactory::FindEventTypeByConfig(perf_attr.type, perf_attr.config);
+  const EventType* event_type = FindEventTypeByConfig(perf_attr.type, perf_attr.config);
   if (event_type != nullptr) {
     event_name = event_type->name;
   }
-  int perf_event_fd = perf_event_open(&perf_attr, pid, cpu, -1, 0);
+  int perf_event_fd = perf_event_open(&perf_attr, tid, cpu, -1, 0);
   if (perf_event_fd == -1) {
-    // It depends whether the perf_event_file configuration is supported by the kernel and the
-    // machine. So fail to open the file is not an error.
-    PLOG(DEBUG) << "open perf_event_file (event " << event_name << ", pid " << pid << ", cpu "
-                << cpu << ") failed";
+    (report_error ? PLOG(ERROR) : PLOG(DEBUG)) << "open perf_event_file (event " << event_name
+                                               << ", tid " << tid << ", cpu " << cpu << ") failed";
     return nullptr;
   }
   if (fcntl(perf_event_fd, F_SETFD, FD_CLOEXEC) == -1) {
-    PLOG(ERROR) << "fcntl(FD_CLOEXEC) for perf_event_file (event " << event_name << ", pid " << pid
-                << ", cpu " << cpu << ") failed";
+    (report_error ? PLOG(ERROR) : PLOG(DEBUG)) << "fcntl(FD_CLOEXEC) for perf_event_file (event "
+                                               << event_name << ", tid " << tid << ", cpu " << cpu
+                                               << ") failed";
     return nullptr;
   }
-  return std::unique_ptr<EventFd>(new EventFd(perf_event_fd, event_name, pid, cpu));
+  return std::unique_ptr<EventFd>(new EventFd(perf_event_fd, event_name, tid, cpu));
 }
 
 EventFd::~EventFd() {
@@ -77,8 +69,8 @@
 }
 
 std::string EventFd::Name() const {
-  return android::base::StringPrintf("perf_event_file(event %s, pid %d, cpu %d)",
-                                     event_name_.c_str(), pid_, cpu_);
+  return android::base::StringPrintf("perf_event_file(event %s, tid %d, cpu %d)",
+                                     event_name_.c_str(), tid_, cpu_);
 }
 
 uint64_t EventFd::Id() const {
@@ -176,3 +168,8 @@
   poll_fd->fd = perf_event_fd_;
   poll_fd->events = POLLIN;
 }
+
+bool IsEventAttrSupportedByKernel(perf_event_attr attr) {
+  auto event_fd = EventFd::OpenEventFile(attr, getpid(), -1, false);
+  return event_fd != nullptr;
+}
diff --git a/simpleperf/event_fd.h b/simpleperf/event_fd.h
index 36ea0cb..1fe5af0 100644
--- a/simpleperf/event_fd.h
+++ b/simpleperf/event_fd.h
@@ -37,13 +37,12 @@
 // EventFd represents an opened perf_event_file.
 class EventFd {
  public:
-  static std::unique_ptr<EventFd> OpenEventFileForProcess(const perf_event_attr& attr, pid_t pid);
-  static std::unique_ptr<EventFd> OpenEventFileForCpu(const perf_event_attr& attr, int cpu);
-  static std::unique_ptr<EventFd> OpenEventFile(const perf_event_attr& attr, pid_t pid, int cpu);
+  static std::unique_ptr<EventFd> OpenEventFile(const perf_event_attr& attr, pid_t tid, int cpu,
+                                                bool report_error = true);
 
   ~EventFd();
 
-  // Give information about this perf_event_file, like (event_name, pid, cpu).
+  // Give information about this perf_event_file, like (event_name, tid, cpu).
   std::string Name() const;
 
   uint64_t Id() const;
@@ -72,11 +71,11 @@
   void PreparePollForMmapData(pollfd* poll_fd);
 
  private:
-  EventFd(int perf_event_fd, const std::string& event_name, pid_t pid, int cpu)
+  EventFd(int perf_event_fd, const std::string& event_name, pid_t tid, int cpu)
       : perf_event_fd_(perf_event_fd),
         id_(0),
         event_name_(event_name),
-        pid_(pid),
+        tid_(tid),
         cpu_(cpu),
         mmap_addr_(nullptr),
         mmap_len_(0) {
@@ -85,7 +84,7 @@
   int perf_event_fd_;
   mutable uint64_t id_;
   const std::string event_name_;
-  pid_t pid_;
+  pid_t tid_;
   int cpu_;
 
   void* mmap_addr_;
@@ -98,4 +97,6 @@
   DISALLOW_COPY_AND_ASSIGN(EventFd);
 };
 
+bool IsEventAttrSupportedByKernel(perf_event_attr attr);
+
 #endif  // SIMPLE_PERF_EVENT_FD_H_
diff --git a/simpleperf/event_selection_set.cpp b/simpleperf/event_selection_set.cpp
index 61f1705..a9a0f96 100644
--- a/simpleperf/event_selection_set.cpp
+++ b/simpleperf/event_selection_set.cpp
@@ -17,22 +17,54 @@
 #include "event_selection_set.h"
 
 #include <base/logging.h>
+#include <base/stringprintf.h>
 
 #include "environment.h"
 #include "event_attr.h"
 #include "event_type.h"
 
-void EventSelectionSet::AddEventType(const EventType& event_type) {
-  EventSelection selection;
-  selection.event_type = &event_type;
-  selection.event_attr = CreateDefaultPerfEventAttr(event_type);
-  selections_.push_back(std::move(selection));
+bool IsBranchSamplingSupported() {
+  const EventType* type = FindEventTypeByName("cpu-cycles");
+  if (type == nullptr) {
+    return false;
+  }
+  perf_event_attr attr = CreateDefaultPerfEventAttr(*type);
+  attr.sample_type |= PERF_SAMPLE_BRANCH_STACK;
+  attr.branch_sample_type = PERF_SAMPLE_BRANCH_ANY;
+  return IsEventAttrSupportedByKernel(attr);
 }
 
-void EventSelectionSet::EnableOnExec() {
-  for (auto& selection : selections_) {
-    selection.event_attr.enable_on_exec = 1;
+bool EventSelectionSet::AddEventType(const EventTypeAndModifier& event_type_modifier) {
+  EventSelection selection;
+  selection.event_type = event_type_modifier.event_type;
+  selection.event_attr = CreateDefaultPerfEventAttr(event_type_modifier.event_type);
+  selection.event_attr.exclude_user = event_type_modifier.exclude_user;
+  selection.event_attr.exclude_kernel = event_type_modifier.exclude_kernel;
+  selection.event_attr.exclude_hv = event_type_modifier.exclude_hv;
+  selection.event_attr.exclude_host = event_type_modifier.exclude_host;
+  selection.event_attr.exclude_guest = event_type_modifier.exclude_guest;
+  selection.event_attr.precise_ip = event_type_modifier.precise_ip;
+  if (!IsEventAttrSupportedByKernel(selection.event_attr)) {
+    LOG(ERROR) << "Event type '" << selection.event_type.name << "' is not supported by the kernel";
+    return false;
   }
+  selections_.push_back(std::move(selection));
+  return true;
+}
+
+void EventSelectionSet::SetEnableOnExec(bool enable) {
+  for (auto& selection : selections_) {
+    selection.event_attr.enable_on_exec = (enable ? 1 : 0);
+  }
+}
+
+bool EventSelectionSet::GetEnableOnExec() {
+  for (auto& selection : selections_) {
+    if (selection.event_attr.enable_on_exec == 0) {
+      return false;
+    }
+  }
+  return true;
 }
 
 void EventSelectionSet::SampleIdAll() {
@@ -57,38 +89,78 @@
   }
 }
 
-bool EventSelectionSet::OpenEventFilesForAllCpus() {
-  std::vector<int> cpus = GetOnlineCpus();
-  if (cpus.empty()) {
+bool EventSelectionSet::SetBranchSampling(uint64_t branch_sample_type) {
+  if (branch_sample_type != 0 &&
+      (branch_sample_type & (PERF_SAMPLE_BRANCH_ANY | PERF_SAMPLE_BRANCH_ANY_CALL |
+                             PERF_SAMPLE_BRANCH_ANY_RETURN | PERF_SAMPLE_BRANCH_IND_CALL)) == 0) {
+    LOG(ERROR) << "Invalid branch_sample_type: 0x" << std::hex << branch_sample_type;
+    return false;
+  }
+  if (branch_sample_type != 0 && !IsBranchSamplingSupported()) {
+    LOG(ERROR) << "branch stack sampling is not supported on this device.";
     return false;
   }
   for (auto& selection : selections_) {
-    for (auto& cpu : cpus) {
-      auto event_fd = EventFd::OpenEventFileForCpu(selection.event_attr, cpu);
-      if (event_fd != nullptr) {
-        selection.event_fds.push_back(std::move(event_fd));
-      }
+    perf_event_attr& attr = selection.event_attr;
+    if (branch_sample_type != 0) {
+      attr.sample_type |= PERF_SAMPLE_BRANCH_STACK;
+    } else {
+      attr.sample_type &= ~PERF_SAMPLE_BRANCH_STACK;
     }
-    // As the online cpus can be enabled or disabled at runtime, we may not open event file for
-    // all cpus successfully. But we should open at least one cpu successfully.
-    if (selection.event_fds.empty()) {
-      LOG(ERROR) << "failed to open perf event file for event_type " << selection.event_type->name
-                 << " on all cpus";
-      return false;
-    }
+    attr.branch_sample_type = branch_sample_type;
   }
   return true;
 }
 
-bool EventSelectionSet::OpenEventFilesForProcess(pid_t pid) {
+void EventSelectionSet::EnableCallChainSampling() {
   for (auto& selection : selections_) {
-    auto event_fd = EventFd::OpenEventFileForProcess(selection.event_attr, pid);
-    if (event_fd == nullptr) {
-      PLOG(ERROR) << "failed to open perf event file for event type " << selection.event_type->name
-                  << " on pid " << pid;
-      return false;
+    selection.event_attr.sample_type |= PERF_SAMPLE_CALLCHAIN;
+  }
+}
+
+void EventSelectionSet::SetInherit(bool enable) {
+  for (auto& selection : selections_) {
+    selection.event_attr.inherit = (enable ? 1 : 0);
+  }
+}
+
+bool EventSelectionSet::OpenEventFilesForAllCpus() {
+  return OpenEventFilesForThreadsOnAllCpus({-1});
+}
+
+bool EventSelectionSet::OpenEventFilesForThreads(const std::vector<pid_t>& threads) {
+  return OpenEventFiles(threads, {-1});
+}
+
+bool EventSelectionSet::OpenEventFilesForThreadsOnAllCpus(const std::vector<pid_t>& threads) {
+  std::vector<int> cpus = GetOnlineCpus();
+  if (cpus.empty()) {
+    return false;
+  }
+  return OpenEventFiles(threads, cpus);
+}
+
+bool EventSelectionSet::OpenEventFiles(const std::vector<pid_t>& threads,
+                                       const std::vector<int>& cpus) {
+  for (auto& selection : selections_) {
+    for (auto& tid : threads) {
+      size_t open_per_thread = 0;
+      for (auto& cpu : cpus) {
+        auto event_fd = EventFd::OpenEventFile(selection.event_attr, tid, cpu);
+        if (event_fd != nullptr) {
+          selection.event_fds.push_back(std::move(event_fd));
+          ++open_per_thread;
+        }
+      }
+      // As the online cpus can be enabled or disabled at runtime, we may not open event file for
+      // all cpus successfully. But we should open at least one cpu successfully.
+      if (open_per_thread == 0) {
+        PLOG(ERROR) << "failed to open perf event file for event_type " << selection.event_type.name
+                    << " for "
+                    << (tid == -1 ? "all threads" : android::base::StringPrintf(" thread %d", tid));
+        return false;
+      }
     }
-    selection.event_fds.push_back(std::move(event_fd));
   }
   return true;
 }
@@ -115,7 +187,7 @@
       }
       counters.push_back(counter);
     }
-    counters_map->insert(std::make_pair(selection.event_type, counters));
+    counters_map->insert(std::make_pair(&selection.event_type, counters));
   }
   return true;
 }
@@ -191,7 +263,7 @@
 EventSelectionSet::EventSelection* EventSelectionSet::FindSelectionByType(
     const EventType& event_type) {
   for (auto& selection : selections_) {
-    if (strcmp(selection.event_type->name, event_type.name) == 0) {
+    if (selection.event_type.name == event_type.name) {
       return &selection;
     }
   }
diff --git a/simpleperf/event_selection_set.h b/simpleperf/event_selection_set.h
index 78be069..e52ec5f 100644
--- a/simpleperf/event_selection_set.h
+++ b/simpleperf/event_selection_set.h
@@ -25,10 +25,9 @@
 #include <base/macros.h>
 
 #include "event_fd.h"
+#include "event_type.h"
 #include "perf_event.h"
 
-struct EventType;
-
 // EventSelectionSet helps to monitor events.
 // Firstly, the user creates an EventSelectionSet, and adds the specific event types to monitor.
 // Secondly, the user defines how to monitor the events (by setting enable_on_exec flag,
@@ -47,15 +46,20 @@
     return selections_.empty();
   }
 
-  void AddEventType(const EventType& event_type);
+  bool AddEventType(const EventTypeAndModifier& event_type_modifier);
 
-  void EnableOnExec();
+  void SetEnableOnExec(bool enable);
+  bool GetEnableOnExec();
   void SampleIdAll();
   void SetSampleFreq(uint64_t sample_freq);
   void SetSamplePeriod(uint64_t sample_period);
+  bool SetBranchSampling(uint64_t branch_sample_type);
+  void EnableCallChainSampling();
+  void SetInherit(bool enable);
 
   bool OpenEventFilesForAllCpus();
-  bool OpenEventFilesForProcess(pid_t pid);
+  bool OpenEventFilesForThreads(const std::vector<pid_t>& threads);
+  bool OpenEventFilesForThreadsOnAllCpus(const std::vector<pid_t>& threads);
   bool EnableEvents();
   bool ReadCounters(std::map<const EventType*, std::vector<PerfCounter>>* counters_map);
   void PreparePollForEventFiles(std::vector<pollfd>* pollfds);
@@ -67,8 +71,10 @@
   const std::vector<std::unique_ptr<EventFd>>& FindEventFdsByType(const EventType& event_type);
 
  private:
+  bool OpenEventFiles(const std::vector<pid_t>& threads, const std::vector<int>& cpus);
+
   struct EventSelection {
-    const EventType* event_type;
+    EventType event_type;
     perf_event_attr event_attr;
     std::vector<std::unique_ptr<EventFd>> event_fds;
   };
diff --git a/simpleperf/event_type.cpp b/simpleperf/event_type.cpp
index ee0e161..526cfa5 100644
--- a/simpleperf/event_type.cpp
+++ b/simpleperf/event_type.cpp
@@ -17,39 +17,78 @@
 #include "event_type.h"
 
 #include <unistd.h>
+#include <algorithm>
 #include <string>
 #include <vector>
 
+#include <base/file.h>
 #include <base/logging.h>
 
 #include "event_attr.h"
 #include "event_fd.h"
+#include "utils.h"
 
 #define EVENT_TYPE_TABLE_ENTRY(name, type, config) \
   { name, type, config }                           \
   ,
 
-static std::vector<const EventType> event_type_array = {
+static const std::vector<EventType> static_event_type_array = {
 #include "event_type_table.h"
 };
 
-static bool IsEventTypeSupportedByKernel(const EventType& event_type) {
-  auto event_fd = EventFd::OpenEventFileForProcess(CreateDefaultPerfEventAttr(event_type), getpid());
-  return event_fd != nullptr;
+static const std::vector<EventType> GetTracepointEventTypes() {
+  std::vector<EventType> result;
+  const std::string tracepoint_dirname = "/sys/kernel/debug/tracing/events";
+  std::vector<std::string> system_dirs;
+  GetEntriesInDir(tracepoint_dirname, nullptr, &system_dirs);
+  for (auto& system_name : system_dirs) {
+    std::string system_path = tracepoint_dirname + "/" + system_name;
+    std::vector<std::string> event_dirs;
+    GetEntriesInDir(system_path, nullptr, &event_dirs);
+    for (auto& event_name : event_dirs) {
+      std::string id_path = system_path + "/" + event_name + "/id";
+      std::string id_content;
+      if (!android::base::ReadFileToString(id_path, &id_content)) {
+        continue;
+      }
+      char* endptr;
+      uint64_t id = strtoull(id_content.c_str(), &endptr, 10);
+      if (endptr == id_content.c_str()) {
+        LOG(DEBUG) << "unexpected id '" << id_content << "' in " << id_path;
+        continue;
+      }
+      result.push_back(EventType(system_name + ":" + event_name, PERF_TYPE_TRACEPOINT, id));
+    }
+  }
+  std::sort(result.begin(), result.end(),
+            [](const EventType& type1, const EventType& type2) { return type1.name < type2.name; });
+  return result;
 }
 
-bool EventType::IsSupportedByKernel() const {
-  return IsEventTypeSupportedByKernel(*this);
-}
-
-const std::vector<const EventType>& EventTypeFactory::GetAllEventTypes() {
+const std::vector<EventType>& GetAllEventTypes() {
+  static std::vector<EventType> event_type_array;
+  if (event_type_array.empty()) {
+    event_type_array.insert(event_type_array.end(), static_event_type_array.begin(),
+                            static_event_type_array.end());
+    const std::vector<EventType> tracepoint_array = GetTracepointEventTypes();
+    event_type_array.insert(event_type_array.end(), tracepoint_array.begin(),
+                            tracepoint_array.end());
+  }
   return event_type_array;
 }
 
-const EventType* EventTypeFactory::FindEventTypeByName(const std::string& name,
-                                                       bool report_unsupported_type) {
+const EventType* FindEventTypeByConfig(uint32_t type, uint64_t config) {
+  for (auto& event_type : GetAllEventTypes()) {
+    if (event_type.type == type && event_type.config == config) {
+      return &event_type;
+    }
+  }
+  return nullptr;
+}
+
+const EventType* FindEventTypeByName(const std::string& name) {
   const EventType* result = nullptr;
-  for (auto& event_type : event_type_array) {
+  for (auto& event_type : GetAllEventTypes()) {
     if (event_type.name == name) {
       result = &event_type;
       break;
@@ -60,19 +99,77 @@
                << "', try `simpleperf list` to list all possible event type names";
     return nullptr;
   }
-  if (!result->IsSupportedByKernel()) {
-    (report_unsupported_type ? PLOG(ERROR) : PLOG(DEBUG)) << "Event type '" << result->name
-                                                          << "' is not supported by the kernel";
-    return nullptr;
-  }
   return result;
 }
 
-const EventType* EventTypeFactory::FindEventTypeByConfig(uint32_t type, uint64_t config) {
-  for (auto& event_type : event_type_array) {
-    if (event_type.type == type && event_type.config == config) {
-      return &event_type;
+std::unique_ptr<EventTypeAndModifier> ParseEventType(const std::string& event_type_str) {
+  static std::string modifier_characters = "ukhGHp";
+  std::unique_ptr<EventTypeAndModifier> event_type_modifier(new EventTypeAndModifier);
+  std::string name = event_type_str;
+  std::string modifier;
+  size_t comm_pos = event_type_str.rfind(':');
+  if (comm_pos != std::string::npos) {
+    bool match_modifier = true;
+    for (size_t i = comm_pos + 1; i < event_type_str.size(); ++i) {
+      char c = event_type_str[i];
+      if (c != ' ' && modifier_characters.find(c) == std::string::npos) {
+        match_modifier = false;
+        break;
+      }
+    }
+    if (match_modifier) {
+      name = event_type_str.substr(0, comm_pos);
+      modifier = event_type_str.substr(comm_pos + 1);
     }
   }
-  return nullptr;
+  const EventType* event_type = FindEventTypeByName(name);
+  if (event_type == nullptr) {
+    // Try if the modifier belongs to the event type name, like some tracepoint events.
+    if (!modifier.empty()) {
+      name = event_type_str;
+      modifier.clear();
+      event_type = FindEventTypeByName(name);
+    }
+    if (event_type == nullptr) {
+      return nullptr;
+    }
+  }
+  event_type_modifier->event_type = *event_type;
+  if (modifier.find_first_of("ukh") != std::string::npos) {
+    event_type_modifier->exclude_user = true;
+    event_type_modifier->exclude_kernel = true;
+    event_type_modifier->exclude_hv = true;
+  }
+  if (modifier.find_first_of("GH") != std::string::npos) {
+    event_type_modifier->exclude_guest = true;
+    event_type_modifier->exclude_host = true;
+  }
+
+  for (auto& c : modifier) {
+    switch (c) {
+      case 'u':
+        event_type_modifier->exclude_user = false;
+        break;
+      case 'k':
+        event_type_modifier->exclude_kernel = false;
+        break;
+      case 'h':
+        event_type_modifier->exclude_hv = false;
+        break;
+      case 'G':
+        event_type_modifier->exclude_guest = false;
+        break;
+      case 'H':
+        event_type_modifier->exclude_host = false;
+        break;
+      case 'p':
+        event_type_modifier->precise_ip++;
+        break;
+      case ' ':
+        break;
+      default:
+        LOG(ERROR) << "Unknown event type modifier '" << c << "'";
+    }
+  }
+  return event_type_modifier;
 }
diff --git a/simpleperf/event_type.h b/simpleperf/event_type.h
index b486a29..df2f782 100644
--- a/simpleperf/event_type.h
+++ b/simpleperf/event_type.h
@@ -18,6 +18,7 @@
 #define SIMPLE_PERF_EVENT_H_
 
 #include <stdint.h>
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -27,19 +28,41 @@
 // the event type is supported by the kernel.
 
 struct EventType {
-  bool IsSupportedByKernel() const;
+  EventType(const std::string& name, uint32_t type, uint64_t config)
+      : name(name), type(type), config(config) {
+  }
 
-  const char* name;
+  EventType() : type(0), config(0) {
+  }
+
+  std::string name;
   uint32_t type;
   uint64_t config;
 };
 
-class EventTypeFactory {
- public:
-  static const std::vector<const EventType>& GetAllEventTypes();
-  static const EventType* FindEventTypeByName(const std::string& name,
-                                              bool report_unsupported_type = true);
-  static const EventType* FindEventTypeByConfig(uint32_t type, uint64_t config);
+const std::vector<EventType>& GetAllEventTypes();
+const EventType* FindEventTypeByConfig(uint32_t type, uint64_t config);
+const EventType* FindEventTypeByName(const std::string& name);
+
+struct EventTypeAndModifier {
+  EventType event_type;
+  bool exclude_user;
+  bool exclude_kernel;
+  bool exclude_hv;
+  bool exclude_host;
+  bool exclude_guest;
+  int precise_ip : 2;
+
+  EventTypeAndModifier()
+      : exclude_user(false),
+        exclude_kernel(false),
+        exclude_hv(false),
+        exclude_host(false),
+        exclude_guest(false),
+        precise_ip(0) {
+  }
 };
 
+std::unique_ptr<EventTypeAndModifier> ParseEventType(const std::string& event_type_str);
+
 #endif  // SIMPLE_PERF_EVENT_H_
diff --git a/simpleperf/event_type_table.h b/simpleperf/event_type_table.h
index 895cc85..a77be0a 100644
--- a/simpleperf/event_type_table.h
+++ b/simpleperf/event_type_table.h
@@ -20,43 +20,43 @@
 {"alignment-faults", PERF_TYPE_SOFTWARE, PERF_COUNT_SW_ALIGNMENT_FAULTS},
 {"emulation-faults", PERF_TYPE_SOFTWARE, PERF_COUNT_SW_EMULATION_FAULTS},
 
-{"L1-dcache-loades", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_L1D) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))},
+{"L1-dcache-loads", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_L1D) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))},
 {"L1-dcache-load-misses", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_L1D) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16))},
 {"L1-dcache-stores", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_L1D) | (PERF_COUNT_HW_CACHE_OP_WRITE << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))},
 {"L1-dcache-store-misses", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_L1D) | (PERF_COUNT_HW_CACHE_OP_WRITE << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16))},
 {"L1-dcache-prefetches", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_L1D) | (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))},
 {"L1-dcache-prefetch-misses", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_L1D) | (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16))},
-{"L1-icache-loades", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_L1I) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))},
+{"L1-icache-loads", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_L1I) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))},
 {"L1-icache-load-misses", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_L1I) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16))},
 {"L1-icache-stores", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_L1I) | (PERF_COUNT_HW_CACHE_OP_WRITE << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))},
 {"L1-icache-store-misses", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_L1I) | (PERF_COUNT_HW_CACHE_OP_WRITE << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16))},
 {"L1-icache-prefetches", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_L1I) | (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))},
 {"L1-icache-prefetch-misses", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_L1I) | (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16))},
-{"LLC-loades", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_LL) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))},
+{"LLC-loads", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_LL) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))},
 {"LLC-load-misses", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_LL) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16))},
 {"LLC-stores", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_LL) | (PERF_COUNT_HW_CACHE_OP_WRITE << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))},
 {"LLC-store-misses", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_LL) | (PERF_COUNT_HW_CACHE_OP_WRITE << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16))},
 {"LLC-prefetches", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_LL) | (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))},
 {"LLC-prefetch-misses", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_LL) | (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16))},
-{"dTLB-loades", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_DTLB) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))},
+{"dTLB-loads", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_DTLB) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))},
 {"dTLB-load-misses", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_DTLB) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16))},
 {"dTLB-stores", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_DTLB) | (PERF_COUNT_HW_CACHE_OP_WRITE << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))},
 {"dTLB-store-misses", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_DTLB) | (PERF_COUNT_HW_CACHE_OP_WRITE << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16))},
 {"dTLB-prefetches", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_DTLB) | (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))},
 {"dTLB-prefetch-misses", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_DTLB) | (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16))},
-{"iTLB-loades", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_ITLB) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))},
+{"iTLB-loads", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_ITLB) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))},
 {"iTLB-load-misses", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_ITLB) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16))},
 {"iTLB-stores", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_ITLB) | (PERF_COUNT_HW_CACHE_OP_WRITE << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))},
 {"iTLB-store-misses", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_ITLB) | (PERF_COUNT_HW_CACHE_OP_WRITE << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16))},
 {"iTLB-prefetches", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_ITLB) | (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))},
 {"iTLB-prefetch-misses", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_ITLB) | (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16))},
-{"branch-loades", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_BPU) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))},
+{"branch-loads", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_BPU) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))},
 {"branch-load-misses", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_BPU) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16))},
 {"branch-stores", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_BPU) | (PERF_COUNT_HW_CACHE_OP_WRITE << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))},
 {"branch-store-misses", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_BPU) | (PERF_COUNT_HW_CACHE_OP_WRITE << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16))},
 {"branch-prefetches", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_BPU) | (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))},
 {"branch-prefetch-misses", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_BPU) | (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16))},
-{"node-loades", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_NODE) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))},
+{"node-loads", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_NODE) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))},
 {"node-load-misses", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_NODE) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16))},
 {"node-stores", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_NODE) | (PERF_COUNT_HW_CACHE_OP_WRITE << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))},
 {"node-store-misses", PERF_TYPE_HW_CACHE, ((PERF_COUNT_HW_CACHE_NODE) | (PERF_COUNT_HW_CACHE_OP_WRITE << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16))},
diff --git a/simpleperf/generate_event_type_table.py b/simpleperf/generate_event_type_table.py
index b3fb897..9fbd278 100755
--- a/simpleperf/generate_event_type_table.py
+++ b/simpleperf/generate_event_type_table.py
@@ -81,7 +81,7 @@
                       ["branch", "PERF_COUNT_HW_CACHE_BPU"],
                       ["node", "PERF_COUNT_HW_CACHE_NODE"],
                       ]
-    hw_cache_ops = [["loades", "load", "PERF_COUNT_HW_CACHE_OP_READ"],
+    hw_cache_ops = [["loads", "load", "PERF_COUNT_HW_CACHE_OP_READ"],
                     ["stores", "store", "PERF_COUNT_HW_CACHE_OP_WRITE"],
                     ["prefetches", "prefetch",
                      "PERF_COUNT_HW_CACHE_OP_PREFETCH"],
diff --git a/simpleperf/main.cpp b/simpleperf/main.cpp
index 173026e..7cc04b8 100644
--- a/simpleperf/main.cpp
+++ b/simpleperf/main.cpp
@@ -15,6 +15,7 @@
  */
 
 #include <string.h>
+#include <map>
 #include <string>
 #include <vector>
 
@@ -22,9 +23,17 @@
 
 #include "command.h"
 
+static std::map<std::string, android::base::LogSeverity> log_severity_map = {
+    {"debug", android::base::DEBUG},
+    {"warning", android::base::WARNING},
+    {"error", android::base::ERROR},
+    {"fatal", android::base::FATAL},
+};
+
 int main(int argc, char** argv) {
   InitLogging(argv, android::base::StderrLogger);
   std::vector<std::string> args;
+  android::base::LogSeverity log_severity = android::base::ERROR;
 
   if (argc == 1) {
     args.push_back("help");
@@ -32,18 +41,34 @@
     for (int i = 1; i < argc; ++i) {
       if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0) {
         args.insert(args.begin(), "help");
+      } else if (strcmp(argv[i], "--log") == 0) {
+        if (i + 1 < argc) {
+          ++i;
+          auto it = log_severity_map.find(argv[i]);
+          if (it != log_severity_map.end()) {
+            log_severity = it->second;
+          } else {
+            LOG(ERROR) << "Unknown log severity: " << argv[i];
+            return 1;
+          }
+        } else {
+          LOG(ERROR) << "Missing argument for --log option.\n";
+          return 1;
+        }
       } else {
         args.push_back(argv[i]);
       }
     }
   }
+  android::base::ScopedLogSeverity severity(log_severity);
 
-  Command* command = Command::FindCommandByName(args[0]);
+  std::unique_ptr<Command> command = CreateCommandInstance(args[0]);
   if (command == nullptr) {
     LOG(ERROR) << "malformed command line: unknown command " << args[0];
     return 1;
   }
   std::string command_name = args[0];
+  args.erase(args.begin());
 
   LOG(DEBUG) << "command '" << command_name << "' starts running";
   bool result = command->Run(args);
diff --git a/simpleperf/perf_event.h b/simpleperf/perf_event.h
index a91eb6b..1688dc9 100644
--- a/simpleperf/perf_event.h
+++ b/simpleperf/perf_event.h
@@ -17,6 +17,14 @@
 #ifndef SIMPLE_PERF_PERF_EVENT_H_
 #define SIMPLE_PERF_PERF_EVENT_H_
 
+#if defined(USE_BIONIC_PERF_EVENT_H)
+
+#include <libc/kernel/uapi/linux/perf_event.h>
+
+#else
+
 #include <linux/perf_event.h>
 
+#endif
+
 #endif  // SIMPLE_PERF_PERF_EVENT_H_
diff --git a/simpleperf/read_elf.cpp b/simpleperf/read_elf.cpp
index 1873b30..4d41165 100644
--- a/simpleperf/read_elf.cpp
+++ b/simpleperf/read_elf.cpp
@@ -32,10 +32,11 @@
 
 #pragma clang diagnostic pop
 
-#include <elf.h>
-
 #include "utils.h"
 
+#define ELF_NOTE_GNU "GNU"
+#define NT_GNU_BUILD_ID 3
+
 static bool GetBuildIdFromNoteSection(const char* section, size_t section_size, BuildId* build_id) {
   const char* p = section;
   const char* end = p + section_size;
@@ -77,7 +78,7 @@
 bool GetBuildIdFromELFFile(const llvm::object::ELFFile<ELFT>* elf, BuildId* build_id) {
   for (auto section_iterator = elf->begin_sections(); section_iterator != elf->end_sections();
        ++section_iterator) {
-    if (section_iterator->sh_type == SHT_NOTE) {
+    if (section_iterator->sh_type == llvm::ELF::SHT_NOTE) {
       auto contents = elf->getSectionContents(&*section_iterator);
       if (contents.getError()) {
         LOG(DEBUG) << "read note section error";
@@ -114,3 +115,101 @@
   }
   return result;
 }
+
+bool IsArmMappingSymbol(const char* name) {
+  // Mapping symbols in arm, which are described in "ELF for ARM Architecture" and
+  // "ELF for ARM 64-bit Architecture". The regular expression to match mapping symbol
+  // is ^\$(a|d|t|x)(\..*)?$
+  return name[0] == '$' && strchr("adtx", name[1]) != nullptr && (name[2] == '\0' || name[2] == '.');
+}
+
+template <class ELFT>
+bool ParseSymbolsFromELFFile(const llvm::object::ELFFile<ELFT>* elf,
+                             std::function<void(const ElfFileSymbol&)> callback) {
+  bool is_arm = (elf->getHeader()->e_machine == llvm::ELF::EM_ARM ||
+                 elf->getHeader()->e_machine == llvm::ELF::EM_AARCH64);
+  auto begin = elf->begin_symbols();
+  auto end = elf->end_symbols();
+  if (begin == end) {
+    begin = elf->begin_dynamic_symbols();
+    end = elf->end_dynamic_symbols();
+  }
+  for (; begin != end; ++begin) {
+    auto& elf_symbol = *begin;
+
+    ElfFileSymbol symbol;
+    memset(&symbol, '\0', sizeof(symbol));
+
+    auto shdr = elf->getSection(&elf_symbol);
+    if (shdr == nullptr) {
+      continue;
+    }
+    auto section_name = elf->getSectionName(shdr);
+    if (section_name.getError() || section_name.get().empty()) {
+      continue;
+    }
+    if (section_name.get() == ".text") {
+      symbol.is_in_text_section = true;
+    }
+
+    auto symbol_name = elf->getSymbolName(begin);
+    if (symbol_name.getError()) {
+      continue;
+    }
+    symbol.name = symbol_name.get();
+    if (symbol.name.empty()) {
+      continue;
+    }
+
+    symbol.start_in_file = elf_symbol.st_value - shdr->sh_addr + shdr->sh_offset;
+    if ((symbol.start_in_file & 1) != 0 && is_arm) {
+      // Arm sets bit 0 to mark it as thumb code, remove the flag.
+      symbol.start_in_file &= ~1;
+    }
+    symbol.len = elf_symbol.st_size;
+    int type = elf_symbol.getType();
+    if (type == llvm::ELF::STT_FUNC) {
+      symbol.is_func = true;
+    } else if (type == llvm::ELF::STT_NOTYPE) {
+      if (symbol.is_in_text_section) {
+        symbol.is_label = true;
+        if (is_arm) {
+          // Remove mapping symbols in arm.
+          const char* p = (symbol.name.compare(0, linker_prefix.size(), linker_prefix) == 0)
+                              ? symbol.name.c_str() + linker_prefix.size()
+                              : symbol.name.c_str();
+          if (IsArmMappingSymbol(p)) {
+            symbol.is_label = false;
+          }
+        }
+      }
+    }
+
+    callback(symbol);
+  }
+  return true;
+}
+
+bool ParseSymbolsFromElfFile(const std::string& filename,
+                             std::function<void(const ElfFileSymbol&)> callback) {
+  auto owning_binary = llvm::object::createBinary(llvm::StringRef(filename));
+  if (owning_binary.getError()) {
+    PLOG(DEBUG) << "can't open file '" << filename << "'";
+    return false;
+  }
+  bool result = false;
+  llvm::object::Binary* binary = owning_binary.get().getBinary();
+  if (auto obj = llvm::dyn_cast<llvm::object::ObjectFile>(binary)) {
+    if (auto elf = llvm::dyn_cast<llvm::object::ELF32LEObjectFile>(obj)) {
+      result = ParseSymbolsFromELFFile(elf->getELFFile(), callback);
+    } else if (auto elf = llvm::dyn_cast<llvm::object::ELF64LEObjectFile>(obj)) {
+      result = ParseSymbolsFromELFFile(elf->getELFFile(), callback);
+    } else {
+      PLOG(DEBUG) << "unknown elf format in file" << filename;
+    }
+  }
+  if (!result) {
+    PLOG(DEBUG) << "can't parse symbols from file " << filename;
+  }
+  return result;
+}
diff --git a/simpleperf/read_elf.h b/simpleperf/read_elf.h
index bc65fea..96eb2f3 100644
--- a/simpleperf/read_elf.h
+++ b/simpleperf/read_elf.h
@@ -17,10 +17,29 @@
 #ifndef SIMPLE_PERF_READ_ELF_H_
 #define SIMPLE_PERF_READ_ELF_H_
 
+#include <functional>
 #include <string>
 #include "build_id.h"
 
 bool GetBuildIdFromNoteFile(const std::string& filename, BuildId* build_id);
 bool GetBuildIdFromElfFile(const std::string& filename, BuildId* build_id);
 
+// The symbol prefix used to indicate that the symbol belongs to android linker.
+static const std::string linker_prefix = "__dl_";
+
+struct ElfFileSymbol {
+  uint64_t start_in_file;
+  uint64_t len;
+  bool is_func;
+  bool is_label;
+  bool is_in_text_section;
+  std::string name;
+};
+
+bool ParseSymbolsFromElfFile(const std::string& filename,
+                             std::function<void(const ElfFileSymbol&)> callback);
+
+// Expose the following functions for unit tests.
+bool IsArmMappingSymbol(const char* name);
+
 #endif  // SIMPLE_PERF_READ_ELF_H_
diff --git a/simpleperf/read_elf_test.cpp b/simpleperf/read_elf_test.cpp
new file mode 100644
index 0000000..c0ff660
--- /dev/null
+++ b/simpleperf/read_elf_test.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "read_elf.h"
+
+#include <gtest/gtest.h>
+
+static void ParseSymbol(const ElfFileSymbol& symbol, bool* result) {
+  if (symbol.is_func) {
+    *result = true;
+  }
+}
+
+TEST(read_elf, parse_symbols_from_elf_file) {
+  char elf_file[PATH_MAX];
+  ssize_t elf_file_len = readlink("/proc/self/exe", elf_file, sizeof(elf_file));
+  ASSERT_GT(elf_file_len, 0L);
+  ASSERT_LT(static_cast<size_t>(elf_file_len), sizeof(elf_file));
+  elf_file[elf_file_len] = '\0';
+
+  bool result = false;
+  ASSERT_TRUE(
+      ParseSymbolsFromElfFile(elf_file, std::bind(ParseSymbol, std::placeholders::_1, &result)));
+  ASSERT_TRUE(result);
+}
+
+TEST(read_elf, arm_mapping_symbol) {
+  ASSERT_TRUE(IsArmMappingSymbol("$a"));
+  ASSERT_FALSE(IsArmMappingSymbol("$b"));
+  ASSERT_TRUE(IsArmMappingSymbol("$a.anything"));
+  ASSERT_FALSE(IsArmMappingSymbol("$a_no_dot"));
+}
diff --git a/simpleperf/record.cpp b/simpleperf/record.cpp
index 46910b9..f6b2560 100644
--- a/simpleperf/record.cpp
+++ b/simpleperf/record.cpp
@@ -27,16 +27,12 @@
 
 static std::string RecordTypeToString(int record_type) {
   static std::unordered_map<int, std::string> record_type_names = {
-      {PERF_RECORD_MMAP, "mmap"},
-      {PERF_RECORD_LOST, "lost"},
-      {PERF_RECORD_COMM, "comm"},
-      {PERF_RECORD_EXIT, "exit"},
-      {PERF_RECORD_THROTTLE, "throttle"},
-      {PERF_RECORD_UNTHROTTLE, "unthrottle"},
-      {PERF_RECORD_FORK, "fork"},
-      {PERF_RECORD_READ, "read"},
-      {PERF_RECORD_SAMPLE, "sample"},
-      {PERF_RECORD_BUILD_ID, "build_id"},
+      {PERF_RECORD_MMAP, "mmap"},         {PERF_RECORD_LOST, "lost"},
+      {PERF_RECORD_COMM, "comm"},         {PERF_RECORD_EXIT, "exit"},
+      {PERF_RECORD_THROTTLE, "throttle"}, {PERF_RECORD_UNTHROTTLE, "unthrottle"},
+      {PERF_RECORD_FORK, "fork"},         {PERF_RECORD_READ, "read"},
+      {PERF_RECORD_SAMPLE, "sample"},     {PERF_RECORD_BUILD_ID, "build_id"},
+      {PERF_RECORD_MMAP2, "mmap2"},
   };
 
   auto it = record_type_names.find(record_type);
@@ -47,9 +43,10 @@
 }
 
 template <class T>
-void MoveFromBinaryFormat(T& data, const char*& p) {
-  data = *reinterpret_cast<const T*>(p);
-  p += sizeof(T);
+void MoveFromBinaryFormat(T* data_p, size_t n, const char*& p) {
+  size_t size = n * sizeof(T);
+  memcpy(data_p, p, size);
+  p += size;
 }
 
 template <class T>
@@ -182,8 +179,8 @@
 }
 
 void MmapRecord::DumpData(size_t indent) const {
-  PrintIndented(indent, "pid %u, tid %u, addr %p, len 0x%" PRIx64 "\n", data.pid, data.tid,
-                reinterpret_cast<void*>(data.addr), data.len);
+  PrintIndented(indent, "pid %u, tid %u, addr 0x%" PRIx64 ", len 0x%" PRIx64 "\n", data.pid,
+                data.tid, data.addr, data.len);
   PrintIndented(indent, "pgoff 0x%" PRIx64 ", filename %s\n", data.pgoff, filename.c_str());
 }
 
@@ -198,6 +195,27 @@
   return buf;
 }
 
+Mmap2Record::Mmap2Record(const perf_event_attr& attr, const perf_event_header* pheader)
+    : Record(pheader) {
+  const char* p = reinterpret_cast<const char*>(pheader + 1);
+  const char* end = reinterpret_cast<const char*>(pheader) + pheader->size;
+  MoveFromBinaryFormat(data, p);
+  filename = p;
+  p += ALIGN(filename.size() + 1, 8);
+  CHECK_LE(p, end);
+  sample_id.ReadFromBinaryFormat(attr, p, end);
+}
+
+void Mmap2Record::DumpData(size_t indent) const {
+  PrintIndented(indent, "pid %u, tid %u, addr 0x%" PRIx64 ", len 0x%" PRIx64 "\n", data.pid,
+                data.tid, data.addr, data.len);
+  PrintIndented(indent,
+                "pgoff 0x" PRIx64 ", maj %u, min %u, ino %" PRId64 ", ino_generation %" PRIu64 "\n",
+                data.pgoff, data.maj, data.min, data.ino, data.ino_generation);
+  PrintIndented(indent, "prot %u, flags %u, filenames %s\n", data.prot, data.flags,
+                filename.c_str());
+}
+
 CommRecord::CommRecord(const perf_event_attr& attr, const perf_event_header* pheader)
     : Record(pheader) {
   const char* p = reinterpret_cast<const char*>(pheader + 1);
@@ -224,7 +242,7 @@
   return buf;
 }
 
-ExitRecord::ExitRecord(const perf_event_attr& attr, const perf_event_header* pheader)
+ExitOrForkRecord::ExitOrForkRecord(const perf_event_attr& attr, const perf_event_header* pheader)
     : Record(pheader) {
   const char* p = reinterpret_cast<const char*>(pheader + 1);
   const char* end = reinterpret_cast<const char*>(pheader) + pheader->size;
@@ -233,11 +251,20 @@
   sample_id.ReadFromBinaryFormat(attr, p, end);
 }
 
-void ExitRecord::DumpData(size_t indent) const {
+void ExitOrForkRecord::DumpData(size_t indent) const {
   PrintIndented(indent, "pid %u, ppid %u, tid %u, ptid %u\n", data.pid, data.ppid, data.tid,
                 data.ptid);
 }
 
+std::vector<char> ForkRecord::BinaryFormat() const {
+  std::vector<char> buf(header.size);
+  char* p = buf.data();
+  MoveToBinaryFormat(header, p);
+  MoveToBinaryFormat(data, p);
+  sample_id.WriteToBinaryFormat(p);
+  return buf;
+}
+
 SampleRecord::SampleRecord(const perf_event_attr& attr, const perf_event_header* pheader)
     : Record(pheader) {
   const char* p = reinterpret_cast<const char*>(pheader + 1);
@@ -268,6 +295,18 @@
   if (sample_type & PERF_SAMPLE_PERIOD) {
     MoveFromBinaryFormat(period_data, p);
   }
+  if (sample_type & PERF_SAMPLE_CALLCHAIN) {
+    uint64_t nr;
+    MoveFromBinaryFormat(nr, p);
+    callchain_data.ips.resize(nr);
+    MoveFromBinaryFormat(callchain_data.ips.data(), nr, p);
+  }
+  if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
+    uint64_t nr;
+    MoveFromBinaryFormat(nr, p);
+    branch_stack_data.stack.resize(nr);
+    MoveFromBinaryFormat(branch_stack_data.stack.data(), nr, p);
+  }
   // TODO: Add parsing of other PERF_SAMPLE_*.
   CHECK_LE(p, end);
   if (p < end) {
@@ -301,6 +340,19 @@
   if (sample_type & PERF_SAMPLE_PERIOD) {
     PrintIndented(indent, "period %" PRId64 "\n", period_data.period);
   }
+  if (sample_type & PERF_SAMPLE_CALLCHAIN) {
+    PrintIndented(indent, "callchain nr=%" PRIu64 "\n", callchain_data.ips.size());
+    for (auto& ip : callchain_data.ips) {
+      PrintIndented(indent + 1, "0x%" PRIx64 "\n", ip);
+    }
+  }
+  if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
+    PrintIndented(indent, "branch_stack nr=%" PRIu64 "\n", branch_stack_data.stack.size());
+    for (auto& item : branch_stack_data.stack) {
+      PrintIndented(indent + 1, "from 0x%" PRIx64 ", to 0x%" PRIx64 ", flags 0x%" PRIx64 "\n",
+                    item.from, item.to, item.flags);
+    }
+  }
 }
 
 BuildIdRecord::BuildIdRecord(const perf_event_header* pheader) : Record(pheader) {
@@ -341,10 +393,14 @@
   switch (pheader->type) {
     case PERF_RECORD_MMAP:
       return std::unique_ptr<const Record>(new MmapRecord(attr, pheader));
+    case PERF_RECORD_MMAP2:
+      return std::unique_ptr<const Record>(new Mmap2Record(attr, pheader));
     case PERF_RECORD_COMM:
       return std::unique_ptr<const Record>(new CommRecord(attr, pheader));
     case PERF_RECORD_EXIT:
       return std::unique_ptr<const Record>(new ExitRecord(attr, pheader));
+    case PERF_RECORD_FORK:
+      return std::unique_ptr<const Record>(new ForkRecord(attr, pheader));
     case PERF_RECORD_SAMPLE:
       return std::unique_ptr<const Record>(new SampleRecord(attr, pheader));
     default:
@@ -384,6 +440,21 @@
   return record;
 }
 
+ForkRecord CreateForkRecord(const perf_event_attr& attr, uint32_t pid, uint32_t tid, uint32_t ppid,
+                            uint32_t ptid) {
+  ForkRecord record;
+  record.header.type = PERF_RECORD_FORK;
+  record.header.misc = 0;
+  record.data.pid = pid;
+  record.data.ppid = ppid;
+  record.data.tid = tid;
+  record.data.ptid = ptid;
+  record.data.time = 0;
+  size_t sample_id_size = record.sample_id.CreateContent(attr);
+  record.header.size = sizeof(record.header) + sizeof(record.data) + sample_id_size;
+  return record;
+}
+
 BuildIdRecord CreateBuildIdRecord(bool in_kernel, pid_t pid, const BuildId& build_id,
                                   const std::string& filename) {
   BuildIdRecord record;
diff --git a/simpleperf/record.h b/simpleperf/record.h
index 83f60db..6f41234 100644
--- a/simpleperf/record.h
+++ b/simpleperf/record.h
@@ -68,6 +68,19 @@
   uint64_t period;
 };
 
+struct PerfSampleCallChainType {
+  std::vector<uint64_t> ips;
+};
+
+struct PerfSampleBranchStackType {
+  struct BranchStackItemType {
+    uint64_t from;
+    uint64_t to;
+    uint64_t flags;
+  };
+  std::vector<BranchStackItemType> stack;
+};
+
 // SampleId is optional at the end of a record in binary format. Its content is determined by
 // sample_id_all and sample_type in perf_event_attr. To avoid the complexity of referring to
 // perf_event_attr each time, we copy sample_id_all and sample_type inside the SampleId structure.
@@ -136,6 +149,29 @@
   void DumpData(size_t indent) const override;
 };
 
+struct Mmap2Record : public Record {
+  struct Mmap2RecordDataType {
+    uint32_t pid, tid;
+    uint64_t addr;
+    uint64_t len;
+    uint64_t pgoff;
+    uint32_t maj;
+    uint32_t min;
+    uint64_t ino;
+    uint64_t ino_generation;
+    uint32_t prot, flags;
+  } data;
+  std::string filename;
+
+  Mmap2Record() {
+  }
+
+  Mmap2Record(const perf_event_attr& attr, const perf_event_header* pheader);
+
+ protected:
+  void DumpData(size_t indent) const override;
+};
+
 struct CommRecord : public Record {
   struct CommRecordDataType {
     uint32_t pid, tid;
@@ -152,19 +188,36 @@
   void DumpData(size_t indent) const override;
 };
 
-struct ExitRecord : public Record {
-  struct ExitRecordDataType {
+struct ExitOrForkRecord : public Record {
+  struct ExitOrForkRecordDataType {
     uint32_t pid, ppid;
     uint32_t tid, ptid;
     uint64_t time;
   } data;
 
-  ExitRecord(const perf_event_attr& attr, const perf_event_header* pheader);
+  ExitOrForkRecord() {
+  }
+  ExitOrForkRecord(const perf_event_attr& attr, const perf_event_header* pheader);
 
  protected:
   void DumpData(size_t indent) const override;
 };
 
+struct ExitRecord : public ExitOrForkRecord {
+  ExitRecord(const perf_event_attr& attr, const perf_event_header* pheader)
+      : ExitOrForkRecord(attr, pheader) {
+  }
+};
+
+struct ForkRecord : public ExitOrForkRecord {
+  ForkRecord() {
+  }
+  ForkRecord(const perf_event_attr& attr, const perf_event_header* pheader)
+      : ExitOrForkRecord(attr, pheader) {
+  }
+  std::vector<char> BinaryFormat() const;
+};
+
 struct SampleRecord : public Record {
   uint64_t sample_type;  // sample_type is a bit mask determining which fields below are valid.
 
@@ -177,6 +230,9 @@
   PerfSampleCpuType cpu_data;             // Valid if PERF_SAMPLE_CPU.
   PerfSamplePeriodType period_data;       // Valid if PERF_SAMPLE_PERIOD.
 
+  PerfSampleCallChainType callchain_data;       // Valid if PERF_SAMPLE_CALLCHAIN.
+  PerfSampleBranchStackType branch_stack_data;  // Valid if PERF_SAMPLE_BRANCH_STACK.
+
   SampleRecord(const perf_event_attr& attr, const perf_event_header* pheader);
 
  protected:
@@ -206,6 +262,8 @@
                             const std::string& filename);
 CommRecord CreateCommRecord(const perf_event_attr& attr, uint32_t pid, uint32_t tid,
                             const std::string& comm);
+ForkRecord CreateForkRecord(const perf_event_attr& attr, uint32_t pid, uint32_t tid, uint32_t ppid,
+                            uint32_t ptid);
 BuildIdRecord CreateBuildIdRecord(bool in_kernel, pid_t pid, const BuildId& build_id,
                                   const std::string& filename);
 #endif  // SIMPLE_PERF_RECORD_H_
diff --git a/simpleperf/record_file.h b/simpleperf/record_file.h
index 694486c..d8b4413 100644
--- a/simpleperf/record_file.h
+++ b/simpleperf/record_file.h
@@ -18,6 +18,7 @@
 #define SIMPLE_PERF_RECORD_FILE_H_
 
 #include <stdio.h>
+#include <map>
 #include <memory>
 #include <string>
 #include <vector>
@@ -52,6 +53,8 @@
 
   bool WriteFeatureHeader(size_t feature_count);
   bool WriteBuildIdFeature(const std::vector<BuildIdRecord>& build_id_records);
+  bool WriteCmdlineFeature(const std::vector<std::string>& cmdline);
+  bool WriteBranchStackFeature();
 
   // Normally, Close() should be called after writing. But if something
   // wrong happens and we need to finish in advance, the destructor
@@ -67,6 +70,9 @@
                              std::vector<std::string>* hit_user_files);
   bool WriteFileHeader();
   bool Write(const void* buf, size_t len);
+  bool SeekFileEnd(uint64_t* file_end);
+  bool WriteFeatureBegin(uint64_t* start_offset);
+  bool WriteFeatureEnd(int feature, uint64_t start_offset);
 
   const std::string filename_;
   FILE* record_fp_;
@@ -95,10 +101,11 @@
   std::vector<const PerfFileFormat::FileAttr*> AttrSection();
   std::vector<uint64_t> IdsForAttr(const PerfFileFormat::FileAttr* attr);
   std::vector<std::unique_ptr<const Record>> DataSection();
-  std::vector<PerfFileFormat::SectionDesc> FeatureSectionDescriptors();
+  const std::map<int, PerfFileFormat::SectionDesc>& FeatureSectionDescriptors();
   const char* DataAtOffset(uint64_t offset) {
     return mmap_addr_ + offset;
   }
+  std::vector<std::string> ReadCmdlineFeature();
   bool Close();
 
  private:
@@ -111,6 +118,8 @@
   const char* mmap_addr_;
   size_t mmap_len_;
 
+  std::map<int, PerfFileFormat::SectionDesc> feature_sections_;
+
   DISALLOW_COPY_AND_ASSIGN(RecordFileReader);
 };
 
diff --git a/simpleperf/record_file_format.h b/simpleperf/record_file_format.h
index 9758f11..da6434b 100644
--- a/simpleperf/record_file_format.h
+++ b/simpleperf/record_file_format.h
@@ -63,7 +63,7 @@
   uint64_t size;
 };
 
-static const char* PERF_MAGIC = "PERFILE2";
+constexpr char PERF_MAGIC[] = "PERFILE2";
 
 struct FileHeader {
   char magic[8];
diff --git a/simpleperf/record_file_reader.cpp b/simpleperf/record_file_reader.cpp
new file mode 100644
index 0000000..8407d32
--- /dev/null
+++ b/simpleperf/record_file_reader.cpp
@@ -0,0 +1,198 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "record_file.h"
+
+#include <fcntl.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <set>
+#include <vector>
+
+#include <base/logging.h>
+
+#include "perf_event.h"
+#include "record.h"
+#include "utils.h"
+
+using namespace PerfFileFormat;
+
+std::unique_ptr<RecordFileReader> RecordFileReader::CreateInstance(const std::string& filename) {
+  int fd = open(filename.c_str(), O_RDONLY | O_CLOEXEC);
+  if (fd == -1) {
+    PLOG(ERROR) << "failed to open record file '" << filename << "'";
+    return nullptr;
+  }
+  auto reader = std::unique_ptr<RecordFileReader>(new RecordFileReader(filename, fd));
+  if (!reader->MmapFile()) {
+    return nullptr;
+  }
+  return reader;
+}
+
+RecordFileReader::RecordFileReader(const std::string& filename, int fd)
+    : filename_(filename), record_fd_(fd), mmap_addr_(nullptr), mmap_len_(0) {
+}
+
+RecordFileReader::~RecordFileReader() {
+  if (record_fd_ != -1) {
+    Close();
+  }
+}
+
+bool RecordFileReader::Close() {
+  bool result = true;
+  if (munmap(const_cast<char*>(mmap_addr_), mmap_len_) == -1) {
+    PLOG(ERROR) << "failed to munmap() record file '" << filename_ << "'";
+    result = false;
+  }
+  if (close(record_fd_) == -1) {
+    PLOG(ERROR) << "failed to close record file '" << filename_ << "'";
+    result = false;
+  }
+  record_fd_ = -1;
+  return result;
+}
+
+bool RecordFileReader::MmapFile() {
+  off_t file_size = lseek(record_fd_, 0, SEEK_END);
+  if (file_size == -1) {
+    return false;
+  }
+  size_t mmap_len = file_size;
+  void* mmap_addr = mmap(nullptr, mmap_len, PROT_READ, MAP_SHARED, record_fd_, 0);
+  if (mmap_addr == MAP_FAILED) {
+    PLOG(ERROR) << "failed to mmap() record file '" << filename_ << "'";
+    return false;
+  }
+
+  mmap_addr_ = reinterpret_cast<const char*>(mmap_addr);
+  mmap_len_ = mmap_len;
+  return true;
+}
+
+const FileHeader* RecordFileReader::FileHeader() {
+  return reinterpret_cast<const struct FileHeader*>(mmap_addr_);
+}
+
+std::vector<const FileAttr*> RecordFileReader::AttrSection() {
+  std::vector<const FileAttr*> result;
+  const struct FileHeader* header = FileHeader();
+  size_t attr_count = header->attrs.size / header->attr_size;
+  const FileAttr* attr = reinterpret_cast<const FileAttr*>(mmap_addr_ + header->attrs.offset);
+  for (size_t i = 0; i < attr_count; ++i) {
+    result.push_back(attr++);
+  }
+  return result;
+}
+
+std::vector<uint64_t> RecordFileReader::IdsForAttr(const FileAttr* attr) {
+  std::vector<uint64_t> result;
+  size_t id_count = attr->ids.size / sizeof(uint64_t);
+  const uint64_t* id = reinterpret_cast<const uint64_t*>(mmap_addr_ + attr->ids.offset);
+  for (size_t i = 0; i < id_count; ++i) {
+    result.push_back(*id++);
+  }
+  return result;
+}
+
+static bool IsRecordHappensBefore(const std::unique_ptr<const Record>& r1,
+                                  const std::unique_ptr<const Record>& r2) {
+  bool is_r1_sample = (r1->header.type == PERF_RECORD_SAMPLE);
+  bool is_r2_sample = (r2->header.type == PERF_RECORD_SAMPLE);
+  uint64_t time1 = (is_r1_sample ? static_cast<const SampleRecord*>(r1.get())->time_data.time
+                                 : r1->sample_id.time_data.time);
+  uint64_t time2 = (is_r2_sample ? static_cast<const SampleRecord*>(r2.get())->time_data.time
+                                 : r2->sample_id.time_data.time);
+  // The record with smaller time happens first.
+  if (time1 != time2) {
+    return time1 < time2;
+  }
+  // If happening at the same time, make non-sample records before sample records,
+  // because non-sample records may contain useful information to parse sample records.
+  if (is_r1_sample != is_r2_sample) {
+    return is_r1_sample ? false : true;
+  }
+  // Otherwise, don't care of the order.
+  return false;
+}
+
+std::vector<std::unique_ptr<const Record>> RecordFileReader::DataSection() {
+  std::vector<std::unique_ptr<const Record>> result;
+  const struct FileHeader* header = FileHeader();
+  auto file_attrs = AttrSection();
+  CHECK(file_attrs.size() > 0);
+  perf_event_attr attr = file_attrs[0]->attr;
+
+  const char* end = mmap_addr_ + header->data.offset + header->data.size;
+  const char* p = mmap_addr_ + header->data.offset;
+  while (p < end) {
+    const perf_event_header* header = reinterpret_cast<const perf_event_header*>(p);
+    if (p + header->size <= end) {
+      result.push_back(std::move(ReadRecordFromBuffer(attr, header)));
+    }
+    p += header->size;
+  }
+  if ((attr.sample_type & PERF_SAMPLE_TIME) && attr.sample_id_all) {
+    std::sort(result.begin(), result.end(), IsRecordHappensBefore);
+  }
+  return result;
+}
+
+const std::map<int, SectionDesc>& RecordFileReader::FeatureSectionDescriptors() {
+  if (feature_sections_.empty()) {
+    std::vector<int> features;
+    const struct FileHeader* header = FileHeader();
+    for (size_t i = 0; i < sizeof(header->features); ++i) {
+      for (size_t j = 0; j < 8; ++j) {
+        if (header->features[i] & (1 << j)) {
+          features.push_back(i * 8 + j);
+        }
+      }
+    }
+    uint64_t feature_section_offset = header->data.offset + header->data.size;
+    const SectionDesc* p = reinterpret_cast<const SectionDesc*>(mmap_addr_ + feature_section_offset);
+    for (auto& feature : features) {
+      feature_sections_.insert(std::make_pair(feature, *p));
+      ++p;
+    }
+  }
+  return feature_sections_;
+}
+
+std::vector<std::string> RecordFileReader::ReadCmdlineFeature() {
+  const std::map<int, SectionDesc>& section_map = FeatureSectionDescriptors();
+  auto it = section_map.find(FEAT_CMDLINE);
+  if (it == section_map.end()) {
+    return std::vector<std::string>();
+  }
+  SectionDesc section = it->second;
+  const char* p = DataAtOffset(section.offset);
+  const char* end = DataAtOffset(section.offset + section.size);
+  std::vector<std::string> cmdline;
+  uint32_t arg_count;
+  MoveFromBinaryFormat(arg_count, p);
+  CHECK_LE(p, end);
+  for (size_t i = 0; i < arg_count; ++i) {
+    uint32_t len;
+    MoveFromBinaryFormat(len, p);
+    CHECK_LE(p + len, end);
+    cmdline.push_back(p);
+    p += len;
+  }
+  return cmdline;
+}
diff --git a/simpleperf/record_file_test.cpp b/simpleperf/record_file_test.cpp
index fffaa2a..6e6bc13 100644
--- a/simpleperf/record_file_test.cpp
+++ b/simpleperf/record_file_test.cpp
@@ -32,10 +32,12 @@
  protected:
   virtual void SetUp() {
     filename = "temporary.record_file";
-    const EventType* event_type = EventTypeFactory::FindEventTypeByName("cpu-cycles");
-    ASSERT_TRUE(event_type != nullptr);
-    event_attr = CreateDefaultPerfEventAttr(*event_type);
-    std::unique_ptr<EventFd> event_fd = EventFd::OpenEventFileForProcess(event_attr, getpid());
+    std::unique_ptr<EventTypeAndModifier> event_type_modifier = ParseEventType("cpu-cycles");
+    ASSERT_TRUE(event_type_modifier != nullptr);
+    event_attr = CreateDefaultPerfEventAttr(event_type_modifier->event_type);
+    event_attr.sample_id_all = 1;
+    event_attr.sample_type |= PERF_SAMPLE_TIME;
+    std::unique_ptr<EventFd> event_fd = EventFd::OpenEventFile(event_attr, getpid(), -1);
     ASSERT_TRUE(event_fd != nullptr);
     event_fds.push_back(std::move(event_fd));
   }
@@ -80,18 +82,48 @@
   // Read and check data section.
   std::vector<std::unique_ptr<const Record>> records = reader->DataSection();
   ASSERT_EQ(1u, records.size());
-  ASSERT_EQ(mmap_record.header.type, records[0]->header.type);
   CheckRecordEqual(mmap_record, *records[0]);
 
   // Read and check feature section.
   ASSERT_TRUE(file_header->features[FEAT_BUILD_ID / 8] & (1 << (FEAT_BUILD_ID % 8)));
-  std::vector<SectionDesc> sections = reader->FeatureSectionDescriptors();
+  std::map<int, SectionDesc> sections = reader->FeatureSectionDescriptors();
   ASSERT_EQ(1u, sections.size());
-  const perf_event_header* header =
-      reinterpret_cast<const perf_event_header*>(reader->DataAtOffset(sections[0].offset));
+  ASSERT_TRUE(sections.find(FEAT_BUILD_ID) != sections.end());
+  const perf_event_header* header = reinterpret_cast<const perf_event_header*>(
+      reader->DataAtOffset(sections[FEAT_BUILD_ID].offset));
   ASSERT_TRUE(header != nullptr);
-  ASSERT_EQ(sections[0].size, header->size);
+  ASSERT_EQ(sections[FEAT_BUILD_ID].size, header->size);
   CheckRecordEqual(build_id_record, BuildIdRecord(header));
 
   ASSERT_TRUE(reader->Close());
 }
+
+TEST_F(RecordFileTest, records_sorted_by_time) {
+  // Write to a record file;
+  std::unique_ptr<RecordFileWriter> writer =
+      RecordFileWriter::CreateInstance(filename, event_attr, event_fds);
+  ASSERT_TRUE(writer != nullptr);
+
+  // Write data section.
+  MmapRecord r1 = CreateMmapRecord(event_attr, true, 1, 1, 0x100, 0x2000, 0x3000, "mmap_record1");
+  MmapRecord r2 = r1;
+  MmapRecord r3 = r1;
+  r1.sample_id.time_data.time = 2;
+  r2.sample_id.time_data.time = 1;
+  r3.sample_id.time_data.time = 3;
+  ASSERT_TRUE(writer->WriteData(r1.BinaryFormat()));
+  ASSERT_TRUE(writer->WriteData(r2.BinaryFormat()));
+  ASSERT_TRUE(writer->WriteData(r3.BinaryFormat()));
+  ASSERT_TRUE(writer->Close());
+
+  // Read from a record file.
+  std::unique_ptr<RecordFileReader> reader = RecordFileReader::CreateInstance(filename);
+  ASSERT_TRUE(reader != nullptr);
+  std::vector<std::unique_ptr<const Record>> records = reader->DataSection();
+  ASSERT_EQ(3u, records.size());
+  CheckRecordEqual(r2, *records[0]);
+  CheckRecordEqual(r1, *records[1]);
+  CheckRecordEqual(r3, *records[2]);
+
+  ASSERT_TRUE(reader->Close());
+}
diff --git a/simpleperf/record_file.cpp b/simpleperf/record_file_writer.cpp
similarity index 68%
rename from simpleperf/record_file.cpp
rename to simpleperf/record_file_writer.cpp
index 54a4dda..deb0ada 100644
--- a/simpleperf/record_file.cpp
+++ b/simpleperf/record_file_writer.cpp
@@ -35,6 +35,10 @@
 std::unique_ptr<RecordFileWriter> RecordFileWriter::CreateInstance(
     const std::string& filename, const perf_event_attr& event_attr,
     const std::vector<std::unique_ptr<EventFd>>& event_fds) {
+  // Remove old perf.data to avoid file ownership problems.
+  if (!RemovePossibleFile(filename)) {
+    return nullptr;
+  }
   FILE* fp = fopen(filename.c_str(), "web+");
   if (fp == nullptr) {
     PLOG(ERROR) << "failed to open record file '" << filename << "'";
@@ -191,16 +195,11 @@
     PLOG(ERROR) << "fflush() failed";
     return false;
   }
-  if (fseek(record_fp_, 0, SEEK_END) == -1) {
-    PLOG(ERROR) << "fseek() failed";
+  uint64_t file_size;
+  if (!SeekFileEnd(&file_size)) {
     return false;
   }
-  long file_size = ftell(record_fp_);
-  if (file_size == -1) {
-    PLOG(ERROR) << "ftell() failed";
-    return false;
-  }
-  size_t mmap_len = file_size;
+  size_t mmap_len = static_cast<size_t>(file_size);
   void* mmap_addr = mmap(nullptr, mmap_len, PROT_READ, MAP_SHARED, fileno(record_fp_), 0);
   if (mmap_addr == MAP_FAILED) {
     PLOG(ERROR) << "mmap() failed";
@@ -217,6 +216,20 @@
   return true;
 }
 
+bool RecordFileWriter::SeekFileEnd(uint64_t* file_end) {
+  if (fseek(record_fp_, 0, SEEK_END) == -1) {
+    PLOG(ERROR) << "fseek() failed";
+    return false;
+  }
+  long offset = ftell(record_fp_);
+  if (offset == -1) {
+    PLOG(ERROR) << "ftell() failed";
+    return false;
+  }
+  *file_end = static_cast<uint64_t>(offset);
+  return true;
+}
+
 bool RecordFileWriter::WriteFeatureHeader(size_t feature_count) {
   feature_count_ = feature_count;
   current_feature_index_ = 0;
@@ -232,17 +245,8 @@
 }
 
 bool RecordFileWriter::WriteBuildIdFeature(const std::vector<BuildIdRecord>& build_id_records) {
-  if (current_feature_index_ >= feature_count_) {
-    return false;
-  }
-  // Always write features at the end of the file.
-  if (fseek(record_fp_, 0, SEEK_END) == -1) {
-    PLOG(ERROR) << "fseek() failed";
-    return false;
-  }
-  long section_start = ftell(record_fp_);
-  if (section_start == -1) {
-    PLOG(ERROR) << "ftell() failed";
+  uint64_t start_offset;
+  if (!WriteFeatureBegin(&start_offset)) {
     return false;
   }
   for (auto& record : build_id_records) {
@@ -251,27 +255,67 @@
       return false;
     }
   }
-  long section_end = ftell(record_fp_);
-  if (section_end == -1) {
+  return WriteFeatureEnd(FEAT_BUILD_ID, start_offset);
+}
+
+bool RecordFileWriter::WriteCmdlineFeature(const std::vector<std::string>& cmdline) {
+  uint64_t start_offset;
+  if (!WriteFeatureBegin(&start_offset)) {
     return false;
   }
+  uint32_t arg_count = cmdline.size();
+  if (!Write(&arg_count, sizeof(arg_count))) {
+    return false;
+  }
+  for (auto& arg : cmdline) {
+    uint32_t len = static_cast<uint32_t>(ALIGN(arg.size() + 1, 64));
+    if (!Write(&len, sizeof(len))) {
+      return false;
+    }
+    std::vector<char> array(len, '\0');
+    std::copy(arg.begin(), arg.end(), array.begin());
+    if (!Write(array.data(), array.size())) {
+      return false;
+    }
+  }
+  return WriteFeatureEnd(FEAT_CMDLINE, start_offset);
+}
 
-  // Write feature section descriptor for build_id feature.
+bool RecordFileWriter::WriteBranchStackFeature() {
+  uint64_t start_offset;
+  if (!WriteFeatureBegin(&start_offset)) {
+    return false;
+  }
+  return WriteFeatureEnd(FEAT_BRANCH_STACK, start_offset);
+}
+
+bool RecordFileWriter::WriteFeatureBegin(uint64_t* start_offset) {
+  CHECK_LT(current_feature_index_, feature_count_);
+  if (!SeekFileEnd(start_offset)) {
+    return false;
+  }
+  return true;
+}
+
+bool RecordFileWriter::WriteFeatureEnd(int feature, uint64_t start_offset) {
+  uint64_t end_offset;
+  if (!SeekFileEnd(&end_offset)) {
+    return false;
+  }
   SectionDesc desc;
-  desc.offset = section_start;
-  desc.size = section_end - section_start;
+  desc.offset = start_offset;
+  desc.size = end_offset - start_offset;
   uint64_t feature_offset = data_section_offset_ + data_section_size_;
   if (fseek(record_fp_, feature_offset + current_feature_index_ * sizeof(SectionDesc), SEEK_SET) ==
       -1) {
     PLOG(ERROR) << "fseek() failed";
     return false;
   }
-  if (fwrite(&desc, sizeof(SectionDesc), 1, record_fp_) != 1) {
-    PLOG(ERROR) << "fwrite() failed";
+  if (!Write(&desc, sizeof(SectionDesc))) {
     return false;
   }
   ++current_feature_index_;
-  features_.push_back(FEAT_BUILD_ID);
+  features_.push_back(feature);
   return true;
 }
 
@@ -317,120 +361,3 @@
   record_fp_ = nullptr;
   return result;
 }
-
-std::unique_ptr<RecordFileReader> RecordFileReader::CreateInstance(const std::string& filename) {
-  int fd = open(filename.c_str(), O_RDONLY | O_CLOEXEC);
-  if (fd == -1) {
-    PLOG(ERROR) << "failed to open record file '" << filename << "'";
-    return nullptr;
-  }
-  auto reader = std::unique_ptr<RecordFileReader>(new RecordFileReader(filename, fd));
-  if (!reader->MmapFile()) {
-    return nullptr;
-  }
-  return reader;
-}
-
-RecordFileReader::RecordFileReader(const std::string& filename, int fd)
-    : filename_(filename), record_fd_(fd), mmap_addr_(nullptr), mmap_len_(0) {
-}
-
-RecordFileReader::~RecordFileReader() {
-  if (record_fd_ != -1) {
-    Close();
-  }
-}
-
-bool RecordFileReader::Close() {
-  bool result = true;
-  if (munmap(const_cast<char*>(mmap_addr_), mmap_len_) == -1) {
-    PLOG(ERROR) << "failed to munmap() record file '" << filename_ << "'";
-    result = false;
-  }
-  if (close(record_fd_) == -1) {
-    PLOG(ERROR) << "failed to close record file '" << filename_ << "'";
-    result = false;
-  }
-  record_fd_ = -1;
-  return result;
-}
-
-bool RecordFileReader::MmapFile() {
-  off64_t file_size = lseek64(record_fd_, 0, SEEK_END);
-  if (file_size == -1) {
-    return false;
-  }
-  size_t mmap_len = file_size;
-  void* mmap_addr = mmap(nullptr, mmap_len, PROT_READ, MAP_SHARED, record_fd_, 0);
-  if (mmap_addr == MAP_FAILED) {
-    PLOG(ERROR) << "failed to mmap() record file '" << filename_ << "'";
-    return false;
-  }
-
-  mmap_addr_ = reinterpret_cast<const char*>(mmap_addr);
-  mmap_len_ = mmap_len;
-  return true;
-}
-
-const FileHeader* RecordFileReader::FileHeader() {
-  return reinterpret_cast<const struct FileHeader*>(mmap_addr_);
-}
-
-std::vector<const FileAttr*> RecordFileReader::AttrSection() {
-  std::vector<const FileAttr*> result;
-  const struct FileHeader* header = FileHeader();
-  size_t attr_count = header->attrs.size / header->attr_size;
-  const FileAttr* attr = reinterpret_cast<const FileAttr*>(mmap_addr_ + header->attrs.offset);
-  for (size_t i = 0; i < attr_count; ++i) {
-    result.push_back(attr++);
-  }
-  return result;
-}
-
-std::vector<uint64_t> RecordFileReader::IdsForAttr(const FileAttr* attr) {
-  std::vector<uint64_t> result;
-  size_t id_count = attr->ids.size / sizeof(uint64_t);
-  const uint64_t* id = reinterpret_cast<const uint64_t*>(mmap_addr_ + attr->ids.offset);
-  for (size_t i = 0; i < id_count; ++i) {
-    result.push_back(*id++);
-  }
-  return result;
-}
-
-std::vector<std::unique_ptr<const Record>> RecordFileReader::DataSection() {
-  std::vector<std::unique_ptr<const Record>> result;
-  const struct FileHeader* header = FileHeader();
-  auto file_attrs = AttrSection();
-  CHECK(file_attrs.size() > 0);
-  perf_event_attr attr = file_attrs[0]->attr;
-
-  const char* end = mmap_addr_ + header->data.offset + header->data.size;
-  const char* p = mmap_addr_ + header->data.offset;
-  while (p < end) {
-    const perf_event_header* header = reinterpret_cast<const perf_event_header*>(p);
-    if (p + header->size <= end) {
-      result.push_back(std::move(ReadRecordFromBuffer(attr, header)));
-    }
-    p += header->size;
-  }
-  return result;
-}
-
-std::vector<SectionDesc> RecordFileReader::FeatureSectionDescriptors() {
-  std::vector<SectionDesc> result;
-  const struct FileHeader* header = FileHeader();
-  size_t feature_count = 0;
-  for (size_t i = 0; i < sizeof(header->features); ++i) {
-    for (size_t j = 0; j < 8; ++j) {
-      if (header->features[i] & (1 << j)) {
-        ++feature_count;
-      }
-    }
-  }
-  uint64_t feature_section_offset = header->data.offset + header->data.size;
-  const SectionDesc* p = reinterpret_cast<const SectionDesc*>(mmap_addr_ + feature_section_offset);
-  for (size_t i = 0; i < feature_count; ++i) {
-    result.push_back(*p++);
-  }
-  return result;
-}
diff --git a/simpleperf/record_test.cpp b/simpleperf/record_test.cpp
index d9e9a4b..96262a8 100644
--- a/simpleperf/record_test.cpp
+++ b/simpleperf/record_test.cpp
@@ -24,9 +24,9 @@
 class RecordTest : public ::testing::Test {
  protected:
   virtual void SetUp() {
-    const EventType* event_type = EventTypeFactory::FindEventTypeByName("cpu-cycles");
-    ASSERT_TRUE(event_type != nullptr);
-    event_attr = CreateDefaultPerfEventAttr(*event_type);
+    const EventType* type = FindEventTypeByName("cpu-cycles");
+    ASSERT_TRUE(type != nullptr);
+    event_attr = CreateDefaultPerfEventAttr(*type);
   }
 
   template <class RecordType>
diff --git a/simpleperf/sample_tree.cpp b/simpleperf/sample_tree.cpp
new file mode 100644
index 0000000..3f0e5b3
--- /dev/null
+++ b/simpleperf/sample_tree.cpp
@@ -0,0 +1,272 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "sample_tree.h"
+
+#include <base/logging.h>
+
+#include "environment.h"
+
+bool MapComparator::operator()(const MapEntry* map1, const MapEntry* map2) const {
+  if (map1->start_addr != map2->start_addr) {
+    return map1->start_addr < map2->start_addr;
+  }
+  if (map1->len != map2->len) {
+    return map1->len < map2->len;
+  }
+  if (map1->time != map2->time) {
+    return map1->time < map2->time;
+  }
+  return false;
+}
+
+void SampleTree::AddThread(int pid, int tid, const std::string& comm) {
+  auto it = thread_tree_.find(tid);
+  if (it == thread_tree_.end()) {
+    ThreadEntry* thread = new ThreadEntry{
+        pid, tid,
+        "unknown",                             // comm
+        std::set<MapEntry*, MapComparator>(),  // maps
+    };
+    auto pair = thread_tree_.insert(std::make_pair(tid, std::unique_ptr<ThreadEntry>(thread)));
+    CHECK(pair.second);
+    it = pair.first;
+  }
+  thread_comm_storage_.push_back(std::unique_ptr<std::string>(new std::string(comm)));
+  it->second->comm = thread_comm_storage_.back()->c_str();
+}
+
+void SampleTree::ForkThread(int pid, int tid, int ppid, int ptid) {
+  ThreadEntry* parent = FindThreadOrNew(ppid, ptid);
+  ThreadEntry* child = FindThreadOrNew(pid, tid);
+  child->comm = parent->comm;
+  child->maps = parent->maps;
+}
+
+static void RemoveOverlappedMap(std::set<MapEntry*, MapComparator>* map_set, const MapEntry* map) {
+  for (auto it = map_set->begin(); it != map_set->end();) {
+    if ((*it)->start_addr >= map->start_addr + map->len) {
+      break;
+    }
+    if ((*it)->start_addr + (*it)->len <= map->start_addr) {
+      ++it;
+    } else {
+      it = map_set->erase(it);
+    }
+  }
+}
+
+void SampleTree::AddKernelMap(uint64_t start_addr, uint64_t len, uint64_t pgoff, uint64_t time,
+                              const std::string& filename) {
+  // kernel map len can be 0 when record command is not run in supervisor mode.
+  if (len == 0) {
+    return;
+  }
+  DsoEntry* dso = FindKernelDsoOrNew(filename);
+  MapEntry* map = new MapEntry{
+      start_addr, len, pgoff, time, dso,
+  };
+  map_storage_.push_back(std::unique_ptr<MapEntry>(map));
+  RemoveOverlappedMap(&kernel_map_tree_, map);
+  auto pair = kernel_map_tree_.insert(map);
+  CHECK(pair.second);
+}
+
+DsoEntry* SampleTree::FindKernelDsoOrNew(const std::string& filename) {
+  if (filename == DEFAULT_KERNEL_MMAP_NAME) {
+    if (kernel_dso_ == nullptr) {
+      kernel_dso_ = DsoFactory::LoadKernel();
+    }
+    return kernel_dso_.get();
+  }
+  auto it = module_dso_tree_.find(filename);
+  if (it == module_dso_tree_.end()) {
+    module_dso_tree_[filename] = DsoFactory::LoadKernelModule(filename);
+    it = module_dso_tree_.find(filename);
+  }
+  return it->second.get();
+}
+
+void SampleTree::AddThreadMap(int pid, int tid, uint64_t start_addr, uint64_t len, uint64_t pgoff,
+                              uint64_t time, const std::string& filename) {
+  ThreadEntry* thread = FindThreadOrNew(pid, tid);
+  DsoEntry* dso = FindUserDsoOrNew(filename);
+  MapEntry* map = new MapEntry{
+      start_addr, len, pgoff, time, dso,
+  };
+  map_storage_.push_back(std::unique_ptr<MapEntry>(map));
+  RemoveOverlappedMap(&thread->maps, map);
+  auto pair = thread->maps.insert(map);
+  CHECK(pair.second);
+}
+
+ThreadEntry* SampleTree::FindThreadOrNew(int pid, int tid) {
+  auto it = thread_tree_.find(tid);
+  if (it == thread_tree_.end()) {
+    AddThread(pid, tid, "unknown");
+    it = thread_tree_.find(tid);
+  } else {
+    CHECK_EQ(pid, it->second.get()->pid) << "tid = " << tid;
+  }
+  return it->second.get();
+}
+
+DsoEntry* SampleTree::FindUserDsoOrNew(const std::string& filename) {
+  auto it = user_dso_tree_.find(filename);
+  if (it == user_dso_tree_.end()) {
+    user_dso_tree_[filename] = DsoFactory::LoadDso(filename);
+    it = user_dso_tree_.find(filename);
+  }
+  return it->second.get();
+}
+
+static bool IsIpInMap(uint64_t ip, const MapEntry* map) {
+  return (map->start_addr <= ip && map->start_addr + map->len > ip);
+}
+
+const MapEntry* SampleTree::FindMap(const ThreadEntry* thread, uint64_t ip, bool in_kernel) {
+  // Construct a map_entry which is strictly after the searched map_entry, based on MapComparator.
+  MapEntry find_map = {
+      ip,          // start_addr
+      ULLONG_MAX,  // len
+      0,           // pgoff
+      ULLONG_MAX,  // time
+      nullptr,     // dso
+  };
+  if (!in_kernel) {
+    auto it = thread->maps.upper_bound(&find_map);
+    if (it != thread->maps.begin() && IsIpInMap(ip, *--it)) {
+      return *it;
+    }
+  } else {
+    auto it = kernel_map_tree_.upper_bound(&find_map);
+    if (it != kernel_map_tree_.begin() && IsIpInMap(ip, *--it)) {
+      return *it;
+    }
+  }
+  return &unknown_map_;
+}
+
+SampleEntry* SampleTree::AddSample(int pid, int tid, uint64_t ip, uint64_t time, uint64_t period,
+                                   bool in_kernel) {
+  const ThreadEntry* thread = FindThreadOrNew(pid, tid);
+  const MapEntry* map = FindMap(thread, ip, in_kernel);
+  const SymbolEntry* symbol = FindSymbol(map, ip);
+
+  SampleEntry value(ip, time, period, 0, 1, thread, map, symbol);
+
+  return InsertSample(value);
+}
+
+void SampleTree::AddBranchSample(int pid, int tid, uint64_t from_ip, uint64_t to_ip,
+                                 uint64_t branch_flags, uint64_t time, uint64_t period) {
+  const ThreadEntry* thread = FindThreadOrNew(pid, tid);
+  const MapEntry* from_map = FindMap(thread, from_ip, false);
+  if (from_map == &unknown_map_) {
+    from_map = FindMap(thread, from_ip, true);
+  }
+  const SymbolEntry* from_symbol = FindSymbol(from_map, from_ip);
+  const MapEntry* to_map = FindMap(thread, to_ip, false);
+  if (to_map == &unknown_map_) {
+    to_map = FindMap(thread, to_ip, true);
+  }
+  const SymbolEntry* to_symbol = FindSymbol(to_map, to_ip);
+
+  SampleEntry value(to_ip, time, period, 0, 1, thread, to_map, to_symbol);
+  value.branch_from.ip = from_ip;
+  value.branch_from.map = from_map;
+  value.branch_from.symbol = from_symbol;
+  value.branch_from.flags = branch_flags;
+
+  InsertSample(value);
+}
+
+SampleEntry* SampleTree::AddCallChainSample(int pid, int tid, uint64_t ip, uint64_t time,
+                                            uint64_t period, bool in_kernel,
+                                            const std::vector<SampleEntry*>& callchain) {
+  const ThreadEntry* thread = FindThreadOrNew(pid, tid);
+  const MapEntry* map = FindMap(thread, ip, in_kernel);
+  const SymbolEntry* symbol = FindSymbol(map, ip);
+
+  SampleEntry value(ip, time, 0, period, 0, thread, map, symbol);
+
+  auto it = sample_tree_.find(&value);
+  if (it != sample_tree_.end()) {
+    SampleEntry* sample = *it;
+    // Process only once for recursive function call.
+    if (std::find(callchain.begin(), callchain.end(), sample) != callchain.end()) {
+      return sample;
+    }
+  }
+  return InsertSample(value);
+}
+
+SampleEntry* SampleTree::InsertSample(SampleEntry& value) {
+  SampleEntry* result;
+  auto it = sample_tree_.find(&value);
+  if (it == sample_tree_.end()) {
+    result = AllocateSample(value);
+    auto pair = sample_tree_.insert(result);
+    CHECK(pair.second);
+  } else {
+    result = *it;
+    result->period += value.period;
+    result->accumulated_period += value.accumulated_period;
+    result->sample_count += value.sample_count;
+  }
+  total_samples_ += value.sample_count;
+  total_period_ += value.period;
+  return result;
+}
+
+SampleEntry* SampleTree::AllocateSample(SampleEntry& value) {
+  SampleEntry* sample = new SampleEntry(std::move(value));
+  sample_storage_.push_back(std::unique_ptr<SampleEntry>(sample));
+  return sample;
+}
+
+const SymbolEntry* SampleTree::FindSymbol(const MapEntry* map, uint64_t ip) {
+  uint64_t offset_in_file;
+  if (map->dso == kernel_dso_.get()) {
+    offset_in_file = ip;
+  } else {
+    offset_in_file = ip - map->start_addr + map->pgoff;
+  }
+  const SymbolEntry* symbol = map->dso->FindSymbol(offset_in_file);
+  if (symbol == nullptr) {
+    symbol = &unknown_symbol_;
+  }
+  return symbol;
+}
+
+void SampleTree::InsertCallChainForSample(SampleEntry* sample,
+                                          const std::vector<SampleEntry*>& callchain,
+                                          uint64_t period) {
+  sample->callchain.AddCallChain(callchain, period);
+}
+
+void SampleTree::VisitAllSamples(std::function<void(const SampleEntry&)> callback) {
+  if (sorted_sample_tree_.size() != sample_tree_.size()) {
+    sorted_sample_tree_.clear();
+    for (auto& sample : sample_tree_) {
+      sample->callchain.SortByPeriod();
+      sorted_sample_tree_.insert(sample);
+    }
+  }
+  for (auto& sample : sorted_sample_tree_) {
+    callback(*sample);
+  }
+}
diff --git a/simpleperf/sample_tree.h b/simpleperf/sample_tree.h
new file mode 100644
index 0000000..2e97ceb
--- /dev/null
+++ b/simpleperf/sample_tree.h
@@ -0,0 +1,199 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SIMPLE_PERF_SAMPLE_TREE_H_
+#define SIMPLE_PERF_SAMPLE_TREE_H_
+
+#include <limits.h>
+#include <functional>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "callchain.h"
+#include "dso.h"
+
+struct MapEntry {
+  uint64_t start_addr;
+  uint64_t len;
+  uint64_t pgoff;
+  uint64_t time;  // Map creation time.
+  DsoEntry* dso;
+};
+
+struct MapComparator {
+  bool operator()(const MapEntry* map1, const MapEntry* map2) const;
+};
+
+struct ThreadEntry {
+  int pid;
+  int tid;
+  const char* comm;  // It always refers to the latest comm.
+  std::set<MapEntry*, MapComparator> maps;
+};
+
+struct BranchFromEntry {
+  uint64_t ip;
+  const MapEntry* map;
+  const SymbolEntry* symbol;
+  uint64_t flags;
+
+  BranchFromEntry() : ip(0), map(nullptr), symbol(nullptr), flags(0) {
+  }
+};
+
+struct SampleEntry {
+  uint64_t ip;
+  uint64_t time;
+  uint64_t period;
+  uint64_t accumulated_period;  // Accumulated when appearing in other samples' callchain.
+  uint64_t sample_count;
+  const ThreadEntry* thread;
+  const char* thread_comm;  // It refers to the thread comm when the sample happens.
+  const MapEntry* map;
+  const SymbolEntry* symbol;
+  BranchFromEntry branch_from;
+  CallChainRoot callchain;  // A callchain tree representing all callchains in the sample records.
+
+  SampleEntry(uint64_t ip, uint64_t time, uint64_t period, uint64_t accumulated_period,
+              uint64_t sample_count, const ThreadEntry* thread, const MapEntry* map,
+              const SymbolEntry* symbol)
+      : ip(ip),
+        time(time),
+        period(period),
+        accumulated_period(accumulated_period),
+        sample_count(sample_count),
+        thread(thread),
+        thread_comm(thread->comm),
+        map(map),
+        symbol(symbol) {
+  }
+
+  // The data member 'callchain' can only move, not copy.
+  SampleEntry(SampleEntry&&) = default;
+  SampleEntry(SampleEntry&) = delete;
+};
+
+typedef std::function<int(const SampleEntry&, const SampleEntry&)> compare_sample_func_t;
+
+class SampleTree {
+ public:
+  SampleTree(compare_sample_func_t sample_compare_function)
+      : sample_comparator_(sample_compare_function),
+        sample_tree_(sample_comparator_),
+        sorted_sample_comparator_(sample_compare_function),
+        sorted_sample_tree_(sorted_sample_comparator_),
+        total_samples_(0),
+        total_period_(0) {
+    unknown_map_ = MapEntry{
+        0,              // start_addr
+        ULLONG_MAX,     // len
+        0,              // pgoff
+        0,              // time
+        &unknown_dso_,  // dso
+    };
+    unknown_dso_.path = "unknown";
+    unknown_symbol_ = SymbolEntry{
+        "unknown",   // name
+        0,           // addr
+        ULLONG_MAX,  // len
+    };
+  }
+
+  void AddThread(int pid, int tid, const std::string& comm);
+  void ForkThread(int pid, int tid, int ppid, int ptid);
+  void AddKernelMap(uint64_t start_addr, uint64_t len, uint64_t pgoff, uint64_t time,
+                    const std::string& filename);
+  void AddThreadMap(int pid, int tid, uint64_t start_addr, uint64_t len, uint64_t pgoff,
+                    uint64_t time, const std::string& filename);
+  SampleEntry* AddSample(int pid, int tid, uint64_t ip, uint64_t time, uint64_t period,
+                         bool in_kernel);
+  void AddBranchSample(int pid, int tid, uint64_t from_ip, uint64_t to_ip, uint64_t branch_flags,
+                       uint64_t time, uint64_t period);
+  SampleEntry* AddCallChainSample(int pid, int tid, uint64_t ip, uint64_t time, uint64_t period,
+                                  bool in_kernel, const std::vector<SampleEntry*>& callchain);
+  void InsertCallChainForSample(SampleEntry* sample, const std::vector<SampleEntry*>& callchain,
+                                uint64_t period);
+  void VisitAllSamples(std::function<void(const SampleEntry&)> callback);
+
+  uint64_t TotalSamples() const {
+    return total_samples_;
+  }
+
+  uint64_t TotalPeriod() const {
+    return total_period_;
+  }
+
+ private:
+  ThreadEntry* FindThreadOrNew(int pid, int tid);
+  const MapEntry* FindMap(const ThreadEntry* thread, uint64_t ip, bool in_kernel);
+  DsoEntry* FindKernelDsoOrNew(const std::string& filename);
+  DsoEntry* FindUserDsoOrNew(const std::string& filename);
+  const SymbolEntry* FindSymbol(const MapEntry* map, uint64_t ip);
+  SampleEntry* InsertSample(SampleEntry& value);
+  SampleEntry* AllocateSample(SampleEntry& value);
+
+  struct SampleComparator {
+    bool operator()(SampleEntry* sample1, SampleEntry* sample2) const {
+      return compare_function(*sample1, *sample2) < 0;
+    }
+    SampleComparator(compare_sample_func_t compare_function) : compare_function(compare_function) {
+    }
+
+    compare_sample_func_t compare_function;
+  };
+
+  struct SortedSampleComparator {
+    bool operator()(SampleEntry* sample1, SampleEntry* sample2) const {
+      uint64_t period1 = sample1->period + sample1->accumulated_period;
+      uint64_t period2 = sample2->period + sample2->accumulated_period;
+      if (period1 != period2) {
+        return period1 > period2;
+      }
+      return compare_function(*sample1, *sample2) < 0;
+    }
+    SortedSampleComparator(compare_sample_func_t compare_function)
+        : compare_function(compare_function) {
+    }
+
+    compare_sample_func_t compare_function;
+  };
+
+  std::unordered_map<int, std::unique_ptr<ThreadEntry>> thread_tree_;
+  std::vector<std::unique_ptr<std::string>> thread_comm_storage_;
+
+  std::set<MapEntry*, MapComparator> kernel_map_tree_;
+  std::vector<std::unique_ptr<MapEntry>> map_storage_;
+  MapEntry unknown_map_;
+
+  std::unique_ptr<DsoEntry> kernel_dso_;
+  std::unordered_map<std::string, std::unique_ptr<DsoEntry>> module_dso_tree_;
+  std::unordered_map<std::string, std::unique_ptr<DsoEntry>> user_dso_tree_;
+  DsoEntry unknown_dso_;
+  SymbolEntry unknown_symbol_;
+
+  SampleComparator sample_comparator_;
+  std::set<SampleEntry*, SampleComparator> sample_tree_;
+  SortedSampleComparator sorted_sample_comparator_;
+  std::set<SampleEntry*, SortedSampleComparator> sorted_sample_tree_;
+  std::vector<std::unique_ptr<SampleEntry>> sample_storage_;
+
+  uint64_t total_samples_;
+  uint64_t total_period_;
+};
+
+#endif  // SIMPLE_PERF_SAMPLE_TREE_H_
diff --git a/simpleperf/sample_tree_test.cpp b/simpleperf/sample_tree_test.cpp
new file mode 100644
index 0000000..9b37f47
--- /dev/null
+++ b/simpleperf/sample_tree_test.cpp
@@ -0,0 +1,187 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "sample_tree.h"
+
+struct ExpectedSampleInMap {
+  int pid;
+  int tid;
+  const char* comm;
+  std::string dso_name;
+  uint64_t map_start_addr;
+  size_t sample_count;
+};
+
+static void SampleMatchExpectation(const SampleEntry& sample, const ExpectedSampleInMap& expected,
+                                   bool* has_error) {
+  *has_error = true;
+  ASSERT_TRUE(sample.thread != nullptr);
+  ASSERT_EQ(expected.pid, sample.thread->pid);
+  ASSERT_EQ(expected.tid, sample.thread->tid);
+  ASSERT_STREQ(expected.comm, sample.thread_comm);
+  ASSERT_TRUE(sample.map != nullptr);
+  ASSERT_EQ(expected.dso_name, sample.map->dso->path);
+  ASSERT_EQ(expected.map_start_addr, sample.map->start_addr);
+  ASSERT_EQ(expected.sample_count, sample.sample_count);
+  *has_error = false;
+}
+
+static void CheckSampleCallback(const SampleEntry& sample,
+                                std::vector<ExpectedSampleInMap>& expected_samples, size_t* pos) {
+  ASSERT_LT(*pos, expected_samples.size());
+  bool has_error;
+  SampleMatchExpectation(sample, expected_samples[*pos], &has_error);
+  ASSERT_FALSE(has_error) << "Error matching sample at pos " << *pos;
+  ++*pos;
+}
+
+static int CompareSampleFunction(const SampleEntry& sample1, const SampleEntry& sample2) {
+  if (sample1.thread->pid != sample2.thread->pid) {
+    return sample1.thread->pid - sample2.thread->pid;
+  }
+  if (sample1.thread->tid != sample2.thread->tid) {
+    return sample1.thread->tid - sample2.thread->tid;
+  }
+  if (strcmp(sample1.thread_comm, sample2.thread_comm) != 0) {
+    return strcmp(sample1.thread_comm, sample2.thread_comm);
+  }
+  if (sample1.map->dso->path != sample2.map->dso->path) {
+    return sample1.map->dso->path > sample2.map->dso->path ? 1 : -1;
+  }
+  if (sample1.map->start_addr != sample2.map->start_addr) {
+    return sample1.map->start_addr - sample2.map->start_addr;
+  }
+  return 0;
+}
+
+void VisitSampleTree(SampleTree* sample_tree,
+                     const std::vector<ExpectedSampleInMap>& expected_samples) {
+  size_t pos = 0;
+  sample_tree->VisitAllSamples(
+      std::bind(&CheckSampleCallback, std::placeholders::_1, expected_samples, &pos));
+  ASSERT_EQ(expected_samples.size(), pos);
+}
+
+class SampleTreeTest : public testing::Test {
+ protected:
+  virtual void SetUp() {
+    sample_tree = std::unique_ptr<SampleTree>(new SampleTree(CompareSampleFunction));
+    sample_tree->AddThread(1, 1, "p1t1");
+    sample_tree->AddThread(1, 11, "p1t11");
+    sample_tree->AddThread(2, 2, "p2t2");
+    sample_tree->AddThreadMap(1, 1, 1, 5, 0, 0, "process1_thread1");
+    sample_tree->AddThreadMap(1, 1, 6, 5, 0, 0, "process1_thread1_map2");
+    sample_tree->AddThreadMap(1, 11, 1, 10, 0, 0, "process1_thread11");
+    sample_tree->AddThreadMap(2, 2, 1, 20, 0, 0, "process2_thread2");
+    sample_tree->AddKernelMap(10, 20, 0, 0, "kernel");
+  }
+
+  void VisitSampleTree(const std::vector<ExpectedSampleInMap>& expected_samples) {
+    ::VisitSampleTree(sample_tree.get(), expected_samples);
+  }
+
+  std::unique_ptr<SampleTree> sample_tree;
+};
+
+TEST_F(SampleTreeTest, ip_in_map) {
+  sample_tree->AddSample(1, 1, 1, 0, 0, false);
+  sample_tree->AddSample(1, 1, 2, 0, 0, false);
+  sample_tree->AddSample(1, 1, 5, 0, 0, false);
+  std::vector<ExpectedSampleInMap> expected_samples = {
+      {1, 1, "p1t1", "process1_thread1", 1, 3},
+  };
+  VisitSampleTree(expected_samples);
+}
+
+TEST_F(SampleTreeTest, different_pid) {
+  sample_tree->AddSample(1, 1, 1, 0, 0, false);
+  sample_tree->AddSample(2, 2, 1, 0, 0, false);
+  std::vector<ExpectedSampleInMap> expected_samples = {
+      {1, 1, "p1t1", "process1_thread1", 1, 1}, {2, 2, "p2t2", "process2_thread2", 1, 1},
+  };
+  VisitSampleTree(expected_samples);
+}
+
+TEST_F(SampleTreeTest, different_tid) {
+  sample_tree->AddSample(1, 1, 1, 0, 0, false);
+  sample_tree->AddSample(1, 11, 1, 0, 0, false);
+  std::vector<ExpectedSampleInMap> expected_samples = {
+      {1, 1, "p1t1", "process1_thread1", 1, 1}, {1, 11, "p1t11", "process1_thread11", 1, 1},
+  };
+  VisitSampleTree(expected_samples);
+}
+
+TEST_F(SampleTreeTest, different_comm) {
+  sample_tree->AddSample(1, 1, 1, 0, 0, false);
+  sample_tree->AddThread(1, 1, "p1t1_comm2");
+  sample_tree->AddSample(1, 1, 1, 0, 0, false);
+  std::vector<ExpectedSampleInMap> expected_samples = {
+      {1, 1, "p1t1", "process1_thread1", 1, 1}, {1, 1, "p1t1_comm2", "process1_thread1", 1, 1},
+  };
+  VisitSampleTree(expected_samples);
+}
+
+TEST_F(SampleTreeTest, different_map) {
+  sample_tree->AddSample(1, 1, 1, 0, 0, false);
+  sample_tree->AddSample(1, 1, 6, 0, 0, false);
+  std::vector<ExpectedSampleInMap> expected_samples = {
+      {1, 1, "p1t1", "process1_thread1", 1, 1}, {1, 1, "p1t1", "process1_thread1_map2", 6, 1},
+  };
+  VisitSampleTree(expected_samples);
+}
+
+TEST_F(SampleTreeTest, unmapped_sample) {
+  sample_tree->AddSample(1, 1, 0, 0, 0, false);
+  sample_tree->AddSample(1, 1, 31, 0, 0, false);
+  sample_tree->AddSample(1, 1, 70, 0, 0, false);
+  // Match the unknown map.
+  std::vector<ExpectedSampleInMap> expected_samples = {
+      {1, 1, "p1t1", "unknown", 0, 3},
+  };
+  VisitSampleTree(expected_samples);
+}
+
+TEST_F(SampleTreeTest, map_kernel) {
+  sample_tree->AddSample(1, 1, 10, 0, 0, true);
+  sample_tree->AddSample(1, 1, 10, 0, 0, false);
+  std::vector<ExpectedSampleInMap> expected_samples = {
+      {1, 1, "p1t1", "kernel", 10, 1}, {1, 1, "p1t1", "process1_thread1_map2", 6, 1},
+  };
+  VisitSampleTree(expected_samples);
+}
+
+TEST(sample_tree, overlapped_map) {
+  auto sample_tree = std::unique_ptr<SampleTree>(new SampleTree(CompareSampleFunction));
+  sample_tree->AddThread(1, 1, "thread1");
+  sample_tree->AddThreadMap(1, 1, 1, 10, 0, 0, "map1");  // Add map 1.
+  sample_tree->AddSample(1, 1, 5, 0, 0, false);          // Hit map 1.
+  sample_tree->AddThreadMap(1, 1, 5, 20, 0, 0, "map2");  // Add map 2.
+  sample_tree->AddSample(1, 1, 6, 0, 0, false);          // Hit map 2.
+  sample_tree->AddSample(1, 1, 4, 0, 0, false);          // Hit unknown map.
+  sample_tree->AddThreadMap(1, 1, 2, 7, 0, 0, "map3");   // Add map 3.
+  sample_tree->AddSample(1, 1, 7, 0, 0, false);          // Hit map 3.
+  sample_tree->AddSample(1, 1, 10, 0, 0, false);         // Hit unknown map.
+
+  std::vector<ExpectedSampleInMap> expected_samples = {
+      {1, 1, "thread1", "map1", 1, 1},
+      {1, 1, "thread1", "map2", 5, 1},
+      {1, 1, "thread1", "map3", 2, 1},
+      {1, 1, "thread1", "unknown", 0, 2},
+  };
+  VisitSampleTree(sample_tree.get(), expected_samples);
+}
diff --git a/simpleperf/test_util.h b/simpleperf/test_util.h
new file mode 100644
index 0000000..34155a3
--- /dev/null
+++ b/simpleperf/test_util.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "workload.h"
+
+static void CreateProcesses(size_t count, std::vector<std::unique_ptr<Workload>>* workloads) {
+  workloads->clear();
+  for (size_t i = 0; i < count; ++i) {
+    auto workload = Workload::CreateWorkload({"sleep", "1"});
+    ASSERT_TRUE(workload != nullptr);
+    ASSERT_TRUE(workload->Start());
+    workloads->push_back(std::move(workload));
+  }
+}
diff --git a/simpleperf/utils.cpp b/simpleperf/utils.cpp
index 349cf5d..b212263 100644
--- a/simpleperf/utils.cpp
+++ b/simpleperf/utils.cpp
@@ -20,6 +20,7 @@
 #include <errno.h>
 #include <stdarg.h>
 #include <stdio.h>
+#include <sys/stat.h>
 #include <unistd.h>
 
 #include <base/logging.h>
@@ -36,16 +37,6 @@
   return (value != 0 && ((value & (value - 1)) == 0));
 }
 
-bool NextArgumentOrError(const std::vector<std::string>& args, size_t* pi) {
-  if (*pi + 1 == args.size()) {
-    LOG(ERROR) << "No argument following " << args[*pi] << " option. Try `simpleperf help "
-               << args[0] << "`";
-    return false;
-  }
-  ++*pi;
-  return true;
-}
-
 void GetEntriesInDir(const std::string& dirpath, std::vector<std::string>* files,
                      std::vector<std::string>* subdirs) {
   if (files != nullptr) {
@@ -76,3 +67,28 @@
   }
   closedir(dir);
 }
+
+bool IsDir(const std::string& dirpath) {
+  struct stat st;
+  if (stat(dirpath.c_str(), &st) == 0) {
+    if (S_ISDIR(st.st_mode)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool RemovePossibleFile(const std::string& filename) {
+  struct stat st;
+  if (stat(filename.c_str(), &st) == 0) {
+    if (!S_ISREG(st.st_mode)) {
+      LOG(ERROR) << filename << " is not a file.";
+      return false;
+    }
+    if (unlink(filename.c_str()) == -1) {
+      PLOG(ERROR) << "unlink(" << filename << ") failed";
+      return false;
+    }
+  }
+  return true;
+}
diff --git a/simpleperf/utils.h b/simpleperf/utils.h
index fba3558..1496974 100644
--- a/simpleperf/utils.h
+++ b/simpleperf/utils.h
@@ -17,6 +17,7 @@
 #ifndef SIMPLE_PERF_UTILS_H_
 #define SIMPLE_PERF_UTILS_H_
 
+#include <signal.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -52,13 +53,38 @@
   size_t bufsize_;
 };
 
+class SignalHandlerRegister {
+ public:
+  SignalHandlerRegister(const std::vector<int>& signums, void (*handler)(int)) {
+    for (auto& sig : signums) {
+      sig_t old_handler = signal(sig, handler);
+      saved_signal_handlers_.push_back(std::make_pair(sig, old_handler));
+    }
+  }
+
+  ~SignalHandlerRegister() {
+    for (auto& pair : saved_signal_handlers_) {
+      signal(pair.first, pair.second);
+    }
+  }
+
+ private:
+  std::vector<std::pair<int, sig_t>> saved_signal_handlers_;
+};
+
+template <class T>
+void MoveFromBinaryFormat(T& data, const char*& p) {
+  data = *reinterpret_cast<const T*>(p);
+  p += sizeof(T);
+}
+
 void PrintIndented(size_t indent, const char* fmt, ...);
 
 bool IsPowerOfTwo(uint64_t value);
 
-bool NextArgumentOrError(const std::vector<std::string>& args, size_t* pi);
-
 void GetEntriesInDir(const std::string& dirpath, std::vector<std::string>* files,
                      std::vector<std::string>* subdirs);
+bool IsDir(const std::string& dirpath);
+bool RemovePossibleFile(const std::string& filename);
 
 #endif  // SIMPLE_PERF_UTILS_H_
diff --git a/simpleperf/workload.cpp b/simpleperf/workload.cpp
index f8e4edd..9138afa 100644
--- a/simpleperf/workload.cpp
+++ b/simpleperf/workload.cpp
@@ -31,6 +31,21 @@
   return nullptr;
 }
 
+Workload::~Workload() {
+  if (work_pid_ != -1 && work_state_ != NotYetCreateNewProcess) {
+    if (!Workload::WaitChildProcess(false)) {
+      kill(work_pid_, SIGKILL);
+      Workload::WaitChildProcess(true);
+    }
+  }
+  if (start_signal_fd_ != -1) {
+    close(start_signal_fd_);
+  }
+  if (exec_child_fd_ != -1) {
+    close(exec_child_fd_);
+  }
+}
+
 static void ChildProcessFn(std::vector<std::string>& args, int start_signal_fd, int exec_child_fd);
 
 bool Workload::CreateNewProcess() {
@@ -110,38 +125,26 @@
   char exec_child_failed;
   ssize_t nread = TEMP_FAILURE_RETRY(read(exec_child_fd_, &exec_child_failed, 1));
   if (nread != 0) {
-    LOG(ERROR) << "exec child failed";
+    ((nread == -1) ? PLOG(ERROR) : LOG(ERROR)) << "exec child failed, nread = " << nread;
     return false;
   }
   work_state_ = Started;
   return true;
 }
 
-bool Workload::IsFinished() {
-  if (work_state_ == Started) {
-    WaitChildProcess(true);
-  }
-  return work_state_ == Finished;
-}
-
-void Workload::WaitFinish() {
-  CHECK(work_state_ == Started || work_state_ == Finished);
-  if (work_state_ == Started) {
-    WaitChildProcess(false);
-  }
-}
-
-void Workload::WaitChildProcess(bool no_hang) {
+bool Workload::WaitChildProcess(bool wait_forever) {
+  bool finished = false;
   int status;
-  pid_t result = TEMP_FAILURE_RETRY(waitpid(work_pid_, &status, (no_hang ? WNOHANG : 0)));
+  pid_t result = TEMP_FAILURE_RETRY(waitpid(work_pid_, &status, (wait_forever ? 0 : WNOHANG)));
   if (result == work_pid_) {
-    work_state_ = Finished;
+    finished = true;
     if (WIFSIGNALED(status)) {
-      LOG(ERROR) << "work process was terminated by signal " << strsignal(WTERMSIG(status));
+      LOG(WARNING) << "child process was terminated by signal " << strsignal(WTERMSIG(status));
     } else if (WIFEXITED(status) && WEXITSTATUS(status) != 0) {
-      LOG(ERROR) << "work process exited with exit code " << WEXITSTATUS(status);
+      LOG(WARNING) << "child process exited with exit code " << WEXITSTATUS(status);
     }
   } else if (result == -1) {
-    PLOG(FATAL) << "waitpid() failed";
+    PLOG(ERROR) << "waitpid() failed";
   }
+  return finished;
 }
diff --git a/simpleperf/workload.h b/simpleperf/workload.h
index 57622c8..4bb0ee5 100644
--- a/simpleperf/workload.h
+++ b/simpleperf/workload.h
@@ -30,24 +30,14 @@
     NotYetCreateNewProcess,
     NotYetStartNewProcess,
     Started,
-    Finished,
   };
 
  public:
   static std::unique_ptr<Workload> CreateWorkload(const std::vector<std::string>& args);
 
-  ~Workload() {
-    if (start_signal_fd_ != -1) {
-      close(start_signal_fd_);
-    }
-    if (exec_child_fd_ != -1) {
-      close(exec_child_fd_);
-    }
-  }
+  ~Workload();
 
   bool Start();
-  bool IsFinished();
-  void WaitFinish();
   pid_t GetPid() {
     return work_pid_;
   }
@@ -62,7 +52,7 @@
   }
 
   bool CreateNewProcess();
-  void WaitChildProcess(bool no_hang);
+  bool WaitChildProcess(bool wait_forever);
 
   WorkState work_state_;
   std::vector<std::string> args_;
diff --git a/simpleperf/workload_test.cpp b/simpleperf/workload_test.cpp
index 0cc67b8..f250328 100644
--- a/simpleperf/workload_test.cpp
+++ b/simpleperf/workload_test.cpp
@@ -16,24 +16,25 @@
 
 #include <gtest/gtest.h>
 
-#include <workload.h>
+#include <signal.h>
 
-#include <chrono>
+#include "utils.h"
+#include "workload.h"
 
-using namespace std::chrono;
+static volatile bool signaled;
+static void signal_handler(int) {
+  signaled = true;
+}
 
-TEST(workload, smoke) {
+TEST(workload, success) {
+  signaled = false;
+  SignalHandlerRegister signal_handler_register({SIGCHLD}, signal_handler);
   auto workload = Workload::CreateWorkload({"sleep", "1"});
   ASSERT_TRUE(workload != nullptr);
-  ASSERT_FALSE(workload->IsFinished());
   ASSERT_TRUE(workload->GetPid() != 0);
-  auto start_time = steady_clock::now();
   ASSERT_TRUE(workload->Start());
-  ASSERT_FALSE(workload->IsFinished());
-  workload->WaitFinish();
-  ASSERT_TRUE(workload->IsFinished());
-  auto end_time = steady_clock::now();
-  ASSERT_TRUE(end_time >= start_time + seconds(1));
+  while (!signaled) {
+  }
 }
 
 TEST(workload, execvp_failure) {
@@ -41,3 +42,42 @@
   ASSERT_TRUE(workload != nullptr);
   ASSERT_FALSE(workload->Start());
 }
+
+static void run_signaled_workload() {
+  {
+    signaled = false;
+    SignalHandlerRegister signal_handler_register({SIGCHLD}, signal_handler);
+    auto workload = Workload::CreateWorkload({"sleep", "10"});
+    ASSERT_TRUE(workload != nullptr);
+    ASSERT_TRUE(workload->Start());
+    ASSERT_EQ(0, kill(workload->GetPid(), SIGABRT));
+    while (!signaled) {
+    }
+  }
+  // Make sure all destructors are called before exit().
+  exit(0);
+}
+
+TEST(workload, signaled_warning) {
+  ASSERT_EXIT(run_signaled_workload(), testing::ExitedWithCode(0),
+              "child process was terminated by signal");
+}
+
+static void run_exit_nonzero_workload() {
+  {
+    signaled = false;
+    SignalHandlerRegister signal_handler_register({SIGCHLD}, signal_handler);
+    auto workload = Workload::CreateWorkload({"ls", "nonexistdir"});
+    ASSERT_TRUE(workload != nullptr);
+    ASSERT_TRUE(workload->Start());
+    while (!signaled) {
+    }
+  }
+  // Make sure all destructors are called before exit().
+  exit(0);
+}
+
+TEST(workload, exit_nonzero_warning) {
+  ASSERT_EXIT(run_exit_nonzero_workload(), testing::ExitedWithCode(0),
+              "child process exited with exit code");
+}
diff --git a/squashfs_utils/Android.mk b/squashfs_utils/Android.mk
index c3d2f2d..2e0456a 100644
--- a/squashfs_utils/Android.mk
+++ b/squashfs_utils/Android.mk
@@ -9,6 +9,14 @@
 LOCAL_MODULE := libsquashfs_utils
 include $(BUILD_STATIC_LIBRARY)
 
+include $(CLEAR_VARS)
+LOCAL_SRC_FILES := squashfs_utils.c
+LOCAL_STATIC_LIBRARIES := libcutils
+LOCAL_C_INCLUDES := external/squashfs-tools/squashfs-tools
+LOCAL_CFLAGS := -Wall -Werror -D_GNU_SOURCE -DSQUASHFS_NO_KLOG
+LOCAL_MODULE := libsquashfs_utils_host
+include $(BUILD_HOST_STATIC_LIBRARY)
+
 ifeq ($(HOST_OS),linux)
 
 include $(CLEAR_VARS)
diff --git a/squashfs_utils/squashfs_utils.c b/squashfs_utils/squashfs_utils.c
index 6189189..1db424b 100644
--- a/squashfs_utils/squashfs_utils.c
+++ b/squashfs_utils/squashfs_utils.c
@@ -16,6 +16,7 @@
 
 #include "squashfs_utils.h"
 
+#include <cutils/fs.h>
 #include <cutils/klog.h>
 #include <errno.h>
 #include <fcntl.h>
@@ -25,9 +26,39 @@
 
 #include "squashfs_fs.h"
 
+#ifdef SQUASHFS_NO_KLOG
+#include <stdio.h>
+#define ERROR(x...)   fprintf(stderr, x)
+#else
 #define ERROR(x...)   KLOG_ERROR("squashfs_utils", x)
+#endif
 
-int squashfs_parse_sb(char *blk_device, struct squashfs_info *info) {
+size_t squashfs_get_sb_size()
+{
+    return sizeof(struct squashfs_super_block);
+}
+
+int squashfs_parse_sb_buffer(const void *buf, struct squashfs_info *info)
+{
+    const struct squashfs_super_block *sb =
+        (const struct squashfs_super_block *)buf;
+
+    if (sb->s_magic != SQUASHFS_MAGIC) {
+        return -1;
+    }
+
+    info->block_size = sb->block_size;
+    info->inodes = sb->inodes;
+    info->bytes_used = sb->bytes_used;
+    // by default mksquashfs pads the filesystem to 4K blocks
+    info->bytes_used_4K_padded =
+        sb->bytes_used + (4096 - (sb->bytes_used & (4096 - 1)));
+
+    return 0;
+}
+
+int squashfs_parse_sb(const char *blk_device, struct squashfs_info *info)
+{
     int ret = 0;
     struct squashfs_super_block sb;
     int data_device;
@@ -44,19 +75,13 @@
         ret = -1;
         goto cleanup;
     }
-    if (sb.s_magic != SQUASHFS_MAGIC) {
+
+    if (squashfs_parse_sb_buffer(&sb, info) == -1) {
         ERROR("Not a valid squashfs filesystem\n");
         ret = -1;
         goto cleanup;
     }
 
-    info->block_size = sb.block_size;
-    info->inodes = sb.inodes;
-    info->bytes_used = sb.bytes_used;
-    // by default mksquashfs pads the filesystem to 4K blocks
-    info->bytes_used_4K_padded =
-        sb.bytes_used + (4096 - (sb.bytes_used & (4096 - 1)));
-
 cleanup:
     close(data_device);
     return ret;
diff --git a/squashfs_utils/squashfs_utils.h b/squashfs_utils/squashfs_utils.h
index ccad32d..465429f 100644
--- a/squashfs_utils/squashfs_utils.h
+++ b/squashfs_utils/squashfs_utils.h
@@ -17,6 +17,7 @@
 #ifndef _SQUASHFS_UTILS_H_
 #define _SQUASHFS_UTILS_H_
 
+#include <stddef.h>
 #include <stdint.h>
 
 #ifdef __cplusplus
@@ -30,7 +31,9 @@
     uint64_t bytes_used_4K_padded;
 };
 
-int squashfs_parse_sb(char *blk_device, struct squashfs_info *info);
+size_t squashfs_get_sb_size();
+int squashfs_parse_sb_buffer(const void *data, struct squashfs_info *info);
+int squashfs_parse_sb(const char *blk_device, struct squashfs_info *info);
 
 #ifdef __cplusplus
 }
diff --git a/tests/iptables/qtaguid/Android.mk b/tests/iptables/qtaguid/Android.mk
index a661678..b92b662 100644
--- a/tests/iptables/qtaguid/Android.mk
+++ b/tests/iptables/qtaguid/Android.mk
@@ -23,5 +23,6 @@
 LOCAL_SHARED_LIBRARIES += libcutils libutils liblog
 LOCAL_STATIC_LIBRARIES += libtestUtil
 LOCAL_C_INCLUDES += system/extras/tests/include
+LOCAL_CFLAGS += -fno-strict-aliasing
 
 include $(BUILD_NATIVE_TEST)
diff --git a/tests/net_test/anycast_test.py b/tests/net_test/anycast_test.py
new file mode 100755
index 0000000..4b03664
--- /dev/null
+++ b/tests/net_test/anycast_test.py
@@ -0,0 +1,105 @@
+#!/usr/bin/python
+#
+# Copyright 2014 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from socket import *  # pylint: disable=wildcard-import
+import threading
+import time
+import unittest
+
+import cstruct
+import multinetwork_base
+import net_test
+
+IPV6_JOIN_ANYCAST = 27
+IPV6_LEAVE_ANYCAST = 28
+
+# pylint: disable=invalid-name
+IPv6Mreq = cstruct.Struct("IPv6Mreq", "=16si", "multiaddr ifindex")
+
+
+_CLOSE_HUNG = False
+
+
+def CauseOops():
+  open("/proc/sysrq-trigger", "w").write("c")
+
+
+class CloseFileDescriptorThread(threading.Thread):
+
+  def __init__(self, fd):
+    super(CloseFileDescriptorThread, self).__init__()
+    self.daemon = True
+    self._fd = fd
+    self.finished = False
+
+  def run(self):
+    global _CLOSE_HUNG
+    _CLOSE_HUNG = True
+    self._fd.close()
+    _CLOSE_HUNG = False
+    self.finished = True
+
+
+class AnycastTest(multinetwork_base.MultiNetworkBaseTest):
+
+  _TEST_NETID = 123
+
+  def AnycastSetsockopt(self, s, is_add, netid, addr):
+    ifindex = self.ifindices[netid]
+    self.assertTrue(ifindex)
+    ipv6mreq = IPv6Mreq((addr, ifindex))
+    option = IPV6_JOIN_ANYCAST if is_add else IPV6_LEAVE_ANYCAST
+    s.setsockopt(IPPROTO_IPV6, option, ipv6mreq.Pack())
+
+  def testAnycastNetdeviceUnregister(self):
+    netid = self._TEST_NETID
+    self.assertNotIn(netid, self.tuns)
+    self.tuns[netid] = self.CreateTunInterface(netid)
+    self.SendRA(netid)
+    iface = self.GetInterfaceName(netid)
+    self.ifindices[netid] = net_test.GetInterfaceIndex(iface)
+
+    s = socket(AF_INET6, SOCK_DGRAM, 0)
+    addr = self.MyAddress(6, netid)
+    self.assertIsNotNone(addr)
+
+    addr = inet_pton(AF_INET6, addr)
+    addr = addr[:8] + os.urandom(8)
+    self.AnycastSetsockopt(s, True, netid, addr)
+
+    # Close the tun fd in the background.
+    # This will hang if the kernel has the bug.
+    thread = CloseFileDescriptorThread(self.tuns[netid])
+    thread.start()
+    time.sleep(0.1)
+
+    # Make teardown work.
+    del self.tuns[netid]
+    # Check that the interface is gone.
+    try:
+      self.assertIsNone(self.MyAddress(6, netid))
+    finally:
+      # This doesn't seem to help, but still.
+      self.AnycastSetsockopt(s, False, netid, addr)
+    self.assertTrue(thread.finished)
+
+
+if __name__ == "__main__":
+  unittest.main(exit=False)
+  if _CLOSE_HUNG:
+    time.sleep(3)
+    CauseOops()
diff --git a/tests/net_test/iproute.py b/tests/net_test/iproute.py
index cde1803..9cc8257 100644
--- a/tests/net_test/iproute.py
+++ b/tests/net_test/iproute.py
@@ -33,6 +33,7 @@
 # Request constants.
 NLM_F_REQUEST = 1
 NLM_F_ACK = 4
+NLM_F_REPLACE = 0x100
 NLM_F_EXCL = 0x200
 NLM_F_CREATE = 0x400
 NLM_F_DUMP = 0x300
@@ -64,6 +65,7 @@
 RTM_GETROUTE = 26
 RTM_NEWNEIGH = 28
 RTM_DELNEIGH = 29
+RTM_GETNEIGH = 30
 RTM_NEWRULE = 32
 RTM_DELRULE = 33
 RTM_GETRULE = 34
@@ -133,12 +135,16 @@
     "family prefixlen flags scope index")
 IFACacheinfo = cstruct.Struct(
     "IFACacheinfo", "=IIII", "prefered valid cstamp tstamp")
+NDACacheinfo = cstruct.Struct(
+    "NDACacheinfo", "=IIII", "confirmed used updated refcnt")
 
 
 ### Neighbour table entry constants. See include/uapi/linux/neighbour.h.
 # Neighbour cache entry attributes.
 NDA_DST = 1
 NDA_LLADDR = 2
+NDA_CACHEINFO = 3
+NDA_PROBES = 4
 
 # Neighbour cache entry states.
 NUD_PERMANENT = 0x80
@@ -155,6 +161,7 @@
 FRA_FWMARK = 10
 FRA_SUPPRESS_PREFIXLEN = 14
 FRA_TABLE = 15
+FRA_FWMASK = 16
 FRA_OIFNAME = 17
 FRA_UID_START = 18
 FRA_UID_END = 19
@@ -180,6 +187,7 @@
 IFLA_NUM_RX_QUEUES = 32
 IFLA_CARRIER = 33
 
+
 def CommandVerb(command):
   return ["NEW", "DEL", "GET", "SET"][command % 4]
 
@@ -191,7 +199,7 @@
 def CommandName(command):
   try:
     return "RTM_%s%s" % (CommandVerb(command), CommandSubject(command))
-  except KeyError:
+  except IndexError:
     return "RTM_%d" % command
 
 
@@ -288,12 +296,12 @@
       # Don't know what this is. Leave it as an integer.
       name = nla_type
 
-    if name in ["FRA_PRIORITY", "FRA_FWMARK", "FRA_TABLE",
+    if name in ["FRA_PRIORITY", "FRA_FWMARK", "FRA_TABLE", "FRA_FWMASK",
                 "FRA_UID_START", "FRA_UID_END",
                 "RTA_OIF", "RTA_PRIORITY", "RTA_TABLE", "RTA_MARK",
                 "IFLA_MTU", "IFLA_TXQLEN", "IFLA_GROUP", "IFLA_EXT_MASK",
                 "IFLA_PROMISCUITY", "IFLA_NUM_RX_QUEUES",
-                "IFLA_NUM_TX_QUEUES"]:
+                "IFLA_NUM_TX_QUEUES", "NDA_PROBES"]:
       data = struct.unpack("=I", nla_data)[0]
     elif name == "FRA_SUPPRESS_PREFIXLEN":
       data = struct.unpack("=i", nla_data)[0]
@@ -311,6 +319,8 @@
       data = RTACacheinfo(nla_data)
     elif name == "IFA_CACHEINFO":
       data = IFACacheinfo(nla_data)
+    elif name == "NDA_CACHEINFO":
+      data = NDACacheinfo(nla_data)
     elif name in ["NDA_LLADDR", "IFLA_ADDRESS"]:
       data = ":".join(x.encode("hex") for x in nla_data)
     else:
@@ -398,13 +408,14 @@
   def _AddressFamily(self, version):
     return {4: socket.AF_INET, 6: socket.AF_INET6}[version]
 
-  def _SendNlRequest(self, command, data):
+  def _SendNlRequest(self, command, data, flags=0):
     """Sends a netlink request and expects an ack."""
-    flags = NLM_F_REQUEST
+    flags |= NLM_F_REQUEST
     if CommandVerb(command) != "GET":
       flags |= NLM_F_ACK
     if CommandVerb(command) == "NEW":
-      flags |= (NLM_F_EXCL | NLM_F_CREATE)
+      if not flags & NLM_F_REPLACE:
+        flags |= (NLM_F_EXCL | NLM_F_CREATE)
 
     length = len(NLMsgHdr) + len(data)
     nlmsg = NLMsgHdr((length, command, flags, self.seq, self.pid)).Pack()
@@ -488,7 +499,7 @@
 
     if nlmsghdr.type == NLMSG_ERROR or nlmsghdr.type == NLMSG_DONE:
       print "done"
-      return None, data
+      return (None, None), data
 
     nlmsg, data = cstruct.Read(data, msgtype)
     self._Debug("    %s" % nlmsg)
@@ -511,12 +522,10 @@
       self._ExpectDone()
     return out
 
-  def MaybeDebugCommand(self, command, data):
-    subject = CommandSubject(command)
-    if "ALL" not in self.NL_DEBUG and subject not in self.NL_DEBUG:
-      return
-    name = CommandName(command)
+  def CommandToString(self, command, data):
     try:
+      name = CommandName(command)
+      subject = CommandSubject(command)
       struct_type = {
           "ADDR": IfAddrMsg,
           "LINK": IfinfoMsg,
@@ -525,14 +534,24 @@
           "RULE": RTMsg,
       }[subject]
       parsed = self._ParseNLMsg(data, struct_type)
-      print "%s %s" % (name, str(parsed))
-    except KeyError:
+      return "%s %s" % (name, str(parsed))
+    except IndexError:
       raise ValueError("Don't know how to print command type %s" % name)
 
+  def MaybeDebugCommand(self, command, data):
+    subject = CommandSubject(command)
+    if "ALL" not in self.NL_DEBUG and subject not in self.NL_DEBUG:
+      return
+    print self.CommandToString(command, data)
+
   def MaybeDebugMessage(self, message):
     hdr = NLMsgHdr(message)
     self.MaybeDebugCommand(hdr.type, message)
 
+  def PrintMessage(self, message):
+    hdr = NLMsgHdr(message)
+    print self.CommandToString(hdr.type, message)
+
   def _Dump(self, command, msg, msgtype):
     """Sends a dump request and returns a list of decoded messages."""
     # Create a netlink dump request containing the msg.
@@ -640,12 +659,12 @@
     routes = self._GetMsgList(RTMsg, data, False)
     return routes
 
-  def _Neighbour(self, version, is_add, addr, lladdr, dev, state):
+  def _Neighbour(self, version, is_add, addr, lladdr, dev, state, flags=0):
     """Adds or deletes a neighbour cache entry."""
     family = self._AddressFamily(version)
 
     # Convert the link-layer address to a raw byte string.
-    if is_add:
+    if is_add and lladdr:
       lladdr = lladdr.split(":")
       if len(lladdr) != 6:
         raise ValueError("Invalid lladdr %s" % ":".join(lladdr))
@@ -653,10 +672,10 @@
 
     ndmsg = NdMsg((family, dev, state, 0, RTN_UNICAST)).Pack()
     ndmsg += self._NlAttrIPAddress(NDA_DST, family, addr)
-    if is_add:
+    if is_add and lladdr:
       ndmsg += self._NlAttr(NDA_LLADDR, lladdr)
     command = RTM_NEWNEIGH if is_add else RTM_DELNEIGH
-    self._SendNlRequest(command, ndmsg)
+    self._SendNlRequest(command, ndmsg, flags)
 
   def AddNeighbour(self, version, addr, lladdr, dev):
     self._Neighbour(version, True, addr, lladdr, dev, NUD_PERMANENT)
@@ -664,6 +683,18 @@
   def DelNeighbour(self, version, addr, lladdr, dev):
     self._Neighbour(version, False, addr, lladdr, dev, 0)
 
+  def UpdateNeighbour(self, version, addr, lladdr, dev, state):
+    self._Neighbour(version, True, addr, lladdr, dev, state,
+                    flags=NLM_F_REPLACE)
+
+  def DumpNeighbours(self, version):
+    ndmsg = NdMsg((self._AddressFamily(version), 0, 0, 0, 0))
+    return self._Dump(RTM_GETNEIGH, ndmsg, NdMsg)
+
+  def ParseNeighbourMessage(self, msg):
+    msg, _ = self._ParseNLMsg(msg, NdMsg)
+    return msg
+
 
 if __name__ == "__main__":
   iproute = IPRoute()
diff --git a/tests/net_test/multinetwork_base.py b/tests/net_test/multinetwork_base.py
index 8940258..97e4d37 100644
--- a/tests/net_test/multinetwork_base.py
+++ b/tests/net_test/multinetwork_base.py
@@ -190,6 +190,10 @@
     return {4: cls._MyIPv4Address(netid),
             6: cls._MyIPv6Address(netid)}[version]
 
+  @classmethod
+  def MyLinkLocalAddress(cls, netid):
+    return net_test.GetLinkAddress(cls.GetInterfaceName(netid), True)
+
   @staticmethod
   def IPv6Prefix(netid):
     return "2001:db8:%02x::" % netid
@@ -221,7 +225,7 @@
     return f
 
   @classmethod
-  def SendRA(cls, netid, retranstimer=None):
+  def SendRA(cls, netid, retranstimer=None, reachabletime=0):
     validity = 300                 # seconds
     macaddr = cls.RouterMacAddress(netid)
     lladdr = cls._RouterAddress(netid, 6)
@@ -238,7 +242,8 @@
 
     ra = (scapy.Ether(src=macaddr, dst="33:33:00:00:00:01") /
           scapy.IPv6(src=lladdr, hlim=255) /
-          scapy.ICMPv6ND_RA(retranstimer=retranstimer,
+          scapy.ICMPv6ND_RA(reachabletime=reachabletime,
+                            retranstimer=retranstimer,
                             routerlifetime=routerlifetime) /
           scapy.ICMPv6NDOptSrcLLAddr(lladdr=macaddr) /
           scapy.ICMPv6NDOptPrefixInfo(prefix=cls.IPv6Prefix(netid),
@@ -430,13 +435,12 @@
       raise ValueError("Unknown interface selection mode %s" % mode)
 
   def BuildSocket(self, version, constructor, netid, routing_mode):
-    uid = self.UidForNetid(netid) if routing_mode == "uid" else None
-    with net_test.RunAsUid(uid):
-      family = self.GetProtocolFamily(version)
-      s = constructor(family)
+    s = constructor(self.GetProtocolFamily(version))
 
     if routing_mode not in [None, "uid"]:
       self.SelectInterface(s, netid, routing_mode)
+    elif routing_mode == "uid":
+      os.fchown(s.fileno(), self.UidForNetid(netid), -1)
 
     return s
 
diff --git a/tests/net_test/multinetwork_test.py b/tests/net_test/multinetwork_test.py
index b66d765..e9cd8f1 100755
--- a/tests/net_test/multinetwork_test.py
+++ b/tests/net_test/multinetwork_test.py
@@ -33,23 +33,16 @@
 PING_SEQ = 3
 PING_TOS = 0x83
 
+# For brevity.
+UDP_PAYLOAD = net_test.UDP_PAYLOAD
+
 IPV6_FLOWINFO = 11
 
-
-UDP_PAYLOAD = str(scapy.DNS(rd=1,
-                            id=random.randint(0, 65535),
-                            qd=scapy.DNSQR(qname="wWW.GoOGle.CoM",
-                                           qtype="AAAA")))
-
-
 IPV4_MARK_REFLECT_SYSCTL = "/proc/sys/net/ipv4/fwmark_reflect"
 IPV6_MARK_REFLECT_SYSCTL = "/proc/sys/net/ipv6/fwmark_reflect"
 SYNCOOKIES_SYSCTL = "/proc/sys/net/ipv4/tcp_syncookies"
 TCP_MARK_ACCEPT_SYSCTL = "/proc/sys/net/ipv4/tcp_fwmark_accept"
 
-HAVE_MARK_REFLECT = os.path.isfile(IPV4_MARK_REFLECT_SYSCTL)
-HAVE_TCP_MARK_ACCEPT = os.path.isfile(TCP_MARK_ACCEPT_SYSCTL)
-
 # The IP[V6]UNICAST_IF socket option was added between 3.1 and 3.4.
 HAVE_UNICAST_IF = net_test.LINUX_VERSION >= (3, 4, 0)
 
@@ -581,27 +574,21 @@
   def SYNToClosedPort(self, *args):
     return Packets.SYN(999, *args)
 
-  @unittest.skipUnless(HAVE_MARK_REFLECT, "no mark reflection")
   def testIPv4ICMPErrorsReflectMark(self):
     self.CheckReflection(4, Packets.UDP, Packets.ICMPPortUnreachable)
 
-  @unittest.skipUnless(HAVE_MARK_REFLECT, "no mark reflection")
   def testIPv6ICMPErrorsReflectMark(self):
     self.CheckReflection(6, Packets.UDP, Packets.ICMPPortUnreachable)
 
-  @unittest.skipUnless(HAVE_MARK_REFLECT, "no mark reflection")
   def testIPv4PingRepliesReflectMarkAndTos(self):
     self.CheckReflection(4, Packets.ICMPEcho, Packets.ICMPReply)
 
-  @unittest.skipUnless(HAVE_MARK_REFLECT, "no mark reflection")
   def testIPv6PingRepliesReflectMarkAndTos(self):
     self.CheckReflection(6, Packets.ICMPEcho, Packets.ICMPReply)
 
-  @unittest.skipUnless(HAVE_MARK_REFLECT, "no mark reflection")
   def testIPv4RSTsReflectMark(self):
     self.CheckReflection(4, self.SYNToClosedPort, Packets.RST)
 
-  @unittest.skipUnless(HAVE_MARK_REFLECT, "no mark reflection")
   def testIPv6RSTsReflectMark(self):
     self.CheckReflection(6, self.SYNToClosedPort, Packets.RST)
 
@@ -723,9 +710,8 @@
 
           listenport = listensocket.getsockname()[1]
 
-          if HAVE_TCP_MARK_ACCEPT:
-            accept_sysctl = 1 if mode == self.MODE_INCOMING_MARK else 0
-            self._SetTCPMarkAcceptSysctl(accept_sysctl)
+          accept_sysctl = 1 if mode == self.MODE_INCOMING_MARK else 0
+          self._SetTCPMarkAcceptSysctl(accept_sysctl)
 
           bound_dev = iif if mode == self.MODE_BINDTODEVICE else None
           self.BindToDevice(listensocket, bound_dev)
@@ -756,11 +742,9 @@
     self.CheckTCP(4, [None, self.MODE_BINDTODEVICE, self.MODE_EXPLICIT_MARK])
     self.CheckTCP(6, [None, self.MODE_BINDTODEVICE, self.MODE_EXPLICIT_MARK])
 
-  @unittest.skipUnless(HAVE_TCP_MARK_ACCEPT, "fwmark writeback not supported")
   def testIPv4MarkAccept(self):
     self.CheckTCP(4, [self.MODE_INCOMING_MARK])
 
-  @unittest.skipUnless(HAVE_TCP_MARK_ACCEPT, "fwmark writeback not supported")
   def testIPv6MarkAccept(self):
     self.CheckTCP(6, [self.MODE_INCOMING_MARK])
 
@@ -975,7 +959,6 @@
   # table the original packet used, and thus it won't be able to clone the
   # correct route.
 
-  @unittest.skipUnless(HAVE_MARK_REFLECT, "no mark reflection")
   def testIPv4UnmarkedSocketPMTU(self):
     self.SetMarkReflectSysctls(1)
     try:
@@ -983,7 +966,6 @@
     finally:
       self.SetMarkReflectSysctls(0)
 
-  @unittest.skipUnless(HAVE_MARK_REFLECT, "no mark reflection")
   def testIPv6UnmarkedSocketPMTU(self):
     self.SetMarkReflectSysctls(1)
     try:
diff --git a/tests/net_test/neighbour_test.py b/tests/net_test/neighbour_test.py
new file mode 100755
index 0000000..828a86b
--- /dev/null
+++ b/tests/net_test/neighbour_test.py
@@ -0,0 +1,216 @@
+#!/usr/bin/python
+#
+# Copyright 2015 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import errno
+import random
+from socket import *  # pylint: disable=wildcard-import
+import time
+import unittest
+
+from scapy import all as scapy
+
+import multinetwork_base
+import net_test
+
+
+RTMGRP_NEIGH = 4
+
+NUD_INCOMPLETE = 0x01
+NUD_REACHABLE = 0x02
+NUD_STALE = 0x04
+NUD_DELAY = 0x08
+NUD_PROBE = 0x10
+NUD_FAILED = 0x20
+NUD_PERMANENT = 0x80
+
+
+# TODO: Support IPv4.
+class NeighbourTest(multinetwork_base.MultiNetworkBaseTest):
+
+  # Set a 100-ms retrans timer so we can test for ND retransmits without
+  # waiting too long. Apparently this cannot go below 500ms.
+  RETRANS_TIME_MS = 500
+
+  # This can only be in seconds, so 1000 is the minimum.
+  DELAY_TIME_MS = 1000
+
+  # Unfortunately, this must be above the delay timer or the kernel ND code will
+  # not behave correctly (e.g., go straight from REACHABLE into DELAY). This is
+  # is fuzzed by the kernel from 0.5x to 1.5x of its value, so we need a value
+  # that's 2x the delay timer.
+  REACHABLE_TIME_MS = 2 * DELAY_TIME_MS
+
+  @classmethod
+  def setUpClass(cls):
+    super(NeighbourTest, cls).setUpClass()
+    for netid in cls.tuns:
+      iface = cls.GetInterfaceName(netid)
+      # This can't be set in an RA.
+      cls.SetSysctl(
+          "/proc/sys/net/ipv6/neigh/%s/delay_first_probe_time" % iface,
+          cls.DELAY_TIME_MS / 1000)
+
+  def setUp(self):
+    super(NeighbourTest, self).setUp()
+
+    self.sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE)
+    self.sock.bind((0, RTMGRP_NEIGH))
+    net_test.SetNonBlocking(self.sock)
+
+    for netid in self.tuns:
+      self.SendRA(netid,
+                  retranstimer=self.RETRANS_TIME_MS,
+                  reachabletime=self.REACHABLE_TIME_MS)
+
+    self.netid = random.choice(self.tuns.keys())
+    self.ifindex = self.ifindices[self.netid]
+
+  def GetNeighbour(self, addr):
+    version = 6 if ":" in addr else 4
+    for msg, args in self.iproute.DumpNeighbours(version):
+      if args["NDA_DST"] == addr:
+        return msg, args
+
+  def GetNdEntry(self, addr):
+    return self.GetNeighbour(addr)
+
+  def CheckNoNdEvents(self):
+    self.assertRaisesErrno(errno.EAGAIN, self.sock.recvfrom, 4096, MSG_PEEK)
+
+  def assertNeighbourState(self, state, addr):
+    self.assertEquals(state, self.GetNdEntry(addr)[0].state)
+
+  def assertNeighbourAttr(self, addr, name, value):
+    self.assertEquals(value, self.GetNdEntry(addr)[1][name])
+
+  def ExpectNeighbourNotification(self, addr, state, attrs=None):
+    msg = self.sock.recv(4096)
+    msg, actual_attrs = self.iproute.ParseNeighbourMessage(msg)
+    self.assertEquals(addr, actual_attrs["NDA_DST"])
+    self.assertEquals(state, msg.state)
+    if attrs:
+      for name in attrs:
+        self.assertEquals(attrs[name], actual_attrs[name])
+
+  def ExpectUnicastProbe(self, addr):
+    version = 6 if ":" in addr else 4
+    if version == 6:
+      expected = (
+          scapy.IPv6(src=self.MyLinkLocalAddress(self.netid), dst=addr) /
+          scapy.ICMPv6ND_NS(tgt=addr) /
+          scapy.ICMPv6NDOptSrcLLAddr(lladdr=self.MyMacAddress(self.netid))
+      )
+      self.ExpectPacketOn(self.netid, "Unicast probe", expected)
+    else:
+      raise NotImplementedError
+
+  def ReceiveUnicastAdvertisement(self, addr, mac):
+    version = 6 if ":" in addr else 4
+    if version == 6:
+      packet = (
+          scapy.Ether(src=mac, dst=self.MyMacAddress(self.netid)) /
+          scapy.IPv6(src=addr, dst=self.MyLinkLocalAddress(self.netid)) /
+          scapy.ICMPv6ND_NA(tgt=addr, S=1, O=0) /
+          scapy.ICMPv6NDOptDstLLAddr(lladdr=mac)
+      )
+      self.ReceiveEtherPacketOn(self.netid, packet)
+    else:
+      raise NotImplementedError
+
+  def MonitorSleep(self, intervalseconds, addr):
+    slept = 0
+    while slept < intervalseconds:
+      time.sleep(0.1)
+      slept += 0.1
+      print self.GetNdEntry(addr)
+
+  def SleepMs(self, ms):
+    time.sleep(ms / 1000.0)
+
+  def testNotifications(self):
+    router4 = self._RouterAddress(self.netid, 4)
+    router6 = self._RouterAddress(self.netid, 6)
+    self.assertNeighbourState(NUD_PERMANENT, router4)
+    self.assertNeighbourState(NUD_STALE, router6)
+
+    # Send a packet and check that we go into DELAY.
+    routing_mode = random.choice(["mark", "oif", "uid"])
+    s = self.BuildSocket(6, net_test.UDPSocket, self.netid, routing_mode)
+    s.connect((net_test.IPV6_ADDR, 53))
+    s.send(net_test.UDP_PAYLOAD)
+    self.assertNeighbourState(NUD_DELAY, router6)
+
+    # Wait for the probe interval, then check that we're in PROBE, and that the
+    # kernel has notified us.
+    self.SleepMs(self.DELAY_TIME_MS)
+    self.ExpectNeighbourNotification(router6, NUD_PROBE)
+    self.assertNeighbourState(NUD_PROBE, router6)
+    self.ExpectUnicastProbe(router6)
+
+    # Respond to the NS and verify we're in REACHABLE again.
+    self.ReceiveUnicastAdvertisement(router6, self.RouterMacAddress(self.netid))
+    self.assertNeighbourState(NUD_REACHABLE, router6)
+
+    # Wait until the reachable time has passed, and verify we're in STALE.
+    self.SleepMs(self.REACHABLE_TIME_MS * 1.5)
+    self.assertNeighbourState(NUD_STALE, router6)
+    self.ExpectNeighbourNotification(router6, NUD_STALE)
+
+    # Send a packet, and verify we go into DELAY and then to PROBE.
+    s.send(net_test.UDP_PAYLOAD)
+    self.assertNeighbourState(NUD_DELAY, router6)
+    self.SleepMs(self.DELAY_TIME_MS)
+    self.assertNeighbourState(NUD_PROBE, router6)
+    self.ExpectNeighbourNotification(router6, NUD_PROBE)
+
+    # Wait for the probes to time out, and expect a FAILED notification.
+    self.assertNeighbourAttr(router6, "NDA_PROBES", 1)
+    self.ExpectUnicastProbe(router6)
+
+    self.SleepMs(self.RETRANS_TIME_MS)
+    self.ExpectUnicastProbe(router6)
+    self.assertNeighbourAttr(router6, "NDA_PROBES", 2)
+
+    self.SleepMs(self.RETRANS_TIME_MS)
+    self.ExpectUnicastProbe(router6)
+    self.assertNeighbourAttr(router6, "NDA_PROBES", 3)
+
+    self.SleepMs(self.RETRANS_TIME_MS)
+    self.assertNeighbourState(NUD_FAILED, router6)
+    self.ExpectNeighbourNotification(router6, NUD_FAILED, {"NDA_PROBES": 3})
+
+  def testRepeatedProbes(self):
+    router4 = self._RouterAddress(self.netid, 4)
+    router6 = self._RouterAddress(self.netid, 6)
+    routermac = self.RouterMacAddress(self.netid)
+    self.assertNeighbourState(NUD_PERMANENT, router4)
+    self.assertNeighbourState(NUD_STALE, router6)
+
+    def ForceProbe(addr, mac):
+      self.iproute.UpdateNeighbour(6, addr, None, self.ifindex, NUD_PROBE)
+      self.assertNeighbourState(NUD_PROBE, addr)
+      self.SleepMs(1)  # TODO: Why is this necessary?
+      self.assertNeighbourState(NUD_PROBE, addr)
+      self.ExpectUnicastProbe(addr)
+      self.ReceiveUnicastAdvertisement(addr, mac)
+      self.assertNeighbourState(NUD_REACHABLE, addr)
+
+    for i in xrange(5):
+      ForceProbe(router6, routermac)
+
+
+if __name__ == "__main__":
+  unittest.main()
diff --git a/tests/net_test/net_test.py b/tests/net_test/net_test.py
index a87b71b..be034d0 100755
--- a/tests/net_test/net_test.py
+++ b/tests/net_test/net_test.py
@@ -16,6 +16,7 @@
 
 import fcntl
 import os
+import random
 from socket import *  # pylint: disable=wildcard-import
 import struct
 import unittest
@@ -63,6 +64,12 @@
                          "st tx_queue rx_queue tr tm->when retrnsmt"
                          "   uid  timeout inode ref pointer drops\n")
 
+# Arbitrary packet payload.
+UDP_PAYLOAD = str(scapy.DNS(rd=1,
+                            id=random.randint(0, 65535),
+                            qd=scapy.DNSQR(qname="wWW.GoOGle.CoM",
+                                           qtype="AAAA")))
+
 # Unix group to use if we want to open sockets as non-root.
 AID_INET = 3003
 
diff --git a/tests/net_test/run_net_test.sh b/tests/net_test/run_net_test.sh
index fae1145..119171e 100755
--- a/tests/net_test/run_net_test.sh
+++ b/tests/net_test/run_net_test.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Kernel configration options.
+# Kernel configuration options.
 OPTIONS=" IPV6 IPV6_ROUTER_PREF IPV6_MULTIPLE_TABLES IPV6_ROUTE_INFO"
 OPTIONS="$OPTIONS TUN SYN_COOKIES IP_ADVANCED_ROUTER IP_MULTIPLE_TABLES"
 OPTIONS="$OPTIONS NETFILTER NETFILTER_ADVANCED NETFILTER_XTABLES"
@@ -11,6 +11,9 @@
 # For 3.1 kernels, where devtmpfs is not on by default.
 OPTIONS="$OPTIONS DEVTMPFS DEVTMPFS_MOUNT"
 
+# These two break the flo kernel due to differences in -Werror on recent GCC.
+DISABLE_OPTIONS=" CONFIG_REISERFS_FS CONFIG_ANDROID_PMEM"
+
 # How many tap interfaces to create.
 NUMTAPINTERFACES=2
 
@@ -73,6 +76,10 @@
 cmdline=${OPTIONS// / -e }
 ./scripts/config $cmdline
 
+# Disable the kernel config options listed in $DISABLE_OPTIONS.
+cmdline=${DISABLE_OPTIONS// / -d }
+./scripts/config $cmdline
+
 # olddefconfig doesn't work on old kernels.
 if ! make olddefconfig ARCH=um SUBARCH=x86_64 CROSS_COMPILE= ; then
   cat >&2 << EOF
@@ -86,7 +93,7 @@
 fi
 
 # Compile the kernel.
-make -j12 linux ARCH=um SUBARCH=x86_64 CROSS_COMPILE=
+make -j32 linux ARCH=um SUBARCH=x86_64 CROSS_COMPILE=
 
 # Get the absolute path to the test file that's being run.
 dir=/host$(dirname $(readlink -f $0))
diff --git a/tests/net_test/srcaddr_selection_test.py b/tests/net_test/srcaddr_selection_test.py
index eb09b7f..0781b2d 100755
--- a/tests/net_test/srcaddr_selection_test.py
+++ b/tests/net_test/srcaddr_selection_test.py
@@ -34,11 +34,6 @@
 IPV6_PREFER_SRC_PUBLIC = 0x0002
 
 
-USE_OPTIMISTIC_SYSCTL = "/proc/sys/net/ipv6/conf/default/use_optimistic"
-
-HAVE_USE_OPTIMISTIC = os.path.isfile(USE_OPTIMISTIC_SYSCTL)
-
-
 class IPv6SourceAddressSelectionTest(multinetwork_base.MultiNetworkBaseTest):
 
   def SetDAD(self, ifname, value):
@@ -123,20 +118,22 @@
       self.SetDAD(self.GetInterfaceName(netid), 0)
       self.SetOptimisticDAD(self.GetInterfaceName(netid), 0)
       self.SetUseTempaddrs(self.GetInterfaceName(netid), 0)
-      if HAVE_USE_OPTIMISTIC:
-        self.SetUseOptimistic(self.GetInterfaceName(netid), 0)
+      self.SetUseOptimistic(self.GetInterfaceName(netid), 0)
 
     # [1]  Pick an interface on which to test.
     self.test_netid = random.choice(self.tuns.keys())
     self.test_ip = self.MyAddress(6, self.test_netid)
     self.test_ifindex = self.ifindices[self.test_netid]
     self.test_ifname = self.GetInterfaceName(self.test_netid)
+    self.test_lladdr = net_test.GetLinkAddress(self.test_ifname, True)
 
     # [2]  Delete the test interface's IPv6 address.
     self.iproute.DelAddress(self.test_ip, 64, self.test_ifindex)
     self.assertAddressNotPresent(self.test_ip)
 
     self.assertAddressNotUsable(self.test_ip, self.test_netid)
+    # Verify that the link-local address is not tentative.
+    self.assertFalse(self.AddressIsTentative(self.test_lladdr));
 
 
 class TentativeAddressTest(MultiInterfaceSourceAddressSelectionTest):
@@ -194,7 +191,6 @@
 
 class OptimisticAddressOkayTest(MultiInterfaceSourceAddressSelectionTest):
 
-  @unittest.skipUnless(HAVE_USE_OPTIMISTIC, "use_optimistic not supported")
   def testModifiedRfc6724Behaviour(self):
     # [3]  Get an IPv6 address back, in optimistic DAD start-up.
     self.SetDAD(self.test_ifname, 1)  # Enable DAD
@@ -214,7 +210,6 @@
 
 class ValidBeforeOptimisticTest(MultiInterfaceSourceAddressSelectionTest):
 
-  @unittest.skipUnless(HAVE_USE_OPTIMISTIC, "use_optimistic not supported")
   def testModifiedRfc6724Behaviour(self):
     # [3]  Add a valid IPv6 address to this interface and verify it is
     # selected as the source address.
@@ -243,7 +238,6 @@
 
 class DadFailureTest(MultiInterfaceSourceAddressSelectionTest):
 
-  @unittest.skipUnless(HAVE_USE_OPTIMISTIC, "use_optimistic not supported")
   def testDadFailure(self):
     # [3]  Get an IPv6 address back, in optimistic DAD start-up.
     self.SetDAD(self.test_ifname, 1)  # Enable DAD
@@ -275,9 +269,6 @@
 
 class NoNsFromOptimisticTest(MultiInterfaceSourceAddressSelectionTest):
 
-  @unittest.skipUnless(HAVE_USE_OPTIMISTIC, "use_optimistic not supported")
-  @unittest.skipUnless(net_test.LinuxVersion() >= (3, 18, 0),
-                       "correct optimistic bind() not supported")
   def testSendToOnlinkDestination(self):
     # [3]  Get an IPv6 address back, in optimistic DAD start-up.
     self.SetDAD(self.test_ifname, 1)  # Enable DAD
@@ -297,11 +288,14 @@
     onlink_dest = self.GetRandomDestination(self.IPv6Prefix(self.test_netid))
     self.SendWithSourceAddress(self.test_ip, self.test_netid, onlink_dest)
 
-    expected_ns = multinetwork_test.Packets.NS(
-        net_test.GetLinkAddress(self.test_ifname, True),
-        onlink_dest,
-        self.MyMacAddress(self.test_netid))[1]
-    self.ExpectPacketOn(self.test_netid, "link-local NS", expected_ns)
+    if net_test.LinuxVersion() >= (3, 18, 0):
+      # Older versions will actually choose the optimistic address to
+      # originate Neighbor Solications (RFC violation).
+      expected_ns = multinetwork_test.Packets.NS(
+          self.test_lladdr,
+          onlink_dest,
+          self.MyMacAddress(self.test_netid))[1]
+      self.ExpectPacketOn(self.test_netid, "link-local NS", expected_ns)
 
 
 # TODO(ek): add tests listening for netlink events.
diff --git a/verity/Android.mk b/verity/Android.mk
index 75face6..586ca58 100644
--- a/verity/Android.mk
+++ b/verity/Android.mk
@@ -100,6 +100,6 @@
 LOCAL_SRC_FILES := build_verity_tree.cpp
 LOCAL_MODULE_TAGS := optional
 LOCAL_STATIC_LIBRARIES := libsparse_host libz
-LOCAL_SHARED_LIBRARIES := libcrypto-host
+LOCAL_SHARED_LIBRARIES := libcrypto-host libbase
 LOCAL_CFLAGS += -Wall -Werror
 include $(BUILD_HOST_EXECUTABLE)
diff --git a/verity/build_verity_tree.cpp b/verity/build_verity_tree.cpp
index e7bfa40..5a6a6ee 100644
--- a/verity/build_verity_tree.cpp
+++ b/verity/build_verity_tree.cpp
@@ -16,6 +16,8 @@
 #include <string.h>
 #include <unistd.h>
 
+#include <base/file.h>
+
 struct sparse_hash_ctx {
     unsigned char *hashes;
     const unsigned char *salt;
@@ -353,7 +355,9 @@
     if (fd < 0) {
         FATAL("failed to open output file '%s'\n", verity_filename);
     }
-    write(fd, verity_tree, verity_blocks * block_size);
+    if (!android::base::WriteFully(fd, verity_tree, verity_blocks * block_size)) {
+        FATAL("failed to write '%s'\n", verity_filename);
+    }
     close(fd);
 
     delete[] verity_tree_levels;