Merge "Support verifying the boot signature against the given pubkey."
diff --git a/ext4_utils/Android.mk b/ext4_utils/Android.mk
index 16b73c4..d9df220 100644
--- a/ext4_utils/Android.mk
+++ b/ext4_utils/Android.mk
@@ -29,8 +29,7 @@
 LOCAL_EXPORT_C_INCLUDE_DIRS := \
     $(LOCAL_PATH)/include
 LOCAL_STATIC_LIBRARIES := \
-    libsparse_host \
-    libz
+    libsparse_host
 LOCAL_STATIC_LIBRARIES_darwin += libselinux
 LOCAL_STATIC_LIBRARIES_linux += libselinux
 LOCAL_MODULE_HOST_OS := darwin linux windows
@@ -89,8 +88,7 @@
     libcutils \
     libext2_uuid \
     libselinux \
-    libsparse \
-    libz
+    libsparse
 include $(BUILD_SHARED_LIBRARY)
 
 
diff --git a/libfec/fec_verity.cpp b/libfec/fec_verity.cpp
index 5dea53d..3f636dd 100644
--- a/libfec/fec_verity.cpp
+++ b/libfec/fec_verity.cpp
@@ -312,6 +312,11 @@
 
     debug("valid");
 
+    if (v->hash) {
+        delete[] v->hash;
+        v->hash = NULL;
+    }
+
     v->hash = data_hashes.release();
     return 0;
 }
@@ -319,7 +324,7 @@
 /* reads, corrects and parses the verity table, validates parameters, and if
    `f->flags' does not have `FEC_VERITY_DISABLE' set, calls `verify_tree' to
    load and validate the hash tree */
-static int parse_table(fec_handle *f, uint64_t offset, uint32_t size)
+static int parse_table(fec_handle *f, uint64_t offset, uint32_t size, bool useecc)
 {
     check(f);
     check(size >= VERITY_MIN_TABLE_SIZE);
@@ -335,8 +340,13 @@
         return -1;
     }
 
-    if (fec_pread(f, table.get(), size, offset) != (ssize_t)size) {
-        error("failed to read verity table: %s", strerror(errno));
+    if (!useecc) {
+        if (!raw_pread(f, table.get(), size, offset)) {
+            error("failed to read verity table: %s", strerror(errno));
+            return -1;
+        }
+    } else if (fec_pread(f, table.get(), size, offset) != (ssize_t)size) {
+        error("failed to ecc read verity table: %s", strerror(errno));
         return -1;
     }
 
@@ -430,7 +440,18 @@
         check(v->data_blocks == v->hash_start / FEC_BLOCKSIZE);
     }
 
+    if (v->salt) {
+        delete[] v->salt;
+        v->salt = NULL;
+    }
+
     v->salt = salt.release();
+
+    if (v->table) {
+        delete[] v->table;
+        v->table = NULL;
+    }
+
     v->table = table.release();
 
     if (!(f->flags & FEC_VERITY_DISABLE)) {
@@ -589,7 +610,10 @@
 
     v->metadata_start = offset;
 
-    if (parse_table(f, offset + sizeof(v->header), v->header.length) == -1) {
+    if (parse_table(f, offset + sizeof(v->header), v->header.length,
+            false) == -1 &&
+        parse_table(f, offset + sizeof(v->header), v->header.length,
+            true)  == -1) {
         return -1;
     }
 
diff --git a/simpleperf/IOEventLoop.cpp b/simpleperf/IOEventLoop.cpp
index de232ea..44de289 100644
--- a/simpleperf/IOEventLoop.cpp
+++ b/simpleperf/IOEventLoop.cpp
@@ -25,9 +25,10 @@
   IOEventLoop* loop;
   event* e;
   std::function<bool()> callback;
+  bool enabled;
 
   IOEvent(IOEventLoop* loop, const std::function<bool()>& callback)
-      : loop(loop), e(nullptr), callback(callback) {}
+      : loop(loop), e(nullptr), callback(callback), enabled(false) {}
 
   ~IOEvent() {
     if (e != nullptr) {
@@ -80,6 +81,14 @@
   return AddEvent(fd, EV_READ | EV_PERSIST, nullptr, callback);
 }
 
+IOEventRef IOEventLoop::AddWriteEvent(int fd,
+                                      const std::function<bool()>& callback) {
+  if (!MakeFdNonBlocking(fd)) {
+    return nullptr;
+  }
+  return AddEvent(fd, EV_WRITE | EV_PERSIST, nullptr, callback);
+}
+
 bool IOEventLoop::AddSignalEvent(int sig,
                                  const std::function<bool()>& callback) {
   return AddEvent(sig, EV_SIGNAL | EV_PERSIST, nullptr, callback) != nullptr;
@@ -115,6 +124,7 @@
     LOG(ERROR) << "event_add() failed";
     return nullptr;
   }
+  e->enabled = true;
   events_.push_back(std::move(e));
   return events_.back().get();
 }
@@ -138,14 +148,33 @@
   return true;
 }
 
+bool IOEventLoop::DisableEvent(IOEventRef ref) {
+  if (ref->enabled) {
+    if (event_del(ref->e) != 0) {
+      LOG(ERROR) << "event_del() failed";
+      return false;
+    }
+    ref->enabled = false;
+  }
+  return true;
+}
+
+bool IOEventLoop::EnableEvent(IOEventRef ref) {
+  if (!ref->enabled) {
+    if (event_add(ref->e, nullptr) != 0) {
+      LOG(ERROR) << "event_add() failed";
+      return false;
+    }
+    ref->enabled = true;
+  }
+  return true;
+}
+
 bool IOEventLoop::DelEvent(IOEventRef ref) {
+  DisableEvent(ref);
   IOEventLoop* loop = ref->loop;
   for (auto it = loop->events_.begin(); it != loop->events_.end(); ++it) {
     if (it->get() == ref) {
-      if (event_del((*it)->e) != 0) {
-        LOG(ERROR) << "event_del() failed";
-        return false;
-      }
       loop->events_.erase(it);
       break;
     }
diff --git a/simpleperf/IOEventLoop.h b/simpleperf/IOEventLoop.h
index f35e9e0..9b96629 100644
--- a/simpleperf/IOEventLoop.h
+++ b/simpleperf/IOEventLoop.h
@@ -40,6 +40,10 @@
   // to control the Event, otherwise return nullptr.
   IOEventRef AddReadEvent(int fd, const std::function<bool()>& callback);
 
+  // Register a write Event, so [callback] is called when [fd] can be written
+  // without blocking.
+  IOEventRef AddWriteEvent(int fd, const std::function<bool()>& callback);
+
   // Register a signal Event, so [callback] is called each time signal [sig]
   // happens.
   bool AddSignalEvent(int sig, const std::function<bool()>& callback);
@@ -60,6 +64,11 @@
   // Exit the loop started by RunLoop().
   bool ExitLoop();
 
+  // Disable an Event, which can be enabled later.
+  static bool DisableEvent(IOEventRef ref);
+  // Enable a disabled Event.
+  static bool EnableEvent(IOEventRef ref);
+
   // Unregister an Event.
   static bool DelEvent(IOEventRef ref);
 
diff --git a/simpleperf/IOEventLoop_test.cpp b/simpleperf/IOEventLoop_test.cpp
index 68475c7..90bb4fa 100644
--- a/simpleperf/IOEventLoop_test.cpp
+++ b/simpleperf/IOEventLoop_test.cpp
@@ -25,10 +25,8 @@
   int fd[2];
   ASSERT_EQ(0, pipe(fd));
   IOEventLoop loop;
-  static int count;
-  static int retry_count;
-  count = 0;
-  retry_count = 0;
+  int count = 0;
+  int retry_count = 0;
   ASSERT_NE(nullptr, loop.AddReadEvent(fd[0], [&]() {
     while (true) {
       char c;
@@ -62,10 +60,45 @@
   close(fd[1]);
 }
 
+TEST(IOEventLoop, write) {
+  int fd[2];
+  ASSERT_EQ(0, pipe(fd));
+  IOEventLoop loop;
+  int count = 0;
+  ASSERT_NE(nullptr, loop.AddWriteEvent(fd[1], [&]() {
+    int ret = 0;
+    char buf[4096];
+    while ((ret = write(fd[1], buf, sizeof(buf))) > 0) {
+    }
+    if (ret == -1 && errno == EAGAIN) {
+      if (++count == 100) {
+        loop.ExitLoop();
+      }
+      return true;
+    }
+    return false;
+  }));
+  std::thread thread([&]() {
+    usleep(500000);
+    while (true) {
+      usleep(1000);
+      char buf[4096];
+      if (read(fd[0], buf, sizeof(buf)) <= 0) {
+        break;
+      }
+    }
+  });
+  ASSERT_TRUE(loop.RunLoop());
+  // close fd[1] to make read thread stop.
+  close(fd[1]);
+  thread.join();
+  close(fd[0]);
+  ASSERT_EQ(100, count);
+}
+
 TEST(IOEventLoop, signal) {
   IOEventLoop loop;
-  static int count;
-  count = 0;
+  int count = 0;
   ASSERT_TRUE(loop.AddSignalEvent(SIGINT, [&]() {
     if (++count == 100) {
       loop.ExitLoop();
@@ -87,8 +120,7 @@
   timeval tv;
   tv.tv_sec = 0;
   tv.tv_usec = 1000;
-  static int count;
-  count = 0;
+  int count = 0;
   IOEventLoop loop;
   ASSERT_TRUE(loop.AddPeriodicEvent(tv, [&]() {
     if (++count == 100) {
@@ -113,8 +145,7 @@
   int fd[2];
   ASSERT_EQ(0, pipe(fd));
   IOEventLoop loop;
-  static int count;
-  count = 0;
+  int count = 0;
   IOEventRef ref = loop.AddReadEvent(fd[0], [&]() {
     count++;
     return IOEventLoop::DelEvent(ref);
@@ -134,3 +165,40 @@
   close(fd[0]);
   close(fd[1]);
 }
+
+TEST(IOEventLoop, disable_enable_event) {
+  int fd[2];
+  ASSERT_EQ(0, pipe(fd));
+  IOEventLoop loop;
+  int count = 0;
+  IOEventRef ref = loop.AddWriteEvent(fd[1], [&]() {
+    count++;
+    return IOEventLoop::DisableEvent(ref);
+  });
+  ASSERT_NE(nullptr, ref);
+
+  timeval tv;
+  tv.tv_sec = 0;
+  tv.tv_usec = 500000;
+  int periodic_count = 0;
+  ASSERT_TRUE(loop.AddPeriodicEvent(tv, [&]() {
+    periodic_count++;
+    if (periodic_count == 1) {
+      if (count != 1) {
+        return false;
+      }
+      return IOEventLoop::EnableEvent(ref);
+    } else {
+      if (count != 2) {
+        return false;
+      }
+      return loop.ExitLoop();
+    }
+  }));
+
+  ASSERT_TRUE(loop.RunLoop());
+  ASSERT_EQ(2, count);
+  ASSERT_EQ(2, periodic_count);
+  close(fd[0]);
+  close(fd[1]);
+}
diff --git a/simpleperf/README.md b/simpleperf/README.md
new file mode 100644
index 0000000..3acd33a
--- /dev/null
+++ b/simpleperf/README.md
@@ -0,0 +1,736 @@
+# Introduction of simpleperf
+## What is simpleperf
+Simpleperf is a native profiling tool for Android. Its command-line interface
+supports broadly the same options as the linux-tools perf, but also supports
+various Android-specific improvements.
+
+Simpleperf is part of the Android Open Source Project. The source code is at
+https://android.googlesource.com/platform/system/extras/+/master/simpleperf/.
+Bugs and feature requests can be submitted at
+http://github.com/android-ndk/ndk/issues.
+
+## How simpleperf works
+Modern CPUs have a hardware component called the performance monitoring unit
+(PMU). The PMU has several hardware counters, counting events like how many cpu
+cycles have happened, how many instructions have executed, or how many cache
+misses have happened.
+
+The Linux kernel wraps these hardware counters into hardware perf events. In
+addition, the Linux kernel also provides hardware independent software events
+and tracepoint events. The Linux kernel exposes all this to userspace via the
+perf_event_open system call, which simpleperf uses.
+
+Simpleperf has three main functions: stat, record and report.
+
+The stat command gives a summary of how many events have happened in the
+profiled processes in a time period. Here’s how it works:
+1. Given user options, simpleperf enables profiling by making a system call to
+linux kernel.
+2. Linux kernel enables counters while scheduling on the profiled processes.
+3. After profiling, simpleperf reads counters from linux kernel, and reports a
+counter summary.
+
+The record command records samples of the profiled process in a time period.
+Here’s how it works:
+1. Given user options, simpleperf enables profiling by making a system call to
+linux kernel.
+2. Simpleperf creates mapped buffers between simpleperf and linux kernel.
+3. Linux kernel enable counters while scheduling on the profiled processes.
+4. Each time a given number of events happen, linux kernel dumps a sample to a
+mapped buffer.
+5. Simpleperf reads samples from the mapped buffers and generates perf.data.
+
+The report command reads a "perf.data" file and any shared libraries used by
+the profiled processes, and outputs a report showing where the time was spent.
+
+## Main simpleperf commands
+Simpleperf supports several subcommands, including list, stat, record, report.
+Each subcommand supports different options. This section only covers the most
+important subcommands and options. To see all subcommands and options,
+use --help.
+
+    # List all subcommands.
+    $simpleperf --help
+
+    # Print help message for record subcommand.
+    $simpleperf record --help
+
+### simpleperf list
+simpleperf list is used to list all events available on the device. Different
+devices may support different events because of differences in hardware and
+kernel.
+
+    $simpleperf list
+    List of hw-cache events:
+      branch-loads
+      ...
+    List of hardware events:
+      cpu-cycles
+      instructions
+      ...
+    List of software events:
+      cpu-clock
+      task-clock
+      ...
+
+### simpleperf stat
+simpleperf stat is used to get a raw event counter information of the profiled program
+or system-wide. By passing options, we can select which events to use, which
+processes/threads to monitor, how long to monitor and the print interval.
+Below is an example.
+
+    # Stat using default events (cpu-cycles,instructions,...), and monitor
+    # process 7394 for 10 seconds.
+    $simpleperf stat -p 7394 --duration 10
+    Performance counter statistics:
+
+     1,320,496,145  cpu-cycles         # 0.131736 GHz                     (100%)
+       510,426,028  instructions       # 2.587047 cycles per instruction  (100%)
+         4,692,338  branch-misses      # 468.118 K/sec                    (100%)
+    886.008130(ms)  task-clock         # 0.088390 cpus used               (100%)
+               753  context-switches   # 75.121 /sec                      (100%)
+               870  page-faults        # 86.793 /sec                      (100%)
+
+    Total test time: 10.023829 seconds.
+
+#### Select events
+We can select which events to use via -e option. Below are examples:
+
+    # Stat event cpu-cycles.
+    $simpleperf stat -e cpu-cycles -p 11904 --duration 10
+
+    # Stat event cache-references and cache-misses.
+    $simpleperf stat -e cache-references,cache-misses -p 11904 --duration 10
+
+When running the stat command, if the number of hardware events is larger than
+the number of hardware counters available in the PMU, the kernel shares hardware
+counters between events, so each event is only monitored for part of the total
+time. In the example below, there is a percentage at the end of each row,
+showing the percentage of the total time that each event was actually monitored.
+
+    # Stat using event cache-references, cache-references:u,....
+    $simpleperf stat -p 7394 -e     cache-references,cache-references:u,cache-references:k,cache-misses,cache-misses:u,cache-misses:k,instructions --duration 1
+    Performance counter statistics:
+
+    4,331,018  cache-references     # 4.861 M/sec    (87%)
+    3,064,089  cache-references:u   # 3.439 M/sec    (87%)
+    1,364,959  cache-references:k   # 1.532 M/sec    (87%)
+       91,721  cache-misses         # 102.918 K/sec  (87%)
+       45,735  cache-misses:u       # 51.327 K/sec   (87%)
+       38,447  cache-misses:k       # 43.131 K/sec   (87%)
+    9,688,515  instructions         # 10.561 M/sec   (89%)
+
+    Total test time: 1.026802 seconds.
+
+In the example above, each event is monitored about 87% of the total time. But
+there is no guarantee that any pair of events are always monitored at the same
+time. If we want to have some events monitored at the same time, we can use
+--group option. Below is an example.
+
+    # Stat using event cache-references, cache-references:u,....
+    $simpleperf stat -p 7394 --group cache-references,cache-misses --group cache-references:u,cache-misses:u --group cache-references:k,cache-misses:k -e instructions --duration 1
+    Performance counter statistics:
+
+    3,638,900  cache-references     # 4.786 M/sec          (74%)
+       65,171  cache-misses         # 1.790953% miss rate  (74%)
+    2,390,433  cache-references:u   # 3.153 M/sec          (74%)
+       32,280  cache-misses:u       # 1.350383% miss rate  (74%)
+      879,035  cache-references:k   # 1.251 M/sec          (68%)
+       30,303  cache-misses:k       # 3.447303% miss rate  (68%)
+    8,921,161  instructions         # 10.070 M/sec         (86%)
+
+    Total test time: 1.029843 seconds.
+
+#### Select target to monitor
+We can select which processes or threads to monitor via -p option or -t option.
+Monitoring a process is the same as monitoring all threads in the process.
+Simpleperf can also fork a child process to run the new command and then monitor
+the child process. Below are examples.
+
+    # Stat process 11904 and 11905.
+    $simpleperf stat -p 11904,11905 --duration 10
+
+    # Stat thread 11904 and 11905.
+    $simpleperf stat -t 11904,11905 --duration 10
+
+    # Start a child process running `ls`, and stat it.
+    $simpleperf stat ls
+
+#### Decide how long to monitor
+When monitoring existing threads, we can use --duration option to decide how long
+to monitor. When monitoring a child process running a new command, simpleperf
+monitors until the child process ends. In this case, we can use Ctrl-C to stop monitoring
+at any time. Below are examples.
+
+    # Stat process 11904 for 10 seconds.
+    $simpleperf stat -p 11904 --duration 10
+
+    # Stat until the child process running `ls` finishes.
+    $simpleperf stat ls
+
+    # Stop monitoring using Ctrl-C.
+    $simpleperf stat -p 11904 --duration 10
+    ^C
+
+#### Decide the print interval
+When monitoring perf counters, we can also use --interval option to decide the print
+interval. Below are examples.
+
+    # Print stat for process 11904 every 300ms.
+    $simpleperf stat -p 11904 --duration 10 --interval 300
+    # Stop by using Ctrl-C.
+    ^C
+
+    # Print system wide stat at interval of 300ms for 10 seconds (rooted device only).
+    # system wide profiling needs root privilege
+    $su 0 simpleperf stat -a --duration 10 --interval 300
+
+#### Display counters in systrace
+simpleperf can also work with systrace to dump counters in the collected trace.
+Below is an example to do a system wide stat
+
+    # capture instructions (kernel only) and cache misses with interval of 300 milliseconds for 15 seconds
+    $su 0 simpleperf stat -e instructions:k,cache-misses -a --interval 300 --duration 15
+    # on host launch systrace to collect trace for 10 seconds
+    (HOST)$external/chromium-trace/systrace.py --time=10 -o new.html sched gfx view
+    # open the collected new.html in browser and perf counters will be shown up
+
+### simpleperf record
+simpleperf record is used to dump records of the profiled program. By passing
+options, we can select which events to use, which processes/threads to monitor,
+what frequency to dump records, how long to monitor, and where to store records.
+
+    # Record on process 7394 for 10 seconds, using default event (cpu-cycles),
+    # using default sample frequency (4000 samples per second), writing records
+    # to perf.data.
+    $simpleperf record -p 7394 --duration 10
+    simpleperf I 07-11 21:44:11 17522 17522 cmd_record.cpp:316] Samples recorded: 21430. Samples lost: 0.
+
+#### Select events
+In most cases, the cpu-cycles event is used to evaluate consumed cpu time.
+As a hardware event, it is both accurate and efficient. We can also use other
+events via -e option. Below is an example.
+
+    # Record using event instructions.
+    $simpleperf record -e instructions -p 11904 --duration 10
+
+#### Select target to monitor
+The way to select target in record command is similar to that in stat command.
+Below are examples.
+
+    # Record process 11904 and 11905.
+    $simpleperf record -p 11904,11905 --duration 10
+
+    # Record thread 11904 and 11905.
+    $simpleperf record -t 11904,11905 --duration 10
+
+    # Record a child process running `ls`.
+    $simpleperf record ls
+
+#### Set the frequency to record
+We can set the frequency to dump records via the -f or -c options. For example,
+-f 4000 means dumping approximately 4000 records every second when the monitored
+thread runs. If a monitored thread runs 0.2s in one second (it can be preempted
+or blocked in other times), simpleperf dumps about 4000 * 0.2 / 1.0 = 800
+records every second. Another way is using -c option. For example, -c 10000
+means dumping one record whenever 10000 events happen. Below are examples.
+
+    # Record with sample frequency 1000: sample 1000 times every second running.
+    $simpleperf record -f 1000 -p 11904,11905 --duration 10
+
+    # Record with sample period 100000: sample 1 time every 100000 events.
+    $simpleperf record -c 100000 -t 11904,11905 --duration 10
+
+#### Decide how long to monitor
+The way to decide how long to monitor in record command is similar to that in
+stat command. Below are examples.
+
+    # Record process 11904 for 10 seconds.
+    $simpleperf record -p 11904 --duration 10
+
+    # Record until the child process running `ls` finishes.
+    $simpleperf record ls
+
+    # Stop monitoring using Ctrl-C.
+    $simpleperf record -p 11904 --duration 10
+    ^C
+
+#### Set the path to store records
+By default, simpleperf stores records in perf.data in current directory. We can
+use -o option to set the path to store records. Below is an example.
+
+    # Write records to data/perf2.data.
+    $simpleperf record -p 11904 -o data/perf2.data --duration 10
+
+### simpleperf report
+simpleperf report is used to report based on perf.data generated by simpleperf
+record command. Report command groups records into different sample entries,
+sorts sample entries based on how many events each sample entry contains, and
+prints out each sample entry. By passing options, we can select where to find
+perf.data and executable binaries used by the monitored program, filter out
+uninteresting records, and decide how to group records.
+
+Below is an example. Records are grouped into 4 sample entries, each entry is
+a row. There are several columns, each column shows piece of information
+belonging to a sample entry. The first column is Overhead, which shows the
+percentage of events inside current sample entry in total events. As the
+perf event is cpu-cycles, the overhead can be seen as the percentage of cpu
+time used in each function.
+
+    # Reports perf.data, using only records sampled in libsudo-game-jni.so,
+    # grouping records using thread name(comm), process id(pid), thread id(tid),
+    # function name(symbol), and showing sample count for each row.
+    $simpleperf report --dsos /data/app/com.example.sudogame-2/lib/arm64/libsudo-game-jni.so --sort comm,pid,tid,symbol -n
+    Cmdline: /data/data/com.example.sudogame/simpleperf record -p 7394 --duration 10
+    Arch: arm64
+    Event: cpu-cycles (type 0, config 0)
+    Samples: 28235
+    Event count: 546356211
+
+    Overhead  Sample  Command    Pid   Tid   Symbol
+    59.25%    16680   sudogame  7394  7394  checkValid(Board const&, int, int)
+    20.42%    5620    sudogame  7394  7394  canFindSolution_r(Board&, int, int)
+    13.82%    4088    sudogame  7394  7394  randomBlock_r(Board&, int, int, int, int, int)
+    6.24%     1756    sudogame  7394  7394  @plt
+
+#### Set the path to read records
+By default, simpleperf reads perf.data in current directory. We can use -i
+option to select another file to read records.
+
+    $simpleperf report -i data/perf2.data
+
+#### Set the path to find executable binaries
+If reporting function symbols, simpleperf needs to read executable binaries
+used by the monitored processes to get symbol table and debug information. By
+default, the paths are the executable binaries used by monitored processes while
+recording. However, these binaries may not exist when reporting or not contain
+symbol table and debug information. So we can use --symfs to redirect the paths.
+Below is an example.
+
+    $simpleperf report
+    # In this case, when simpleperf wants to read executable binary /A/b,
+    # it reads file in /A/b.
+
+    $simpleperf report --symfs /debug_dir
+    # In this case, when simpleperf wants to read executable binary /A/b,
+    # it prefers file in /debug_dir/A/b to file in /A/b.
+
+#### Filter records
+When reporting, it happens that not all records are of interest. Simpleperf
+supports five filters to select records of interest. Below are examples.
+
+    # Report records in threads having name sudogame.
+    $simpleperf report --comms sudogame
+
+    # Report records in process 7394 or 7395
+    $simpleperf report --pids 7394,7395
+
+    # Report records in thread 7394 or 7395.
+    $simpleperf report --tids 7394,7395
+
+    # Report records in libsudo-game-jni.so.
+    $simpleperf report --dsos /data/app/com.example.sudogame-2/lib/arm64/libsudo-game-jni.so
+
+    # Report records in function checkValid or canFindSolution_r.
+    $simpleperf report --symbols "checkValid(Board const&, int, int);canFindSolution_r(Board&, int, int)"
+
+#### Decide how to group records into sample entries
+Simpleperf uses --sort option to decide how to group sample entries. Below are
+examples.
+
+    # Group records based on their process id: records having the same process
+    # id are in the same sample entry.
+    $simpleperf report --sort pid
+
+    # Group records based on their thread id and thread comm: records having
+    # the same thread id and thread name are in the same sample entry.
+    $simpleperf report --sort tid,comm
+
+    # Group records based on their binary and function: records in the same
+    # binary and function are in the same sample entry.
+    $simpleperf report --sort dso,symbol
+
+    # Default option: --sort comm,pid,tid,dso,symbol. Group records in the same
+    # thread, and belong to the same function in the same binary.
+    $simpleperf report
+
+## Features of simpleperf
+Simpleperf works similar to linux-tools-perf, but it has following improvements:
+1. Aware of Android environment. Simpleperf handles some Android specific
+situations when profiling. For example, it can profile embedded shared libraries
+in apk, read symbol table and debug information from .gnu_debugdata section. If
+possible, it gives suggestions when facing errors, like how to disable
+perf_harden to enable profiling.
+2. Support unwinding while recording. If we want to use -g option to record and
+report call-graph of a program, we need to dump user stack and register set in
+each record, and then unwind the stack to find the call chain. Simpleperf
+supports unwinding while recording, so it doesn’t need to store user stack in
+perf.data. So we can profile for a longer time with limited space on device.
+3. Build in static binaries. Simpleperf is a static binary, so it doesn’t need
+supporting shared libraries to run. It means there is no limitation of Android
+version that simpleperf can run on, although some devices don’t support
+profiling.
+
+# Steps to profile native libraries
+After introducing simpleperf, this section uses a simple example to show how to
+profile jni native libraries on Android using simpleperf. The example profiles
+an app called com.example.sudogame, which uses a jni native library
+sudo-game-jni.so. We focus on sudo-game-jni.so, not the java code or system
+libraries.
+
+## 1. Run debug version of the app on device
+We need to run debug version of the app, because we can’t use *run-as* for non
+debuggable apps.
+
+## 2. Download simpleperf to the app’s directory
+Use *uname* to find the architecture on device
+
+    $adb shell uname -m
+    aarch64
+
+"aarch64" means we should download arm64 version of simpleperf to device.
+
+    $adb push device/arm64/simpleperf /data/local/tmp
+    $adb shell run-as com.example.sudogame cp /data/local/tmp/simpleperf .
+    $adb shell run-as com.example.sudogame chmod a+x simpleperf
+    $adb shell run-as com.example.sudogame ls -l
+    -rwxrwxrwx 1 u0_a90 u0_a90 3059208 2016-01-01 10:40 simpleperf
+
+Note that some apps use arm native libraries even on arm64 devices (We can
+verify this by checking /proc/<process\_id\_of\_app>/maps). In that case, we
+should use arm/simpleperf instead of arm64/simpleperf.
+
+## 3. Enable profiling
+Android devices may disable profiling by default, and we need to enable
+profiling.
+
+    $adb shell setprop security.perf_harden 0
+
+## 4. Find the target process/thread to record
+
+    # Use `ps` to get process id of sudogame.
+    $adb shell ps  | grep sudogame
+    u0_a102   15869 545   1629824 76888 SyS_epoll_ 0000000000 S com.example.sudogame
+
+    # Use `ps -t` to get thread ids of process 15869.
+    # If this doesn’t work, you can try `ps -eT`.
+    $adb shell ps -t  | grep 15869
+    u0_a102   15869 545   1629824 76888 SyS_epoll_ 0000000000 S com.example.sudogame
+    u0_a102   15874 15869 1629824 76888 futex_wait 0000000000 S Jit thread pool
+    ...
+
+## 5. Record perf.data
+
+    # Record process 15869 for 30s, and use the app while recording it.
+    $adb shell run-as com.example.sudogame ./simpleperf record -p 15869 --duration 30
+    simpleperf W 07-12 20:00:33 16022 16022 environment.cpp:485] failed to read /proc/sys/kernel/kptr_restrict: Permission denied
+    simpleperf I 07-12 20:01:03 16022 16022 cmd_record.cpp:315] Samples recorded: 81445. Samples lost: 0.
+
+    $adb shell run-as com.example.sudogame ls -lh perf.data
+    -rw-rw-rw- 1 u0_a102 u0_a102 4.3M 2016-07-12 20:01 perf.data
+
+Now we have recorded perf.data with 81445 records. There is a warning about
+failing to read kptr_restrict. It doesn’t matter in our case, but is a notification that we
+can’t read kernel symbol addresses.
+
+## 6. Report perf.data
+Below are several examples reporting on device.
+
+### Report samples in different binaries
+
+    # Report how samples distribute on different binaries.
+    $adb shell run-as com.example.sudogame ./simpleperf report -n --sort dso
+    simpleperf W 07-12 19:15:10 11389 11389 dso.cpp:309] Symbol addresses in /proc/kallsyms are all zero. `echo 0 >/proc/sys/kernel/kptr_restrict` if possible.
+    Cmdline: /data/data/com.example.sudogame/simpleperf record -p 15869 --duration 30
+    Arch: arm64
+    Event: cpu-cycles (type 0, config 0)
+    Samples: 81445
+    Event count: 34263925309
+
+    Overhead  Sample  Shared Object
+    75.31%    58231   [kernel.kallsyms]
+    8.44%     6845    /system/lib64/libc.so
+    4.30%     4667    /vendor/lib64/egl/libGLESv2_adreno.so
+    2.30%     2433    /system/lib64/libhwui.so
+    1.88%     1952    /system/lib64/libart.so
+    1.88%     1967    /system/framework/arm64/boot-framework.oat
+    1.59%     1218    /system/lib64/libcutils.so
+    0.69%     728     /system/lib64/libskia.so
+    0.63%     489     /data/app/com.example.sudogame-2/lib/arm64/libsudo-game-jni.so
+    0.34%     312     /system/lib64/libart-compiler.so
+    ...
+
+According to the report above, most time is spent in kernel, and
+libsudo-game-jni.so costs only 0.63% by itself. It seems libsudo-game-jni.so
+can’t be the bottleneck. However, it is possible we didn’t record long enough
+to hit the hot spot, or code in libsudo-game-jni.so calls other libraries
+consuming most time.
+
+### Report samples in different functions
+
+    # Report how samples distribute inside libsudo-game-jni.so.
+    $adb shell run-as com.example.sudogame ./simpleperf report -n --dsos /data/app/com.example.sudogame-2/lib/arm64/libsudo-game-jni.so --sort symbol
+    ...
+    Overhead  Sample  Symbol
+    94.45%    461     unknown
+    5.22%     26      @plt
+    0.20%     1       Java_com_example_sudogame_GameModel_findConflictPairs
+    0.14%     1       Java_com_example_sudogame_GameModel_canFindSolution
+
+In the report above, most samples belong to unknown symbol. It is because the
+libsudo-game-jni.so used on device doesn’t contain symbol table. We need to
+download shared library with symbol table to device. In android studio 2.1.2,
+the binary with symbol table is in
+[app_dir]/app/build/intermediates/binaries/debug/obj/arm64-v8a (for amr64).
+
+    # Make a proper directory to download binary to device. This directory
+    # should be the same as the directory of
+    # /data/app/com.example.sudogame-2/lib/arm64/libsudo-game-jni.so.
+    $adb shell run-as com.example.sudogame mkdir -p data/app/com.example.sudogame-2/lib/arm64
+    # Download binary with symbol table.
+    $adb push [app_dir]/app/build/intermediates/binaries/debug/obj/arm64-v8a/libsudo-game-jni.so /data/local/tmp
+    $adb shell run-as com.example.sudogame cp /data/local/tmp/libsudo-game-jni.so data/app/com.example.sudogame-2/lib/arm64
+
+    # Report how samples distribute inside libsudo-game-jni.so with debug binary
+    # support.
+    $adb shell run-as com.example.sudogame ./simpleperf report -n --dsos /data/app/com.example.sudogame-2/lib/arm64/libsudo-game-jni.so --sort symbol --symfs .
+    ...
+    Overhead  Sample  Symbol
+    71.08%    347     checkValid(Board const&, int, int)
+    15.13%    74      randomBlock_r(Board&, int, int, int, int, int)
+    7.94%     38      canFindSolution_r(Board&, int, int)
+    5.22%     26      @plt
+    0.30%     2       randomBoard(Board&)
+    0.20%     1       Java_com_example_sudogame_GameModel_findConflictPairs
+    0.14%     1       Java_com_example_sudogame_GameModel_canFindSolution
+
+With the help of debug version of libsudo-game-jni.so, the report above shows that most
+time in libsudo-game-jni.so is spent in function checkValid. So now we can look
+into it further.
+
+### Report samples in one function
+
+    # Report how samples distribute inside checkValid() function.
+    # adb shell command can’t pass ‘(‘ in arguments, so we run the command
+    # inside `adb shell`.
+    $adb shell
+    device$ run-as com.example.sudogame ./simpleperf report -n --symbols "checkValid(Board const&, int, int)" --sort vaddr_in_file --symfs .
+    ...
+    Overhead  Sample  VaddrInFile
+    14.90%    50      0x24d8
+    8.48%     29      0x251c
+    5.52%     19      0x2468
+    ...
+
+The report above shows samples hitting different places inside function
+checkValid(). By using objdump to disassemble libsudo-game-jni.so, we can find
+which are the hottest instructions in checkValid() function.
+
+    # Disassemble libsudo-game-jni.so.
+    $aarch64-linux-android-objdump -d -S -l libsudo-game-jni.so >libsudo-game-jni.asm
+
+## 7. Record and report call graph
+### What is a call graph
+A call graph is a tree showing function call relations. For example, a program
+starts at main() function, and main() calls functionOne() and functionTwo(),
+and functionOne() calls functionTwo() and functionThree(). Then the call graph
+is as below.
+
+    main() -> functionOne()
+          |    |
+          |    |-> functionTwo()
+          |    |
+          |     ->  functionThree()
+           -> functionTwo()
+
+### Record dwarf based call graph
+To generate call graph, simpleperf needs to generate call chain for each record.
+Simpleperf requests kernel to dump user stack and user register set for each
+record, then it backtraces the user stack to find the function call chain. To
+parse the call chain, it needs support of dwarf call frame information, which
+usually resides in .eh_frame or .debug_frame section of the binary.  So we need
+to use --symfs to point out where is libsudo-game-jni.so with debug information.
+
+    # Record thread 11546 for 30s, use the app while recording it.
+    $adb shell run-as com.example.sudogame ./simpleperf record -t 11546 -g --symfs . --duration 30
+    simpleperf I 01-01 07:13:08  9415  9415 cmd_record.cpp:336] Samples recorded: 65279. Samples lost: 16740.
+    simpleperf W 01-01 07:13:08  9415  9415 cmd_record.cpp:343] Lost 20.4099% of samples, consider increasing mmap_pages(-m), or decreasing sample frequency(-f), or increasing sample period(-c).
+
+    $adb shell run-as com.example.sudogame ls -lh perf.data
+    -rw-rw-rw- 1 u0_a96 u0_a96 8.3M 2016-01-01 08:49 perf.data
+
+Note that kernel can’t dump user stack >= 64K, so the dwarf based call graph
+doesn’t contain call chains consuming >= 64K stack. So avoiding allocating
+large memory on stack is a good way to improve dwarf based call graph.
+
+### Record stack frame based call graph
+Another way to generate call graph is to rely on the kernel parsing the call
+chain for each record. To make it possible, kernel has to be able to identify
+the stack frame of each function call. This is not always possible, because
+compilers can optimize away stack frames, or use a stack frame style not
+recognized by the kernel. So how well it works depends.
+
+    # Record thread 11546 for 30s, use the app while recording it.
+    $adb shell run-as com.example.sudogame ./simpleperf record -t 11546 --call-graph fp --symfs . --duration 30
+    simpleperf W 01-02 05:43:24 23277 23277 environment.cpp:485] failed to read /proc/sys/kernel/kptr_restrict: Permission denied
+    simpleperf I 01-02 05:43:54 23277 23277 cmd_record.cpp:323] Samples recorded: 95023. Samples lost: 0.
+
+    $adb shell run-as com.example.sudogame ls -lh perf.data
+    -rw-rw-rw- 1 u0_a96 u0_a96 39M 2016-01-02 05:43 perf.data
+
+### Report call graph
+#### Report call graph on device
+    # Report call graph.
+    $adb shell run-as com.example.sudogame ./simpleperf report -n -g --symfs .
+    Cmdline: /data/data/com.example.sudogame/simpleperf record -t 11546 -g --symfs . -f 1000 --duration 30
+    Arch: arm64
+    Event: cpu-cycles (type 0, config 0)
+    Samples: 23840
+    Event count: 41301992088
+
+    Children  Self    Sample  Command          Pid    Tid    Shared Object                                                   Symbol
+    97.98%    0.69%   162     xample.sudogame  11546  11546  /data/app/com.example.sudogame-1/lib/arm64/libsudo-game-jni.so  checkValid(Board const&, int, int)
+       |
+       -- checkValid(Board const&, int, int)
+          |
+          |--99.95%-- __android_log_print
+          |           |
+          |           |--92.19%-- __android_log_buf_write
+          |           |           |
+          |           |           |--73.50%-- libcutils.so[+1120c]
+    ...
+
+#### Report call graph in callee mode
+Call graph can be shown in two modes. One is caller mode, showing how functions
+call others. The other is callee mode, showing how functions are called by
+others. We can use  *-g callee* option to show call graph in callee mode.
+
+    # Report call graph.
+    $host/simpleperf report -n -g callee --symfs .
+    Cmdline: /data/data/com.example.sudogame/simpleperf record -t 11546 -g --symfs . -f 1000 --duration 30
+    Arch: arm64
+    Event: cpu-cycles (type 0, config 0)
+    Samples: 23840
+    Event count: 41301992088
+
+    Children  Self    Sample  Command          Pid    Tid    Shared Object                                                   Symbol
+    97.58%    0.21%   48      xample.sudogame  11546  11546  /system/lib64/liblog.so                                         __android_log_print
+       |
+       -- __android_log_print
+          |
+          |--99.70%-- checkValid(Board const&, int, int)
+          |           |
+          |           |--99.31%-- canFindSolution_r(Board&, int, int)
+    ...
+
+#### Report using report.py
+The call graph generated by simpleperf report may be hard to read in text mode.
+Simpleperf provides a python script showing GUI of call graph.
+It can be used as below.
+
+    # Show call graph in GUI.
+    $adb shell run-as com.example.sudogame ./simpleperf report -n -g --symfs . >perf.report
+    $python report.py perf.report
+
+# Steps to profile java code on rooted devices
+Simpleperf only supports profiling native instructions in binaries in ELF
+format. If the java code is executed by interpreter, or with jit cache, it
+can’t be profiled by simpleperf. As Android supports Ahead-of-time compilation,
+it can compile java bytecode into native instructions. We currently need root
+privilege to force Android fully compiling java code into native instructions
+in ELF binaries with debug information (this could be fixed by a
+profileable=”true” in AndroidManifest that causes PackageManager to pass -g to
+dex2oat). We also need root privilege to read compiled native binaries
+(because installd writes them to a directory whose uid/gid is system:install).
+So profiling java code can currently only be done on rooted devices.
+
+## 1. Fully compile java code into native instructions
+### On Android N
+
+    # Restart adb as root. It needs root privilege to setprop below.
+    $adb root
+    # Set the property to compile with debug information.
+    $adb shell setprop dalvik.vm.dex2oat-flags -g
+
+    # Fully compile the app instead of using interpreter or jit.
+    $adb shell cmd package compile -f -m speed com.example.sudogame
+
+    # Restart the app on device.
+
+### On Android M
+
+    # Restart adb as root. It needs root privilege to setprop below.
+    $adb root
+    # Set the property to compile with debug information.
+    $adb shell setprop dalvik.vm.dex2oat-flags -g
+
+    # Reinstall the app.
+    $adb install -r app-debug.apk
+
+### On Android L
+
+    # Restart adb as root. It needs root privilege to setprop below.
+    $adb root
+    # Set the property to compile with debug information.
+    $adb shell setprop dalvik.vm.dex2oat-flags --include-debug-symbols
+
+    # Reinstall the app.
+    $adb install -r app-debug.apk
+
+## 2. Record perf.data
+
+    # Change to the app’s data directory.
+    $ adb root && adb shell
+    device# cd `run-as com.example.sudogame pwd`
+
+    # Record as root as simpleperf needs to read the generated native binary.
+    device#./simpleperf record -t 25636 -g --symfs . -f 1000 --duration 30
+    simpleperf I 01-02 07:18:20 27182 27182 cmd_record.cpp:323] Samples recorded: 23552. Samples lost: 39.
+
+    device#ls -lh perf.data
+    -rw-rw-rw- 1 root root 11M 2016-01-02 07:18 perf.data
+
+## 3. Report perf.data
+    # Report how samples distribute on different binaries.
+    device#./simpleperf report -n --sort dso
+    Cmdline: /data/data/com.example.sudogame/simpleperf record -t 25636 -g --symfs . -f 1000 --duration 30
+    Arch: arm64
+    Event: cpu-cycles (type 0, config 0)
+    Samples: 23552
+    Event count: 40662494587
+
+    Overhead  Sample  Shared Object
+    85.73%    20042   [kernel.kallsyms]
+    9.41%     2198    /system/lib64/libc.so
+    2.29%     535     /system/lib64/libcutils.so
+    0.95%     222     /data/app/com.example.sudogame-1/lib/arm64/libsudo-game-jni.so
+    ...
+    0.04%     16      /system/lib64/libandroid_runtime.so
+    0.03%     10      /data/app/com.example.sudogame-1/oat/arm64/base.odex
+    ...
+
+As in the report above, there are samples in
+/data/app/com.example.sudogame-1/oat/arm64/base.odex, which is the native binary
+compiled from java code.
+
+    # Report call graph.
+    device#./simpleperf report -n -g --symfs .
+    Cmdline: /data/data/com.example.sudogame/simpleperf record -t 25636 -g --symfs . -f 1000 --duration 30
+    Arch: arm64
+    Event: cpu-cycles (type 0, config 0)
+    Samples: 23552
+    Event count: 40662494587
+
+    Children  Self    Sample  Command          Pid    Tid    Shared Object                                                   Symbol
+    98.32%    0.00%   1       xample.sudogame  25636  25636  /data/app/com.example.sudogame-1/oat/arm64/base.odex            void com.example.sudogame.GameModel.reInit()
+       |
+       -- void com.example.sudogame.GameModel.reInit()
+          |
+          |--98.98%-- boolean com.example.sudogame.GameModel.canFindSolution(int[][])
+          |           Java_com_example_sudogame_GameModel_canFindSolution
+          |           |
+          |           |--99.93%-- canFindSolution(Board&)
+    ...
+
+As in the report above, reInit() and canFindSolution() are java
+functions.
diff --git a/simpleperf/SampleDisplayer.h b/simpleperf/SampleDisplayer.h
index 606f639..bc74e3d 100644
--- a/simpleperf/SampleDisplayer.h
+++ b/simpleperf/SampleDisplayer.h
@@ -96,6 +96,10 @@
 template <typename SampleT, typename CallChainNodeT>
 class CallgraphDisplayer {
  public:
+  CallgraphDisplayer(uint32_t max_stack = UINT32_MAX,
+                     double percent_limit = 0.0)
+      : max_stack_(max_stack), percent_limit_(percent_limit) {}
+
   virtual ~CallgraphDisplayer() {}
 
   void operator()(FILE* fp, const SampleT* sample) {
@@ -113,21 +117,23 @@
   void DisplayCallGraphEntry(FILE* fp, size_t depth, std::string prefix,
                              const std::unique_ptr<CallChainNodeT>& node,
                              uint64_t parent_period, bool last) {
-    if (depth > 20) {
-      LOG(WARNING) << "truncated callgraph at depth " << depth;
+    if (depth > max_stack_) {
       return;
     }
-    prefix += "|";
-    fprintf(fp, "%s\n", prefix.c_str());
-    if (last) {
-      prefix.back() = ' ';
-    }
     std::string percentage_s = "-- ";
     if (node->period + node->children_period != parent_period) {
       double percentage =
           100.0 * (node->period + node->children_period) / parent_period;
+      if (percentage < percent_limit_) {
+        return;
+      }
       percentage_s = android::base::StringPrintf("--%.2f%%-- ", percentage);
     }
+    prefix += "|";
+    fprintf(fp, "%s\n", prefix.c_str());
+    if (last) {
+      prefix.back() = ' ';
+    }
     fprintf(fp, "%s%s%s\n", prefix.c_str(), percentage_s.c_str(),
             PrintSampleName(node->chain[0]).c_str());
     prefix.append(percentage_s.size(), ' ');
@@ -146,6 +152,10 @@
   virtual std::string PrintSampleName(const SampleT* sample) {
     return sample->symbol->DemangledName();
   }
+
+ private:
+  uint32_t max_stack_;
+  double percent_limit_;
 };
 
 // SampleDisplayer is a class using a collections of display functions to show a
diff --git a/simpleperf/cmd_kmem_test.cpp b/simpleperf/cmd_kmem_test.cpp
index dd18858..3bc4473 100644
--- a/simpleperf/cmd_kmem_test.cpp
+++ b/simpleperf/cmd_kmem_test.cpp
@@ -23,8 +23,6 @@
 #include <memory>
 
 #include "command.h"
-#include "environment.h"
-#include "event_selection_set.h"
 #include "get_test_data.h"
 #include "record.h"
 #include "record_file.h"
@@ -71,6 +69,7 @@
 }
 
 #if defined(__linux__)
+#include "environment.h"
 
 static bool RunKmemRecordCmd(std::vector<std::string> v,
                              const char* output_file = nullptr) {
diff --git a/simpleperf/cmd_record.cpp b/simpleperf/cmd_record.cpp
index a076778..0462eba 100644
--- a/simpleperf/cmd_record.cpp
+++ b/simpleperf/cmd_record.cpp
@@ -26,6 +26,7 @@
 
 #include <android-base/logging.h>
 #include <android-base/file.h>
+#include <android-base/parsedouble.h>
 #include <android-base/parseint.h>
 #include <android-base/strings.h>
 
@@ -187,7 +188,7 @@
   bool PostUnwind(const std::vector<std::string>& args);
   bool DumpAdditionalFeatures(const std::vector<std::string>& args);
   bool DumpBuildIdFeature();
-  void CollectHitFileInfo(Record* record);
+  void CollectHitFileInfo(const SampleRecord& r);
 
   bool use_sample_freq_;
   uint64_t sample_freq_;  // Sample 'sample_freq_' times per second.
@@ -216,9 +217,6 @@
 
   uint64_t start_sampling_time_in_ns_;  // nanoseconds from machine starting
 
-  std::set<std::string> hit_kernel_modules_;
-  std::set<std::string> hit_user_files_;
-
   uint64_t sample_record_count_;
   uint64_t lost_record_count_;
 };
@@ -418,10 +416,8 @@
       if (!NextArgumentOrError(args, &i)) {
         return false;
       }
-      errno = 0;
-      char* endptr;
-      duration_in_sec_ = strtod(args[i].c_str(), &endptr);
-      if (duration_in_sec_ <= 0 || *endptr != '\0' || errno == ERANGE) {
+      if (!android::base::ParseDouble(args[i].c_str(), &duration_in_sec_,
+                                      1e-9)) {
         LOG(ERROR) << "Invalid duration: " << args[i].c_str();
         return false;
       }
@@ -725,7 +721,7 @@
         dump_processes.find(thread.pid) == dump_processes.end()) {
       continue;
     }
-    CommRecord record(attr, thread.pid, thread.tid, thread.comm, event_id);
+    CommRecord record(attr, thread.pid, thread.tid, thread.comm, event_id, 0);
     if (!ProcessRecord(&record)) {
       return false;
     }
@@ -760,7 +756,8 @@
     if (!ProcessRecord(&fork_record)) {
       return false;
     }
-    CommRecord comm_record(attr, thread.pid, thread.tid, thread.comm, event_id);
+    CommRecord comm_record(attr, thread.pid, thread.tid, thread.comm, event_id,
+                           0);
     if (!ProcessRecord(&comm_record)) {
       return false;
     }
@@ -778,7 +775,6 @@
   }
   UpdateRecordForEmbeddedElfPath(record);
   thread_tree_.Update(*record);
-  CollectHitFileInfo(record);
   if (unwind_dwarf_callchain_ && !post_unwind_) {
     if (!UnwindRecord(record)) {
       return false;
@@ -786,8 +782,9 @@
   }
   if (record->type() == PERF_RECORD_SAMPLE) {
     sample_record_count_++;
+    auto& r = *static_cast<SampleRecord*>(record);
+    CollectHitFileInfo(r);
     if (dump_symbols_) {
-      auto& r = *static_cast<SampleRecord*>(record);
       if (!DumpSymbolForRecord(r, false)) {
         return false;
       }
@@ -988,16 +985,19 @@
 bool RecordCommand::DumpBuildIdFeature() {
   std::vector<BuildIdRecord> build_id_records;
   BuildId build_id;
-  // Add build_ids for kernel/modules.
-  for (const auto& filename : hit_kernel_modules_) {
-    if (filename == DEFAULT_KERNEL_FILENAME_FOR_BUILD_ID) {
+  std::vector<Dso*> dso_v = thread_tree_.GetAllDsos();
+  for (Dso* dso : dso_v) {
+    if (!dso->IsHit()) {
+      continue;
+    }
+    if (dso->type() == DSO_KERNEL) {
       if (!GetKernelBuildId(&build_id)) {
         continue;
       }
-      build_id_records.push_back(BuildIdRecord(
-          true, UINT_MAX, build_id, DEFAULT_KERNEL_FILENAME_FOR_BUILD_ID));
-    } else {
-      std::string path = filename;
+      build_id_records.push_back(
+          BuildIdRecord(true, UINT_MAX, build_id, dso->Path()));
+    } else if (dso->type() == DSO_KERNEL_MODULE) {
+      std::string path = dso->Path();
       std::string module_name = basename(&path[0]);
       if (android::base::EndsWith(module_name, ".ko")) {
         module_name = module_name.substr(0, module_name.size() - 3);
@@ -1006,34 +1006,31 @@
         LOG(DEBUG) << "can't read build_id for module " << module_name;
         continue;
       }
-      build_id_records.push_back(
-          BuildIdRecord(true, UINT_MAX, build_id, filename));
-    }
-  }
-  // Add build_ids for user elf files.
-  for (const auto& filename : hit_user_files_) {
-    if (filename == DEFAULT_EXECNAME_FOR_THREAD_MMAP) {
-      continue;
-    }
-    auto tuple = SplitUrlInApk(filename);
-    if (std::get<0>(tuple)) {
-      ElfStatus result = GetBuildIdFromApkFile(std::get<1>(tuple),
-                                               std::get<2>(tuple), &build_id);
-      if (result != ElfStatus::NO_ERROR) {
-        LOG(DEBUG) << "can't read build_id from file " << filename << ": "
-                   << result;
-        continue;
-      }
+      build_id_records.push_back(BuildIdRecord(true, UINT_MAX, build_id, path));
     } else {
-      ElfStatus result = GetBuildIdFromElfFile(filename, &build_id);
-      if (result != ElfStatus::NO_ERROR) {
-        LOG(DEBUG) << "can't read build_id from file " << filename << ": "
-                   << result;
+      if (dso->Path() == DEFAULT_EXECNAME_FOR_THREAD_MMAP) {
         continue;
       }
+      auto tuple = SplitUrlInApk(dso->Path());
+      if (std::get<0>(tuple)) {
+        ElfStatus result = GetBuildIdFromApkFile(std::get<1>(tuple),
+                                                 std::get<2>(tuple), &build_id);
+        if (result != ElfStatus::NO_ERROR) {
+          LOG(DEBUG) << "can't read build_id from file " << dso->Path() << ": "
+                     << result;
+          continue;
+        }
+      } else {
+        ElfStatus result = GetBuildIdFromElfFile(dso->Path(), &build_id);
+        if (result != ElfStatus::NO_ERROR) {
+          LOG(DEBUG) << "can't read build_id from file " << dso->Path() << ": "
+                     << result;
+          continue;
+        }
+      }
+      build_id_records.push_back(
+          BuildIdRecord(false, UINT_MAX, build_id, dso->Path()));
     }
-    build_id_records.push_back(
-        BuildIdRecord(false, UINT_MAX, build_id, filename));
   }
   if (!record_file_writer_->WriteBuildIdFeature(build_id_records)) {
     return false;
@@ -1041,17 +1038,19 @@
   return true;
 }
 
-void RecordCommand::CollectHitFileInfo(Record* record) {
-  if (record->type() == PERF_RECORD_SAMPLE) {
-    const auto& r = *static_cast<SampleRecord*>(record);
-    bool in_kernel = r.InKernel();
-    const ThreadEntry* thread =
-        thread_tree_.FindThreadOrNew(r.tid_data.pid, r.tid_data.tid);
-    const MapEntry* map = thread_tree_.FindMap(thread, r.ip_data.ip, in_kernel);
-    if (in_kernel) {
-      hit_kernel_modules_.insert(map->dso->Path());
-    } else {
-      hit_user_files_.insert(map->dso->Path());
+void RecordCommand::CollectHitFileInfo(const SampleRecord& r) {
+  const ThreadEntry* thread =
+      thread_tree_.FindThreadOrNew(r.tid_data.pid, r.tid_data.tid);
+  const MapEntry* map =
+      thread_tree_.FindMap(thread, r.ip_data.ip, r.InKernel());
+  map->dso->SetHitFlag();
+  if (r.sample_type & PERF_SAMPLE_CALLCHAIN) {
+    size_t ip_nr = r.callchain_data.ip_nr;
+    const uint64_t* ips = r.callchain_data.ips;
+    for (size_t i = 0; i < ip_nr; ++i) {
+      // Even if a sample is in kernel, its callchain can be in user space.
+      map = thread_tree_.FindMap(thread, ips[i]);
+      map->dso->SetHitFlag();
     }
   }
 }
diff --git a/simpleperf/cmd_report.cpp b/simpleperf/cmd_report.cpp
index 830ec13..7fd0f61 100644
--- a/simpleperf/cmd_report.cpp
+++ b/simpleperf/cmd_report.cpp
@@ -25,6 +25,7 @@
 #include <vector>
 
 #include <android-base/logging.h>
+#include <android-base/parsedouble.h>
 #include <android-base/parseint.h>
 #include <android-base/stringprintf.h>
 #include <android-base/strings.h>
@@ -276,10 +277,12 @@
 "                      the graph shows how functions call others.\n"
 "                      Default is caller mode.\n"
 "-i <file>  Specify path of record file, default is perf.data.\n"
+"--max-stack <frames>  Set max stack frames shown when printing call graph.\n"
 "-n         Print the sample count for each item.\n"
 "--no-demangle         Don't demangle symbol names.\n"
 "--no-show-ip          Don't show vaddr in file for unknown symbols.\n"
 "-o report_file_name   Set report file name, default is stdout.\n"
+"--percent-limit <percent>  Set min percentage shown when printing call graph.\n"
 "--pids pid1,pid2,...  Report only for selected pids.\n"
 "--sort key1,key2,...  Select keys used to sort and print the report. The\n"
 "                      appearance order of keys decides the order of keys used\n"
@@ -312,7 +315,9 @@
         system_wide_collection_(false),
         accumulate_callchain_(false),
         print_callgraph_(false),
-        callgraph_show_callee_(false) {}
+        callgraph_show_callee_(false),
+        callgraph_max_stack_(UINT32_MAX),
+        callgraph_percent_limit_(0) {}
 
   bool Run(const std::vector<std::string>& args);
 
@@ -341,6 +346,8 @@
   bool accumulate_callchain_;
   bool print_callgraph_;
   bool callgraph_show_callee_;
+  uint32_t callgraph_max_stack_;
+  double callgraph_percent_limit_;
 
   std::string report_filename_;
 };
@@ -423,6 +430,14 @@
       }
       record_filename_ = args[i];
 
+    } else if (args[i] == "--max-stack") {
+      if (!NextArgumentOrError(args, &i)) {
+        return false;
+      }
+      if (!android::base::ParseUint(args[i].c_str(), &callgraph_max_stack_)) {
+        LOG(ERROR) << "invalid arg for --max-stack: " << args[i];
+        return false;
+      }
     } else if (args[i] == "-n") {
       print_sample_count = true;
 
@@ -435,7 +450,14 @@
         return false;
       }
       report_filename_ = args[i];
-
+    } else if (args[i] == "--percent-limit") {
+      if (!NextArgumentOrError(args, &i)) {
+        return false;
+      }
+      if (!android::base::ParseDouble(args[i].c_str(),
+                                      &callgraph_percent_limit_, 0.0)) {
+        LOG(ERROR) << "invalid arg for --percent-limit: " << args[i];
+      }
     } else if (args[i] == "--pids" || args[i] == "--tids") {
       const std::string& option = args[i];
       std::unordered_set<int>& filter =
@@ -562,7 +584,8 @@
         displayer.AddExclusiveDisplayFunction(
             ReportCmdCallgraphDisplayerWithVaddrInFile());
       } else {
-        displayer.AddExclusiveDisplayFunction(ReportCmdCallgraphDisplayer());
+        displayer.AddExclusiveDisplayFunction(ReportCmdCallgraphDisplayer(
+            callgraph_max_stack_, callgraph_percent_limit_));
       }
     }
   }
diff --git a/simpleperf/cmd_report_test.cpp b/simpleperf/cmd_report_test.cpp
index 704076a..e00b5ee 100644
--- a/simpleperf/cmd_report_test.cpp
+++ b/simpleperf/cmd_report_test.cpp
@@ -415,6 +415,28 @@
   ASSERT_TRUE(success);
 }
 
+TEST_F(ReportCommandTest, max_stack_and_percent_limit_option) {
+  Report(PERF_DATA_MAX_STACK_AND_PERCENT_LIMIT, {"-g"});
+  ASSERT_TRUE(success);
+  ASSERT_NE(content.find("89.03"), std::string::npos);
+
+  Report(PERF_DATA_MAX_STACK_AND_PERCENT_LIMIT, {"-g", "--max-stack", "0"});
+  ASSERT_TRUE(success);
+  ASSERT_EQ(content.find("89.03"), std::string::npos);
+  Report(PERF_DATA_MAX_STACK_AND_PERCENT_LIMIT, {"-g", "--max-stack", "1"});
+  ASSERT_TRUE(success);
+  ASSERT_NE(content.find("89.03"), std::string::npos);
+
+  Report(PERF_DATA_MAX_STACK_AND_PERCENT_LIMIT,
+         {"-g", "--percent-limit", "90"});
+  ASSERT_TRUE(success);
+  ASSERT_EQ(content.find("89.03"), std::string::npos);
+  Report(PERF_DATA_MAX_STACK_AND_PERCENT_LIMIT,
+         {"-g", "--percent-limit", "70"});
+  ASSERT_TRUE(success);
+  ASSERT_NE(content.find("89.03"), std::string::npos);
+}
+
 #if defined(__linux__)
 #include "event_selection_set.h"
 
diff --git a/simpleperf/cmd_stat.cpp b/simpleperf/cmd_stat.cpp
index 9c5cf8f..06258f3 100644
--- a/simpleperf/cmd_stat.cpp
+++ b/simpleperf/cmd_stat.cpp
@@ -27,6 +27,7 @@
 #include <vector>
 
 #include <android-base/logging.h>
+#include <android-base/parsedouble.h>
 #include <android-base/strings.h>
 
 #include "command.h"
@@ -452,10 +453,8 @@
       if (!NextArgumentOrError(args, &i)) {
         return false;
       }
-      errno = 0;
-      char* endptr;
-      duration_in_sec_ = strtod(args[i].c_str(), &endptr);
-      if (duration_in_sec_ <= 0 || *endptr != '\0' || errno == ERANGE) {
+      if (!android::base::ParseDouble(args[i].c_str(), &duration_in_sec_,
+                                      1e-9)) {
         LOG(ERROR) << "Invalid duration: " << args[i].c_str();
         return false;
       }
@@ -463,10 +462,8 @@
       if (!NextArgumentOrError(args, &i)) {
         return false;
       }
-      errno = 0;
-      char* endptr;
-      interval_in_ms_ = strtod(args[i].c_str(), &endptr);
-      if (interval_in_ms_ <= 0 || *endptr != '\0' || errno == ERANGE) {
+      if (!android::base::ParseDouble(args[i].c_str(), &interval_in_ms_,
+                                      1e-9)) {
         LOG(ERROR) << "Invalid interval: " << args[i].c_str();
         return false;
       }
diff --git a/simpleperf/dso.cpp b/simpleperf/dso.cpp
index bd6c1ba..b22ebbf 100644
--- a/simpleperf/dso.cpp
+++ b/simpleperf/dso.cpp
@@ -138,7 +138,8 @@
       debug_file_path_(path),
       min_vaddr_(std::numeric_limits<uint64_t>::max()),
       is_loaded_(false),
-      has_dumped_(false) {
+      has_dumped_(false),
+      hit_flag_(false) {
   // Check if file matching path_ exists in symfs directory before using it as
   // debug_file_path_.
   if (!symfs_dir_.empty()) {
@@ -191,7 +192,10 @@
   auto it = symbols_.upper_bound(Symbol("", vaddr_in_dso, 0));
   if (it != symbols_.begin()) {
     --it;
-    if (it->addr <= vaddr_in_dso && it->addr + it->len > vaddr_in_dso) {
+    // If vaddr_in_dso is ULLONG_MAX, then it->addr + it->len overflows,
+    // and we allow this situation.
+    if (it->addr <= vaddr_in_dso && (it->addr + it->len > vaddr_in_dso ||
+                                     it->addr + it->len < it->addr)) {
       return &*it;
     }
   }
diff --git a/simpleperf/dso.h b/simpleperf/dso.h
index c381e6d..1d53f3f 100644
--- a/simpleperf/dso.h
+++ b/simpleperf/dso.h
@@ -93,6 +93,10 @@
 
   void SetDumped() { has_dumped_ = true; }
 
+  // Set when there are samples hit in current dso.
+  void SetHitFlag() { hit_flag_ = true; }
+  bool IsHit() const { return hit_flag_; }
+
   // Return the minimum virtual address in program header.
   uint64_t MinVirtualAddress();
   void SetMinVirtualAddress(uint64_t min_vaddr) { min_vaddr_ = min_vaddr; }
@@ -130,6 +134,7 @@
   std::set<Symbol, SymbolComparator> symbols_;
   bool is_loaded_;
   bool has_dumped_;
+  bool hit_flag_;
 };
 
 const char* DsoTypeToString(DsoType dso_type);
diff --git a/simpleperf/get_test_data.h b/simpleperf/get_test_data.h
index 339871e..4550843 100644
--- a/simpleperf/get_test_data.h
+++ b/simpleperf/get_test_data.h
@@ -99,4 +99,7 @@
 // generated_by_linux_perf.data is generated by `perf record -F 1 -a -g -- sleep 0.1`.
 static const std::string PERF_DATA_GENERATED_BY_LINUX_PERF = "generated_by_linux_perf.data";
 
+// generated by `simpleperf record -g ls`.
+static const std::string PERF_DATA_MAX_STACK_AND_PERCENT_LIMIT = "perf_test_max_stack_and_percent_limit.data";
+
 #endif  // SIMPLE_PERF_GET_TEST_DATA_H_
diff --git a/simpleperf/record.cpp b/simpleperf/record.cpp
index 2bcba4d..be92ac1 100644
--- a/simpleperf/record.cpp
+++ b/simpleperf/record.cpp
@@ -24,7 +24,6 @@
 #include <android-base/stringprintf.h>
 
 #include "dso.h"
-#include "environment.h"
 #include "perf_regs.h"
 #include "tracing.h"
 #include "utils.h"
@@ -56,31 +55,11 @@
   return android::base::StringPrintf("unknown(%d)", record_type);
 }
 
-template <class T>
-void MoveFromBinaryFormat(T* data_p, size_t n, const char*& p) {
-  size_t size = n * sizeof(T);
-  memcpy(data_p, p, size);
-  p += size;
-}
-
-template <class T>
-void MoveToBinaryFormat(const T& data, char*& p) {
-  *reinterpret_cast<T*>(p) = data;
-  p += sizeof(T);
-}
-
 template <>
 void MoveToBinaryFormat(const RecordHeader& data, char*& p) {
   data.MoveToBinaryFormat(p);
 }
 
-template <class T>
-void MoveToBinaryFormat(const T* data_p, size_t n, char*& p) {
-  size_t size = n * sizeof(T);
-  memcpy(p, data_p, size);
-  p += size;
-}
-
 SampleId::SampleId() { memset(this, 0, sizeof(SampleId)); }
 
 // Return sample_id size in binary format.
@@ -321,12 +300,13 @@
 }
 
 CommRecord::CommRecord(const perf_event_attr& attr, uint32_t pid, uint32_t tid,
-                       const std::string& comm, uint64_t event_id) {
+                       const std::string& comm, uint64_t event_id, uint64_t time) {
   SetTypeAndMisc(PERF_RECORD_COMM, 0);
   CommRecordDataType data;
   data.pid = pid;
   data.tid = tid;
   size_t sample_id_size = sample_id.CreateContent(attr, event_id);
+  sample_id.time_data.time = time;
   SetSize(header_size() + sizeof(data) + Align(comm.size() + 1, 8) +
           sample_id_size);
   char* new_binary = new char[size()];
@@ -476,6 +456,83 @@
   }
 }
 
+SampleRecord::SampleRecord(const perf_event_attr& attr, uint64_t id,
+                           uint64_t ip, uint32_t pid, uint32_t tid,
+                           uint64_t time, uint32_t cpu, uint64_t period,
+                           const std::vector<uint64_t>& ips) {
+  SetTypeAndMisc(PERF_RECORD_SAMPLE, PERF_RECORD_MISC_USER);
+  sample_type = attr.sample_type;
+  CHECK_EQ(0u, sample_type & ~(PERF_SAMPLE_IP | PERF_SAMPLE_TID
+      | PERF_SAMPLE_TIME | PERF_SAMPLE_ID | PERF_SAMPLE_CPU
+      | PERF_SAMPLE_PERIOD | PERF_SAMPLE_CALLCHAIN));
+  ip_data.ip = ip;
+  tid_data.pid = pid;
+  tid_data.tid = tid;
+  time_data.time = time;
+  id_data.id = id;
+  cpu_data.cpu = cpu;
+  cpu_data.res = 0;
+  period_data.period = period;
+  callchain_data.ip_nr = ips.size();
+  raw_data.size = 0;
+  branch_stack_data.stack_nr = 0;
+  regs_user_data.abi = 0;
+  regs_user_data.reg_mask = 0;
+  stack_user_data.size = 0;
+
+  uint32_t size = header_size();
+  if (sample_type & PERF_SAMPLE_IP) {
+    size += sizeof(ip_data);
+  }
+  if (sample_type & PERF_SAMPLE_TID) {
+    size += sizeof(tid_data);
+  }
+  if (sample_type & PERF_SAMPLE_TIME) {
+    size += sizeof(time_data);
+  }
+  if (sample_type & PERF_SAMPLE_ID) {
+    size += sizeof(id_data);
+  }
+  if (sample_type & PERF_SAMPLE_CPU) {
+    size += sizeof(cpu_data);
+  }
+  if (sample_type & PERF_SAMPLE_PERIOD) {
+    size += sizeof(period_data);
+  }
+  if (sample_type & PERF_SAMPLE_CALLCHAIN) {
+    size += sizeof(uint64_t) * (ips.size() + 1);
+  }
+  SetSize(size);
+  char* new_binary = new char[size];
+  char* p = new_binary;
+  MoveToBinaryFormat(header, p);
+  if (sample_type & PERF_SAMPLE_IP) {
+    MoveToBinaryFormat(ip_data, p);
+  }
+  if (sample_type & PERF_SAMPLE_TID) {
+    MoveToBinaryFormat(tid_data, p);
+  }
+  if (sample_type & PERF_SAMPLE_TIME) {
+    MoveToBinaryFormat(time_data, p);
+  }
+  if (sample_type & PERF_SAMPLE_ID) {
+    MoveToBinaryFormat(id_data, p);
+  }
+  if (sample_type & PERF_SAMPLE_CPU) {
+    MoveToBinaryFormat(cpu_data, p);
+  }
+  if (sample_type & PERF_SAMPLE_PERIOD) {
+    MoveToBinaryFormat(period_data, p);
+  }
+  if (sample_type & PERF_SAMPLE_CALLCHAIN) {
+    MoveToBinaryFormat(callchain_data.ip_nr, p);
+    callchain_data.ips = reinterpret_cast<uint64_t*>(p);
+    MoveToBinaryFormat(ips.data(), ips.size(), p);
+  }
+  CHECK_EQ(p, new_binary + size);
+  UpdateBinary(new_binary);
+}
+
 void SampleRecord::ReplaceRegAndStackWithCallChain(
     const std::vector<uint64_t>& ips) {
   uint32_t size_added_in_callchain = sizeof(uint64_t) * (ips.size() + 1);
diff --git a/simpleperf/record.h b/simpleperf/record.h
index 02854b4..2cbe35a 100644
--- a/simpleperf/record.h
+++ b/simpleperf/record.h
@@ -319,7 +319,7 @@
   CommRecord(const perf_event_attr& attr, const char* p);
 
   CommRecord(const perf_event_attr& attr, uint32_t pid, uint32_t tid,
-             const std::string& comm, uint64_t event_id);
+             const std::string& comm, uint64_t event_id, uint64_t time);
 
  protected:
   void DumpData(size_t indent) const override;
@@ -385,6 +385,10 @@
   PerfSampleStackUserType stack_user_data;  // Valid if PERF_SAMPLE_STACK_USER.
 
   SampleRecord(const perf_event_attr& attr, const char* p);
+  SampleRecord(const perf_event_attr& attr, uint64_t id, uint64_t ip,
+               uint32_t pid, uint32_t tid, uint64_t time, uint32_t cpu,
+               uint64_t period, const std::vector<uint64_t>& ips);
+
   void ReplaceRegAndStackWithCallChain(const std::vector<uint64_t>& ips);
   uint64_t Timestamp() const override;
   uint32_t Cpu() const override;
diff --git a/simpleperf/record_equal_test.h b/simpleperf/record_equal_test.h
index bf0568e..9496271 100644
--- a/simpleperf/record_equal_test.h
+++ b/simpleperf/record_equal_test.h
@@ -30,10 +30,44 @@
   ASSERT_STREQ(r1.filename, r2.filename);
 }
 
+static void CheckSampleRecordDataEqual(const SampleRecord& r1, const SampleRecord& r2) {
+  ASSERT_EQ(r1.sample_type, r2.sample_type);
+  if (r1.sample_type & PERF_SAMPLE_IP) {
+    EXPECT_EQ(r1.ip_data.ip, r2.ip_data.ip);
+  }
+  if (r1.sample_type & PERF_SAMPLE_TID) {
+    EXPECT_EQ(r1.tid_data.pid, r2.tid_data.pid);
+    EXPECT_EQ(r1.tid_data.tid, r2.tid_data.tid);
+  }
+  if (r1.sample_type & PERF_SAMPLE_TIME) {
+    EXPECT_EQ(r1.time_data.time, r2.time_data.time);
+  }
+  if (r1.sample_type & PERF_SAMPLE_ID) {
+    EXPECT_EQ(r1.id_data.id, r2.id_data.id);
+  }
+  if (r1.sample_type & PERF_SAMPLE_CPU) {
+    EXPECT_EQ(r1.cpu_data.cpu, r2.cpu_data.cpu);
+  }
+  if (r1.sample_type & PERF_SAMPLE_PERIOD) {
+    EXPECT_EQ(r1.period_data.period, r2.period_data.period);
+  }
+  if (r1.sample_type & PERF_SAMPLE_CALLCHAIN) {
+    ASSERT_EQ(r1.callchain_data.ip_nr, r2.callchain_data.ip_nr);
+    for (size_t i = 0; i < r1.callchain_data.ip_nr; ++i) {
+      EXPECT_EQ(r1.callchain_data.ips[i], r2.callchain_data.ips[i]);
+    }
+  }
+}
+
 static void CheckRecordEqual(const Record& r1, const Record& r2) {
   ASSERT_EQ(r1.type(), r2.type());
   ASSERT_EQ(r1.misc(), r2.misc());
   ASSERT_EQ(r1.size(), r2.size());
+  if (r1.type() == PERF_RECORD_SAMPLE) {
+    CheckSampleRecordDataEqual(static_cast<const SampleRecord&>(r1),
+                               static_cast<const SampleRecord&>(r2));
+    return;
+  }
   ASSERT_EQ(0, memcmp(&r1.sample_id, &r2.sample_id, sizeof(r1.sample_id)));
   if (r1.type() == PERF_RECORD_MMAP) {
     CheckMmapRecordDataEqual(static_cast<const MmapRecord&>(r1), static_cast<const MmapRecord&>(r2));
diff --git a/simpleperf/record_file_reader.cpp b/simpleperf/record_file_reader.cpp
index 68fcc4c..c300fc7 100644
--- a/simpleperf/record_file_reader.cpp
+++ b/simpleperf/record_file_reader.cpp
@@ -351,11 +351,15 @@
   const char* end = buf.data() + buf.size();
   std::vector<BuildIdRecord> result;
   while (p < end) {
-    BuildIdRecord record(p);
+    auto header = reinterpret_cast<const perf_event_header*>(p);
+    CHECK_LE(p + header->size, end);
+    char* binary = new char[header->size];
+    memcpy(binary, p, header->size);
+    p += header->size;
+    BuildIdRecord record(binary);
+    record.OwnBinary();
     // Set type explicitly as the perf.data produced by perf doesn't set it.
     record.SetTypeAndMisc(PERF_RECORD_BUILD_ID, record.misc());
-    CHECK_LE(p + record.size(), end);
-    p += record.size();
     result.push_back(std::move(record));
   }
   return result;
diff --git a/simpleperf/record_test.cpp b/simpleperf/record_test.cpp
index da5a3ce..45febc5 100644
--- a/simpleperf/record_test.cpp
+++ b/simpleperf/record_test.cpp
@@ -27,6 +27,7 @@
     const EventType* type = FindEventTypeByName("cpu-cycles");
     ASSERT_TRUE(type != nullptr);
     event_attr = CreateDefaultPerfEventAttr(*type);
+    event_attr.sample_id_all = 1;
   }
 
   void CheckRecordMatchBinary(const Record& record) {
@@ -47,7 +48,15 @@
 }
 
 TEST_F(RecordTest, CommRecordMatchBinary) {
-  CommRecord record(event_attr, 1, 2, "CommRecord", 0);
+  CommRecord record(event_attr, 1, 2, "CommRecord", 0, 7);
+  CheckRecordMatchBinary(record);
+}
+
+TEST_F(RecordTest, SampleRecordMatchBinary) {
+  event_attr.sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_TIME
+                           | PERF_SAMPLE_ID | PERF_SAMPLE_CPU
+                           | PERF_SAMPLE_PERIOD | PERF_SAMPLE_CALLCHAIN;
+  SampleRecord record(event_attr, 1, 2, 3, 4, 5, 6, 7, {8, 9, 10});
   CheckRecordMatchBinary(record);
 }
 
diff --git a/simpleperf/sample_tree_test.cpp b/simpleperf/sample_tree_test.cpp
index e288968..c2a0c71 100644
--- a/simpleperf/sample_tree_test.cpp
+++ b/simpleperf/sample_tree_test.cpp
@@ -232,3 +232,9 @@
   };
   CheckSamples(sample_tree_builder.GetSamples(), expected_samples);
 }
+
+TEST(thread_tree, symbol_ULLONG_MAX) {
+  ThreadTree thread_tree;
+  thread_tree.ShowIpForUnknownSymbol();
+  ASSERT_TRUE(thread_tree.FindKernelSymbol(ULLONG_MAX) != nullptr);
+}
diff --git a/simpleperf/testdata/perf_test_max_stack_and_percent_limit.data b/simpleperf/testdata/perf_test_max_stack_and_percent_limit.data
new file mode 100644
index 0000000..b3fc225
--- /dev/null
+++ b/simpleperf/testdata/perf_test_max_stack_and_percent_limit.data
Binary files differ
diff --git a/simpleperf/thread_tree.cpp b/simpleperf/thread_tree.cpp
index 56e5fbf..69dab72 100644
--- a/simpleperf/thread_tree.cpp
+++ b/simpleperf/thread_tree.cpp
@@ -305,4 +305,16 @@
   }
 }
 
+std::vector<Dso*> ThreadTree::GetAllDsos() const {
+  std::vector<Dso*> result;
+  result.push_back(kernel_dso_.get());
+  for (auto& p : module_dso_tree_) {
+    result.push_back(p.second.get());
+  }
+  for (auto& p : user_dso_tree_) {
+    result.push_back(p.second.get());
+  }
+  return result;
+}
+
 }  // namespace simpleperf
diff --git a/simpleperf/thread_tree.h b/simpleperf/thread_tree.h
index 703bc88..89bf059 100644
--- a/simpleperf/thread_tree.h
+++ b/simpleperf/thread_tree.h
@@ -112,6 +112,8 @@
   // Update thread tree with information provided by record.
   void Update(const Record& record);
 
+  std::vector<Dso*> GetAllDsos() const;
+
  private:
   Dso* FindKernelDsoOrNew(const std::string& filename);
   Dso* FindUserDsoOrNew(const std::string& filename);
diff --git a/simpleperf/utils.h b/simpleperf/utils.h
index a032681..89962a6 100644
--- a/simpleperf/utils.h
+++ b/simpleperf/utils.h
@@ -107,10 +107,34 @@
 
 template <class T>
 void MoveFromBinaryFormat(T& data, const char*& p) {
+  static_assert(std::is_standard_layout<T>::value, "not standard layout");
   data = *reinterpret_cast<const T*>(p);
   p += sizeof(T);
 }
 
+template <class T>
+void MoveFromBinaryFormat(T* data_p, size_t n, const char*& p) {
+  static_assert(std::is_standard_layout<T>::value, "not standard layout");
+  size_t size = n * sizeof(T);
+  memcpy(data_p, p, size);
+  p += size;
+}
+
+template <class T>
+void MoveToBinaryFormat(const T& data, char*& p) {
+  static_assert(std::is_standard_layout<T>::value, "not standard layout");
+  *reinterpret_cast<T*>(p) = data;
+  p += sizeof(T);
+}
+
+template <class T>
+void MoveToBinaryFormat(const T* data_p, size_t n, char*& p) {
+  static_assert(std::is_standard_layout<T>::value, "not standard layout");
+  size_t size = n * sizeof(T);
+  memcpy(p, data_p, size);
+  p += size;
+}
+
 void PrintIndented(size_t indent, const char* fmt, ...);
 void FprintIndented(FILE* fp, size_t indent, const char* fmt, ...);
 
diff --git a/verity/build_verity_metadata.py b/verity/build_verity_metadata.py
index 51e629a..71e66b4 100755
--- a/verity/build_verity_metadata.py
+++ b/verity/build_verity_metadata.py
@@ -1,5 +1,20 @@
 #! /usr/bin/env python
+#
+# Copyright (C) 2013 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
+import argparse
 import os
 import sys
 import struct
@@ -27,12 +42,16 @@
     block = block.ljust(METADATA_SIZE, '\x00')
     return block
 
-def sign_verity_table(table, signer_path, key_path):
+def sign_verity_table(table, signer_path, key_path, signer_args=None):
+    if signer_args is None:
+        signer_args = ''
+
     with tempfile.NamedTemporaryFile(suffix='.table') as table_file:
         with tempfile.NamedTemporaryFile(suffix='.sig') as signature_file:
             table_file.write(table)
             table_file.flush()
-            cmd = " ".join((signer_path, table_file.name, key_path, signature_file.name))
+            cmd = " ".join((signer_path, signer_args, table_file.name,
+                            key_path, signature_file.name))
             print cmd
             run(cmd)
             return signature_file.read()
@@ -49,12 +68,12 @@
                 salt)
     return table
 
-def build_verity_metadata(data_blocks, metadata_image, root_hash,
-                            salt, block_device, signer_path, signing_key):
+def build_verity_metadata(data_blocks, metadata_image, root_hash, salt,
+        block_device, signer_path, signing_key, signer_args=None):
     # build the verity table
     verity_table = build_verity_table(block_device, data_blocks, root_hash, salt)
     # build the verity table signature
-    signature = sign_verity_table(verity_table, signer_path, signing_key)
+    signature = sign_verity_table(verity_table, signer_path, signing_key, signer_args)
     # build the metadata block
     metadata_block = build_metadata_block(verity_table, signature)
     # write it to the outfile
@@ -62,17 +81,30 @@
         f.write(metadata_block)
 
 if __name__ == "__main__":
-    if len(sys.argv) == 3 and sys.argv[1] == "-s":
-        print get_verity_metadata_size(int(sys.argv[2]))
-    elif len(sys.argv) == 8:
-        data_image_blocks = int(sys.argv[1]) / 4096
-        metadata_image = sys.argv[2]
-        root_hash = sys.argv[3]
-        salt = sys.argv[4]
-        block_device = sys.argv[5]
-        signer_path = sys.argv[6]
-        signing_key = sys.argv[7]
-        build_verity_metadata(data_image_blocks, metadata_image, root_hash,
-                                salt, block_device, signer_path, signing_key)
+    parser = argparse.ArgumentParser()
+    subparsers = parser.add_subparsers()
+
+    parser_size = subparsers.add_parser('size')
+    parser_size.add_argument('partition_size', type=int, action='store', help='partition size')
+    parser_size.set_defaults(dest='size')
+
+    parser_build = subparsers.add_parser('build')
+    parser_build.add_argument('blocks', type=int, help='data image blocks')
+    parser_build.add_argument('metadata_image', action='store', help='metadata image')
+    parser_build.add_argument('root_hash', action='store', help='root hash')
+    parser_build.add_argument('salt', action='store', help='salt')
+    parser_build.add_argument('block_device', action='store', help='block device')
+    parser_build.add_argument('signer_path', action='store', help='verity signer path')
+    parser_build.add_argument('signing_key', action='store', help='verity signing key')
+    parser_build.add_argument('--signer_args', action='store', help='verity signer args')
+    parser_build.set_defaults(dest='build')
+
+    args = parser.parse_args()
+
+    if args.dest == 'size':
+        print get_verity_metadata_size(args.partition_size)
     else:
-        exit(-1)
+        build_verity_metadata(args.blocks / 4096, args.metadata_image,
+                              args.root_hash, args.salt, args.block_device,
+                              args.signer_path, args.signing_key,
+                              args.signer_args)