Merge "llkd: add live-lock daemon" am: 656bc24630
am: e40838d37a
Change-Id: I7998a53bfc73c36b9687af95c673266f613a5b86
diff --git a/libcutils/include/private/android_filesystem_config.h b/libcutils/include/private/android_filesystem_config.h
index 8209167..3be8ad0 100644
--- a/libcutils/include/private/android_filesystem_config.h
+++ b/libcutils/include/private/android_filesystem_config.h
@@ -130,6 +130,7 @@
#define AID_INCIDENTD 1067 /* incidentd daemon */
#define AID_SECURE_ELEMENT 1068 /* secure element subsystem */
#define AID_LMKD 1069 /* low memory killer daemon */
+#define AID_LLKD 1070 /* live lock daemon */
/* Changes to this file must be made in AOSP, *not* in internal branches. */
#define AID_SHELL 2000 /* adb and debug shell user */
diff --git a/llkd/Android.bp b/llkd/Android.bp
new file mode 100644
index 0000000..a6edd26
--- /dev/null
+++ b/llkd/Android.bp
@@ -0,0 +1,42 @@
+cc_library_headers {
+ name: "llkd_headers",
+
+ export_include_dirs: ["include"],
+}
+
+cc_library_static {
+ name: "libllkd",
+
+ srcs: [
+ "libllkd.cpp",
+ ],
+
+ shared_libs: [
+ "libbase",
+ "libcutils",
+ "liblog",
+ ],
+
+ export_include_dirs: ["include"],
+
+ cflags: ["-Werror"],
+}
+
+cc_binary {
+ name: "llkd",
+
+ srcs: [
+ "llkd.cpp",
+ ],
+ shared_libs: [
+ "libbase",
+ "libcutils",
+ "liblog",
+ ],
+ static_libs: [
+ "libllkd",
+ ],
+ cflags: ["-Werror"],
+
+ init_rc: ["llkd.rc"],
+}
diff --git a/llkd/OWNERS b/llkd/OWNERS
new file mode 100644
index 0000000..b6af537
--- /dev/null
+++ b/llkd/OWNERS
@@ -0,0 +1,2 @@
+salyzyn@google.com
+surenb@google.com
diff --git a/llkd/README.md b/llkd/README.md
new file mode 100644
index 0000000..146a998
--- /dev/null
+++ b/llkd/README.md
@@ -0,0 +1,116 @@
+Android Live-LocK Daemon
+========================
+
+Introduction
+------------
+
+Android Live-LocK Daemon (llkd) is used to catch kernel deadlocks and mitigate.
+
+Code is structured to allow integration into another service as either as part
+of the main loop, or spun off as a thread should that be necessary. A default
+standalone implementation is provided by llkd component.
+
+The 'C' interface from libllkd component is thus:
+
+ #include "llkd.h"
+ bool llkInit(const char* threadname) /* return true if enabled */
+ unsigned llkCheckMillseconds(void) /* ms to sleep for next check */
+
+If a threadname is provided, a thread will be automatically spawned, otherwise
+caller must call llkCheckMilliseconds in its main loop. Function will return
+the period of time before the next expected call to this handler.
+
+Operations
+----------
+
+If a thread is in D or Z state with no forward progress for longer than
+ro.llk.timeout_ms, or ro.llk.[D|Z].timeout_ms, kill the process or parent
+process respectively. If another scan shows the same process continues to
+exist, then have a confirmed live-lock condition and need to panic. Panic
+the kernel in a manner to provide the greatest bugreporting details as to the
+condition. Add a alarm self watchdog should llkd ever get locked up that is
+double the expected time to flow through the mainloop. Sampling is every
+ro.llk_sample_ms.
+
+Default will not monitor init, or [kthreadd] and all that [kthreadd] spawns.
+This reduces the effectiveness of llkd by limiting its coverage. If there is
+value in covering [kthreadd] spawned threads, the requirement will be that
+the drivers not remain in a persistent 'D' state, or that they have mechanisms
+to recover the thread should it be killed externally (this is good driver
+coding hygiene, a common request to add such to publicly reviewed kernel.org
+maintained drivers). For instance use wait_event_interruptible() instead of
+wait_event(). The blacklists can be adjusted accordingly if these
+conditions are met to cover kernel components.
+
+An accompanying gTest set have been added, and will setup a persistent D or Z
+process, with and without forward progress, but not in a live-lock state
+because that would require a buggy kernel, or a module or kernel modification
+to stimulate. The test will check that llkd will mitigate first by killing
+the appropriate process. D state is setup by vfork() waiting for exec() in
+child process. Z state is setup by fork() and an un-waited for child process.
+Should be noted that both of these conditions should never happen on Android
+on purpose, and llkd effectively sweeps up processes that create these
+conditions. If the test can, it will reconfigure llkd to expedite the test
+duration by adjusting the ro.llk.* Android properties. Tests run the D state
+with some scheduling progress to ensure that ABA checking prevents false
+triggers.
+
+Android Properties
+------------------
+
+Android Properties llkd respond to (<prop>_ms parms are in milliseconds):
+
+#### ro.config.low_ram
+default false, if true do not sysrq t (dump all threads).
+
+#### ro.llk.enable
+default false, allow live-lock daemon to be enabled.
+
+#### ro.khungtask.enable
+default false, allow [khungtask] daemon to be enabled.
+
+#### ro.llk.mlockall
+default false, enable call to mlockall().
+
+#### ro.khungtask.timeout
+default value 12 minutes, [khungtask] maximum timelimit.
+
+#### ro.llk.timeout_ms
+default 10 minutes, D or Z maximum timelimit, double this value and it sets
+the alarm watchdog for llkd.
+
+#### ro.llk.D.timeout_ms
+default ro.llk.timeout_ms, D maximum timelimit.
+
+#### ro.llk.Z.timeout_ms
+default ro.llk.timeout_ms, Z maximum timelimit.
+
+#### ro.llk.check_ms
+default 2 minutes samples of threads for D or Z.
+
+#### ro.llk.blacklist.process
+default 0,1,2 (kernel, init and [kthreadd]) plus process names
+init,[kthreadd],[khungtaskd],lmkd,lmkd.llkd,llkd,watchdogd,
+[watchdogd],[watchdogd/0],...,[watchdogd/<get_nprocs-1>].
+
+#### ro.llk.blacklist.parent
+default 0,2 (kernel and [kthreadd]).
+
+#### ro.llk.blacklist.uid
+default <empty>, comma separated list of uid numbers or names.
+
+Architectural Concerns
+----------------------
+
+- Figure out how to communicate the kernel panic better to bootstat canonical
+ boot reason determination. This may require an alteration to bootstat, or
+ some logging from llkd. Would like to see boot reason to be
+ watchdog,livelock as a minimum requirement. Or more specifically would want
+ watchdog,livelock,device or watchdog,livelock,zombie be reported.
+ Currently reports panic,sysrq (user requested panic) or panic depending on
+ system support of pstore.
+- Create kernel module and associated gTest to actually test panic.
+- Create gTest to test out blacklist (ro.llk.blacklist.<properties> generally
+ not be inputs). Could require more test-only interfaces to libllkd.
+- Speed up gTest using something else than ro.llk.<properties>, which should
+ not be inputs.
diff --git a/llkd/include/llkd.h b/llkd/include/llkd.h
new file mode 100644
index 0000000..2ae28ed
--- /dev/null
+++ b/llkd/include/llkd.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _LLKD_H_
+#define _LLKD_H_
+
+#ifndef LOG_TAG
+#define LOG_TAG "livelock"
+#endif
+
+#include <stdbool.h>
+#include <sys/cdefs.h>
+
+__BEGIN_DECLS
+
+bool llkInit(const char* threadname); /* threadname NULL, not spawned */
+unsigned llkCheckMilliseconds(void);
+
+/* clang-format off */
+#define LLK_ENABLE_PROPERTY "ro.llk.enable"
+#define LLK_ENABLE_DEFAULT false
+#define KHT_ENABLE_PROPERTY "ro.khungtask.enable"
+#define LLK_MLOCKALL_PROPERTY "ro.llk.mlockall"
+#define LLK_MLOCKALL_DEFAULT true
+#define LLK_TIMEOUT_MS_PROPERTY "ro.llk.timeout_ms"
+#define KHT_TIMEOUT_PROPERTY "ro.khungtask.timeout"
+#define LLK_D_TIMEOUT_MS_PROPERTY "ro.llk.D.timeout_ms"
+#define LLK_Z_TIMEOUT_MS_PROPERTY "ro.llk.Z.timeout_ms"
+#define LLK_CHECK_MS_PROPERTY "ro.llk.check_ms"
+/* LLK_CHECK_MS_DEFAULT = actual timeout_ms / LLK_CHECKS_PER_TIMEOUT_DEFAULT */
+#define LLK_CHECKS_PER_TIMEOUT_DEFAULT 5
+#define LLK_BLACKLIST_PROCESS_PROPERTY "ro.llk.blacklist.process"
+#define LLK_BLACKLIST_PROCESS_DEFAULT \
+ "0,1,2,init,[kthreadd],[khungtaskd],lmkd,lmkd.llkd,llkd,watchdogd,[watchdogd],[watchdogd/0]"
+#define LLK_BLACKLIST_PARENT_PROPERTY "ro.llk.blacklist.parent"
+#define LLK_BLACKLIST_PARENT_DEFAULT "0,2,[kthreadd]"
+#define LLK_BLACKLIST_UID_PROPERTY "ro.llk.blacklist.uid"
+#define LLK_BLACKLIST_UID_DEFAULT ""
+/* clang-format on */
+
+__END_DECLS
+
+#ifdef __cplusplus
+extern "C++" { /* In case this included wrapped with __BEGIN_DECLS */
+
+#include <chrono>
+
+__BEGIN_DECLS
+/* C++ code allowed to not specify threadname argument for this C linkage */
+bool llkInit(const char* threadname = nullptr);
+__END_DECLS
+std::chrono::milliseconds llkCheck(bool checkRunning = false);
+
+/* clang-format off */
+#define LLK_TIMEOUT_MS_DEFAULT std::chrono::duration_cast<milliseconds>(std::chrono::minutes(10))
+#define LLK_TIMEOUT_MS_MINIMUM std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::seconds(10))
+#define LLK_CHECK_MS_MINIMUM std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::seconds(1))
+/* clang-format on */
+
+} /* extern "C++" */
+#endif /* __cplusplus */
+
+#endif /* _LLKD_H_ */
diff --git a/llkd/libllkd.cpp b/llkd/libllkd.cpp
new file mode 100644
index 0000000..b25eb06
--- /dev/null
+++ b/llkd/libllkd.cpp
@@ -0,0 +1,1159 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "llkd.h"
+
+#include <ctype.h>
+#include <dirent.h> // opendir() and readdir()
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <pwd.h> // getpwuid()
+#include <signal.h>
+#include <stdint.h>
+#include <sys/cdefs.h> // ___STRING, __predict_true() and _predict_false()
+#include <sys/mman.h> // mlockall()
+#include <sys/prctl.h>
+#include <sys/stat.h> // lstat()
+#include <sys/syscall.h> // __NR_getdents64
+#include <sys/sysinfo.h> // get_nprocs_conf()
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <chrono>
+#include <ios>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+
+#include <android-base/file.h>
+#include <android-base/logging.h>
+#include <android-base/parseint.h>
+#include <android-base/properties.h>
+#include <android-base/strings.h>
+#include <cutils/android_get_control_file.h>
+#include <log/log_main.h>
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+
+#define TASK_COMM_LEN 16 // internal kernel, not uapi, from .../linux/include/linux/sched.h
+
+using namespace std::chrono_literals;
+using namespace std::chrono;
+
+namespace {
+
+constexpr pid_t kernelPid = 0;
+constexpr pid_t initPid = 1;
+constexpr pid_t kthreaddPid = 2;
+
+constexpr char procdir[] = "/proc/";
+
+// Configuration
+milliseconds llkUpdate; // last check ms signature
+milliseconds llkCycle; // ms to next thread check
+bool llkEnable = LLK_ENABLE_DEFAULT; // llk daemon enabled
+bool llkRunning = false; // thread is running
+bool llkMlockall = LLK_MLOCKALL_DEFAULT; // run mlocked
+milliseconds llkTimeoutMs = LLK_TIMEOUT_MS_DEFAULT; // default timeout
+enum { llkStateD, llkStateZ, llkNumStates }; // state indexes
+milliseconds llkStateTimeoutMs[llkNumStates]; // timeout override for each detection state
+milliseconds llkCheckMs; // checking interval to inspect any
+ // persistent live-locked states
+bool llkLowRam; // ro.config.low_ram
+bool khtEnable = LLK_ENABLE_DEFAULT; // [khungtaskd] panic
+// [khungtaskd] should have a timeout beyond the granularity of llkTimeoutMs.
+// Provides a wide angle of margin b/c khtTimeout is also its granularity.
+seconds khtTimeout = duration_cast<seconds>(llkTimeoutMs * (1 + LLK_CHECKS_PER_TIMEOUT_DEFAULT) /
+ LLK_CHECKS_PER_TIMEOUT_DEFAULT);
+
+// Blacklist variables, initialized with comma separated lists of high false
+// positive and/or dangerous references, e.g. without self restart, for pid,
+// ppid, name and uid:
+
+// list of pids, or tids or names to skip. kernel pid (0), init pid (1),
+// [kthreadd] pid (2), ourselves, "init", "[kthreadd]", "lmkd", "llkd" or
+// combinations of watchdogd in kernel and user space.
+std::unordered_set<std::string> llkBlacklistProcess;
+// list of parent pids, comm or cmdline names to skip. default:
+// kernel pid (0), [kthreadd] (2), or ourselves, enforced and implied
+std::unordered_set<std::string> llkBlacklistParent;
+// list of uids, and uid names, to skip, default nothing
+std::unordered_set<std::string> llkBlacklistUid;
+
+class dir {
+ public:
+ enum level { proc, task, numLevels };
+
+ private:
+ int fd;
+ size_t available_bytes;
+ dirent* next;
+ // each directory level picked to be just north of 4K in size
+ static constexpr size_t buffEntries = 15;
+ static dirent buff[numLevels][buffEntries];
+
+ bool fill(enum level index) {
+ if (index >= numLevels) return false;
+ if (available_bytes != 0) return true;
+ if (__predict_false(fd < 0)) return false;
+ // getdents64 has no libc wrapper
+ auto rc = TEMP_FAILURE_RETRY(syscall(__NR_getdents64, fd, buff[index], sizeof(buff[0]), 0));
+ if (rc <= 0) return false;
+ available_bytes = rc;
+ next = buff[index];
+ return true;
+ }
+
+ public:
+ dir() : fd(-1), available_bytes(0), next(nullptr) {}
+
+ explicit dir(const char* directory)
+ : fd(__predict_true(directory != nullptr)
+ ? ::open(directory, O_CLOEXEC | O_DIRECTORY | O_RDONLY)
+ : -1),
+ available_bytes(0),
+ next(nullptr) {}
+
+ explicit dir(const std::string&& directory)
+ : fd(::open(directory.c_str(), O_CLOEXEC | O_DIRECTORY | O_RDONLY)),
+ available_bytes(0),
+ next(nullptr) {}
+
+ explicit dir(const std::string& directory)
+ : fd(::open(directory.c_str(), O_CLOEXEC | O_DIRECTORY | O_RDONLY)),
+ available_bytes(0),
+ next(nullptr) {}
+
+ // Don't need any copy or move constructors.
+ explicit dir(const dir& c) = delete;
+ explicit dir(dir& c) = delete;
+ explicit dir(dir&& c) = delete;
+
+ ~dir() {
+ if (fd >= 0) {
+ ::close(fd);
+ }
+ }
+
+ operator bool() const { return fd >= 0; }
+
+ void reset(void) {
+ if (fd >= 0) {
+ ::close(fd);
+ fd = -1;
+ available_bytes = 0;
+ next = nullptr;
+ }
+ }
+
+ dir& reset(const char* directory) {
+ reset();
+ // available_bytes will _always_ be zero here as its value is
+ // intimately tied to fd < 0 or not.
+ fd = ::open(directory, O_CLOEXEC | O_DIRECTORY | O_RDONLY);
+ return *this;
+ }
+
+ void rewind(void) {
+ if (fd >= 0) {
+ ::lseek(fd, off_t(0), SEEK_SET);
+ available_bytes = 0;
+ next = nullptr;
+ }
+ }
+
+ dirent* read(enum level index = proc, dirent* def = nullptr) {
+ if (!fill(index)) return def;
+ auto ret = next;
+ available_bytes -= next->d_reclen;
+ next = reinterpret_cast<dirent*>(reinterpret_cast<char*>(next) + next->d_reclen);
+ return ret;
+ }
+} llkTopDirectory;
+
+dirent dir::buff[dir::numLevels][dir::buffEntries];
+
+// helper functions
+
+bool llkIsMissingExeLink(pid_t tid) {
+ char c;
+ // CAP_SYS_PTRACE is required to prevent ret == -1, but ENOENT is signal
+ auto ret = ::readlink((procdir + std::to_string(tid) + "/exe").c_str(), &c, sizeof(c));
+ return (ret == -1) && (errno == ENOENT);
+}
+
+// Common routine where caller accepts empty content as error/passthrough.
+// Reduces the churn of reporting read errors in the callers.
+std::string ReadFile(std::string&& path) {
+ std::string content;
+ if (!android::base::ReadFileToString(path, &content)) {
+ PLOG(DEBUG) << "Read " << path << " failed";
+ content = "";
+ }
+ return content;
+}
+
+std::string llkProcGetName(pid_t tid, const char* node = "/cmdline") {
+ std::string content = ReadFile(procdir + std::to_string(tid) + node);
+ static constexpr char needles[] = " \t\r\n"; // including trailing nul
+ auto pos = content.find_first_of(needles, 0, sizeof(needles));
+ if (pos != std::string::npos) {
+ content.erase(pos);
+ }
+ return content;
+}
+
+uid_t llkProcGetUid(pid_t tid) {
+ // Get the process' uid. The following read from /status is admittedly
+ // racy, prone to corruption due to shape-changes. The consequences are
+ // not catastrophic as we sample a few times before taking action.
+ //
+ // If /loginuid worked on reliably, or on Android (all tasks report -1)...
+ // Android lmkd causes /cgroup to contain memory:/<dom>/uid_<uid>/pid_<pid>
+ // which is tighter, but also not reliable.
+ std::string content = ReadFile(procdir + std::to_string(tid) + "/status");
+ static constexpr char Uid[] = "\nUid:";
+ auto pos = content.find(Uid);
+ if (pos == std::string::npos) {
+ return -1;
+ }
+ pos += ::strlen(Uid);
+ while ((pos < content.size()) && ::isblank(content[pos])) {
+ ++pos;
+ }
+ content.erase(0, pos);
+ for (pos = 0; (pos < content.size()) && ::isdigit(content[pos]); ++pos) {
+ ;
+ }
+ // Content of form 'Uid: 0 0 0 0', newline is error
+ if ((pos >= content.size()) || !::isblank(content[pos])) {
+ return -1;
+ }
+ content.erase(pos);
+ uid_t ret;
+ if (!android::base::ParseInt(content, &ret, uid_t(0))) {
+ return -1;
+ }
+ return ret;
+}
+
+struct proc {
+ pid_t tid; // monitored thread id (in Z or D state).
+ nanoseconds schedUpdate; // /proc/<tid>/sched "se.avg.lastUpdateTime",
+ uint64_t nrSwitches; // /proc/<tid>/sched "nr_switches" for
+ // refined ABA problem detection, determine
+ // forward scheduling progress.
+ milliseconds update; // llkUpdate millisecond signature of last.
+ milliseconds count; // duration in state.
+ pid_t pid; // /proc/<pid> before iterating through
+ // /proc/<pid>/task/<tid> for threads.
+ pid_t ppid; // /proc/<tid>/stat field 4 parent pid.
+ uid_t uid; // /proc/<tid>/status Uid: field.
+ unsigned time; // sum of /proc/<tid>/stat field 14 utime &
+ // 15 stime for coarse ABA problem detection.
+ std::string cmdline; // cached /cmdline content
+ char state; // /proc/<tid>/stat field 3: Z or D
+ // (others we do not monitor: S, R, T or ?)
+ char comm[TASK_COMM_LEN + 3]; // space for adding '[' and ']'
+ bool exeMissingValid; // exeMissing has been cached
+ bool cmdlineValid; // cmdline has been cached
+ bool updated; // cleared before monitoring pass.
+ bool killed; // sent a kill to this thread, next panic...
+
+ void setComm(const char* _comm) { strncpy(comm + 1, _comm, sizeof(comm) - 2); }
+
+ proc(pid_t tid, pid_t pid, pid_t ppid, const char* _comm, int time, char state)
+ : tid(tid),
+ schedUpdate(0),
+ nrSwitches(0),
+ update(llkUpdate),
+ count(0),
+ pid(pid),
+ ppid(ppid),
+ uid(-1),
+ time(time),
+ state(state),
+ exeMissingValid(false),
+ cmdlineValid(false),
+ updated(true),
+ killed(false) {
+ memset(comm, '\0', sizeof(comm));
+ setComm(_comm);
+ }
+
+ const char* getComm(void) {
+ if (comm[1] == '\0') { // comm Valid?
+ strncpy(comm + 1, llkProcGetName(tid, "/comm").c_str(), sizeof(comm) - 2);
+ }
+ if (!exeMissingValid) {
+ if (llkIsMissingExeLink(tid)) {
+ comm[0] = '[';
+ }
+ exeMissingValid = true;
+ }
+ size_t len = strlen(comm + 1);
+ if (__predict_true(len < (sizeof(comm) - 1))) {
+ if (comm[0] == '[') {
+ if ((comm[len] != ']') && __predict_true(len < (sizeof(comm) - 2))) {
+ comm[++len] = ']';
+ comm[++len] = '\0';
+ }
+ } else {
+ if (comm[len] == ']') {
+ comm[len] = '\0';
+ }
+ }
+ }
+ return &comm[comm[0] != '['];
+ }
+
+ const char* getCmdline(void) {
+ if (!cmdlineValid) {
+ cmdline = llkProcGetName(tid);
+ cmdlineValid = true;
+ }
+ return cmdline.c_str();
+ }
+
+ uid_t getUid(void) {
+ if (uid <= 0) { // Churn on root user, because most likely to setuid()
+ uid = llkProcGetUid(tid);
+ }
+ return uid;
+ }
+
+ void reset(void) { // reset cache, if we detected pid rollover
+ uid = -1;
+ state = '?';
+ cmdline = "";
+ comm[0] = '\0';
+ exeMissingValid = false;
+ cmdlineValid = false;
+ }
+};
+
+std::unordered_map<pid_t, proc> tids;
+
+// Check range and setup defaults, in order of propagation:
+// llkTimeoutMs
+// llkCheckMs
+// ...
+// KISS to keep it all self-contained, and called multiple times as parameters
+// are interpreted so that defaults, llkCheckMs and llkCycle make sense.
+void llkValidate() {
+ if (llkTimeoutMs == 0ms) {
+ llkTimeoutMs = LLK_TIMEOUT_MS_DEFAULT;
+ }
+ llkTimeoutMs = std::max(llkTimeoutMs, LLK_TIMEOUT_MS_MINIMUM);
+ if (llkCheckMs == 0ms) {
+ llkCheckMs = llkTimeoutMs / LLK_CHECKS_PER_TIMEOUT_DEFAULT;
+ }
+ llkCheckMs = std::min(llkCheckMs, llkTimeoutMs);
+
+ for (size_t state = 0; state < ARRAY_SIZE(llkStateTimeoutMs); ++state) {
+ if (llkStateTimeoutMs[state] == 0ms) {
+ llkStateTimeoutMs[state] = llkTimeoutMs;
+ }
+ llkStateTimeoutMs[state] =
+ std::min(std::max(llkStateTimeoutMs[state], LLK_TIMEOUT_MS_MINIMUM), llkTimeoutMs);
+ llkCheckMs = std::min(llkCheckMs, llkStateTimeoutMs[state]);
+ }
+
+ llkCheckMs = std::max(llkCheckMs, LLK_CHECK_MS_MINIMUM);
+ if (llkCycle == 0ms) {
+ llkCycle = llkCheckMs;
+ }
+ llkCycle = std::min(llkCycle, llkCheckMs);
+}
+
+milliseconds llkGetTimespecDiffMs(timespec* from, timespec* to) {
+ return duration_cast<milliseconds>(seconds(to->tv_sec - from->tv_sec)) +
+ duration_cast<milliseconds>(nanoseconds(to->tv_nsec - from->tv_nsec));
+}
+
+std::string llkProcGetName(pid_t tid, const char* comm, const char* cmdline) {
+ if ((cmdline != nullptr) && (*cmdline != '\0')) {
+ return cmdline;
+ }
+ if ((comm != nullptr) && (*comm != '\0')) {
+ return comm;
+ }
+
+ // UNLIKELY! Here because killed before we kill it?
+ // Assume change is afoot, do not call llkTidAlloc
+
+ // cmdline ?
+ std::string content = llkProcGetName(tid);
+ if (content.size() != 0) {
+ return content;
+ }
+ // Comm instead?
+ content = llkProcGetName(tid, "/comm");
+ if (llkIsMissingExeLink(tid) && (content.size() != 0)) {
+ return '[' + content + ']';
+ }
+ return content;
+}
+
+int llkKillOneProcess(pid_t pid, char state, pid_t tid, const char* tcomm = nullptr,
+ const char* tcmdline = nullptr, const char* pcomm = nullptr,
+ const char* pcmdline = nullptr) {
+ std::string forTid;
+ if (tid != pid) {
+ forTid = " for '" + llkProcGetName(tid, tcomm, tcmdline) + "' (" + std::to_string(tid) + ")";
+ }
+ LOG(INFO) << "Killing '" << llkProcGetName(pid, pcomm, pcmdline) << "' (" << pid
+ << ") to check forward scheduling progress in " << state << " state" << forTid;
+ // CAP_KILL required
+ errno = 0;
+ auto r = ::kill(pid, SIGKILL);
+ if (r) {
+ PLOG(ERROR) << "kill(" << pid << ")=" << r << ' ';
+ }
+
+ return r;
+}
+
+// Kill one process
+int llkKillOneProcess(pid_t pid, proc* tprocp) {
+ return llkKillOneProcess(pid, tprocp->state, tprocp->tid, tprocp->getComm(),
+ tprocp->getCmdline());
+}
+
+// Kill one process specified by kprocp
+int llkKillOneProcess(proc* kprocp, proc* tprocp) {
+ if (kprocp == nullptr) {
+ return -2;
+ }
+
+ return llkKillOneProcess(kprocp->tid, tprocp->state, tprocp->tid, tprocp->getComm(),
+ tprocp->getCmdline(), kprocp->getComm(), kprocp->getCmdline());
+}
+
+// Acquire file descriptor from environment, or open and cache it.
+// NB: cache is unnecessary in our current context, pedantically
+// required to prevent leakage of file descriptors in the future.
+int llkFileToWriteFd(const std::string& file) {
+ static std::unordered_map<std::string, int> cache;
+ auto search = cache.find(file);
+ if (search != cache.end()) return search->second;
+ auto fd = android_get_control_file(file.c_str());
+ if (fd >= 0) return fd;
+ fd = TEMP_FAILURE_RETRY(::open(file.c_str(), O_WRONLY | O_CLOEXEC));
+ if (fd >= 0) cache.emplace(std::make_pair(file, fd));
+ return fd;
+}
+
+// Wrap android::base::WriteStringToFile to use android_get_control_file.
+bool llkWriteStringToFile(const std::string& string, const std::string& file) {
+ auto fd = llkFileToWriteFd(file);
+ if (fd < 0) return false;
+ return android::base::WriteStringToFd(string, fd);
+}
+
+bool llkWriteStringToFileConfirm(const std::string& string, const std::string& file) {
+ auto fd = llkFileToWriteFd(file);
+ auto ret = (fd < 0) ? false : android::base::WriteStringToFd(string, fd);
+ std::string content;
+ if (!android::base::ReadFileToString(file, &content)) return ret;
+ return android::base::Trim(content) == string;
+}
+
+void llkPanicKernel(bool dump, pid_t tid) __noreturn;
+void llkPanicKernel(bool dump, pid_t tid) {
+ auto sysrqTriggerFd = llkFileToWriteFd("/proc/sysrq-trigger");
+ if (sysrqTriggerFd < 0) {
+ // DYB
+ llkKillOneProcess(initPid, 'R', tid);
+ // The answer to life, the universe and everything
+ ::exit(42);
+ // NOTREACHED
+ }
+ ::sync();
+ if (dump) {
+ // Show all locks that are held
+ android::base::WriteStringToFd("d", sysrqTriggerFd);
+ // This can trigger hardware watchdog, that is somewhat _ok_.
+ // But useless if pstore configured for <256KB, low ram devices ...
+ if (!llkLowRam) {
+ android::base::WriteStringToFd("t", sysrqTriggerFd);
+ }
+ ::usleep(200000); // let everything settle
+ }
+ android::base::WriteStringToFd("c", sysrqTriggerFd);
+ // NOTREACHED
+ // DYB
+ llkKillOneProcess(initPid, 'R', tid);
+ // I sat at my desk, stared into the garden and thought '42 will do'.
+ // I typed it out. End of story
+ ::exit(42);
+ // NOTREACHED
+}
+
+void llkAlarmHandler(int) {
+ llkPanicKernel(false, ::getpid());
+}
+
+milliseconds GetUintProperty(const std::string& key, milliseconds def) {
+ return milliseconds(android::base::GetUintProperty(key, static_cast<uint64_t>(def.count()),
+ static_cast<uint64_t>(def.max().count())));
+}
+
+seconds GetUintProperty(const std::string& key, seconds def) {
+ return seconds(android::base::GetUintProperty(key, static_cast<uint64_t>(def.count()),
+ static_cast<uint64_t>(def.max().count())));
+}
+
+proc* llkTidLookup(pid_t tid) {
+ auto search = tids.find(tid);
+ if (search == tids.end()) {
+ return nullptr;
+ }
+ return &search->second;
+}
+
+void llkTidRemove(pid_t tid) {
+ tids.erase(tid);
+}
+
+proc* llkTidAlloc(pid_t tid, pid_t pid, pid_t ppid, const char* comm, int time, char state) {
+ auto it = tids.emplace(std::make_pair(tid, proc(tid, pid, ppid, comm, time, state)));
+ return &it.first->second;
+}
+
+std::string llkFormat(milliseconds ms) {
+ auto sec = duration_cast<seconds>(ms);
+ std::ostringstream s;
+ s << sec.count() << '.';
+ auto f = s.fill('0');
+ auto w = s.width(3);
+ s << std::right << (ms - sec).count();
+ s.width(w);
+ s.fill(f);
+ s << 's';
+ return s.str();
+}
+
+std::string llkFormat(seconds s) {
+ return std::to_string(s.count()) + 's';
+}
+
+std::string llkFormat(bool flag) {
+ return flag ? "true" : "false";
+}
+
+std::string llkFormat(const std::unordered_set<std::string>& blacklist) {
+ std::string ret;
+ for (auto entry : blacklist) {
+ if (ret.size()) {
+ ret += ",";
+ }
+ ret += entry;
+ }
+ return ret;
+}
+
+// We only officially support comma separators, but wetware being what they
+// are will take some liberty and I do not believe they should be punished.
+std::unordered_set<std::string> llkSplit(const std::string& s,
+ const std::string& delimiters = ", \t:") {
+ std::unordered_set<std::string> result;
+
+ size_t base = 0;
+ size_t found;
+ while (true) {
+ found = s.find_first_of(delimiters, base);
+ result.emplace(s.substr(base, found - base));
+ if (found == s.npos) break;
+ base = found + 1;
+ }
+ return result;
+}
+
+bool llkSkipName(const std::string& name,
+ const std::unordered_set<std::string>& blacklist = llkBlacklistProcess) {
+ if ((name.size() == 0) || (blacklist.size() == 0)) {
+ return false;
+ }
+
+ return blacklist.find(name) != blacklist.end();
+}
+
+bool llkSkipPid(pid_t pid) {
+ return llkSkipName(std::to_string(pid), llkBlacklistProcess);
+}
+
+bool llkSkipPpid(pid_t ppid) {
+ return llkSkipName(std::to_string(ppid), llkBlacklistParent);
+}
+
+bool llkSkipUid(uid_t uid) {
+ // Match by number?
+ if (llkSkipName(std::to_string(uid), llkBlacklistUid)) {
+ return true;
+ }
+
+ // Match by name?
+ auto pwd = ::getpwuid(uid);
+ return (pwd != nullptr) && __predict_true(pwd->pw_name != nullptr) &&
+ __predict_true(pwd->pw_name[0] != '\0') && llkSkipName(pwd->pw_name, llkBlacklistUid);
+}
+
+bool getValidTidDir(dirent* dp, std::string* piddir) {
+ if (!::isdigit(dp->d_name[0])) {
+ return false;
+ }
+
+ // Corner case can not happen in reality b/c of above ::isdigit check
+ if (__predict_false(dp->d_type != DT_DIR)) {
+ if (__predict_false(dp->d_type == DT_UNKNOWN)) { // can't b/c procfs
+ struct stat st;
+ *piddir = procdir;
+ *piddir += dp->d_name;
+ return (lstat(piddir->c_str(), &st) == 0) && (st.st_mode & S_IFDIR);
+ }
+ return false;
+ }
+
+ *piddir = procdir;
+ *piddir += dp->d_name;
+ return true;
+}
+
+bool llkIsMonitorState(char state) {
+ return (state == 'Z') || (state == 'D');
+}
+
+// returns -1 if not found
+long long getSchedValue(const std::string& schedString, const char* key) {
+ auto pos = schedString.find(key);
+ if (pos == std::string::npos) {
+ return -1;
+ }
+ pos = schedString.find(':', pos);
+ if (__predict_false(pos == std::string::npos)) {
+ return -1;
+ }
+ while ((++pos < schedString.size()) && ::isblank(schedString[pos])) {
+ ;
+ }
+ long long ret;
+ if (!android::base::ParseInt(schedString.substr(pos), &ret, static_cast<long long>(0))) {
+ return -1;
+ }
+ return ret;
+}
+
+// Primary ABA mitigation watching last time schedule activity happened
+void llkCheckSchedUpdate(proc* procp, const std::string& piddir) {
+ // Audit finds /proc/<tid>/sched is just over 1K, and
+ // is rarely larger than 2K, even less on Android.
+ // For example, the "se.avg.lastUpdateTime" field we are
+ // interested in typically within the primary set in
+ // the first 1K.
+ //
+ // Proc entries can not be read >1K atomically via libbase,
+ // but if there are problems we assume at least a few
+ // samples of reads occur before we take any real action.
+ std::string schedString = ReadFile(piddir + "/sched");
+ if (schedString.size() == 0) {
+ // /schedstat is not as standardized, but in 3.1+
+ // Android devices, the third field is nr_switches
+ // from /sched:
+ schedString = ReadFile(piddir + "/schedstat");
+ if (schedString.size() == 0) {
+ return;
+ }
+ auto val = static_cast<unsigned long long>(-1);
+ if (((::sscanf(schedString.c_str(), "%*d %*d %llu", &val)) == 1) &&
+ (val != static_cast<unsigned long long>(-1)) && (val != 0) &&
+ (val != procp->nrSwitches)) {
+ procp->nrSwitches = val;
+ procp->count = 0ms;
+ procp->killed = false;
+ }
+ return;
+ }
+
+ auto val = getSchedValue(schedString, "\nse.avg.lastUpdateTime");
+ if (val == -1) {
+ val = getSchedValue(schedString, "\nse.svg.last_update_time");
+ }
+ if (val != -1) {
+ auto schedUpdate = nanoseconds(val);
+ if (schedUpdate != procp->schedUpdate) {
+ procp->schedUpdate = schedUpdate;
+ procp->count = 0ms;
+ procp->killed = false;
+ }
+ }
+
+ val = getSchedValue(schedString, "\nnr_switches");
+ if (val != -1) {
+ if (static_cast<uint64_t>(val) != procp->nrSwitches) {
+ procp->nrSwitches = val;
+ procp->count = 0ms;
+ procp->killed = false;
+ }
+ }
+}
+
+void llkLogConfig(void) {
+ LOG(INFO) << "ro.config.low_ram=" << llkFormat(llkLowRam) << "\n"
+ << LLK_ENABLE_PROPERTY "=" << llkFormat(llkEnable) << "\n"
+ << KHT_ENABLE_PROPERTY "=" << llkFormat(khtEnable) << "\n"
+ << LLK_MLOCKALL_PROPERTY "=" << llkFormat(llkMlockall) << "\n"
+ << KHT_TIMEOUT_PROPERTY "=" << llkFormat(khtTimeout) << "\n"
+ << LLK_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkTimeoutMs) << "\n"
+ << LLK_D_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkStateTimeoutMs[llkStateD]) << "\n"
+ << LLK_Z_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkStateTimeoutMs[llkStateZ]) << "\n"
+ << LLK_CHECK_MS_PROPERTY "=" << llkFormat(llkCheckMs) << "\n"
+ << LLK_BLACKLIST_PROCESS_PROPERTY "=" << llkFormat(llkBlacklistProcess) << "\n"
+ << LLK_BLACKLIST_PARENT_PROPERTY "=" << llkFormat(llkBlacklistParent) << "\n"
+ << LLK_BLACKLIST_UID_PROPERTY "=" << llkFormat(llkBlacklistUid);
+}
+
+void* llkThread(void* obj) {
+ LOG(INFO) << "started";
+
+ std::string name = std::to_string(::gettid());
+ if (!llkSkipName(name)) {
+ llkBlacklistProcess.emplace(name);
+ }
+ name = static_cast<const char*>(obj);
+ prctl(PR_SET_NAME, name.c_str());
+ if (__predict_false(!llkSkipName(name))) {
+ llkBlacklistProcess.insert(name);
+ }
+ // No longer modifying llkBlacklistProcess.
+ llkRunning = true;
+ llkLogConfig();
+ while (llkRunning) {
+ ::usleep(duration_cast<microseconds>(llkCheck(true)).count());
+ }
+ // NOTREACHED
+ LOG(INFO) << "exiting";
+ return nullptr;
+}
+
+} // namespace
+
+milliseconds llkCheck(bool checkRunning) {
+ if (!llkEnable || (checkRunning != llkRunning)) {
+ return milliseconds::max();
+ }
+
+ // Reset internal watchdog, which is a healthy engineering margin of
+ // double the maximum wait or cycle time for the mainloop that calls us.
+ //
+ // This alarm is effectively the live lock detection of llkd, as
+ // we understandably can not monitor ourselves otherwise.
+ ::alarm(duration_cast<seconds>(llkTimeoutMs * 2).count());
+
+ // kernel jiffy precision fastest acquisition
+ static timespec last;
+ timespec now;
+ ::clock_gettime(CLOCK_MONOTONIC_COARSE, &now);
+ auto ms = llkGetTimespecDiffMs(&last, &now);
+ if (ms < llkCycle) {
+ return llkCycle - ms;
+ }
+ last = now;
+
+ LOG(VERBOSE) << "opendir(\"" << procdir << "\")";
+ if (__predict_false(!llkTopDirectory)) {
+ // gid containing AID_READPROC required
+ llkTopDirectory.reset(procdir);
+ if (__predict_false(!llkTopDirectory)) {
+ // Most likely reason we could be here is a resource limit.
+ // Keep our processing down to a minimum, but not so low that
+ // we do not recover in a timely manner should the issue be
+ // transitory.
+ LOG(DEBUG) << "opendir(\"" << procdir << "\") failed";
+ return llkTimeoutMs;
+ }
+ }
+
+ for (auto& it : tids) {
+ it.second.updated = false;
+ }
+
+ auto prevUpdate = llkUpdate;
+ llkUpdate += ms;
+ ms -= llkCycle;
+ auto myPid = ::getpid();
+ auto myTid = ::gettid();
+ for (auto dp = llkTopDirectory.read(); dp != nullptr; dp = llkTopDirectory.read()) {
+ std::string piddir;
+
+ if (!getValidTidDir(dp, &piddir)) {
+ continue;
+ }
+
+ // Get the process tasks
+ std::string taskdir = piddir + "/task/";
+ int pid = -1;
+ LOG(VERBOSE) << "+opendir(\"" << taskdir << "\")";
+ dir taskDirectory(taskdir);
+ if (__predict_false(!taskDirectory)) {
+ LOG(DEBUG) << "+opendir(\"" << taskdir << "\") failed";
+ }
+ for (auto tp = taskDirectory.read(dir::task, dp); tp != nullptr;
+ tp = taskDirectory.read(dir::task)) {
+ if (!getValidTidDir(tp, &piddir)) {
+ continue;
+ }
+
+ // Get the process stat
+ std::string stat = ReadFile(piddir + "/stat");
+ if (stat.size() == 0) {
+ continue;
+ }
+ unsigned tid = -1;
+ char pdir[TASK_COMM_LEN + 1];
+ char state = '?';
+ unsigned ppid = -1;
+ unsigned utime = -1;
+ unsigned stime = -1;
+ int dummy;
+ pdir[0] = '\0';
+ // tid should not change value
+ auto match = ::sscanf(
+ stat.c_str(),
+ "%u (%" ___STRING(
+ TASK_COMM_LEN) "[^)]) %c %u %*d %*d %*d %*d %*d %*d %*d %*d %*d %u %u %d",
+ &tid, pdir, &state, &ppid, &utime, &stime, &dummy);
+ if (pid == -1) {
+ pid = tid;
+ }
+ LOG(VERBOSE) << "match " << match << ' ' << tid << " (" << pdir << ") " << state << ' '
+ << ppid << " ... " << utime << ' ' << stime << ' ' << dummy;
+ if (match != 7) {
+ continue;
+ }
+
+ auto procp = llkTidLookup(tid);
+ if (procp == nullptr) {
+ procp = llkTidAlloc(tid, pid, ppid, pdir, utime + stime, state);
+ } else {
+ // comm can change ...
+ procp->setComm(pdir);
+ procp->updated = true;
+ // pid/ppid/tid wrap?
+ if (((procp->update != prevUpdate) && (procp->update != llkUpdate)) ||
+ (procp->ppid != ppid) || (procp->pid != pid)) {
+ procp->reset();
+ } else if (procp->time != (utime + stime)) { // secondary ABA.
+ // watching utime+stime granularity jiffy
+ procp->state = '?';
+ }
+ procp->update = llkUpdate;
+ procp->pid = pid;
+ procp->ppid = ppid;
+ procp->time = utime + stime;
+ if (procp->state != state) {
+ procp->count = 0ms;
+ procp->killed = false;
+ procp->state = state;
+ } else {
+ procp->count += llkCycle;
+ }
+ }
+
+ // Filter checks in intuitive order of CPU cost to evaluate
+ // If tid unique continue, if ppid or pid unique break
+
+ if (pid == myPid) {
+ break;
+ }
+ if (!llkIsMonitorState(state)) {
+ continue;
+ }
+ if ((tid == myTid) || llkSkipPid(tid)) {
+ continue;
+ }
+ if (llkSkipPpid(ppid)) {
+ break;
+ }
+
+ if (llkSkipName(procp->getComm())) {
+ continue;
+ }
+ if (llkSkipName(procp->getCmdline())) {
+ break;
+ }
+
+ auto pprocp = llkTidLookup(ppid);
+ if (pprocp == nullptr) {
+ pprocp = llkTidAlloc(ppid, ppid, 0, "", 0, '?');
+ }
+ if ((pprocp != nullptr) && (llkSkipName(pprocp->getComm(), llkBlacklistParent) ||
+ llkSkipName(pprocp->getCmdline(), llkBlacklistParent))) {
+ break;
+ }
+
+ if ((llkBlacklistUid.size() != 0) && llkSkipUid(procp->getUid())) {
+ continue;
+ }
+
+ // ABA mitigation watching last time schedule activity happened
+ llkCheckSchedUpdate(procp, piddir);
+
+ // Can only fall through to here if registered D or Z state !!!
+ if (procp->count < llkStateTimeoutMs[(state == 'Z') ? llkStateZ : llkStateD]) {
+ LOG(VERBOSE) << state << ' ' << llkFormat(procp->count) << ' ' << ppid << "->"
+ << pid << "->" << tid << ' ' << procp->getComm();
+ continue;
+ }
+
+ // We have to kill it to determine difference between live lock
+ // and persistent state blocked on a resource. Is there something
+ // wrong with a process that has no forward scheduling progress in
+ // Z or D? Yes, generally means improper accounting in the
+ // process, but not always ...
+ //
+ // Whomever we hit with a test kill must accept the Android
+ // Aphorism that everything can be burned to the ground and
+ // must survive.
+ if (procp->killed == false) {
+ procp->killed = true;
+ // confirm: re-read uid before committing to a panic.
+ procp->uid = -1;
+ switch (state) {
+ case 'Z': // kill ppid to free up a Zombie
+ // Killing init will kernel panic without diagnostics
+ // so skip right to controlled kernel panic with
+ // diagnostics.
+ if (ppid == initPid) {
+ break;
+ }
+ LOG(WARNING) << "Z " << llkFormat(procp->count) << ' ' << ppid << "->"
+ << pid << "->" << tid << ' ' << procp->getComm() << " [kill]";
+ if ((llkKillOneProcess(pprocp, procp) >= 0) ||
+ (llkKillOneProcess(ppid, procp) >= 0)) {
+ continue;
+ }
+ break;
+
+ case 'D': // kill tid to free up an uninterruptible D
+ // If ABA is doing its job, we would not need or
+ // want the following. Test kill is a Hail Mary
+ // to make absolutely sure there is no forward
+ // scheduling progress. The cost when ABA is
+ // not working is we kill a process that likes to
+ // stay in 'D' state, instead of panicing the
+ // kernel (worse).
+ LOG(WARNING) << "D " << llkFormat(procp->count) << ' ' << pid << "->" << tid
+ << ' ' << procp->getComm() << " [kill]";
+ if ((llkKillOneProcess(llkTidLookup(pid), procp) >= 0) ||
+ (llkKillOneProcess(pid, 'D', tid) >= 0) ||
+ (llkKillOneProcess(procp, procp) >= 0) ||
+ (llkKillOneProcess(tid, 'D', tid) >= 0)) {
+ continue;
+ }
+ break;
+ }
+ }
+ // We are here because we have confirmed kernel live-lock
+ LOG(ERROR) << state << ' ' << llkFormat(procp->count) << ' ' << ppid << "->" << pid
+ << "->" << tid << ' ' << procp->getComm() << " [panic]";
+ llkPanicKernel(true, tid);
+ }
+ LOG(VERBOSE) << "+closedir()";
+ }
+ llkTopDirectory.rewind();
+ LOG(VERBOSE) << "closedir()";
+
+ // garbage collection of old process references
+ for (auto p = tids.begin(); p != tids.end();) {
+ if (!p->second.updated) {
+ IF_ALOG(LOG_VERBOSE, LOG_TAG) {
+ std::string ppidCmdline = llkProcGetName(p->second.ppid, nullptr, nullptr);
+ if (ppidCmdline.size()) {
+ ppidCmdline = "(" + ppidCmdline + ")";
+ }
+ std::string pidCmdline;
+ if (p->second.pid != p->second.tid) {
+ pidCmdline = llkProcGetName(p->second.pid, nullptr, p->second.getCmdline());
+ if (pidCmdline.size()) {
+ pidCmdline = "(" + pidCmdline + ")";
+ }
+ }
+ std::string tidCmdline =
+ llkProcGetName(p->second.tid, p->second.getComm(), p->second.getCmdline());
+ if (tidCmdline.size()) {
+ tidCmdline = "(" + tidCmdline + ")";
+ }
+ LOG(VERBOSE) << "thread " << p->second.ppid << ppidCmdline << "->" << p->second.pid
+ << pidCmdline << "->" << p->second.tid << tidCmdline << " removed";
+ }
+ p = tids.erase(p);
+ } else {
+ ++p;
+ }
+ }
+ if (__predict_false(tids.empty())) {
+ llkTopDirectory.reset();
+ }
+
+ llkCycle = llkCheckMs;
+
+ timespec end;
+ ::clock_gettime(CLOCK_MONOTONIC_COARSE, &end);
+ auto milli = llkGetTimespecDiffMs(&now, &end);
+ LOG((milli > 10s) ? ERROR : (milli > 1s) ? WARNING : VERBOSE) << "sample " << llkFormat(milli);
+
+ // cap to minimum sleep for 1 second since last cycle
+ if (llkCycle < (ms + 1s)) {
+ return 1s;
+ }
+ return llkCycle - ms;
+}
+
+unsigned llkCheckMilliseconds() {
+ return duration_cast<milliseconds>(llkCheck()).count();
+}
+
+bool llkInit(const char* threadname) {
+ llkLowRam = android::base::GetBoolProperty("ro.config.low_ram", false);
+ llkEnable = android::base::GetBoolProperty(LLK_ENABLE_PROPERTY, llkEnable);
+ if (llkEnable && !llkTopDirectory.reset(procdir)) {
+ // Most likely reason we could be here is llkd was started
+ // incorrectly without the readproc permissions. Keep our
+ // processing down to a minimum.
+ llkEnable = false;
+ }
+ khtEnable = android::base::GetBoolProperty(KHT_ENABLE_PROPERTY, khtEnable);
+ llkMlockall = android::base::GetBoolProperty(LLK_MLOCKALL_PROPERTY, llkMlockall);
+ // if LLK_TIMOUT_MS_PROPERTY was not set, we will use a set
+ // KHT_TIMEOUT_PROPERTY as co-operative guidance for the default value.
+ khtTimeout = GetUintProperty(KHT_TIMEOUT_PROPERTY, khtTimeout);
+ if (khtTimeout == 0s) {
+ khtTimeout = duration_cast<seconds>(llkTimeoutMs * (1 + LLK_CHECKS_PER_TIMEOUT_DEFAULT) /
+ LLK_CHECKS_PER_TIMEOUT_DEFAULT);
+ }
+ llkTimeoutMs =
+ khtTimeout * LLK_CHECKS_PER_TIMEOUT_DEFAULT / (1 + LLK_CHECKS_PER_TIMEOUT_DEFAULT);
+ llkTimeoutMs = GetUintProperty(LLK_TIMEOUT_MS_PROPERTY, llkTimeoutMs);
+ llkValidate(); // validate llkTimeoutMs, llkCheckMs and llkCycle
+ llkStateTimeoutMs[llkStateD] = GetUintProperty(LLK_D_TIMEOUT_MS_PROPERTY, llkTimeoutMs);
+ llkStateTimeoutMs[llkStateZ] = GetUintProperty(LLK_Z_TIMEOUT_MS_PROPERTY, llkTimeoutMs);
+ llkCheckMs = GetUintProperty(LLK_CHECK_MS_PROPERTY, llkCheckMs);
+ llkValidate(); // validate all (effectively minus llkTimeoutMs)
+ std::string defaultBlacklistProcess(
+ std::to_string(kernelPid) + "," + std::to_string(initPid) + "," +
+ std::to_string(kthreaddPid) + "," + std::to_string(::getpid()) + "," +
+ std::to_string(::gettid()) + "," LLK_BLACKLIST_PROCESS_DEFAULT);
+ if (threadname) {
+ defaultBlacklistProcess += std::string(",") + threadname;
+ }
+ for (int cpu = 1; cpu < get_nprocs_conf(); ++cpu) {
+ defaultBlacklistProcess += ",[watchdog/" + std::to_string(cpu) + "]";
+ }
+ defaultBlacklistProcess =
+ android::base::GetProperty(LLK_BLACKLIST_PROCESS_PROPERTY, defaultBlacklistProcess);
+ llkBlacklistProcess = llkSplit(defaultBlacklistProcess);
+ if (!llkSkipName("[khungtaskd]")) { // ALWAYS ignore as special
+ llkBlacklistProcess.emplace("[khungtaskd]");
+ }
+ llkBlacklistParent = llkSplit(android::base::GetProperty(
+ LLK_BLACKLIST_PARENT_PROPERTY, std::to_string(kernelPid) + "," + std::to_string(kthreaddPid) +
+ "," LLK_BLACKLIST_PARENT_DEFAULT));
+ llkBlacklistUid =
+ llkSplit(android::base::GetProperty(LLK_BLACKLIST_UID_PROPERTY, LLK_BLACKLIST_UID_DEFAULT));
+
+ // internal watchdog
+ ::signal(SIGALRM, llkAlarmHandler);
+
+ // kernel hung task configuration? Otherwise leave it as-is
+ if (khtEnable) {
+ // EUID must be AID_ROOT to write to /proc/sys/kernel/ nodes, there
+ // are no capability overrides. For security reasons we do not want
+ // to run as AID_ROOT. We may not be able to write them successfully,
+ // we will try, but the least we can do is read the values back to
+ // confirm expectations and report whether configured or not.
+ auto configured = llkWriteStringToFileConfirm(std::to_string(khtTimeout.count()),
+ "/proc/sys/kernel/hung_task_timeout_secs");
+ if (configured) {
+ llkWriteStringToFile("65535", "/proc/sys/kernel/hung_task_warnings");
+ llkWriteStringToFile("65535", "/proc/sys/kernel/hung_task_check_count");
+ configured = llkWriteStringToFileConfirm("1", "/proc/sys/kernel/hung_task_panic");
+ }
+ if (configured) {
+ LOG(INFO) << "[khungtaskd] configured";
+ } else {
+ LOG(WARNING) << "[khungtaskd] not configurable";
+ }
+ }
+
+ bool logConfig = true;
+ if (llkEnable) {
+ if (llkMlockall &&
+ // MCL_ONFAULT pins pages as they fault instead of loading
+ // everything immediately all at once. (Which would be bad,
+ // because as of this writing, we have a lot of mapped pages we
+ // never use.) Old kernels will see MCL_ONFAULT and fail with
+ // EINVAL; we ignore this failure.
+ //
+ // N.B. read the man page for mlockall. MCL_CURRENT | MCL_ONFAULT
+ // pins ⊆ MCL_CURRENT, converging to just MCL_CURRENT as we fault
+ // in pages.
+
+ // CAP_IPC_LOCK required
+ mlockall(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT) && (errno != EINVAL)) {
+ PLOG(WARNING) << "mlockall failed ";
+ }
+
+ if (threadname) {
+ pthread_attr_t attr;
+
+ if (!pthread_attr_init(&attr)) {
+ sched_param param;
+
+ memset(¶m, 0, sizeof(param));
+ pthread_attr_setschedparam(&attr, ¶m);
+ pthread_attr_setschedpolicy(&attr, SCHED_BATCH);
+ if (!pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED)) {
+ pthread_t thread;
+ if (!pthread_create(&thread, &attr, llkThread, const_cast<char*>(threadname))) {
+ // wait a second for thread to start
+ for (auto retry = 50; retry && !llkRunning; --retry) {
+ ::usleep(20000);
+ }
+ logConfig = !llkRunning; // printed in llkd context?
+ } else {
+ LOG(ERROR) << "failed to spawn llkd thread";
+ }
+ } else {
+ LOG(ERROR) << "failed to detach llkd thread";
+ }
+ pthread_attr_destroy(&attr);
+ } else {
+ LOG(ERROR) << "failed to allocate attibutes for llkd thread";
+ }
+ }
+ } else {
+ LOG(DEBUG) << "[khungtaskd] left unconfigured";
+ }
+ if (logConfig) {
+ llkLogConfig();
+ }
+
+ return llkEnable;
+}
diff --git a/llkd/llkd.cpp b/llkd/llkd.cpp
new file mode 100644
index 0000000..f10253d
--- /dev/null
+++ b/llkd/llkd.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "llkd.h"
+
+#include <sched.h>
+#include <unistd.h>
+
+#include <chrono>
+
+#include <android-base/logging.h>
+
+using namespace std::chrono;
+
+int main(int, char**) {
+ LOG(INFO) << "started";
+
+ bool enabled = llkInit();
+
+ // Would like this policy to be automatic as part of libllkd,
+ // but that would be presumptuous and bad side-effect.
+ struct sched_param param;
+ memset(¶m, 0, sizeof(param));
+ sched_setscheduler(0, SCHED_BATCH, ¶m);
+
+ while (true) {
+ if (enabled) {
+ ::usleep(duration_cast<microseconds>(llkCheck()).count());
+ } else {
+ ::pause();
+ }
+ }
+ // NOTREACHED
+
+ LOG(INFO) << "exiting";
+ return 0;
+}
diff --git a/llkd/llkd.rc b/llkd/llkd.rc
new file mode 100644
index 0000000..a257e76
--- /dev/null
+++ b/llkd/llkd.rc
@@ -0,0 +1,18 @@
+# Configure [khungtaskd]
+on property:ro.khungtask.enable=true
+ write /proc/sys/kernel/hung_task_timeout_secs ${ro.khungtask.timeout:-720}
+ write /proc/sys/kernel/hung_task_warnings 65535
+ write /proc/sys/kernel/hung_task_check_count 65535
+ write /proc/sys/kernel/hung_task_panic 1
+
+on property:ro.llk.enable=true
+ start llkd
+
+service llkd /system/bin/llkd
+ class late_start
+ disabled
+ user llkd
+ group llkd readproc
+ capabilities KILL IPC_LOCK
+ file /proc/sysrq-trigger w
+ writepid /dev/cpuset/system-background/tasks
diff --git a/llkd/tests/Android.bp b/llkd/tests/Android.bp
new file mode 100644
index 0000000..6dd5938
--- /dev/null
+++ b/llkd/tests/Android.bp
@@ -0,0 +1,41 @@
+// Copyright (C) 2018 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+cc_test {
+ name: "llkd_unit_test",
+
+ shared_libs: [
+ "libbase",
+ "liblog",
+ ],
+ header_libs: [
+ "llkd_headers",
+ ],
+
+ target: {
+ android: {
+ srcs: [
+ "llkd_test.cpp",
+ ],
+ },
+ },
+
+ cflags: [
+ "-Wall",
+ "-Wextra",
+ "-Werror",
+ ],
+
+ compile_multilib: "first",
+}
diff --git a/llkd/tests/llkd_test.cpp b/llkd/tests/llkd_test.cpp
new file mode 100644
index 0000000..e3c95eb
--- /dev/null
+++ b/llkd/tests/llkd_test.cpp
@@ -0,0 +1,243 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <signal.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include <chrono>
+#include <iostream>
+#include <string>
+
+#include <android-base/properties.h>
+#include <gtest/gtest.h>
+#include <log/log_time.h> // for MS_PER_SEC and US_PER_SEC
+
+#include "llkd.h"
+
+using namespace std::chrono;
+using namespace std::chrono_literals;
+
+namespace {
+
+milliseconds GetUintProperty(const std::string& key, milliseconds def) {
+ return milliseconds(android::base::GetUintProperty(key, static_cast<uint64_t>(def.count()),
+ static_cast<uint64_t>(def.max().count())));
+}
+
+seconds GetUintProperty(const std::string& key, seconds def) {
+ return seconds(android::base::GetUintProperty(key, static_cast<uint64_t>(def.count()),
+ static_cast<uint64_t>(def.max().count())));
+}
+
+// GTEST_LOG_(WARNING) output is fugly, this has much less noise
+// ToDo: look into fixing googletest to produce output that matches style of
+// all the other status messages, and can switch off __line__ and
+// __function__ noise
+#define GTEST_LOG_WARNING std::cerr << "[ WARNING ] "
+#define GTEST_LOG_INFO std::cerr << "[ INFO ] "
+
+// Properties is _not_ a high performance ABI!
+void rest() {
+ usleep(200000);
+}
+
+void execute(const char* command) {
+ if (getuid() || system(command)) {
+ system((std::string("su root ") + command).c_str());
+ }
+}
+
+seconds llkdSleepPeriod(char state) {
+ auto default_enable = android::base::GetBoolProperty(LLK_ENABLE_PROPERTY, LLK_ENABLE_DEFAULT);
+ if (android::base::GetProperty(LLK_ENABLE_PROPERTY, "nothing") == "nothing") {
+ GTEST_LOG_INFO << LLK_ENABLE_PROPERTY " defaults to " << (default_enable ? "true" : "false")
+ << "\n";
+ }
+ // Hail Mary hope is unconfigured.
+ if ((GetUintProperty(LLK_TIMEOUT_MS_PROPERTY, LLK_TIMEOUT_MS_DEFAULT) !=
+ duration_cast<milliseconds>(120s)) ||
+ (GetUintProperty(LLK_CHECK_MS_PROPERTY,
+ LLK_TIMEOUT_MS_DEFAULT / LLK_CHECKS_PER_TIMEOUT_DEFAULT) !=
+ duration_cast<milliseconds>(10s))) {
+ execute("stop llkd");
+ rest();
+ std::string setprop("setprop ");
+ execute((setprop + LLK_TIMEOUT_MS_PROPERTY + " 120000").c_str());
+ rest();
+ execute((setprop + KHT_TIMEOUT_PROPERTY + " 130").c_str());
+ rest();
+ execute((setprop + LLK_CHECK_MS_PROPERTY + " 10000").c_str());
+ rest();
+ execute((setprop + LLK_ENABLE_PROPERTY + " true").c_str());
+ rest();
+ }
+ default_enable = android::base::GetBoolProperty(LLK_ENABLE_PROPERTY, false);
+ if (default_enable) {
+ execute("start llkd");
+ rest();
+ GTEST_LOG_INFO << "llkd enabled\n";
+ } else {
+ GTEST_LOG_WARNING << "llkd disabled\n";
+ }
+
+ /* KISS follows llk_init() */
+ milliseconds llkTimeoutMs = LLK_TIMEOUT_MS_DEFAULT;
+ seconds khtTimeout = duration_cast<seconds>(
+ llkTimeoutMs * (1 + LLK_CHECKS_PER_TIMEOUT_DEFAULT) / LLK_CHECKS_PER_TIMEOUT_DEFAULT);
+ khtTimeout = GetUintProperty(KHT_TIMEOUT_PROPERTY, khtTimeout);
+ llkTimeoutMs =
+ khtTimeout * LLK_CHECKS_PER_TIMEOUT_DEFAULT / (1 + LLK_CHECKS_PER_TIMEOUT_DEFAULT);
+ llkTimeoutMs = GetUintProperty(LLK_TIMEOUT_MS_PROPERTY, llkTimeoutMs);
+ if (llkTimeoutMs < LLK_TIMEOUT_MS_MINIMUM) {
+ llkTimeoutMs = LLK_TIMEOUT_MS_MINIMUM;
+ }
+ milliseconds llkCheckMs = llkTimeoutMs / LLK_CHECKS_PER_TIMEOUT_DEFAULT;
+ auto timeout = GetUintProperty(
+ (state == 'Z') ? LLK_Z_TIMEOUT_MS_PROPERTY : LLK_D_TIMEOUT_MS_PROPERTY, llkTimeoutMs);
+ if (timeout < LLK_TIMEOUT_MS_MINIMUM) {
+ timeout = LLK_TIMEOUT_MS_MINIMUM;
+ }
+
+ if (llkCheckMs > timeout) {
+ llkCheckMs = timeout;
+ }
+ llkCheckMs = GetUintProperty(LLK_CHECK_MS_PROPERTY, llkCheckMs);
+ timeout += llkCheckMs;
+ auto sec = duration_cast<seconds>(timeout);
+ if (sec == 0s) {
+ ++sec;
+ } else if (sec > 59s) {
+ GTEST_LOG_WARNING << "llkd is configured for about " << duration_cast<minutes>(sec).count()
+ << " minutes to react\n";
+ }
+
+ // 33% margin for the test to naturally timeout waiting for llkd to respond
+ return (sec * 4 + 2s) / 3;
+}
+
+inline void waitForPid(pid_t child_pid) {
+ int wstatus;
+ ASSERT_LE(0, waitpid(child_pid, &wstatus, 0));
+ EXPECT_FALSE(WIFEXITED(wstatus)) << "[ INFO ] exit=" << WEXITSTATUS(wstatus);
+ ASSERT_TRUE(WIFSIGNALED(wstatus));
+ ASSERT_EQ(WTERMSIG(wstatus), SIGKILL);
+}
+
+} // namespace
+
+// The tests that use this helper are to simulate processes stuck in 'D'
+// state that are experiencing forward scheduled progress. As such the
+// expectation is that llkd will _not_ perform any mitigations. The sleepfor
+// argument helps us set the amount of forward scheduler progress.
+static void llkd_driver_ABA(const microseconds sleepfor) {
+ const auto period = llkdSleepPeriod('D');
+ if (period <= sleepfor) {
+ GTEST_LOG_WARNING << "llkd configuration too short for "
+ << duration_cast<milliseconds>(sleepfor).count() << "ms work cycle\n";
+ return;
+ }
+
+ auto child_pid = fork();
+ ASSERT_LE(0, child_pid);
+ int wstatus;
+ if (!child_pid) {
+ auto ratio = period / sleepfor;
+ ASSERT_LT(0, ratio);
+ // vfork() parent is uninterruptable D state waiting for child to exec()
+ while (--ratio > 0) {
+ auto driver_pid = vfork();
+ ASSERT_LE(0, driver_pid);
+ if (driver_pid) { // parent
+ waitpid(driver_pid, &wstatus, 0);
+ if (!WIFEXITED(wstatus)) {
+ exit(42);
+ }
+ if (WEXITSTATUS(wstatus) != 42) {
+ exit(42);
+ }
+ } else {
+ usleep(sleepfor.count());
+ exit(42);
+ }
+ }
+ exit(0);
+ }
+ ASSERT_LE(0, waitpid(child_pid, &wstatus, 0));
+ EXPECT_TRUE(WIFEXITED(wstatus));
+ if (WIFEXITED(wstatus)) {
+ EXPECT_EQ(0, WEXITSTATUS(wstatus));
+ }
+ ASSERT_FALSE(WIFSIGNALED(wstatus)) << "[ INFO ] signo=" << WTERMSIG(wstatus);
+}
+
+TEST(llkd, driver_ABA_fast) {
+ llkd_driver_ABA(5ms);
+}
+
+TEST(llkd, driver_ABA_slow) {
+ llkd_driver_ABA(1s);
+}
+
+TEST(llkd, driver_ABA_glacial) {
+ llkd_driver_ABA(1min);
+}
+
+// Following tests must be last in this file to capture possible errant
+// kernel_panic mitigation failure.
+
+// The following tests simulate processes stick in 'Z' or 'D' state with
+// no forward scheduling progress, but interruptible. As such the expectation
+// is that llkd will perform kill mitigation and not progress to kernel_panic.
+
+TEST(llkd, zombie) {
+ const auto period = llkdSleepPeriod('Z');
+
+ /* Create a Persistent Zombie Process */
+ pid_t child_pid = fork();
+ ASSERT_LE(0, child_pid);
+ if (!child_pid) {
+ auto zombie_pid = fork();
+ ASSERT_LE(0, zombie_pid);
+ if (!zombie_pid) {
+ sleep(1);
+ exit(0);
+ }
+ sleep(period.count());
+ exit(42);
+ }
+
+ waitForPid(child_pid);
+}
+
+TEST(llkd, driver) {
+ const auto period = llkdSleepPeriod('D');
+
+ /* Create a Persistent Device Process */
+ auto child_pid = fork();
+ ASSERT_LE(0, child_pid);
+ if (!child_pid) {
+ // vfork() parent is uninterruptable D state waiting for child to exec()
+ auto driver_pid = vfork();
+ ASSERT_LE(0, driver_pid);
+ sleep(period.count());
+ exit(driver_pid ? 42 : 0);
+ }
+
+ waitForPid(child_pid);
+}