Merge "llkd: bootstat: propagate detailed livelock canonical boot reason"
am: 481a8125a6
Change-Id: I161063e3c33209585d3bbafbd7ee0d1a0c5f151f
diff --git a/bootstat/bootstat.cpp b/bootstat/bootstat.cpp
index c2688e9..8ce9dfc 100644
--- a/bootstat/bootstat.cpp
+++ b/bootstat/bootstat.cpp
@@ -303,6 +303,9 @@
{"kernel_panic,init", 158},
{"kernel_panic,oom", 159},
{"kernel_panic,stack", 160},
+ {"kernel_panic,sysrq,livelock,alarm", 161}, // llkd
+ {"kernel_panic,sysrq,livelock,driver", 162}, // llkd
+ {"kernel_panic,sysrq,livelock,zombie", 163}, // llkd
};
// Converts a string value representing the reason the system booted to an
diff --git a/llkd/README.md b/llkd/README.md
index 71319c8..b2ba2a2 100644
--- a/llkd/README.md
+++ b/llkd/README.md
@@ -53,7 +53,9 @@
conditions. If the test can, it will reconfigure llkd to expedite the test
duration by adjusting the ro.llk.* Android properties. Tests run the D state
with some scheduling progress to ensure that ABA checking prevents false
-triggers.
+triggers. If 100% reliable ABA on platform, then ro.llk.killtest can be
+set to false; however this will result in some of the unit tests to panic
+kernel instead of deal with more graceful kill operation.
Android Properties
------------------
@@ -108,13 +110,6 @@
Architectural Concerns
----------------------
-- Figure out how to communicate the kernel panic better to bootstat canonical
- boot reason determination. This may require an alteration to bootstat, or
- some logging from llkd. Would like to see boot reason to be
- watchdog,livelock as a minimum requirement. Or more specifically would want
- watchdog,livelock,device or watchdog,livelock,zombie be reported.
- Currently reports panic,sysrq (user requested panic) or panic depending on
- system support of pstore.
- Create kernel module and associated gTest to actually test panic.
- Create gTest to test out blacklist (ro.llk.blacklist.<properties> generally
not be inputs). Could require more test-only interfaces to libllkd.
diff --git a/llkd/include/llkd.h b/llkd/include/llkd.h
index bd0739b..e3ae4bb 100644
--- a/llkd/include/llkd.h
+++ b/llkd/include/llkd.h
@@ -37,6 +37,8 @@
#define KHT_ENABLE_PROPERTY "ro." KHT_ENABLE_WRITEABLE_PROPERTY
#define LLK_MLOCKALL_PROPERTY "ro.llk.mlockall"
#define LLK_MLOCKALL_DEFAULT true
+#define LLK_KILLTEST_PROPERTY "ro.llk.killtest"
+#define LLK_KILLTEST_DEFAULT true
#define LLK_TIMEOUT_MS_PROPERTY "ro.llk.timeout_ms"
#define KHT_TIMEOUT_PROPERTY "ro.khungtask.timeout"
#define LLK_D_TIMEOUT_MS_PROPERTY "ro.llk.D.timeout_ms"
diff --git a/llkd/libllkd.cpp b/llkd/libllkd.cpp
index d828105..f357cc2 100644
--- a/llkd/libllkd.cpp
+++ b/llkd/libllkd.cpp
@@ -70,6 +70,7 @@
bool llkEnable = LLK_ENABLE_DEFAULT; // llk daemon enabled
bool llkRunning = false; // thread is running
bool llkMlockall = LLK_MLOCKALL_DEFAULT; // run mlocked
+bool llkTestWithKill = LLK_KILLTEST_DEFAULT; // issue test kills
milliseconds llkTimeoutMs = LLK_TIMEOUT_MS_DEFAULT; // default timeout
enum { llkStateD, llkStateZ, llkNumStates }; // state indexes
milliseconds llkStateTimeoutMs[llkNumStates]; // timeout override for each detection state
@@ -292,7 +293,7 @@
exeMissingValid(false),
cmdlineValid(false),
updated(true),
- killed(false) {
+ killed(!llkTestWithKill) {
memset(comm, '\0', sizeof(comm));
setComm(_comm);
}
@@ -475,8 +476,8 @@
return android::base::Trim(content) == string;
}
-void llkPanicKernel(bool dump, pid_t tid) __noreturn;
-void llkPanicKernel(bool dump, pid_t tid) {
+void llkPanicKernel(bool dump, pid_t tid, const char* state) __noreturn;
+void llkPanicKernel(bool dump, pid_t tid, const char* state) {
auto sysrqTriggerFd = llkFileToWriteFd("/proc/sysrq-trigger");
if (sysrqTriggerFd < 0) {
// DYB
@@ -496,6 +497,8 @@
}
::usleep(200000); // let everything settle
}
+ llkWriteStringToFile(std::string("SysRq : Trigger a crash : 'livelock,") + state + "'\n",
+ "/dev/kmsg");
android::base::WriteStringToFd("c", sysrqTriggerFd);
// NOTREACHED
// DYB
@@ -507,7 +510,7 @@
}
void llkAlarmHandler(int) {
- llkPanicKernel(false, ::getpid());
+ llkPanicKernel(false, ::getpid(), "alarm");
}
milliseconds GetUintProperty(const std::string& key, milliseconds def) {
@@ -686,7 +689,7 @@
(val != procp->nrSwitches)) {
procp->nrSwitches = val;
procp->count = 0ms;
- procp->killed = false;
+ procp->killed = !llkTestWithKill;
}
return;
}
@@ -700,7 +703,7 @@
if (schedUpdate != procp->schedUpdate) {
procp->schedUpdate = schedUpdate;
procp->count = 0ms;
- procp->killed = false;
+ procp->killed = !llkTestWithKill;
}
}
@@ -709,7 +712,7 @@
if (static_cast<uint64_t>(val) != procp->nrSwitches) {
procp->nrSwitches = val;
procp->count = 0ms;
- procp->killed = false;
+ procp->killed = !llkTestWithKill;
}
}
}
@@ -719,6 +722,7 @@
<< LLK_ENABLE_PROPERTY "=" << llkFormat(llkEnable) << "\n"
<< KHT_ENABLE_PROPERTY "=" << llkFormat(khtEnable) << "\n"
<< LLK_MLOCKALL_PROPERTY "=" << llkFormat(llkMlockall) << "\n"
+ << LLK_KILLTEST_PROPERTY "=" << llkFormat(llkTestWithKill) << "\n"
<< KHT_TIMEOUT_PROPERTY "=" << llkFormat(khtTimeout) << "\n"
<< LLK_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkTimeoutMs) << "\n"
<< LLK_D_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkStateTimeoutMs[llkStateD]) << "\n"
@@ -869,7 +873,7 @@
procp->time = utime + stime;
if (procp->state != state) {
procp->count = 0ms;
- procp->killed = false;
+ procp->killed = !llkTestWithKill;
procp->state = state;
} else {
procp->count += llkCycle;
@@ -973,7 +977,7 @@
// We are here because we have confirmed kernel live-lock
LOG(ERROR) << state << ' ' << llkFormat(procp->count) << ' ' << ppid << "->" << pid
<< "->" << tid << ' ' << procp->getComm() << " [panic]";
- llkPanicKernel(true, tid);
+ llkPanicKernel(true, tid, (state == 'Z') ? "zombie" : "driver");
}
LOG(VERBOSE) << "+closedir()";
}
@@ -1045,6 +1049,7 @@
}
khtEnable = android::base::GetBoolProperty(KHT_ENABLE_PROPERTY, khtEnable);
llkMlockall = android::base::GetBoolProperty(LLK_MLOCKALL_PROPERTY, llkMlockall);
+ llkTestWithKill = android::base::GetBoolProperty(LLK_KILLTEST_PROPERTY, llkTestWithKill);
// if LLK_TIMOUT_MS_PROPERTY was not set, we will use a set
// KHT_TIMEOUT_PROPERTY as co-operative guidance for the default value.
khtTimeout = GetUintProperty(KHT_TIMEOUT_PROPERTY, khtTimeout);
diff --git a/llkd/llkd.rc b/llkd/llkd.rc
index f762a5c..e538cdb 100644
--- a/llkd/llkd.rc
+++ b/llkd/llkd.rc
@@ -44,5 +44,6 @@
user llkd
group llkd readproc
capabilities KILL IPC_LOCK
+ file /dev/kmsg w
file /proc/sysrq-trigger w
writepid /dev/cpuset/system-background/tasks
diff --git a/llkd/tests/llkd_test.cpp b/llkd/tests/llkd_test.cpp
index 2de1820..3a15ff1 100644
--- a/llkd/tests/llkd_test.cpp
+++ b/llkd/tests/llkd_test.cpp
@@ -154,6 +154,27 @@
ASSERT_EQ(WTERMSIG(wstatus), SIGKILL);
}
+bool checkKill(const char* reason) {
+ if (android::base::GetBoolProperty(LLK_KILLTEST_PROPERTY, LLK_KILLTEST_DEFAULT)) {
+ return false;
+ }
+ auto bootreason = android::base::GetProperty("sys.boot.reason", "nothing");
+ if (bootreason == reason) {
+ GTEST_LOG_INFO << "Expected test result confirmed " << reason << "\n";
+ return true;
+ }
+ GTEST_LOG_WARNING << "Expected test result is " << reason << "\n";
+
+ // apct adjustment if needed (set LLK_KILLTEST_PROPERTY to "off" to allow test)
+ //
+ // if (android::base::GetProperty(LLK_KILLTEST_PROPERTY, "") == "false") {
+ // GTEST_LOG_WARNING << "Bypassing test\n";
+ // return true;
+ // }
+
+ return false;
+}
+
} // namespace
// The tests that use this helper are to simulate processes stuck in 'D'
@@ -221,6 +242,10 @@
// is that llkd will perform kill mitigation and not progress to kernel_panic.
TEST(llkd, zombie) {
+ if (checkKill("kernel_panic,sysrq,livelock,zombie")) {
+ return;
+ }
+
const auto period = llkdSleepPeriod('Z');
/* Create a Persistent Zombie Process */
@@ -241,6 +266,10 @@
}
TEST(llkd, driver) {
+ if (checkKill("kernel_panic,sysrq,livelock,driver")) {
+ return;
+ }
+
const auto period = llkdSleepPeriod('D');
/* Create a Persistent Device Process */