lmkd: Introduce kill strategy based on zone watermarks, swap and thrashing
Add new kill strategy which makes kill decisions based on which zone
watermark is breached, how much free swap space is still available and
what percentage of the file-backed page cache has been refaulted. This mode
is designed to be used only with PSI signals. It kills unconditionally when
a critical pressure event is received, therefore PSI stall for that event
should be set to a value representing a truly non-responding system
(currently set to 700ms out of 1sec spent in complete stall). New event
handler also controls polling interval based on current memory conditions.
Bug: 132642304
Test: lmkd_unit_test, ACT memory pressure tests
Change-Id: Ia213ef2bb06b245d651ebf2d813e944b4ae7565f
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
diff --git a/lmkd/README.md b/lmkd/README.md
index 656a6ea..a735955 100644
--- a/lmkd/README.md
+++ b/lmkd/README.md
@@ -60,6 +60,23 @@
any eligible task (fast decision). Default = false
ro.lmk.kill_timeout_ms: duration in ms after a kill when no additional
- kill will be done, Default = 0 (disabled)
+ kill will be done. Default = 0 (disabled)
ro.lmk.debug: enable lmkd debug logs, Default = false
+
+ ro.lmk.swap_free_low_percentage: level of free swap as a percentage of the
+ total swap space used as a threshold to consider
+ the system as swap space starved. Default for
+ low-RAM devices = 10, for high-end devices = 20
+
+ ro.lmk.thrashing_limit: number of workingset refaults as a percentage of
+ the file-backed pagecache size used as a threshold
+ to consider system thrashing its pagecache.
+ Default for low-RAM devices = 30, for high-end
+ devices = 100
+
+ ro.lmk.thrashing_limit_decay: thrashing threshold decay expressed as a
+ percentage of the original threshold used to lower
+ the threshold when system does not recover even
+ after a kill. Default for low-RAM devices = 50,
+ for high-end devices = 10
diff --git a/lmkd/lmkd.c b/lmkd/lmkd.c
index 04662fe..221fbc7 100644
--- a/lmkd/lmkd.c
+++ b/lmkd/lmkd.c
@@ -79,6 +79,7 @@
#define MEMCG_MEMORYSW_USAGE "/dev/memcg/memory.memsw.usage_in_bytes"
#define ZONEINFO_PATH "/proc/zoneinfo"
#define MEMINFO_PATH "/proc/meminfo"
+#define VMSTAT_PATH "/proc/vmstat"
#define PROC_STATUS_TGID_FIELD "Tgid:"
#define LINE_MAX 128
@@ -110,13 +111,29 @@
* PSI_WINDOW_SIZE_MS after the event happens.
*/
#define PSI_WINDOW_SIZE_MS 1000
-/* Polling period after initial PSI signal */
-#define PSI_POLL_PERIOD_MS 10
+/* Polling period after PSI signal when pressure is high */
+#define PSI_POLL_PERIOD_SHORT_MS 10
+/* Polling period after PSI signal when pressure is low */
+#define PSI_POLL_PERIOD_LONG_MS 100
#define min(a, b) (((a) < (b)) ? (a) : (b))
+#define max(a, b) (((a) > (b)) ? (a) : (b))
#define FAIL_REPORT_RLIMIT_MS 1000
+/*
+ * System property defaults
+ */
+/* ro.lmk.swap_free_low_percentage property defaults */
+#define DEF_LOW_SWAP_LOWRAM 10
+#define DEF_LOW_SWAP 20
+/* ro.lmk.thrashing_limit property defaults */
+#define DEF_THRASHING_LOWRAM 30
+#define DEF_THRASHING 100
+/* ro.lmk.thrashing_limit_decay property defaults */
+#define DEF_THRASHING_DECAY_LOWRAM 50
+#define DEF_THRASHING_DECAY 10
+
/* default to old in-kernel interface if no memory pressure events */
static bool use_inkernel_interface = true;
static bool has_inkernel_module;
@@ -157,6 +174,8 @@
static bool use_minfree_levels;
static bool per_app_memcg;
static int swap_free_low_percentage;
+static int thrashing_limit_pct;
+static int thrashing_limit_decay_pct;
static bool use_psi_monitors = false;
static struct psi_threshold psi_thresholds[VMPRESS_LEVEL_COUNT] = {
{ PSI_SOME, 70 }, /* 70ms out of 1sec for partial stall */
@@ -390,6 +409,41 @@
int64_t arr[MI_FIELD_COUNT];
};
+/* Fields to parse in /proc/vmstat */
+enum vmstat_field {
+ VS_FREE_PAGES,
+ VS_INACTIVE_FILE,
+ VS_ACTIVE_FILE,
+ VS_WORKINGSET_REFAULT,
+ VS_PGSCAN_KSWAPD,
+ VS_PGSCAN_DIRECT,
+ VS_PGSCAN_DIRECT_THROTTLE,
+ VS_FIELD_COUNT
+};
+
+static const char* const vmstat_field_names[MI_FIELD_COUNT] = {
+ "nr_free_pages",
+ "nr_inactive_file",
+ "nr_active_file",
+ "workingset_refault",
+ "pgscan_kswapd",
+ "pgscan_direct",
+ "pgscan_direct_throttle",
+};
+
+union vmstat {
+ struct {
+ int64_t nr_free_pages;
+ int64_t nr_inactive_file;
+ int64_t nr_active_file;
+ int64_t workingset_refault;
+ int64_t pgscan_kswapd;
+ int64_t pgscan_direct;
+ int64_t pgscan_direct_throttle;
+ } field;
+ int64_t arr[VS_FIELD_COUNT];
+};
+
enum field_match_result {
NO_MATCH,
PARSE_FAIL,
@@ -445,6 +499,10 @@
static char* proc_get_name(int pid);
static void poll_kernel();
+static int clamp(int low, int high, int value) {
+ return max(min(value, high), low);
+}
+
static bool parse_int64(const char* str, int64_t* ret) {
char* endptr;
long long val = strtoll(str, &endptr, 10);
@@ -1248,7 +1306,7 @@
#endif
/*
- * /prop/zoneinfo parsing routines
+ * /proc/zoneinfo parsing routines
* Expected file format is:
*
* Node <node_id>, zone <zone_name>
@@ -1442,7 +1500,7 @@
return 0;
}
-/* /prop/meminfo parsing routines */
+/* /proc/meminfo parsing routines */
static bool meminfo_parse_line(char *line, union meminfo *mi) {
char *cp = line;
char *ap;
@@ -1497,6 +1555,59 @@
return 0;
}
+/* /proc/vmstat parsing routines */
+static bool vmstat_parse_line(char *line, union vmstat *vs) {
+ char *cp;
+ char *ap;
+ char *save_ptr;
+ int64_t val;
+ int field_idx;
+ enum field_match_result match_res;
+
+ cp = strtok_r(line, " ", &save_ptr);
+ if (!cp) {
+ return false;
+ }
+
+ ap = strtok_r(NULL, " ", &save_ptr);
+ if (!ap) {
+ return false;
+ }
+
+ match_res = match_field(cp, ap, vmstat_field_names, VS_FIELD_COUNT,
+ &val, &field_idx);
+ if (match_res == PARSE_SUCCESS) {
+ vs->arr[field_idx] = val;
+ }
+ return (match_res != PARSE_FAIL);
+}
+
+static int vmstat_parse(union vmstat *vs) {
+ static struct reread_data file_data = {
+ .filename = VMSTAT_PATH,
+ .fd = -1,
+ };
+ char *buf;
+ char *save_ptr;
+ char *line;
+
+ memset(vs, 0, sizeof(union vmstat));
+
+ if ((buf = reread_file(&file_data)) == NULL) {
+ return -1;
+ }
+
+ for (line = strtok_r(buf, "\n", &save_ptr); line;
+ line = strtok_r(NULL, "\n", &save_ptr)) {
+ if (!vmstat_parse_line(line, vs)) {
+ ALOGE("%s parse error", file_data.filename);
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
static void meminfo_log(union meminfo *mi) {
for (int field_idx = 0; field_idx < MI_FIELD_COUNT; field_idx++) {
android_log_write_int32(ctx, (int32_t)min(mi->arr[field_idx] * page_k, INT32_MAX));
@@ -1833,6 +1944,219 @@
return false;
}
+enum zone_watermark {
+ WMARK_MIN = 0,
+ WMARK_LOW,
+ WMARK_HIGH,
+ WMARK_NONE
+};
+
+/*
+ * Returns lowest breached watermark or WMARK_NONE.
+ */
+static enum zone_watermark get_lowest_watermark(struct zoneinfo *zi)
+{
+ enum zone_watermark wmark = WMARK_NONE;
+
+ for (int node_idx = 0; node_idx < zi->node_count; node_idx++) {
+ struct zoneinfo_node *node = &zi->nodes[node_idx];
+
+ for (int zone_idx = 0; zone_idx < node->zone_count; zone_idx++) {
+ struct zoneinfo_zone *zone = &node->zones[zone_idx];
+ int zone_free_mem;
+
+ if (!zone->fields.field.present) {
+ continue;
+ }
+
+ zone_free_mem = zone->fields.field.nr_free_pages - zone->fields.field.nr_free_cma;
+ if (zone_free_mem > zone->max_protection + zone->fields.field.high) {
+ continue;
+ }
+ if (zone_free_mem > zone->max_protection + zone->fields.field.low) {
+ if (wmark > WMARK_HIGH) wmark = WMARK_HIGH;
+ continue;
+ }
+ if (zone_free_mem > zone->max_protection + zone->fields.field.min) {
+ if (wmark > WMARK_LOW) wmark = WMARK_LOW;
+ continue;
+ }
+ wmark = WMARK_MIN;
+ }
+ }
+
+ return wmark;
+}
+
+static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_params) {
+ enum kill_reasons {
+ NONE = -1, /* To denote no kill condition */
+ PRESSURE_AFTER_KILL = 0,
+ NOT_RESPONDING,
+ LOW_SWAP_AND_THRASHING,
+ LOW_MEM_AND_SWAP,
+ LOW_MEM_AND_THRASHING,
+ DIRECT_RECL_AND_THRASHING,
+ KILL_REASON_COUNT
+ };
+ enum reclaim_state {
+ NO_RECLAIM = 0,
+ KSWAPD_RECLAIM,
+ DIRECT_RECLAIM,
+ };
+ static int64_t init_ws_refault;
+ static int64_t base_file_lru;
+ static int64_t init_pgscan_kswapd;
+ static int64_t init_pgscan_direct;
+ static int64_t swap_low_threshold;
+ static bool killing;
+ static int thrashing_limit;
+ static bool in_reclaim;
+
+ union meminfo mi;
+ union vmstat vs;
+ struct zoneinfo zi;
+ struct timespec curr_tm;
+ int64_t thrashing = 0;
+ bool swap_is_low = false;
+ enum vmpressure_level level = (enum vmpressure_level)data;
+ enum kill_reasons kill_reason = NONE;
+ bool cycle_after_kill = false;
+ enum reclaim_state reclaim = NO_RECLAIM;
+ enum zone_watermark wmark = WMARK_NONE;
+
+ /* Skip while still killing a process */
+ if (is_kill_pending()) {
+ /* TODO: replace this quick polling with pidfd polling if kernel supports */
+ goto no_kill;
+ }
+
+ if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
+ ALOGE("Failed to get current time");
+ return;
+ }
+
+ if (vmstat_parse(&vs) < 0) {
+ ALOGE("Failed to parse vmstat!");
+ return;
+ }
+
+ if (meminfo_parse(&mi) < 0) {
+ ALOGE("Failed to parse meminfo!");
+ return;
+ }
+
+ /* Reset states after process got killed */
+ if (killing) {
+ killing = false;
+ cycle_after_kill = true;
+ /* Reset file-backed pagecache size and refault amounts after a kill */
+ base_file_lru = vs.field.nr_inactive_file + vs.field.nr_active_file;
+ init_ws_refault = vs.field.workingset_refault;
+ }
+
+ /* Check free swap levels */
+ if (swap_free_low_percentage) {
+ if (!swap_low_threshold) {
+ swap_low_threshold = mi.field.total_swap * swap_free_low_percentage / 100;
+ }
+ swap_is_low = mi.field.free_swap < swap_low_threshold;
+ }
+
+ /* Identify reclaim state */
+ if (vs.field.pgscan_direct > init_pgscan_direct) {
+ init_pgscan_direct = vs.field.pgscan_direct;
+ init_pgscan_kswapd = vs.field.pgscan_kswapd;
+ reclaim = DIRECT_RECLAIM;
+ } else if (vs.field.pgscan_kswapd > init_pgscan_kswapd) {
+ init_pgscan_kswapd = vs.field.pgscan_kswapd;
+ reclaim = KSWAPD_RECLAIM;
+ } else {
+ in_reclaim = false;
+ /* Skip if system is not reclaiming */
+ goto no_kill;
+ }
+
+ if (!in_reclaim) {
+ /* Record file-backed pagecache size when entering reclaim cycle */
+ base_file_lru = vs.field.nr_inactive_file + vs.field.nr_active_file;
+ init_ws_refault = vs.field.workingset_refault;
+ thrashing_limit = thrashing_limit_pct;
+ } else {
+ /* Calculate what % of the file-backed pagecache refaulted so far */
+ thrashing = (vs.field.workingset_refault - init_ws_refault) * 100 / base_file_lru;
+ }
+ in_reclaim = true;
+
+ /* Find out which watermark is breached if any */
+ if (zoneinfo_parse(&zi) < 0) {
+ ALOGE("Failed to parse zoneinfo!");
+ return;
+ }
+ wmark = get_lowest_watermark(&zi);
+
+ /*
+ * TODO: move this logic into a separate function
+ * Decide if killing a process is necessary and record the reason
+ */
+ if (cycle_after_kill && wmark < WMARK_LOW) {
+ /*
+ * Prevent kills not freeing enough memory which might lead to OOM kill.
+ * This might happen when a process is consuming memory faster than reclaim can
+ * free even after a kill. Mostly happens when running memory stress tests.
+ */
+ kill_reason = PRESSURE_AFTER_KILL;
+ } else if (level == VMPRESS_LEVEL_CRITICAL && events != 0) {
+ /*
+ * Device is too busy reclaiming memory which might lead to ANR.
+ * Critical level is triggered when PSI complete stall (all tasks are blocked because
+ * of the memory congestion) breaches the configured threshold.
+ */
+ kill_reason = NOT_RESPONDING;
+ } else if (swap_is_low && thrashing > thrashing_limit_pct) {
+ /* Page cache is thrashing while swap is low */
+ kill_reason = LOW_SWAP_AND_THRASHING;
+ } else if (swap_is_low && wmark < WMARK_HIGH) {
+ /* Both free memory and swap are low */
+ kill_reason = LOW_MEM_AND_SWAP;
+ } else if (wmark < WMARK_HIGH && thrashing > thrashing_limit) {
+ /* Page cache is thrashing while memory is low */
+ thrashing_limit = (thrashing_limit * (100 - thrashing_limit_decay_pct)) / 100;
+ kill_reason = LOW_MEM_AND_THRASHING;
+ } else if (reclaim == DIRECT_RECLAIM && thrashing > thrashing_limit) {
+ /* Page cache is thrashing while in direct reclaim (mostly happens on lowram devices) */
+ thrashing_limit = (thrashing_limit * (100 - thrashing_limit_decay_pct)) / 100;
+ kill_reason = DIRECT_RECL_AND_THRASHING;
+ }
+
+ /* Kill a process if necessary */
+ if (kill_reason != NONE) {
+ int pages_freed = find_and_kill_process(0);
+ killing = (pages_freed > 0);
+ meminfo_log(&mi);
+ }
+
+no_kill:
+ /*
+ * Start polling after initial PSI event;
+ * extend polling while device is in direct reclaim or process is being killed;
+ * do not extend when kswapd reclaims because that might go on for a long time
+ * without causing memory pressure
+ */
+ if (events || killing || reclaim == DIRECT_RECLAIM) {
+ poll_params->update = POLLING_START;
+ }
+
+ /* Decide the polling interval */
+ if (swap_is_low || killing) {
+ /* Fast polling during and after a kill or when swap is low */
+ poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS;
+ } else {
+ /* By default use long intervals */
+ poll_params->polling_interval_ms = PSI_POLL_PERIOD_LONG_MS;
+ }
+}
+
static void mp_event_common(int data, uint32_t events, struct polling_params *poll_params) {
int ret;
unsigned long long evcount;
@@ -1881,7 +2205,7 @@
if (use_psi_monitors && events) {
/* Override polling params only if current event is more critical */
if (!poll_params->poll_handler || data > poll_params->poll_handler->data) {
- poll_params->polling_interval_ms = PSI_POLL_PERIOD_MS;
+ poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS;
poll_params->update = POLLING_START;
}
}
@@ -2483,8 +2807,12 @@
property_get_bool("ro.lmk.use_minfree_levels", false);
per_app_memcg =
property_get_bool("ro.config.per_app_memcg", low_ram_device);
- swap_free_low_percentage =
- property_get_int32("ro.lmk.swap_free_low_percentage", 10);
+ swap_free_low_percentage = clamp(0, 100, property_get_int32("ro.lmk.swap_free_low_percentage",
+ low_ram_device ? DEF_LOW_SWAP_LOWRAM : DEF_LOW_SWAP));
+ thrashing_limit_pct = max(0, property_get_int32("ro.lmk.thrashing_limit",
+ low_ram_device ? DEF_THRASHING_LOWRAM : DEF_THRASHING));
+ thrashing_limit_decay_pct = clamp(0, 100, property_get_int32("ro.lmk.thrashing_limit_decay",
+ low_ram_device ? DEF_THRASHING_DECAY_LOWRAM : DEF_THRASHING_DECAY));
ctx = create_android_logger(MEMINFO_LOG_TAG);