lmkd: Introduce kill strategy based on zone watermarks, swap and thrashing am: 561cfd9478 am: ad233c87b0
am: 390b7d739e

Change-Id: I28820107bdf7f681a5875fe3b8107e86b5cde199
diff --git a/lmkd/README.md b/lmkd/README.md
index 656a6ea..a735955 100644
--- a/lmkd/README.md
+++ b/lmkd/README.md
@@ -60,6 +60,23 @@
                              any eligible task (fast decision). Default = false
 
   ro.lmk.kill_timeout_ms:    duration in ms after a kill when no additional
-                             kill will be done, Default = 0 (disabled)
+                             kill will be done. Default = 0 (disabled)
 
   ro.lmk.debug:              enable lmkd debug logs, Default = false
+
+  ro.lmk.swap_free_low_percentage: level of free swap as a percentage of the
+                             total swap space used as a threshold to consider
+                             the system as swap space starved. Default for
+                             low-RAM devices = 10, for high-end devices = 20
+
+  ro.lmk.thrashing_limit:    number of workingset refaults as a percentage of
+                             the file-backed pagecache size used as a threshold
+                             to consider system thrashing its pagecache.
+                             Default for low-RAM devices = 30, for high-end
+                             devices = 100
+
+  ro.lmk.thrashing_limit_decay: thrashing threshold decay expressed as a
+                             percentage of the original threshold used to lower
+                             the threshold when system does not recover even
+                             after a kill. Default for low-RAM devices = 50,
+                             for high-end devices = 10
diff --git a/lmkd/lmkd.c b/lmkd/lmkd.c
index 04662fe..221fbc7 100644
--- a/lmkd/lmkd.c
+++ b/lmkd/lmkd.c
@@ -79,6 +79,7 @@
 #define MEMCG_MEMORYSW_USAGE "/dev/memcg/memory.memsw.usage_in_bytes"
 #define ZONEINFO_PATH "/proc/zoneinfo"
 #define MEMINFO_PATH "/proc/meminfo"
+#define VMSTAT_PATH "/proc/vmstat"
 #define PROC_STATUS_TGID_FIELD "Tgid:"
 #define LINE_MAX 128
 
@@ -110,13 +111,29 @@
  * PSI_WINDOW_SIZE_MS after the event happens.
  */
 #define PSI_WINDOW_SIZE_MS 1000
-/* Polling period after initial PSI signal */
-#define PSI_POLL_PERIOD_MS 10
+/* Polling period after PSI signal when pressure is high */
+#define PSI_POLL_PERIOD_SHORT_MS 10
+/* Polling period after PSI signal when pressure is low */
+#define PSI_POLL_PERIOD_LONG_MS 100
 
 #define min(a, b) (((a) < (b)) ? (a) : (b))
+#define max(a, b) (((a) > (b)) ? (a) : (b))
 
 #define FAIL_REPORT_RLIMIT_MS 1000
 
+/*
+ * System property defaults
+ */
+/* ro.lmk.swap_free_low_percentage property defaults */
+#define DEF_LOW_SWAP_LOWRAM 10
+#define DEF_LOW_SWAP 20
+/* ro.lmk.thrashing_limit property defaults */
+#define DEF_THRASHING_LOWRAM 30
+#define DEF_THRASHING 100
+/* ro.lmk.thrashing_limit_decay property defaults */
+#define DEF_THRASHING_DECAY_LOWRAM 50
+#define DEF_THRASHING_DECAY 10
+
 /* default to old in-kernel interface if no memory pressure events */
 static bool use_inkernel_interface = true;
 static bool has_inkernel_module;
@@ -157,6 +174,8 @@
 static bool use_minfree_levels;
 static bool per_app_memcg;
 static int swap_free_low_percentage;
+static int thrashing_limit_pct;
+static int thrashing_limit_decay_pct;
 static bool use_psi_monitors = false;
 static struct psi_threshold psi_thresholds[VMPRESS_LEVEL_COUNT] = {
     { PSI_SOME, 70 },    /* 70ms out of 1sec for partial stall */
@@ -390,6 +409,41 @@
     int64_t arr[MI_FIELD_COUNT];
 };
 
+/* Fields to parse in /proc/vmstat */
+enum vmstat_field {
+    VS_FREE_PAGES,
+    VS_INACTIVE_FILE,
+    VS_ACTIVE_FILE,
+    VS_WORKINGSET_REFAULT,
+    VS_PGSCAN_KSWAPD,
+    VS_PGSCAN_DIRECT,
+    VS_PGSCAN_DIRECT_THROTTLE,
+    VS_FIELD_COUNT
+};
+
+static const char* const vmstat_field_names[MI_FIELD_COUNT] = {
+    "nr_free_pages",
+    "nr_inactive_file",
+    "nr_active_file",
+    "workingset_refault",
+    "pgscan_kswapd",
+    "pgscan_direct",
+    "pgscan_direct_throttle",
+};
+
+union vmstat {
+    struct {
+        int64_t nr_free_pages;
+        int64_t nr_inactive_file;
+        int64_t nr_active_file;
+        int64_t workingset_refault;
+        int64_t pgscan_kswapd;
+        int64_t pgscan_direct;
+        int64_t pgscan_direct_throttle;
+    } field;
+    int64_t arr[VS_FIELD_COUNT];
+};
+
 enum field_match_result {
     NO_MATCH,
     PARSE_FAIL,
@@ -445,6 +499,10 @@
 static char* proc_get_name(int pid);
 static void poll_kernel();
 
+static int clamp(int low, int high, int value) {
+    return max(min(value, high), low);
+}
+
 static bool parse_int64(const char* str, int64_t* ret) {
     char* endptr;
     long long val = strtoll(str, &endptr, 10);
@@ -1248,7 +1306,7 @@
 #endif
 
 /*
- * /prop/zoneinfo parsing routines
+ * /proc/zoneinfo parsing routines
  * Expected file format is:
  *
  *   Node <node_id>, zone   <zone_name>
@@ -1442,7 +1500,7 @@
     return 0;
 }
 
-/* /prop/meminfo parsing routines */
+/* /proc/meminfo parsing routines */
 static bool meminfo_parse_line(char *line, union meminfo *mi) {
     char *cp = line;
     char *ap;
@@ -1497,6 +1555,59 @@
     return 0;
 }
 
+/* /proc/vmstat parsing routines */
+static bool vmstat_parse_line(char *line, union vmstat *vs) {
+    char *cp;
+    char *ap;
+    char *save_ptr;
+    int64_t val;
+    int field_idx;
+    enum field_match_result match_res;
+
+    cp = strtok_r(line, " ", &save_ptr);
+    if (!cp) {
+        return false;
+    }
+
+    ap = strtok_r(NULL, " ", &save_ptr);
+    if (!ap) {
+        return false;
+    }
+
+    match_res = match_field(cp, ap, vmstat_field_names, VS_FIELD_COUNT,
+        &val, &field_idx);
+    if (match_res == PARSE_SUCCESS) {
+        vs->arr[field_idx] = val;
+    }
+    return (match_res != PARSE_FAIL);
+}
+
+static int vmstat_parse(union vmstat *vs) {
+    static struct reread_data file_data = {
+        .filename = VMSTAT_PATH,
+        .fd = -1,
+    };
+    char *buf;
+    char *save_ptr;
+    char *line;
+
+    memset(vs, 0, sizeof(union vmstat));
+
+    if ((buf = reread_file(&file_data)) == NULL) {
+        return -1;
+    }
+
+    for (line = strtok_r(buf, "\n", &save_ptr); line;
+         line = strtok_r(NULL, "\n", &save_ptr)) {
+        if (!vmstat_parse_line(line, vs)) {
+            ALOGE("%s parse error", file_data.filename);
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
 static void meminfo_log(union meminfo *mi) {
     for (int field_idx = 0; field_idx < MI_FIELD_COUNT; field_idx++) {
         android_log_write_int32(ctx, (int32_t)min(mi->arr[field_idx] * page_k, INT32_MAX));
@@ -1833,6 +1944,219 @@
     return false;
 }
 
+enum zone_watermark {
+    WMARK_MIN = 0,
+    WMARK_LOW,
+    WMARK_HIGH,
+    WMARK_NONE
+};
+
+/*
+ * Returns lowest breached watermark or WMARK_NONE.
+ */
+static enum zone_watermark get_lowest_watermark(struct zoneinfo *zi)
+{
+    enum zone_watermark wmark = WMARK_NONE;
+
+    for (int node_idx = 0; node_idx < zi->node_count; node_idx++) {
+        struct zoneinfo_node *node = &zi->nodes[node_idx];
+
+        for (int zone_idx = 0; zone_idx < node->zone_count; zone_idx++) {
+            struct zoneinfo_zone *zone = &node->zones[zone_idx];
+            int zone_free_mem;
+
+            if (!zone->fields.field.present) {
+                continue;
+            }
+
+            zone_free_mem = zone->fields.field.nr_free_pages - zone->fields.field.nr_free_cma;
+            if (zone_free_mem > zone->max_protection + zone->fields.field.high) {
+                continue;
+            }
+            if (zone_free_mem > zone->max_protection + zone->fields.field.low) {
+                if (wmark > WMARK_HIGH) wmark = WMARK_HIGH;
+                continue;
+            }
+            if (zone_free_mem > zone->max_protection + zone->fields.field.min) {
+                if (wmark > WMARK_LOW) wmark = WMARK_LOW;
+                continue;
+            }
+            wmark = WMARK_MIN;
+        }
+    }
+
+    return wmark;
+}
+
+static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_params) {
+    enum kill_reasons {
+        NONE = -1, /* To denote no kill condition */
+        PRESSURE_AFTER_KILL = 0,
+        NOT_RESPONDING,
+        LOW_SWAP_AND_THRASHING,
+        LOW_MEM_AND_SWAP,
+        LOW_MEM_AND_THRASHING,
+        DIRECT_RECL_AND_THRASHING,
+        KILL_REASON_COUNT
+    };
+    enum reclaim_state {
+        NO_RECLAIM = 0,
+        KSWAPD_RECLAIM,
+        DIRECT_RECLAIM,
+    };
+    static int64_t init_ws_refault;
+    static int64_t base_file_lru;
+    static int64_t init_pgscan_kswapd;
+    static int64_t init_pgscan_direct;
+    static int64_t swap_low_threshold;
+    static bool killing;
+    static int thrashing_limit;
+    static bool in_reclaim;
+
+    union meminfo mi;
+    union vmstat vs;
+    struct zoneinfo zi;
+    struct timespec curr_tm;
+    int64_t thrashing = 0;
+    bool swap_is_low = false;
+    enum vmpressure_level level = (enum vmpressure_level)data;
+    enum kill_reasons kill_reason = NONE;
+    bool cycle_after_kill = false;
+    enum reclaim_state reclaim = NO_RECLAIM;
+    enum zone_watermark wmark = WMARK_NONE;
+
+    /* Skip while still killing a process */
+    if (is_kill_pending()) {
+        /* TODO: replace this quick polling with pidfd polling if kernel supports */
+        goto no_kill;
+    }
+
+    if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
+        ALOGE("Failed to get current time");
+        return;
+    }
+
+    if (vmstat_parse(&vs) < 0) {
+        ALOGE("Failed to parse vmstat!");
+        return;
+    }
+
+    if (meminfo_parse(&mi) < 0) {
+        ALOGE("Failed to parse meminfo!");
+        return;
+    }
+
+    /* Reset states after process got killed */
+    if (killing) {
+        killing = false;
+        cycle_after_kill = true;
+        /* Reset file-backed pagecache size and refault amounts after a kill */
+        base_file_lru = vs.field.nr_inactive_file + vs.field.nr_active_file;
+        init_ws_refault = vs.field.workingset_refault;
+    }
+
+    /* Check free swap levels */
+    if (swap_free_low_percentage) {
+        if (!swap_low_threshold) {
+            swap_low_threshold = mi.field.total_swap * swap_free_low_percentage / 100;
+        }
+        swap_is_low = mi.field.free_swap < swap_low_threshold;
+    }
+
+    /* Identify reclaim state */
+    if (vs.field.pgscan_direct > init_pgscan_direct) {
+        init_pgscan_direct = vs.field.pgscan_direct;
+        init_pgscan_kswapd = vs.field.pgscan_kswapd;
+        reclaim = DIRECT_RECLAIM;
+    } else if (vs.field.pgscan_kswapd > init_pgscan_kswapd) {
+        init_pgscan_kswapd = vs.field.pgscan_kswapd;
+        reclaim = KSWAPD_RECLAIM;
+    } else {
+        in_reclaim = false;
+        /* Skip if system is not reclaiming */
+        goto no_kill;
+    }
+
+    if (!in_reclaim) {
+        /* Record file-backed pagecache size when entering reclaim cycle */
+        base_file_lru = vs.field.nr_inactive_file + vs.field.nr_active_file;
+        init_ws_refault = vs.field.workingset_refault;
+        thrashing_limit = thrashing_limit_pct;
+    } else {
+        /* Calculate what % of the file-backed pagecache refaulted so far */
+        thrashing = (vs.field.workingset_refault - init_ws_refault) * 100 / base_file_lru;
+    }
+    in_reclaim = true;
+
+    /* Find out which watermark is breached if any */
+    if (zoneinfo_parse(&zi) < 0) {
+        ALOGE("Failed to parse zoneinfo!");
+        return;
+    }
+    wmark = get_lowest_watermark(&zi);
+
+    /*
+     * TODO: move this logic into a separate function
+     * Decide if killing a process is necessary and record the reason
+     */
+    if (cycle_after_kill && wmark < WMARK_LOW) {
+        /*
+         * Prevent kills not freeing enough memory which might lead to OOM kill.
+         * This might happen when a process is consuming memory faster than reclaim can
+         * free even after a kill. Mostly happens when running memory stress tests.
+         */
+        kill_reason = PRESSURE_AFTER_KILL;
+    } else if (level == VMPRESS_LEVEL_CRITICAL && events != 0) {
+        /*
+         * Device is too busy reclaiming memory which might lead to ANR.
+         * Critical level is triggered when PSI complete stall (all tasks are blocked because
+         * of the memory congestion) breaches the configured threshold.
+         */
+        kill_reason = NOT_RESPONDING;
+    } else if (swap_is_low && thrashing > thrashing_limit_pct) {
+        /* Page cache is thrashing while swap is low */
+        kill_reason = LOW_SWAP_AND_THRASHING;
+    } else if (swap_is_low && wmark < WMARK_HIGH) {
+        /* Both free memory and swap are low */
+        kill_reason = LOW_MEM_AND_SWAP;
+    } else if (wmark < WMARK_HIGH && thrashing > thrashing_limit) {
+        /* Page cache is thrashing while memory is low */
+        thrashing_limit = (thrashing_limit * (100 - thrashing_limit_decay_pct)) / 100;
+        kill_reason = LOW_MEM_AND_THRASHING;
+    } else if (reclaim == DIRECT_RECLAIM && thrashing > thrashing_limit) {
+        /* Page cache is thrashing while in direct reclaim (mostly happens on lowram devices) */
+        thrashing_limit = (thrashing_limit * (100 - thrashing_limit_decay_pct)) / 100;
+        kill_reason = DIRECT_RECL_AND_THRASHING;
+    }
+
+    /* Kill a process if necessary */
+    if (kill_reason != NONE) {
+        int pages_freed = find_and_kill_process(0);
+        killing = (pages_freed > 0);
+        meminfo_log(&mi);
+    }
+
+no_kill:
+    /*
+     * Start polling after initial PSI event;
+     * extend polling while device is in direct reclaim or process is being killed;
+     * do not extend when kswapd reclaims because that might go on for a long time
+     * without causing memory pressure
+     */
+    if (events || killing || reclaim == DIRECT_RECLAIM) {
+        poll_params->update = POLLING_START;
+    }
+
+    /* Decide the polling interval */
+    if (swap_is_low || killing) {
+        /* Fast polling during and after a kill or when swap is low */
+        poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS;
+    } else {
+        /* By default use long intervals */
+        poll_params->polling_interval_ms = PSI_POLL_PERIOD_LONG_MS;
+    }
+}
+
 static void mp_event_common(int data, uint32_t events, struct polling_params *poll_params) {
     int ret;
     unsigned long long evcount;
@@ -1881,7 +2205,7 @@
     if (use_psi_monitors && events) {
         /* Override polling params only if current event is more critical */
         if (!poll_params->poll_handler || data > poll_params->poll_handler->data) {
-            poll_params->polling_interval_ms = PSI_POLL_PERIOD_MS;
+            poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS;
             poll_params->update = POLLING_START;
         }
     }
@@ -2483,8 +2807,12 @@
         property_get_bool("ro.lmk.use_minfree_levels", false);
     per_app_memcg =
         property_get_bool("ro.config.per_app_memcg", low_ram_device);
-    swap_free_low_percentage =
-        property_get_int32("ro.lmk.swap_free_low_percentage", 10);
+    swap_free_low_percentage = clamp(0, 100, property_get_int32("ro.lmk.swap_free_low_percentage",
+        low_ram_device ? DEF_LOW_SWAP_LOWRAM : DEF_LOW_SWAP));
+    thrashing_limit_pct = max(0, property_get_int32("ro.lmk.thrashing_limit",
+        low_ram_device ? DEF_THRASHING_LOWRAM : DEF_THRASHING));
+    thrashing_limit_decay_pct = clamp(0, 100, property_get_int32("ro.lmk.thrashing_limit_decay",
+        low_ram_device ? DEF_THRASHING_DECAY_LOWRAM : DEF_THRASHING_DECAY));
 
     ctx = create_android_logger(MEMINFO_LOG_TAG);