lmkd: rate-limit and cleanup failed kill reports

Excessive number of failed kill reports when lmkd can't find an eligible
process to kill or frees not enough memory pollutes logs and bugreports.
Cleanup kill reports to remove duplicate information and rate limit failed
kill attempts at 1 report per sec. The number of suppressed failed kills
will be reported in the next lmkd report.

Bug: 113864581
Test: Verified using lmkd_unit_test
Change-Id: I67fa1fec97613f136c7582115edcbc56b1503c9c
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
diff --git a/lmkd/lmkd.c b/lmkd/lmkd.c
index ce2421e..1980dc6 100644
--- a/lmkd/lmkd.c
+++ b/lmkd/lmkd.c
@@ -97,6 +97,8 @@
 
 #define min(a, b) (((a) < (b)) ? (a) : (b))
 
+#define FAIL_REPORT_RLIMIT_MS 1000
+
 /* default to old in-kernel interface if no memory pressure events */
 static bool use_inkernel_interface = true;
 static bool has_inkernel_module;
@@ -1097,8 +1099,7 @@
 }
 
 /* Kill one process specified by procp.  Returns the size of the process killed */
-static int kill_one_process(struct proc* procp, int min_score_adj,
-                            enum vmpressure_level level) {
+static int kill_one_process(struct proc* procp) {
     int pid = procp->pid;
     uid_t uid = procp->uid;
     char *taskname;
@@ -1132,11 +1133,8 @@
 
     /* CAP_KILL required */
     r = kill(pid, SIGKILL);
-    ALOGI(
-        "Killing '%s' (%d), uid %d, adj %d\n"
-        "   to free %ldkB because system is under %s memory pressure (min_oom_adj=%d)\n",
-        taskname, pid, uid, procp->oomadj, tasksize * page_k,
-        level_name[level], min_score_adj);
+    ALOGI("Kill '%s' (%d), uid %d, oom_adj %d to free %ldkB",
+        taskname, pid, uid, procp->oomadj, tasksize * page_k);
     pid_remove(pid);
 
     TRACE_KILL_END();
@@ -1163,8 +1161,7 @@
  * If pages_to_free is set to 0 only one process will be killed.
  * Returns the size of the killed processes.
  */
-static int find_and_kill_processes(enum vmpressure_level level,
-                                   int min_score_adj, int pages_to_free) {
+static int find_and_kill_processes(int min_score_adj, int pages_to_free) {
     int i;
     int killed_size;
     int pages_freed = 0;
@@ -1183,7 +1180,7 @@
             if (!procp)
                 break;
 
-            killed_size = kill_one_process(procp, min_score_adj, level);
+            killed_size = kill_one_process(procp);
             if (killed_size >= 0) {
 #ifdef LMKD_LOG_STATS
                 if (enable_stats_log && !lmk_state_change_start) {
@@ -1282,6 +1279,7 @@
     enum vmpressure_level lvl;
     union meminfo mi;
     union zoneinfo zi;
+    struct timespec curr_tm;
     static struct timespec last_kill_tm;
     static unsigned long kill_skip_count = 0;
     enum vmpressure_level level = (enum vmpressure_level)data;
@@ -1312,14 +1310,12 @@
         }
     }
 
+    if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
+        ALOGE("Failed to get current time");
+        return;
+    }
+
     if (kill_timeout_ms) {
-        struct timespec curr_tm;
-
-        if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
-            ALOGE("Failed to get current time");
-            return;
-        }
-
         if (get_time_diff_ms(&last_kill_tm, &curr_tm) < kill_timeout_ms) {
             kill_skip_count++;
             return;
@@ -1425,7 +1421,7 @@
 do_kill:
     if (low_ram_device) {
         /* For Go devices kill only one task */
-        if (find_and_kill_processes(level, level_oomadj[level], 0) == 0) {
+        if (find_and_kill_processes(level_oomadj[level], 0) == 0) {
             if (debug_process_killing) {
                 ALOGI("Nothing to kill");
             }
@@ -1434,6 +1430,8 @@
         }
     } else {
         int pages_freed;
+        static struct timespec last_report_tm;
+        static unsigned long report_skip_count = 0;
 
         if (!use_minfree_levels) {
             /* If pressure level is less than critical and enough free swap then ignore */
@@ -1461,30 +1459,41 @@
             min_score_adj = level_oomadj[level];
         }
 
-        pages_freed = find_and_kill_processes(level, min_score_adj, pages_to_free);
+        pages_freed = find_and_kill_processes(min_score_adj, pages_to_free);
 
-        if (use_minfree_levels) {
-            ALOGI("Killing because cache %ldkB is below "
-                  "limit %ldkB for oom_adj %d\n"
-                  "   Free memory is %ldkB %s reserved",
-                  other_file * page_k, minfree * page_k, min_score_adj,
-                  other_free * page_k, other_free >= 0 ? "above" : "below");
-        }
-
-        if (pages_freed < pages_to_free) {
-            ALOGI("Unable to free enough memory (pages to free=%d, pages freed=%d)",
-                  pages_to_free, pages_freed);
-        } else {
-            ALOGI("Reclaimed enough memory (pages to free=%d, pages freed=%d)",
-                  pages_to_free, pages_freed);
-            if (clock_gettime(CLOCK_MONOTONIC_COARSE, &last_kill_tm) != 0) {
-                ALOGE("Failed to get current time");
+        if (pages_freed == 0) {
+            /* Rate limit kill reports when nothing was reclaimed */
+            if (get_time_diff_ms(&last_report_tm, &curr_tm) < FAIL_REPORT_RLIMIT_MS) {
+                report_skip_count++;
                 return;
             }
         }
-        if (pages_freed > 0) {
-            meminfo_log(&mi);
+
+        /* Log meminfo whenever we kill or when report rate limit allows */
+        meminfo_log(&mi);
+        if (pages_freed >= pages_to_free) {
+            /* Reset kill time only if reclaimed enough memory */
+            last_kill_tm = curr_tm;
         }
+
+        if (use_minfree_levels) {
+            ALOGI("Killing to reclaim %ldkB, reclaimed %ldkB, cache(%ldkB) and "
+                "free(%" PRId64 "kB)-reserved(%" PRId64 "kB) below min(%ldkB) for oom_adj %d",
+                pages_to_free * page_k, pages_freed * page_k,
+                other_file * page_k, mi.field.nr_free_pages * page_k,
+                zi.field.totalreserve_pages * page_k,
+                minfree * page_k, min_score_adj);
+        } else {
+            ALOGI("Killing to reclaim %ldkB, reclaimed %ldkB at oom_adj %d",
+                pages_to_free * page_k, pages_freed * page_k, min_score_adj);
+        }
+
+        if (report_skip_count > 0) {
+            ALOGI("Suppressed %lu failed kill reports", report_skip_count);
+            report_skip_count = 0;
+        }
+
+        last_report_tm = curr_tm;
     }
 }
 
diff --git a/lmkd/tests/lmkd_test.cpp b/lmkd/tests/lmkd_test.cpp
index 1996bae..f54b25c 100644
--- a/lmkd/tests/lmkd_test.cpp
+++ b/lmkd/tests/lmkd_test.cpp
@@ -39,7 +39,7 @@
 #define LMKDTEST_RESPAWN_FLAG "LMKDTEST_RESPAWN"
 
 #define LMKD_LOGCAT_MARKER "lowmemorykiller"
-#define LMKD_KILL_MARKER_TEMPLATE LMKD_LOGCAT_MARKER ": Killing '%s'"
+#define LMKD_KILL_MARKER_TEMPLATE LMKD_LOGCAT_MARKER ": Kill '%s'"
 #define OOM_MARKER "Out of memory"
 #define OOM_KILL_MARKER "Killed process"
 #define MIN_LOG_SIZE 100