Use hung thread stack for watchdog panics.

With this change the crash reporter tries to determine if the last stack
dump in the log file is from the hung thread watchdog, and in that case
creates a crash signature that's based on the previous stack dump.

When the previous stack is used because the watchdog function was detected,
prepend "(HANG)" to the function name or kernel message in the signature.

Also use an earlier stack signature if all functions in the last one are
uncertain.  This happens in many cases (about 1/2).  It's likely a
bug in the kernel but until we fix it this workaround will be helpful.

I tested this manually with the --generate_kernel_signature flag and
with the provided unit test.  I also added two unit tests for the two
new cases.  Finally, I tried to run the KernelCrashServer autotests,
but it doesn't run correctly in developer mode, even before my change.

BUG=chromium-os:24632
TEST=manually tested, unit tested, and autotested.  No further tests required.

Change-Id: Ib65177b06c207675cb7f6e33f1df8fa79dde6090
Reviewed-on: https://gerrit.chromium.org/gerrit/13541
Reviewed-by: Luigi Semenzato <semenzato@chromium.org>
Tested-by: Luigi Semenzato <semenzato@chromium.org>
Commit-Ready: Luigi Semenzato <semenzato@chromium.org>
diff --git a/crash_reporter/kernel_collector.cc b/crash_reporter/kernel_collector.cc
index bdd1dd2..c600cec 100644
--- a/crash_reporter/kernel_collector.cc
+++ b/crash_reporter/kernel_collector.cc
@@ -250,37 +250,47 @@
     pcrecpp::StringPiece kernel_dump,
     bool print_diagnostics,
     unsigned *hash,
-    float *last_stack_timestamp) {
+    float *last_stack_timestamp,
+    bool *is_watchdog_crash) {
   pcrecpp::RE line_re("(.+)", pcrecpp::MULTILINE());
   pcrecpp::RE stack_trace_start_re(kTimestampRegex +
         " (Call Trace|Backtrace):$");
 
+  // Match lines such as the following and grab out "function_name".
+  // The ? may or may not be present.
+  //
   // For ARM:
-  // <4>[ 3498.731164] [<c0057220>] (__bug+0x20/0x2c) from [<c018062c>]
-  // (write_breakme+0xdc/0x1bc)
+  // <4>[ 3498.731164] [<c0057220>] ? (function_name+0x20/0x2c) from
+  // [<c018062c>] (foo_bar+0xdc/0x1bc)
   //
   // For X86:
-  // Match lines such as the following and grab out "error_code".
-  // <4>[ 6066.849504]  [<7937bcee>] ? error_code+0x66/0x6c
-  // The ? may or may not be present
+  // <4>[ 6066.849504]  [<7937bcee>] ? function_name+0x66/0x6c
+  //
   pcrecpp::RE stack_entry_re(kTimestampRegex +
     "\\s+\\[<[[:xdigit:]]+>\\]"      // Matches "  [<7937bcee>]"
     "([\\s\\?(]+)"                   // Matches " ? (" (ARM) or " ? " (X86)
     "([^\\+ )]+)");                  // Matches until delimiter reached
   std::string line;
   std::string hashable;
+  std::string previous_hashable;
+  bool is_watchdog = false;
 
   *hash = 0;
   *last_stack_timestamp = 0;
 
+  // Find the last and second-to-last stack traces.  The latter is used when
+  // the panic is from a watchdog timeout.
   while (line_re.FindAndConsume(&kernel_dump, &line)) {
     std::string certainty;
     std::string function_name;
     if (stack_trace_start_re.PartialMatch(line, last_stack_timestamp)) {
       if (print_diagnostics) {
-        printf("Stack trace starting. Clearing any prior traces.\n");
+        printf("Stack trace starting.%s\n",
+               hashable.empty() ? "" : "  Saving prior trace.");
       }
+      previous_hashable = hashable;
       hashable.clear();
+      is_watchdog = false;
     } else if (stack_entry_re.PartialMatch(line,
                                            last_stack_timestamp,
                                            &certainty,
@@ -297,11 +307,28 @@
         continue;
       if (!hashable.empty())
         hashable.append("|");
+      if (function_name == "watchdog_timer_fn" ||
+          function_name == "watchdog") {
+        is_watchdog = true;
+      }
       hashable.append(function_name);
     }
   }
 
+  // If the last stack trace contains a watchdog function we assume the panic
+  // is from the watchdog timer, and we hash the previous stack trace rather
+  // than the last one, assuming that the previous stack is that of the hung
+  // thread.
+  //
+  // In addition, if the hashable is empty (meaning all frames are uncertain,
+  // for whatever reason) also use the previous frame, as it cannot be any
+  // worse.
+  if (is_watchdog || hashable.empty()) {
+    hashable = previous_hashable;
+  }
+
   *hash = HashString(hashable);
+  *is_watchdog_crash = is_watchdog;
 
   if (print_diagnostics) {
     printf("Hash based on stack trace: \"%s\" at %f.\n",
@@ -396,11 +423,13 @@
   unsigned stack_hash = 0;
   float last_stack_timestamp = 0;
   std::string human_string;
+  bool is_watchdog_crash;
 
   ProcessStackTrace(kernel_dump,
                     print_diagnostics,
                     &stack_hash,
-                    &last_stack_timestamp);
+                    &last_stack_timestamp,
+                    &is_watchdog_crash);
 
   if (!FindCrashingFunction(kernel_dump,
                             print_diagnostics,
@@ -422,8 +451,9 @@
   }
 
   human_string = human_string.substr(0, kMaxHumanStringLength);
-  *kernel_signature = StringPrintf("%s-%s-%08X",
+  *kernel_signature = StringPrintf("%s-%s%s-%08X",
                                    kKernelExecName,
+                                   (is_watchdog_crash ? "(HANG)-" : ""),
                                    human_string.c_str(),
                                    stack_hash);
   return true;
diff --git a/crash_reporter/kernel_collector.h b/crash_reporter/kernel_collector.h
index e8698a7..abbdee9 100644
--- a/crash_reporter/kernel_collector.h
+++ b/crash_reporter/kernel_collector.h
@@ -83,7 +83,8 @@
   void ProcessStackTrace(pcrecpp::StringPiece kernel_dump,
                          bool print_diagnostics,
                          unsigned *hash,
-                         float *last_stack_timestamp);
+                         float *last_stack_timestamp,
+                         bool *is_watchdog_crash);
   bool FindCrashingFunction(pcrecpp::StringPiece kernel_dump,
                             bool print_diagnostics,
                             float stack_trace_timestamp,
diff --git a/crash_reporter/kernel_collector_test.cc b/crash_reporter/kernel_collector_test.cc
index 10e9e36..87c6f3d 100644
--- a/crash_reporter/kernel_collector_test.cc
+++ b/crash_reporter/kernel_collector_test.cc
@@ -444,6 +444,163 @@
                                              &signature,
                                              false));
   EXPECT_EQ("kernel-Testing panic-E0FC3552", signature);
+
+  // Panic from hung task.
+  const char kHungTaskBreakMe[] =
+      "<3>[  720.459157] INFO: task bash:2287 blocked blah blah\n"
+      "<5>[  720.459282] Call Trace:\n"
+      "<5>[  720.459307]  [<810a457b>] ? __dentry_open+0x186/0x23e\n"
+      "<5>[  720.459323]  [<810b9c71>] ? mntput_no_expire+0x29/0xe2\n"
+      "<5>[  720.459336]  [<810b9d48>] ? mntput+0x1e/0x20\n"
+      "<5>[  720.459350]  [<810ad135>] ? path_put+0x1a/0x1d\n"
+      "<5>[  720.459366]  [<8137cacc>] schedule+0x4d/0x4f\n"
+      "<5>[  720.459379]  [<8137ccfb>] schedule_timeout+0x26/0xaf\n"
+      "<5>[  720.459394]  [<8102127e>] ? should_resched+0xd/0x27\n"
+      "<5>[  720.459409]  [<81174d1f>] ? _copy_from_user+0x3c/0x50\n"
+      "<5>[  720.459423]  [<8137cd9e>] "
+      "schedule_timeout_uninterruptible+0x1a/0x1c\n"
+      "<5>[  720.459438]  [<810dee63>] write_breakme+0xb3/0x178\n"
+      "<5>[  720.459453]  [<810dedb0>] ? meminfo_proc_show+0x2f2/0x2f2\n"
+      "<5>[  720.459467]  [<810d94ae>] proc_reg_write+0x6d/0x87\n"
+      "<5>[  720.459481]  [<810d9441>] ? proc_reg_poll+0x76/0x76\n"
+      "<5>[  720.459493]  [<810a5e9e>] vfs_write+0x79/0xa5\n"
+      "<5>[  720.459505]  [<810a6011>] sys_write+0x40/0x65\n"
+      "<5>[  720.459519]  [<8137e677>] sysenter_do_call+0x12/0x26\n"
+      "<0>[  720.459530] Kernel panic - not syncing: hung_task: blocked tasks\n"
+      "<5>[  720.459768] Pid: 31, comm: khungtaskd Tainted: "
+      "G         C  3.0.8 #1\n"
+      "<5>[  720.459998] Call Trace:\n"
+      "<5>[  720.460140]  [<81378a35>] panic+0x53/0x14a\n"
+      "<5>[  720.460312]  [<8105f875>] watchdog+0x15b/0x1a0\n"
+      "<5>[  720.460495]  [<8105f71a>] ? hung_task_panic+0x16/0x16\n"
+      "<5>[  720.460693]  [<81043af3>] kthread+0x67/0x6c\n"
+      "<5>[  720.460862]  [<81043a8c>] ? __init_kthread_worker+0x2d/0x2d\n"
+      "<5>[  720.461106]  [<8137eb9e>] kernel_thread_helper+0x6/0x10\n";
+
+  EXPECT_TRUE(
+      collector_.ComputeKernelStackSignature(kHungTaskBreakMe,
+                                             &signature,
+                                             false));
+
+  EXPECT_EQ("kernel-(HANG)-hung_task: blocked tasks-600B37EA", signature);
+
+  // Panic with all question marks in the last stack trace.
+  const char kUncertainStackTrace[] =
+      "<0>[56279.689669] ------------[ cut here ]------------\n"
+      "<2>[56279.689677] kernel BUG at /build/x86-alex/tmp/portage/"
+      "sys-kernel/chromeos-kernel-0.0.1-r516/work/chromeos-kernel-0.0.1/"
+      "kernel/timer.c:844!\n"
+      "<0>[56279.689683] invalid opcode: 0000 [#1] SMP \n"
+      "<0>[56279.689688] last sysfs file: /sys/power/state\n"
+      "<5>[56279.689692] Modules linked in: nls_iso8859_1 nls_cp437 vfat fat "
+      "gobi usbnet tsl2583(C) industrialio(C) snd_hda_codec_realtek "
+      "snd_hda_intel i2c_dev snd_hda_codec snd_hwdep qcserial snd_pcm usb_wwan "
+      "i2c_i801 snd_timer nm10_gpio snd_page_alloc rtc_cmos fuse "
+      "nf_conntrack_ipv6 nf_defrag_ipv6 uvcvideo videodev ip6table_filter "
+      "ath9k ip6_tables ipv6 mac80211 ath9k_common ath9k_hw ath cfg80211 "
+      "xt_mark\n"
+      "<5>[56279.689731] \n"
+      "<5>[56279.689738] Pid: 24607, comm: powerd_suspend Tainted: G        "
+      "WC  2.6.38.3+ #1 SAMSUNG ELECTRONICS CO., LTD. Alex/G100          \n"
+      "<5>[56279.689748] EIP: 0060:[<8103e3ea>] EFLAGS: 00210286 CPU: 3\n"
+      "<5>[56279.689758] EIP is at add_timer+0xd/0x1b\n"
+      "<5>[56279.689762] EAX: f5e00684 EBX: f5e003c0 ECX: 00000002 EDX: "
+      "00200246\n"
+      "<5>[56279.689767] ESI: f5e003c0 EDI: d28bc03c EBP: d2be5e40 ESP: "
+      "d2be5e40\n"
+      "<5>[56279.689772]  DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068\n"
+      "<0>[56279.689778] Process powerd_suspend (pid: 24607, ti=d2be4000 "
+      "task=f5dc9b60 task.ti=d2be4000)\n"
+      "<0>[56279.689782] Stack:\n"
+      "<5>[56279.689785]  d2be5e4c f8dccced f4ac02c0 d2be5e70 f8ddc752 "
+      "f5e003c0 f4ac0458 f4ac092c\n"
+      "<5>[56279.689797]  f4ac043c f4ac02c0 f4ac0000 f4ac007c d2be5e7c "
+      "f8dd4a33 f4ac0164 d2be5e94\n"
+      "<5>[56279.689809]  f87e0304 f69ff0cc f4ac0164 f87e02a4 f4ac0164 "
+      "d2be5eb0 81248968 00000000\n"
+      "<0>[56279.689821] Call Trace:\n"
+      "<5>[56279.689840]  [<f8dccced>] ieee80211_sta_restart+0x25/0x8c "
+      "[mac80211]\n"
+      "<5>[56279.689854]  [<f8ddc752>] ieee80211_reconfig+0x2e9/0x339 "
+      "[mac80211]\n"
+      "<5>[56279.689869]  [<f8dd4a33>] ieee80211_aes_cmac+0x182d/0x184e "
+      "[mac80211]\n"
+      "<5>[56279.689883]  [<f87e0304>] cfg80211_get_dev_from_info+0x29b/0x2c0 "
+      "[cfg80211]\n"
+      "<5>[56279.689895]  [<f87e02a4>] ? "
+      "cfg80211_get_dev_from_info+0x23b/0x2c0 [cfg80211]\n"
+      "<5>[56279.689904]  [<81248968>] legacy_resume+0x25/0x5d\n"
+      "<5>[56279.689910]  [<812490ae>] device_resume+0xdd/0x110\n"
+      "<5>[56279.689917]  [<812491c2>] dpm_resume_end+0xe1/0x271\n"
+      "<5>[56279.689925]  [<81060481>] suspend_devices_and_enter+0x18b/0x1de\n"
+      "<5>[56279.689932]  [<810605ba>] enter_state+0xe6/0x132\n"
+      "<5>[56279.689939]  [<8105fd4b>] state_store+0x91/0x9d\n"
+      "<5>[56279.689945]  [<8105fcba>] ? state_store+0x0/0x9d\n"
+      "<5>[56279.689953]  [<81178fb1>] kobj_attr_store+0x16/0x22\n"
+      "<5>[56279.689961]  [<810eea5e>] sysfs_write_file+0xc1/0xec\n"
+      "<5>[56279.689969]  [<810af443>] vfs_write+0x8f/0x101\n"
+      "<5>[56279.689975]  [<810ee99d>] ? sysfs_write_file+0x0/0xec\n"
+      "<5>[56279.689982]  [<810af556>] sys_write+0x40/0x65\n"
+      "<5>[56279.689989]  [<81002d57>] sysenter_do_call+0x12/0x26\n"
+      "<0>[56279.689993] Code: c1 d3 e2 4a 89 55 f4 f7 d2 21 f2 6a 00 31 c9 89 "
+      "d8 e8 6e fd ff ff 5a 8d 65 f8 5b 5e 5d c3 55 89 e5 3e 8d 74 26 00 83 38 "
+      "00 74 04 <0f> 0b eb fe 8b 50 08 e8 6f ff ff ff 5d c3 55 89 e5 3e 8d 74 "
+      "26 \n"
+      "<0>[56279.690009] EIP: [<8103e3ea>] add_timer+0xd/0x1b SS:ESP "
+      "0068:d2be5e40\n"
+      "<4>[56279.690113] ---[ end trace b71141bb67c6032a ]---\n"
+      "<7>[56279.694069] wlan0: deauthenticated from 00:00:00:00:00:01 "
+      "(Reason: 6)\n"
+      "<0>[56279.703465] Kernel panic - not syncing: Fatal exception\n"
+      "<5>[56279.703471] Pid: 24607, comm: powerd_suspend Tainted: G      D "
+      "WC  2.6.38.3+ #1\n"
+      "<5>[56279.703475] Call Trace:\n"
+      "<5>[56279.703483]  [<8136648c>] ? panic+0x55/0x152\n"
+      "<5>[56279.703491]  [<810057fa>] ? oops_end+0x73/0x81\n"
+      "<5>[56279.703497]  [<81005a44>] ? die+0xed/0xf5\n"
+      "<5>[56279.703503]  [<810033cb>] ? do_trap+0x7a/0x80\n"
+      "<5>[56279.703509]  [<8100369b>] ? do_invalid_op+0x0/0x80\n"
+      "<5>[56279.703515]  [<81003711>] ? do_invalid_op+0x76/0x80\n"
+      "<5>[56279.703522]  [<8103e3ea>] ? add_timer+0xd/0x1b\n"
+      "<5>[56279.703529]  [<81025e23>] ? check_preempt_curr+0x2e/0x69\n"
+      "<5>[56279.703536]  [<8102ef28>] ? ttwu_post_activation+0x5a/0x11b\n"
+      "<5>[56279.703543]  [<8102fa8d>] ? try_to_wake_up+0x213/0x21d\n"
+      "<5>[56279.703550]  [<81368b7f>] ? error_code+0x67/0x6c\n"
+      "<5>[56279.703557]  [<8103e3ea>] ? add_timer+0xd/0x1b\n"
+      "<5>[56279.703577]  [<f8dccced>] ? ieee80211_sta_restart+0x25/0x8c "
+      "[mac80211]\n"
+      "<5>[56279.703591]  [<f8ddc752>] ? ieee80211_reconfig+0x2e9/0x339 "
+      "[mac80211]\n"
+      "<5>[56279.703605]  [<f8dd4a33>] ? ieee80211_aes_cmac+0x182d/0x184e "
+      "[mac80211]\n"
+      "<5>[56279.703618]  [<f87e0304>] ? "
+      "cfg80211_get_dev_from_info+0x29b/0x2c0 [cfg80211]\n"
+      "<5>[56279.703630]  [<f87e02a4>] ? "
+      "cfg80211_get_dev_from_info+0x23b/0x2c0 [cfg80211]\n"
+      "<5>[56279.703637]  [<81248968>] ? legacy_resume+0x25/0x5d\n"
+      "<5>[56279.703643]  [<812490ae>] ? device_resume+0xdd/0x110\n"
+      "<5>[56279.703649]  [<812491c2>] ? dpm_resume_end+0xe1/0x271\n"
+      "<5>[56279.703657]  [<81060481>] ? "
+      "suspend_devices_and_enter+0x18b/0x1de\n"
+      "<5>[56279.703663]  [<810605ba>] ? enter_state+0xe6/0x132\n"
+      "<5>[56279.703670]  [<8105fd4b>] ? state_store+0x91/0x9d\n"
+      "<5>[56279.703676]  [<8105fcba>] ? state_store+0x0/0x9d\n"
+      "<5>[56279.703683]  [<81178fb1>] ? kobj_attr_store+0x16/0x22\n"
+      "<5>[56279.703690]  [<810eea5e>] ? sysfs_write_file+0xc1/0xec\n"
+      "<5>[56279.703697]  [<810af443>] ? vfs_write+0x8f/0x101\n"
+      "<5>[56279.703703]  [<810ee99d>] ? sysfs_write_file+0x0/0xec\n"
+      "<5>[56279.703709]  [<810af556>] ? sys_write+0x40/0x65\n"
+      "<5>[56279.703716]  [<81002d57>] ? sysenter_do_call+0x12/0x26\n";
+
+  EXPECT_TRUE(
+      collector_.ComputeKernelStackSignature(kUncertainStackTrace,
+                                             &signature,
+                                             false));
+  // The first trace contains only uncertain entries and its hash is 00000000,
+  // so, if we used that, the signature would be kernel-add_timer-00000000.
+  // Instead we use the second-to-last trace for the hash.
+  EXPECT_EQ("kernel-add_timer-B5178878", signature);
+
   ComputeKernelStackSignatureCommon();
 }