Strip MAC addresses from dmesg sent w/ kcrash files.

BUG=chromium-os:13579
TEST=Ran the kernel_collector_test

rm -f /build/x86-mario/tmp/portage/chromeos-base/crash-reporter-9999/.tested; cros_workon_make --test --board=x86-mario crash-reporter

[ RUN      ] KernelCollectorTest.StripSensitiveDataBasic
[       OK ] KernelCollectorTest.StripSensitiveDataBasic (1 ms)
[ RUN      ] KernelCollectorTest.StripSensitiveDataBulk
[       OK ] KernelCollectorTest.StripSensitiveDataBulk (2 ms)
[ RUN      ] KernelCollectorTest.StripSensitiveDataSample
[       OK ] KernelCollectorTest.StripSensitiveDataSample (0 ms)

Review URL: http://codereview.chromium.org/6777001

Change-Id: Ie2cd3d007f9ee2fb877d28280cfe62748c108bd2
diff --git a/crash_reporter/kernel_collector.cc b/crash_reporter/kernel_collector.cc
index 56fa301..529e35f 100644
--- a/crash_reporter/kernel_collector.cc
+++ b/crash_reporter/kernel_collector.cc
@@ -68,6 +68,81 @@
   return true;
 }
 
+void KernelCollector::StripSensitiveData(std::string *kernel_dump) {
+  // Strip any data that the user might not want sent up to the crash servers.
+  // We'll read in from kernel_dump and also place our output there.
+  //
+  // At the moment, the only sensitive data we strip is MAC addresses.
+
+  // Get rid of things that look like MAC addresses, since they could possibly
+  // give information about where someone has been.  This is strings that look
+  // like this: 11:22:33:44:55:66
+  // Complications:
+  // - Within a given kernel_dump, want to be able to tell when the same MAC
+  //   was used more than once.  Thus, we'll consistently replace the first
+  //   MAC found with 00:00:00:00:00:01, the second with ...:02, etc.
+  // - ACPI commands look like MAC addresses.  We'll specifically avoid getting
+  //   rid of those.
+  std::ostringstream result;
+  std::string pre_mac_str;
+  std::string mac_str;
+  std::map<std::string, std::string> mac_map;
+  pcrecpp::StringPiece input(*kernel_dump);
+
+  // This RE will find the next MAC address and can return us the data preceding
+  // the MAC and the MAC itself.
+  pcrecpp::RE mac_re("(.*?)("
+                     "[0-9a-fA-F][0-9a-fA-F]:"
+                     "[0-9a-fA-F][0-9a-fA-F]:"
+                     "[0-9a-fA-F][0-9a-fA-F]:"
+                     "[0-9a-fA-F][0-9a-fA-F]:"
+                     "[0-9a-fA-F][0-9a-fA-F]:"
+                     "[0-9a-fA-F][0-9a-fA-F])",
+                     pcrecpp::RE_Options()
+                       .set_multiline(true)
+                       .set_dotall(true));
+
+  // This RE will identify when the 'pre_mac_str' shows that the MAC address
+  // was really an ACPI cmd.  The full string looks like this:
+  //   ata1.00: ACPI cmd ef/10:03:00:00:00:a0 (SET FEATURES) filtered out
+  pcrecpp::RE acpi_re("ACPI cmd ef/$",
+                      pcrecpp::RE_Options()
+                        .set_multiline(true)
+                        .set_dotall(true));
+
+  // Keep consuming, building up a result string as we go.
+  while (mac_re.Consume(&input, &pre_mac_str, &mac_str)) {
+    if (acpi_re.PartialMatch(pre_mac_str)) {
+      // We really saw an ACPI command; add to result w/ no stripping.
+      result << pre_mac_str << mac_str;
+    } else {
+      // Found a MAC address; look up in our hash for the mapping.
+      std::string replacement_mac = mac_map[mac_str];
+      if (replacement_mac == "") {
+        // It wasn't present, so build up a replacement string.
+        int mac_id = mac_map.size();
+
+        // Handle up to 2^32 unique MAC address; overkill, but doesn't hurt.
+        replacement_mac = StringPrintf("00:00:%02x:%02x:%02x:%02x",
+                                       (mac_id & 0xff000000) >> 24,
+                                       (mac_id & 0x00ff0000) >> 16,
+                                       (mac_id & 0x0000ff00) >> 8,
+                                       (mac_id & 0x000000ff));
+        mac_map[mac_str] = replacement_mac;
+      }
+
+      // Dump the string before the MAC and the fake MAC address into result.
+      result << pre_mac_str << replacement_mac;
+    }
+  }
+
+  // One last bit of data might still be in the input.
+  result << input;
+
+  // We'll just assign right back to kernel_dump.
+  *kernel_dump = result.str();
+}
+
 bool KernelCollector::Enable() {
   if (arch_ == archUnknown || arch_ >= archCount ||
       s_pc_regex[arch_] == NULL) {
@@ -300,6 +375,7 @@
   if (!LoadPreservedDump(&kernel_dump)) {
     return false;
   }
+  StripSensitiveData(&kernel_dump);
   if (kernel_dump.empty()) {
     return false;
   }
diff --git a/crash_reporter/kernel_collector.h b/crash_reporter/kernel_collector.h
index 04f3bdd..1c86f40 100644
--- a/crash_reporter/kernel_collector.h
+++ b/crash_reporter/kernel_collector.h
@@ -58,10 +58,14 @@
   friend class KernelCollectorTest;
   FRIEND_TEST(KernelCollectorTest, ClearPreservedDump);
   FRIEND_TEST(KernelCollectorTest, LoadPreservedDump);
+  FRIEND_TEST(KernelCollectorTest, StripSensitiveDataBasic);
+  FRIEND_TEST(KernelCollectorTest, StripSensitiveDataBulk);
+  FRIEND_TEST(KernelCollectorTest, StripSensitiveDataSample);
   FRIEND_TEST(KernelCollectorTest, CollectOK);
 
   bool LoadPreservedDump(std::string *contents);
   bool ClearPreservedDump();
+  void StripSensitiveData(std::string *kernel_dump);
 
   void ProcessStackTrace(pcrecpp::StringPiece kernel_dump,
                          bool print_diagnostics,
diff --git a/crash_reporter/kernel_collector_test.cc b/crash_reporter/kernel_collector_test.cc
index 08e3169..bfa2f04 100644
--- a/crash_reporter/kernel_collector_test.cc
+++ b/crash_reporter/kernel_collector_test.cc
@@ -100,6 +100,123 @@
   ASSERT_EQ(KernelCollector::kClearingSequence, dump);
 }
 
+TEST_F(KernelCollectorTest, StripSensitiveDataBasic) {
+  // Basic tests of StripSensitiveData...
+
+  // Make sure we work OK with a string w/ no MAC addresses.
+  const std::string kCrashWithNoMacsOrig =
+      "<7>[111566.131728] PM: Entering mem sleep\n";
+  std::string crash_with_no_macs(kCrashWithNoMacsOrig);
+  collector_.StripSensitiveData(&crash_with_no_macs);
+  EXPECT_EQ(kCrashWithNoMacsOrig, crash_with_no_macs);
+
+  // Make sure that we handle the case where there's nothing before/after the
+  // MAC address.
+  const std::string kJustAMacOrig =
+      "11:22:33:44:55:66";
+  const std::string kJustAMacStripped =
+      "00:00:00:00:00:01";
+  std::string just_a_mac(kJustAMacOrig);
+  collector_.StripSensitiveData(&just_a_mac);
+  EXPECT_EQ(kJustAMacStripped, just_a_mac);
+
+  // Test MAC addresses crammed together to make sure it gets both of them.
+  //
+  // I'm not sure that the code does ideal on these two test cases (they don't
+  // look like two MAC addresses to me), but since we don't see them I think
+  // it's OK to behave as shown here.
+  const std::string kCrammedMacs1Orig =
+      "11:22:33:44:55:66:11:22:33:44:55:66";
+  const std::string kCrammedMacs1Stripped =
+      "00:00:00:00:00:01:00:00:00:00:00:01";
+  std::string crammed_macs_1(kCrammedMacs1Orig);
+  collector_.StripSensitiveData(&crammed_macs_1);
+  EXPECT_EQ(kCrammedMacs1Stripped, crammed_macs_1);
+
+  const std::string kCrammedMacs2Orig =
+      "11:22:33:44:55:6611:22:33:44:55:66";
+  const std::string kCrammedMacs2Stripped =
+      "00:00:00:00:00:0100:00:00:00:00:01";
+  std::string crammed_macs_2(kCrammedMacs2Orig);
+  collector_.StripSensitiveData(&crammed_macs_2);
+  EXPECT_EQ(kCrammedMacs2Stripped, crammed_macs_2);
+
+  // Test case-sensitiveness (we shouldn't be case-senstive).
+  const std::string kCapsMacOrig =
+      "AA:BB:CC:DD:EE:FF";
+  const std::string kCapsMacStripped =
+      "00:00:00:00:00:01";
+  std::string caps_mac(kCapsMacOrig);
+  collector_.StripSensitiveData(&caps_mac);
+  EXPECT_EQ(kCapsMacStripped, caps_mac);
+
+  const std::string kLowerMacOrig =
+      "aa:bb:cc:dd:ee:ff";
+  const std::string kLowerMacStripped =
+      "00:00:00:00:00:01";
+  std::string lower_mac(kLowerMacOrig);
+  collector_.StripSensitiveData(&lower_mac);
+  EXPECT_EQ(kLowerMacStripped, lower_mac);
+}
+
+TEST_F(KernelCollectorTest, StripSensitiveDataBulk) {
+  // Test calling StripSensitiveData w/ lots of MAC addresses in the "log".
+
+  // Test that stripping code handles more than 256 unique MAC addresses, since
+  // that overflows past the last byte...
+  // We'll write up some code that generates 258 unique MAC addresses.  Sorta
+  // cheating since the code is very similar to the current code in
+  // StripSensitiveData(), but would catch if someone changed that later.
+  std::string lotsa_macs_orig;
+  std::string lotsa_macs_stripped;
+  int i;
+  for (i = 0; i < 258; i++) {
+    lotsa_macs_orig += StringPrintf(" 11:11:11:11:%02X:%02x",
+                                  (i & 0xff00) >> 8, i & 0x00ff);
+    lotsa_macs_stripped += StringPrintf(" 00:00:00:00:%02X:%02x",
+                                     ((i+1) & 0xff00) >> 8, (i+1) & 0x00ff);
+  }
+  std::string lotsa_macs(lotsa_macs_orig);
+  collector_.StripSensitiveData(&lotsa_macs);
+  EXPECT_EQ(lotsa_macs_stripped, lotsa_macs);
+}
+
+TEST_F(KernelCollectorTest, StripSensitiveDataSample) {
+  // Test calling StripSensitiveData w/ some actual lines from a real crash;
+  // included two MAC addresses (though replaced them with some bogusness).
+  const std::string kCrashWithMacsOrig =
+      "<6>[111567.195339] ata1.00: ACPI cmd ef/10:03:00:00:00:a0 (SET FEATURES)"
+        " filtered out\n"
+      "<7>[108539.540144] wlan0: authenticate with 11:22:33:44:55:66 (try 1)\n"
+      "<7>[108539.554973] wlan0: associate with 11:22:33:44:55:66 (try 1)\n"
+      "<6>[110136.587583] usb0: register 'QCUSBNet2k' at usb-0000:00:1d.7-2,"
+        " QCUSBNet Ethernet Device, 99:88:77:66:55:44\n"
+      "<7>[110964.314648] wlan0: deauthenticated from 11:22:33:44:55:66"
+        " (Reason: 6)\n"
+      "<7>[110964.325057] phy0: Removed STA 11:22:33:44:55:66\n"
+      "<7>[110964.325115] phy0: Destroyed STA 11:22:33:44:55:66\n"
+      "<6>[110969.219172] usb0: register 'QCUSBNet2k' at usb-0000:00:1d.7-2,"
+        " QCUSBNet Ethernet Device, 99:88:77:66:55:44\n"
+      "<7>[111566.131728] PM: Entering mem sleep\n";
+  const std::string kCrashWithMacsStripped =
+      "<6>[111567.195339] ata1.00: ACPI cmd ef/10:03:00:00:00:a0 (SET FEATURES)"
+        " filtered out\n"
+      "<7>[108539.540144] wlan0: authenticate with 00:00:00:00:00:01 (try 1)\n"
+      "<7>[108539.554973] wlan0: associate with 00:00:00:00:00:01 (try 1)\n"
+      "<6>[110136.587583] usb0: register 'QCUSBNet2k' at usb-0000:00:1d.7-2,"
+        " QCUSBNet Ethernet Device, 00:00:00:00:00:02\n"
+      "<7>[110964.314648] wlan0: deauthenticated from 00:00:00:00:00:01"
+        " (Reason: 6)\n"
+      "<7>[110964.325057] phy0: Removed STA 00:00:00:00:00:01\n"
+      "<7>[110964.325115] phy0: Destroyed STA 00:00:00:00:00:01\n"
+      "<6>[110969.219172] usb0: register 'QCUSBNet2k' at usb-0000:00:1d.7-2,"
+        " QCUSBNet Ethernet Device, 00:00:00:00:00:02\n"
+      "<7>[111566.131728] PM: Entering mem sleep\n";
+  std::string crash_with_macs(kCrashWithMacsOrig);
+  collector_.StripSensitiveData(&crash_with_macs);
+  EXPECT_EQ(kCrashWithMacsStripped, crash_with_macs);
+}
+
 TEST_F(KernelCollectorTest, CollectPreservedFileMissing) {
   ASSERT_FALSE(collector_.Collect());
   ASSERT_TRUE(FindLog("Unable to read test/kcrash"));