metricsd: Persist the report to disk if an upload fails.

If the metrics server is unreachable, we may need to resend a metrics
report later. Instead of keeping the staged report in memory, save it to
disk to avoid loosing data if the system restarts or crashes.

BUG: 23033262
TEST: unit tests.

Change-Id: Idd14964e40f022952469f47d675d8cda9586d7cd
diff --git a/metricsd/constants.h b/metricsd/constants.h
index 021bc73..717e5d2 100644
--- a/metricsd/constants.h
+++ b/metricsd/constants.h
@@ -23,6 +23,8 @@
 static const char kMetricsGUIDFileName[] = "Sysinfo.GUID";
 static const char kMetricsServer[] = "https://clients4.google.com/uma/v2";
 static const char kConsentFileName[] = "enabled";
+static const char kStagedLogName[] = "staged_log";
+static const char kFailedUploadCountName[] = "failed_upload_count";
 static const char kDefaultVersion[] = "0.0.0.0";
 
 // System properties used.
diff --git a/metricsd/uploader/upload_service.cc b/metricsd/uploader/upload_service.cc
index 6d7fd54..b630cec 100644
--- a/metricsd/uploader/upload_service.cc
+++ b/metricsd/uploader/upload_service.cc
@@ -19,6 +19,7 @@
 #include <string>
 
 #include <base/bind.h>
+#include <base/files/file_util.h>
 #include <base/logging.h>
 #include <base/memory/scoped_vector.h>
 #include <base/message_loop/message_loop.h>
@@ -45,6 +46,7 @@
       metrics_lib_(metrics_lib),
       histogram_snapshot_manager_(this),
       sender_(new HttpSender(server)),
+      failed_upload_count_(metrics::kFailedUploadCountName),
       testing_(false) {
 }
 
@@ -60,6 +62,7 @@
                          const base::FilePath& metrics_directory) {
   base::StatisticsRecorder::Initialize();
   metrics_file_ = metrics_directory.Append(metrics::kMetricsEventsFileName);
+  staged_log_path_ = metrics_directory.Append(metrics::kStagedLogName);
 
   if (!testing_) {
     base::MessageLoop::current()->PostDelayedTask(FROM_HERE,
@@ -71,8 +74,8 @@
 }
 
 void UploadService::StartNewLog() {
-  CHECK(!staged_log_) << "the staged log should be discarded before starting "
-                         "a new metrics log";
+  CHECK(!HasStagedLog()) << "the staged log should be discarded before "
+                         << "starting a new metrics log";
   MetricsLog* log = new MetricsLog();
   current_log_.reset(log);
 }
@@ -88,7 +91,11 @@
 }
 
 void UploadService::UploadEvent() {
-  if (staged_log_) {
+  // If the system shutdown or crashed while uploading a report, we may not have
+  // deleted an old log.
+  RemoveFailedLog();
+
+  if (HasStagedLog()) {
     // Previous upload failed, retry sending the logs.
     SendStagedLog();
     return;
@@ -100,47 +107,44 @@
   StageCurrentLog();
 
   // If a log is available for upload, upload it.
-  if (staged_log_) {
+  if (HasStagedLog()) {
     SendStagedLog();
   }
 }
 
 void UploadService::SendStagedLog() {
-  CHECK(staged_log_) << "staged_log_ must exist to be sent";
-
   // If metrics are not enabled, discard the log and exit.
   if (!metrics_lib_->AreMetricsEnabled()) {
     LOG(INFO) << "Metrics disabled. Don't upload metrics samples.";
-    staged_log_.reset();
+    base::DeleteFile(staged_log_path_, false);
     return;
   }
 
-  std::string log_text;
-  staged_log_->GetEncodedLog(&log_text);
-  if (!sender_->Send(log_text, base::SHA1HashString(log_text))) {
-    ++failed_upload_count_;
-    if (failed_upload_count_ <= kMaxFailedUpload) {
-      LOG(WARNING) << "log upload failed " << failed_upload_count_
-                   << " times. It will be retried later.";
-      return;
-    }
-    LOG(WARNING) << "log failed more than " << kMaxFailedUpload << " times.";
+  std::string staged_log;
+  CHECK(base::ReadFileToString(staged_log_path_, &staged_log));
+
+  // Increase the failed count in case the daemon crashes while sending the log.
+  failed_upload_count_.Add(1);
+
+  if (!sender_->Send(staged_log, base::SHA1HashString(staged_log))) {
+    LOG(WARNING) << "log failed to upload";
   } else {
-    LOG(INFO) << "uploaded " << log_text.length() << " bytes";
+    VLOG(1) << "uploaded " << staged_log.length() << " bytes";
+    base::DeleteFile(staged_log_path_, false);
   }
-  // Discard staged log.
-  staged_log_.reset();
+
+  RemoveFailedLog();
 }
 
 void UploadService::Reset() {
-  staged_log_.reset();
+  base::DeleteFile(staged_log_path_, false);
   current_log_.reset();
-  failed_upload_count_ = 0;
+  failed_upload_count_.Set(0);
 }
 
 void UploadService::ReadMetrics() {
-  CHECK(!staged_log_)
-      << "cannot read metrics until the old logs have been discarded";
+  CHECK(!HasStagedLog()) << "cannot read metrics until the old logs have been "
+                         << "discarded";
 
   ScopedVector<metrics::MetricSample> vector;
   metrics::SerializationUtils::ReadAndTruncateMetricsFromFile(
@@ -153,7 +157,7 @@
     AddSample(*sample);
     i++;
   }
-  DLOG(INFO) << i << " samples read";
+  VLOG(1) << i << " samples read";
 }
 
 void UploadService::AddSample(const metrics::MetricSample& sample) {
@@ -217,19 +221,27 @@
 }
 
 void UploadService::StageCurrentLog() {
-  CHECK(!staged_log_)
-      << "staged logs must be discarded before another log can be staged";
+  // If we haven't logged anything since the last upload, don't upload an empty
+  // report.
+  if (!current_log_)
+    return;
 
-  if (!current_log_) return;
-
-  staged_log_.swap(current_log_);
-  staged_log_->CloseLog();
-  if (!staged_log_->PopulateSystemProfile(system_profile_setter_.get())) {
+  scoped_ptr<MetricsLog> staged_log;
+  staged_log.swap(current_log_);
+  staged_log->CloseLog();
+  if (!staged_log->PopulateSystemProfile(system_profile_setter_.get())) {
     LOG(WARNING) << "Error while adding metadata to the log. Discarding the "
                  << "log.";
-    staged_log_.reset();
+    return;
   }
-  failed_upload_count_ = 0;
+  std::string encoded_log;
+  staged_log->GetEncodedLog(&encoded_log);
+
+  failed_upload_count_.Set(0);
+  if (static_cast<int>(encoded_log.size()) != base::WriteFile(
+      staged_log_path_, encoded_log.data(), encoded_log.size())) {
+    LOG(ERROR) << "failed to persist to " << staged_log_path_.value();
+  }
 }
 
 MetricsLog* UploadService::GetOrCreateCurrentLog() {
@@ -238,3 +250,16 @@
   }
   return current_log_.get();
 }
+
+bool UploadService::HasStagedLog() {
+  return base::PathExists(staged_log_path_);
+}
+
+void UploadService::RemoveFailedLog() {
+  if (failed_upload_count_.Get() > kMaxFailedUpload) {
+    LOG(INFO) << "log failed more than " << kMaxFailedUpload << " times.";
+    CHECK(base::DeleteFile(staged_log_path_, false))
+        << "failed to delete staged log at " << staged_log_path_.value();
+    failed_upload_count_.Set(0);
+  }
+}
diff --git a/metricsd/uploader/upload_service.h b/metricsd/uploader/upload_service.h
index d9c690d..ea93d21 100644
--- a/metricsd/uploader/upload_service.h
+++ b/metricsd/uploader/upload_service.h
@@ -24,6 +24,7 @@
 #include "base/metrics/histogram_snapshot_manager.h"
 
 #include "metrics/metrics_library.h"
+#include "persistent_integer.h"
 #include "uploader/metrics_log.h"
 #include "uploader/sender.h"
 #include "uploader/system_profile_cache.h"
@@ -146,6 +147,12 @@
   // system information.
   void StageCurrentLog();
 
+  // Returns true iff a log is staged.
+  bool HasStagedLog();
+
+  // Remove the staged log iff the upload failed more than |kMaxFailedUpload|.
+  void RemoveFailedLog();
+
   // Returns the current log. If there is no current log, creates it first.
   MetricsLog* GetOrCreateCurrentLog();
 
@@ -153,11 +160,11 @@
   MetricsLibraryInterface* metrics_lib_;
   base::HistogramSnapshotManager histogram_snapshot_manager_;
   scoped_ptr<Sender> sender_;
-  int failed_upload_count_;
+  chromeos_metrics::PersistentInteger failed_upload_count_;
   scoped_ptr<MetricsLog> current_log_;
-  scoped_ptr<MetricsLog> staged_log_;
 
   base::FilePath metrics_file_;
+  base::FilePath staged_log_path_;
 
   bool testing_;
 };
diff --git a/metricsd/uploader/upload_service_test.cc b/metricsd/uploader/upload_service_test.cc
index d3b5289..fd8248a 100644
--- a/metricsd/uploader/upload_service_test.cc
+++ b/metricsd/uploader/upload_service_test.cc
@@ -24,6 +24,7 @@
 
 #include "constants.h"
 #include "metrics_library_mock.h"
+#include "persistent_integer.h"
 #include "serialization/metric_sample.h"
 #include "uploader/metrics_log.h"
 #include "uploader/mock/mock_system_profile_setter.h"
@@ -38,6 +39,8 @@
  protected:
   virtual void SetUp() {
     CHECK(dir_.CreateUniqueTempDir());
+    chromeos_metrics::PersistentInteger::SetMetricsDirectory(
+        dir_.path().value());
     upload_service_.reset(new UploadService(new MockSystemProfileSetter(),
                                             &metrics_lib_, "", true));
 
@@ -45,9 +48,6 @@
     upload_service_->Init(base::TimeDelta::FromMinutes(30), dir_.path());
     upload_service_->GatherHistograms();
     upload_service_->Reset();
-
-    chromeos_metrics::PersistentInteger::SetMetricsDirectory(
-        dir_.path().value());
   }
 
   scoped_ptr<metrics::MetricSample> Crash(const std::string& name) {
@@ -132,9 +132,17 @@
     upload_service_->UploadEvent();
   }
 
-  EXPECT_TRUE(upload_service_->staged_log_);
+  EXPECT_TRUE(upload_service_->HasStagedLog());
   upload_service_->UploadEvent();
-  EXPECT_FALSE(upload_service_->staged_log_);
+  EXPECT_FALSE(upload_service_->HasStagedLog());
+
+  // Log a new sample. The failed upload counter should be reset.
+  upload_service_->AddSample(*Crash("user"));
+  for (int i = 0; i < UploadService::kMaxFailedUpload; i++) {
+    upload_service_->UploadEvent();
+  }
+  // The log is not discarded after multiple failed uploads.
+  EXPECT_TRUE(upload_service_->HasStagedLog());
 }
 
 TEST_F(UploadServiceTest, EmptyLogsAreNotSent) {