update_engine: call res_init and retry one extra time on unresolved host
libcurl error
Based on https://curl.haxx.se/docs/todo.html#updated_DNS_server_while_running:
"If /etc/resolv.conf gets updated while a program using libcurl is running, it
may cause name resolves to fail unless res_init() is called. We should
consider calling res_init() + retry once unconditionally on all name resolve
failures to mitigate against this."
This CL added following behavior:
On libcurl returns CURLE_COULDNT_RESOLVE_HOST error code:
1. we increase the max retry count by 1 for the first time it happens in the
lifetime of an LibcurlHttpFetcher object.
2. we call res_init unconditionally.
We also add UMA metrics to measure whether calling res_init helps
mitigate the unresolved host problem. WIP CL: https://chromium-review.googlesource.com/c/chromium/src/+/1698722
BUG=chromium:982813
TEST=FEATURES="test" emerge-kefka update_engine, tested on a device
Change-Id: Ia894eae93b3a0adbac1a831e657b75cba835dfa0
diff --git a/libcurl_http_fetcher.cc b/libcurl_http_fetcher.cc
index 06722fd..247327a 100644
--- a/libcurl_http_fetcher.cc
+++ b/libcurl_http_fetcher.cc
@@ -16,6 +16,8 @@
#include "update_engine/libcurl_http_fetcher.h"
+#include <netinet/in.h>
+#include <resolv.h>
#include <sys/types.h>
#include <unistd.h>
@@ -480,14 +482,45 @@
if (http_response_code_) {
LOG(INFO) << "HTTP response code: " << http_response_code_;
no_network_retry_count_ = 0;
+ unresolved_host_state_machine_.UpdateState(false);
} else {
LOG(ERROR) << "Unable to get http response code.";
- LogCurlHandleInfo();
+ CURLcode curl_code = GetCurlCode();
+ LOG(ERROR) << "Return code for the transfer: " << curl_code;
+ if (curl_code == CURLE_COULDNT_RESOLVE_HOST) {
+ LOG(ERROR) << "libcurl can not resolve host.";
+ unresolved_host_state_machine_.UpdateState(true);
+ if (delegate_) {
+ delegate_->ReportUpdateCheckMetrics(
+ metrics::CheckResult::kUnset,
+ metrics::CheckReaction::kUnset,
+ metrics::DownloadErrorCode::kUnresolvedHost);
+ }
+ }
}
// we're done!
CleanUp();
+ if (unresolved_host_state_machine_.getState() ==
+ UnresolvedHostStateMachine::State::kRetry) {
+ // Based on
+ // https://curl.haxx.se/docs/todo.html#updated_DNS_server_while_running,
+ // update_engine process should call res_init() and unconditionally retry.
+ res_init();
+ no_network_max_retries_++;
+ LOG(INFO) << "Will retry after reloading resolv.conf because last attempt "
+ "failed to resolve host.";
+ } else if (unresolved_host_state_machine_.getState() ==
+ UnresolvedHostStateMachine::State::kRetriedSuccess) {
+ if (delegate_) {
+ delegate_->ReportUpdateCheckMetrics(
+ metrics::CheckResult::kUnset,
+ metrics::CheckReaction::kUnset,
+ metrics::DownloadErrorCode::kUnresolvedHostRecovered);
+ }
+ }
+
// TODO(petkov): This temporary code tries to deal with the case where the
// update engine performs an update check while the network is not ready
// (e.g., right after resume). Longer term, we should check if the network
@@ -813,7 +846,8 @@
}
}
-void LibcurlHttpFetcher::LogCurlHandleInfo() {
+CURLcode LibcurlHttpFetcher::GetCurlCode() {
+ CURLcode curl_code = CURLE_OK;
while (true) {
// Repeated calls to |curl_multi_info_read| will return a new struct each
// time, until a NULL is returned as a signal that there is no more to get
@@ -831,7 +865,7 @@
CHECK_EQ(curl_handle_, curl_msg->easy_handle);
// Transfer return code reference:
// https://curl.haxx.se/libcurl/c/libcurl-errors.html
- LOG(ERROR) << "Return code for the transfer: " << curl_msg->data.result;
+ curl_code = curl_msg->data.result;
}
}
@@ -842,6 +876,32 @@
if (res == CURLE_OK && connect_error) {
LOG(ERROR) << "Connect error code from the OS: " << connect_error;
}
+
+ return curl_code;
+}
+
+void UnresolvedHostStateMachine::UpdateState(bool failed_to_resolve_host) {
+ switch (state_) {
+ case State::kInit:
+ if (failed_to_resolve_host) {
+ state_ = State::kRetry;
+ }
+ break;
+ case State::kRetry:
+ if (failed_to_resolve_host) {
+ state_ = State::kNotRetry;
+ } else {
+ state_ = State::kRetriedSuccess;
+ }
+ break;
+ case State::kNotRetry:
+ break;
+ case State::kRetriedSuccess:
+ break;
+ default:
+ NOTREACHED();
+ break;
+ }
}
} // namespace chromeos_update_engine
diff --git a/libcurl_http_fetcher.h b/libcurl_http_fetcher.h
index 3978b70..cdd489d 100644
--- a/libcurl_http_fetcher.h
+++ b/libcurl_http_fetcher.h
@@ -37,6 +37,48 @@
namespace chromeos_update_engine {
+// |UnresolvedHostStateMachine| is a representation of internal state machine of
+// |LibcurlHttpFetcher|.
+class UnresolvedHostStateMachine {
+ public:
+ UnresolvedHostStateMachine() = default;
+ enum class State {
+ kInit = 0,
+ kRetry = 1,
+ kRetriedSuccess = 2,
+ kNotRetry = 3,
+ };
+
+ State getState() { return state_; }
+
+ // Updates the following internal state machine:
+ //
+ // |kInit|
+ // |
+ // |
+ // \/
+ // (Try, host Unresolved)
+ // |
+ // |
+ // \/
+ // |kRetry| --> (Retry, host resolved)
+ // | |
+ // | |
+ // \/ \/
+ // (Retry, host Unresolved) |kRetriedSuccess|
+ // |
+ // |
+ // \/
+ // |kNotRetry|
+ //
+ void UpdateState(bool failed_to_resolve_host);
+
+ private:
+ State state_ = {State::kInit};
+
+ DISALLOW_COPY_AND_ASSIGN(UnresolvedHostStateMachine);
+};
+
class LibcurlHttpFetcher : public HttpFetcher {
public:
LibcurlHttpFetcher(ProxyResolver* proxy_resolver,
@@ -88,6 +130,8 @@
no_network_max_retries_ = retries;
}
+ int get_no_network_max_retries() { return no_network_max_retries_; }
+
void set_server_to_check(ServerToCheck server_to_check) {
server_to_check_ = server_to_check;
}
@@ -125,10 +169,8 @@
// Asks libcurl for the http response code and stores it in the object.
void GetHttpResponseCode();
- // Logs curl handle info.
- // This can be called only when an http request failed to avoid spamming the
- // logs. This must be called after |ResumeTransfer| and before |CleanUp|.
- void LogCurlHandleInfo();
+ // Returns the last |CURLcode|.
+ CURLcode GetCurlCode();
// Checks whether stored HTTP response is within the success range.
inline bool IsHttpResponseSuccess() {
@@ -280,6 +322,9 @@
// True if this object is for update check.
bool is_update_check_{false};
+ // Internal state machine.
+ UnresolvedHostStateMachine unresolved_host_state_machine_;
+
int low_speed_limit_bps_{kDownloadLowSpeedLimitBps};
int low_speed_time_seconds_{kDownloadLowSpeedTimeSeconds};
int connect_timeout_seconds_{kDownloadConnectTimeoutSeconds};
diff --git a/libcurl_http_fetcher_unittest.cc b/libcurl_http_fetcher_unittest.cc
index 88e48fa..7f00dae 100644
--- a/libcurl_http_fetcher_unittest.cc
+++ b/libcurl_http_fetcher_unittest.cc
@@ -43,6 +43,7 @@
brillo::FakeMessageLoop loop_{nullptr};
FakeHardware fake_hardware_;
LibcurlHttpFetcher libcurl_fetcher_{nullptr, &fake_hardware_};
+ UnresolvedHostStateMachine state_machine_;
};
TEST_F(LibcurlHttpFetcherTest, GetEmptyHeaderValueTest) {
@@ -78,4 +79,60 @@
EXPECT_EQ(header_value, actual_header_value);
}
+TEST_F(LibcurlHttpFetcherTest, InvalidURLTest) {
+ int no_network_max_retries = 1;
+ libcurl_fetcher_.set_no_network_max_retries(no_network_max_retries);
+
+ libcurl_fetcher_.BeginTransfer("not-an-URL");
+ while (loop_.PendingTasks()) {
+ loop_.RunOnce(true);
+ }
+
+ EXPECT_EQ(libcurl_fetcher_.get_no_network_max_retries(),
+ no_network_max_retries);
+}
+
+TEST_F(LibcurlHttpFetcherTest, CouldntResolveHostTest) {
+ int no_network_max_retries = 1;
+ libcurl_fetcher_.set_no_network_max_retries(no_network_max_retries);
+
+ // This test actually sends request to internet but according to
+ // https://tools.ietf.org/html/rfc2606#section-2, .invalid domain names are
+ // reserved and sure to be invalid. Ideally we should mock libcurl or
+ // reorganize LibcurlHttpFetcher so the part that sends request can be mocked
+ // easily.
+ // TODO(xiaochu) Refactor LibcurlHttpFetcher (and its relates) so it's
+ // easier to mock the part that depends on internet connectivity.
+ libcurl_fetcher_.BeginTransfer("https://An-uNres0lvable-uRl.invalid");
+ while (loop_.PendingTasks()) {
+ loop_.RunOnce(true);
+ }
+
+ // If libcurl fails to resolve the name, we call res_init() to reload
+ // resolv.conf and retry exactly once more. See crbug.com/982813 for details.
+ EXPECT_EQ(libcurl_fetcher_.get_no_network_max_retries(),
+ no_network_max_retries + 1);
+}
+
+TEST_F(LibcurlHttpFetcherTest, HttpFetcherStateMachineRetryFailedTest) {
+ state_machine_.UpdateState(true);
+ state_machine_.UpdateState(true);
+ EXPECT_EQ(state_machine_.getState(),
+ UnresolvedHostStateMachine::State::kNotRetry);
+}
+
+TEST_F(LibcurlHttpFetcherTest, HttpFetcherStateMachineRetrySucceedTest) {
+ state_machine_.UpdateState(true);
+ state_machine_.UpdateState(false);
+ EXPECT_EQ(state_machine_.getState(),
+ UnresolvedHostStateMachine::State::kRetriedSuccess);
+}
+
+TEST_F(LibcurlHttpFetcherTest, HttpFetcherStateMachineNoRetryTest) {
+ state_machine_.UpdateState(false);
+ state_machine_.UpdateState(false);
+ EXPECT_EQ(state_machine_.getState(),
+ UnresolvedHostStateMachine::State::kInit);
+}
+
} // namespace chromeos_update_engine
diff --git a/metrics_constants.h b/metrics_constants.h
index b3833a3..161d585 100644
--- a/metrics_constants.h
+++ b/metrics_constants.h
@@ -60,6 +60,11 @@
// above block and before the kInputMalformed field. This
// is to ensure that error codes are not reordered.
+ // This error is reported when libcurl returns CURLE_COULDNT_RESOLVE_HOST and
+ // calling res_init() can recover.
+ kUnresolvedHostRecovered = 97,
+ // This error is reported when libcurl returns CURLE_COULDNT_RESOLVE_HOST.
+ kUnresolvedHost = 98,
// This error is reported when libcurl has an internal error that
// update_engine can't recover from.
kInternalError = 99,