Clean up update state when snapshots are interrupted or cancelled.

This patch addresses two edge cases with interrupted updates:
 (1) If the device reverts to the old slot *before* merging, snapshots
     must be removed.
 (2) If during a merge, a snapshot is detected to be invalid (due to
     flashing), the snapshot must be removed.

To encapsulate this logic, WaitForMerge has been renamed to
ProcessUpdateState. It is still intended to be called after /data is
mounted, or after a merge is initiated.

Bug: 139154795
Test: libsnapshot_test gtest
Change-Id: I37a25ca722f30ae9548894dcfbd70cb64330e416
diff --git a/fs_mgr/libsnapshot/include/libsnapshot/snapshot.h b/fs_mgr/libsnapshot/include/libsnapshot/snapshot.h
index b982b4b..e3fc4f6 100644
--- a/fs_mgr/libsnapshot/include/libsnapshot/snapshot.h
+++ b/fs_mgr/libsnapshot/include/libsnapshot/snapshot.h
@@ -66,7 +66,11 @@
     MergeCompleted,
 
     // Merging failed due to an unrecoverable error.
-    MergeFailed
+    MergeFailed,
+
+    // The update was implicitly cancelled, either by a rollback or a flash
+    // operation via fastboot. This state can only be returned by WaitForMerge.
+    Cancelled
 };
 
 class SnapshotManager final {
@@ -82,6 +86,7 @@
         virtual std::string GetGsidDir() const = 0;
         virtual std::string GetMetadataDir() const = 0;
         virtual std::string GetSlotSuffix() const = 0;
+        virtual std::string GetSuperDevice(uint32_t slot) const = 0;
         virtual const IPartitionOpener& GetPartitionOpener() const = 0;
     };
 
@@ -117,12 +122,15 @@
     // update has been marked successful after booting.
     bool InitiateMerge();
 
-    // Wait for the current merge to finish, then perform cleanup when it
-    // completes. It is necessary to call this after InitiateMerge(), or when
-    // a merge state is detected during boot.
+    // Perform any necessary post-boot actions. This should be run soon after
+    // /data is mounted.
     //
-    // Note that after calling WaitForMerge(), GetUpdateState() may still return
-    // that a merge is in progress:
+    // If a merge is in progress, this function will block until the merge is
+    // completed. If a merge or update was cancelled, this will clean up any
+    // update artifacts and return.
+    //
+    // Note that after calling this, GetUpdateState() may still return that a
+    // merge is in progress:
     //   MergeFailed indicates that a fatal error occurred. WaitForMerge() may
     //   called any number of times again to attempt to make more progress, but
     //   we do not expect it to succeed if a catastrophic error occurred.
@@ -135,7 +143,7 @@
     //
     //   MergeCompleted indicates that the update has fully completed.
     //   GetUpdateState will return None, and a new update can begin.
-    UpdateState WaitForMerge();
+    UpdateState ProcessUpdateState();
 
     // Find the status of the current update, if any.
     //
@@ -158,6 +166,7 @@
     FRIEND_TEST(SnapshotTest, CreateSnapshot);
     FRIEND_TEST(SnapshotTest, FirstStageMountAfterRollback);
     FRIEND_TEST(SnapshotTest, FirstStageMountAndMerge);
+    FRIEND_TEST(SnapshotTest, FlashSuperDuringMerge);
     FRIEND_TEST(SnapshotTest, FlashSuperDuringUpdate);
     FRIEND_TEST(SnapshotTest, MapPartialSnapshot);
     FRIEND_TEST(SnapshotTest, MapSnapshot);
@@ -245,6 +254,14 @@
     // List the known snapshot names.
     bool ListSnapshots(LockedFile* lock, std::vector<std::string>* snapshots);
 
+    // Check for a cancelled or rolled back merge, returning true if such a
+    // condition was detected and handled.
+    bool HandleCancelledUpdate(LockedFile* lock);
+
+    // Remove artifacts created by the update process, such as snapshots, and
+    // set the update state to None.
+    bool RemoveAllUpdateState(LockedFile* lock);
+
     // Interact with /metadata/ota/state.
     std::unique_ptr<LockedFile> OpenStateFile(int open_flags, int lock_flags);
     std::unique_ptr<LockedFile> LockShared();
@@ -272,6 +289,7 @@
     bool MarkSnapshotMergeCompleted(LockedFile* snapshot_lock, const std::string& snapshot_name);
     void AcknowledgeMergeSuccess(LockedFile* lock);
     void AcknowledgeMergeFailure();
+    bool IsCancelledSnapshot(const std::string& snapshot_name);
 
     // Note that these require the name of the device containing the snapshot,
     // which may be the "inner" device. Use GetsnapshotDeviecName().
diff --git a/fs_mgr/libsnapshot/snapshot.cpp b/fs_mgr/libsnapshot/snapshot.cpp
index ab1157b..1f621cd 100644
--- a/fs_mgr/libsnapshot/snapshot.cpp
+++ b/fs_mgr/libsnapshot/snapshot.cpp
@@ -28,6 +28,7 @@
 #include <android-base/strings.h>
 #include <android-base/unique_fd.h>
 #include <ext4_utils/ext4_utils.h>
+#include <fs_mgr.h>
 #include <fs_mgr_dm_linear.h>
 #include <fstab/fstab.h>
 #include <libdm/dm.h>
@@ -64,6 +65,9 @@
     std::string GetMetadataDir() const override { return "/metadata/ota"s; }
     std::string GetSlotSuffix() const override { return fs_mgr_get_slot_suffix(); }
     const android::fs_mgr::IPartitionOpener& GetPartitionOpener() const { return opener_; }
+    std::string GetSuperDevice(uint32_t slot) const override {
+        return fs_mgr_get_super_partition_name(slot);
+    }
 
   private:
     android::fs_mgr::PartitionOpener opener_;
@@ -123,17 +127,20 @@
         LOG(ERROR) << "Cannot cancel update after it has completed or started merging";
         return false;
     }
+    return RemoveAllUpdateState(file.get());
+}
 
-    if (!RemoveAllSnapshots(file.get())) {
+bool SnapshotManager::RemoveAllUpdateState(LockedFile* lock) {
+    if (!RemoveAllSnapshots(lock)) {
         LOG(ERROR) << "Could not remove all snapshots";
         return false;
     }
 
-    if (!WriteUpdateState(file.get(), UpdateState::None)) {
-        LOG(ERROR) << "Could not write new update state";
-        return false;
-    }
-    return true;
+    RemoveSnapshotBootIndicator();
+
+    // If this fails, we'll keep trying to remove the update state (as the
+    // device reboots or starts a new update) until it finally succeeds.
+    return WriteUpdateState(lock, UpdateState::None);
 }
 
 bool SnapshotManager::FinishedSnapshotWrites() {
@@ -362,14 +369,13 @@
     if (!EnsureImageManager()) return false;
 
     auto cow_name = GetCowName(name);
-    if (!images_->BackingImageExists(cow_name)) {
-        return true;
-    }
-    if (images_->IsImageMapped(cow_name) && !images_->UnmapImageDevice(cow_name)) {
-        return false;
-    }
-    if (!images_->DeleteBackingImage(cow_name)) {
-        return false;
+    if (images_->BackingImageExists(cow_name)) {
+        if (images_->IsImageMapped(cow_name) && !images_->UnmapImageDevice(cow_name)) {
+            return false;
+        }
+        if (!images_->DeleteBackingImage(cow_name)) {
+            return false;
+        }
     }
 
     std::string error;
@@ -575,9 +581,12 @@
 // Note that when a merge fails, we will *always* try again to complete the
 // merge each time the device boots. There is no harm in doing so, and if
 // the problem was transient, we might manage to get a new outcome.
-UpdateState SnapshotManager::WaitForMerge() {
+UpdateState SnapshotManager::ProcessUpdateState() {
     while (true) {
         UpdateState state = CheckMergeState();
+        if (state == UpdateState::MergeFailed) {
+            AcknowledgeMergeFailure();
+        }
         if (state != UpdateState::Merging) {
             // Either there is no merge, or the merge was finished, so no need
             // to keep waiting.
@@ -593,15 +602,16 @@
 UpdateState SnapshotManager::CheckMergeState() {
     auto lock = LockExclusive();
     if (!lock) {
-        AcknowledgeMergeFailure();
         return UpdateState::MergeFailed;
     }
 
-    auto state = CheckMergeState(lock.get());
+    UpdateState state = CheckMergeState(lock.get());
     if (state == UpdateState::MergeCompleted) {
+        // Do this inside the same lock. Failures get acknowledged without the
+        // lock, because flock() might have failed.
         AcknowledgeMergeSuccess(lock.get());
-    } else if (state == UpdateState::MergeFailed) {
-        AcknowledgeMergeFailure();
+    } else if (state == UpdateState::Cancelled) {
+        RemoveAllUpdateState(lock.get());
     }
     return state;
 }
@@ -623,10 +633,17 @@
             // run.
             break;
 
+        case UpdateState::Unverified:
+            // This is an edge case. Normally cancelled updates are detected
+            // via the merge poll below, but if we never started a merge, we
+            // need to also check here.
+            if (HandleCancelledUpdate(lock)) {
+                return UpdateState::Cancelled;
+            }
+            return state;
+
         default:
-            LOG(ERROR) << "No merge exists, cannot wait. Update state: "
-                       << static_cast<uint32_t>(state);
-            return UpdateState::None;
+            return state;
     }
 
     std::vector<std::string> snapshots;
@@ -634,6 +651,7 @@
         return UpdateState::MergeFailed;
     }
 
+    bool cancelled = false;
     bool failed = false;
     bool merging = false;
     bool needs_reboot = false;
@@ -651,6 +669,9 @@
                 break;
             case UpdateState::MergeCompleted:
                 break;
+            case UpdateState::Cancelled:
+                cancelled = true;
+                break;
             default:
                 LOG(ERROR) << "Unknown merge status: " << static_cast<uint32_t>(snapshot_state);
                 failed = true;
@@ -673,6 +694,14 @@
         WriteUpdateState(lock, UpdateState::MergeNeedsReboot);
         return UpdateState::MergeNeedsReboot;
     }
+    if (cancelled) {
+        // This is an edge case, that we handle as correctly as we sensibly can.
+        // The underlying partition has changed behind update_engine, and we've
+        // removed the snapshot as a result. The exact state of the update is
+        // undefined now, but this can only happen on an unlocked device where
+        // partitions can be flashed without wiping userdata.
+        return UpdateState::Cancelled;
+    }
     return UpdateState::MergeCompleted;
 }
 
@@ -684,17 +713,30 @@
 
     std::string dm_name = GetSnapshotDeviceName(name, snapshot_status);
 
-    // During a check, we decided the merge was complete, but we were unable to
-    // collapse the device-mapper stack and perform COW cleanup. If we haven't
-    // rebooted after this check, the device will still be a snapshot-merge
-    // target. If the have rebooted, the device will now be a linear target,
-    // and we can try cleanup again.
-    if (snapshot_status.state == SnapshotState::MergeCompleted && !IsSnapshotDevice(dm_name)) {
-        // NB: It's okay if this fails now, we gave cleanup our best effort.
-        OnSnapshotMergeComplete(lock, name, snapshot_status);
-        return UpdateState::MergeCompleted;
+    if (!IsSnapshotDevice(dm_name)) {
+        if (IsCancelledSnapshot(name)) {
+            DeleteSnapshot(lock, name);
+            return UpdateState::Cancelled;
+        }
+
+        // During a check, we decided the merge was complete, but we were unable to
+        // collapse the device-mapper stack and perform COW cleanup. If we haven't
+        // rebooted after this check, the device will still be a snapshot-merge
+        // target. If the have rebooted, the device will now be a linear target,
+        // and we can try cleanup again.
+        if (snapshot_status.state == SnapshotState::MergeCompleted) {
+            // NB: It's okay if this fails now, we gave cleanup our best effort.
+            OnSnapshotMergeComplete(lock, name, snapshot_status);
+            return UpdateState::MergeCompleted;
+        }
+
+        LOG(ERROR) << "Expected snapshot or snapshot-merge for device: " << dm_name;
+        return UpdateState::MergeFailed;
     }
 
+    // This check is expensive so it is only enabled for debugging.
+    DCHECK(!IsCancelledSnapshot(name));
+
     std::string target_type;
     DmTargetSnapshot::Status status;
     if (!QuerySnapshotStatus(dm_name, &target_type, &status)) {
@@ -750,12 +792,7 @@
 }
 
 void SnapshotManager::AcknowledgeMergeSuccess(LockedFile* lock) {
-    RemoveSnapshotBootIndicator();
-
-    if (!WriteUpdateState(lock, UpdateState::None)) {
-        // We'll try again next reboot, ad infinitum.
-        return;
-    }
+    RemoveAllUpdateState(lock);
 }
 
 void SnapshotManager::AcknowledgeMergeFailure() {
@@ -906,6 +943,44 @@
     return true;
 }
 
+bool SnapshotManager::HandleCancelledUpdate(LockedFile* lock) {
+    std::string old_slot;
+    auto boot_file = GetSnapshotBootIndicatorPath();
+    if (!android::base::ReadFileToString(boot_file, &old_slot)) {
+        PLOG(ERROR) << "Unable to read the snapshot indicator file: " << boot_file;
+        return false;
+    }
+    if (device_->GetSlotSuffix() != old_slot) {
+        // We're booted into the target slot, which means we just rebooted
+        // after applying the update.
+        return false;
+    }
+
+    // The only way we can get here is if:
+    //  (1) The device rolled back to the previous slot.
+    //  (2) This function was called prematurely before rebooting the device.
+    //  (3) fastboot set_active was used.
+    //
+    // In any case, delete the snapshots. It may be worth using the boot_control
+    // HAL to differentiate case (2).
+    RemoveAllUpdateState(lock);
+    return true;
+}
+
+bool SnapshotManager::IsCancelledSnapshot(const std::string& snapshot_name) {
+    const auto& opener = device_->GetPartitionOpener();
+    uint32_t slot = SlotNumberForSlotSuffix(device_->GetSlotSuffix());
+    auto super_device = device_->GetSuperDevice(slot);
+    auto metadata = android::fs_mgr::ReadMetadata(opener, super_device, slot);
+    if (!metadata) {
+        LOG(ERROR) << "Could not read dynamic partition metadata for device: " << super_device;
+        return false;
+    }
+    auto partition = android::fs_mgr::FindPartition(*metadata.get(), snapshot_name);
+    if (!partition) return false;
+    return (partition->attributes & LP_PARTITION_ATTR_UPDATED) == 0;
+}
+
 bool SnapshotManager::RemoveAllSnapshots(LockedFile* lock) {
     std::vector<std::string> snapshots;
     if (!ListSnapshots(lock, &snapshots)) {
diff --git a/fs_mgr/libsnapshot/snapshot_test.cpp b/fs_mgr/libsnapshot/snapshot_test.cpp
index acffe8c..f3678d1 100644
--- a/fs_mgr/libsnapshot/snapshot_test.cpp
+++ b/fs_mgr/libsnapshot/snapshot_test.cpp
@@ -347,7 +347,7 @@
     // We should not be able to cancel an update now.
     ASSERT_FALSE(sm->CancelUpdate());
 
-    ASSERT_EQ(sm->WaitForMerge(), UpdateState::MergeCompleted);
+    ASSERT_EQ(sm->ProcessUpdateState(), UpdateState::MergeCompleted);
     ASSERT_EQ(sm->GetUpdateState(), UpdateState::None);
 
     // The device should no longer be a snapshot or snapshot-merge.
@@ -388,7 +388,7 @@
     ASSERT_TRUE(sm->InitiateMerge());
 
     // COW cannot be removed due to open fd, so expect a soft failure.
-    ASSERT_EQ(sm->WaitForMerge(), UpdateState::MergeNeedsReboot);
+    ASSERT_EQ(sm->ProcessUpdateState(), UpdateState::MergeNeedsReboot);
 
     // Forcefully delete the snapshot device, so it looks like we just rebooted.
     DeleteSnapshotDevice("test-snapshot");
@@ -401,7 +401,7 @@
     fd = {};
     lock_ = nullptr;
 
-    ASSERT_EQ(sm->WaitForMerge(), UpdateState::MergeCompleted);
+    ASSERT_EQ(sm->ProcessUpdateState(), UpdateState::MergeCompleted);
 }
 
 TEST_F(SnapshotTest, FirstStageMountAndMerge) {
@@ -482,6 +482,52 @@
     DeviceMapper::TargetInfo target;
     auto dm_name = init->GetSnapshotDeviceName("test_partition_b", status);
     ASSERT_FALSE(init->IsSnapshotDevice(dm_name, &target));
+
+    // We should see a cancelled update as well.
+    lock_ = nullptr;
+    ASSERT_EQ(sm->ProcessUpdateState(), UpdateState::Cancelled);
+}
+
+TEST_F(SnapshotTest, FlashSuperDuringMerge) {
+    ON_CALL(*GetMockedPropertyFetcher(), GetBoolProperty("ro.virtual_ab.enabled", _))
+            .WillByDefault(Return(true));
+
+    ASSERT_TRUE(AcquireLock());
+
+    static const uint64_t kDeviceSize = 1024 * 1024;
+
+    ASSERT_TRUE(CreatePartition("test_partition_a", kDeviceSize));
+    ASSERT_TRUE(MapUpdatePartitions());
+    ASSERT_TRUE(sm->CreateSnapshot(lock_.get(), "test_partition_b", kDeviceSize, kDeviceSize,
+                                   kDeviceSize));
+
+    // Simulate a reboot into the new slot.
+    lock_ = nullptr;
+    ASSERT_TRUE(sm->FinishedSnapshotWrites());
+    ASSERT_TRUE(DestroyLogicalPartition("test_partition_b"));
+
+    auto rebooted = new TestDeviceInfo(fake_super);
+    rebooted->set_slot_suffix("_b");
+
+    auto init = SnapshotManager::NewForFirstStageMount(rebooted);
+    ASSERT_NE(init, nullptr);
+    ASSERT_TRUE(init->NeedSnapshotsInFirstStageMount());
+    ASSERT_TRUE(init->CreateLogicalAndSnapshotPartitions("super"));
+    ASSERT_TRUE(init->InitiateMerge());
+
+    // Now, reflash super. Note that we haven't called ProcessUpdateState, so the
+    // status is still Merging.
+    DeleteSnapshotDevice("test_partition_b");
+    ASSERT_TRUE(init->image_manager()->UnmapImageDevice("test_partition_b-cow"));
+    FormatFakeSuper();
+    ASSERT_TRUE(CreatePartition("test_partition_b", kDeviceSize));
+    ASSERT_TRUE(init->NeedSnapshotsInFirstStageMount());
+    ASSERT_TRUE(init->CreateLogicalAndSnapshotPartitions("super"));
+
+    // Because the status is Merging, we must call ProcessUpdateState, which should
+    // detect a cancelled update.
+    ASSERT_EQ(sm->ProcessUpdateState(), UpdateState::Cancelled);
+    ASSERT_EQ(sm->GetUpdateState(), UpdateState::None);
 }
 
 }  // namespace snapshot
diff --git a/fs_mgr/libsnapshot/test_helpers.h b/fs_mgr/libsnapshot/test_helpers.h
index c87f118..9f582d9 100644
--- a/fs_mgr/libsnapshot/test_helpers.h
+++ b/fs_mgr/libsnapshot/test_helpers.h
@@ -47,6 +47,7 @@
     std::string GetGsidDir() const override { return "ota/test"s; }
     std::string GetMetadataDir() const override { return "/metadata/ota/test"s; }
     std::string GetSlotSuffix() const override { return slot_suffix_; }
+    std::string GetSuperDevice([[maybe_unused]] uint32_t slot) const override { return "super"; }
     const android::fs_mgr::IPartitionOpener& GetPartitionOpener() const override {
         return *opener_.get();
     }