From cb45a3f511132925b0ea76ecbcf14e872d4854c7 Mon Sep 17 00:00:00 2001
From: Lawrence Forooghian <lawrence@forooghian.com>
Date: Thu, 31 Oct 2024 16:35:40 -0300
Subject: [PATCH] Implement RETRY room lifecycle operation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Based on the spec referenced in 20f21c7. The RETRY part of this spec was
quite unclear, and I asked quite a few questions on the PR to understand
it better, so the behaviour implemented here is based on the spec plus
Andy’s answers to my questions (I’ve linked to the discussions in the
code and / or tests). Recently (i.e. after most of this commit was
already implemented, Andy has updated the spec with answers to these
questions, but in the interests of not dragging out the current task,
I’ll incorporate these updates in #66.

The internal triggering of the RETRY operation (as specified by
CHA-RL1h3 and CHA-RL4b9) will come in #50.

Resolves #51.
---
 Sources/AblyChat/RoomLifecycleManager.swift   | 170 +++++++++-
 .../DefaultRoomLifecycleManagerTests.swift    | 310 +++++++++++++++++-
 .../MockRoomLifecycleContributorChannel.swift |  18 +-
 3 files changed, 478 insertions(+), 20 deletions(-)

diff --git a/Sources/AblyChat/RoomLifecycleManager.swift b/Sources/AblyChat/RoomLifecycleManager.swift
index a6dc72bc..f950a473 100644
--- a/Sources/AblyChat/RoomLifecycleManager.swift
+++ b/Sources/AblyChat/RoomLifecycleManager.swift
@@ -181,11 +181,12 @@ internal actor DefaultRoomLifecycleManager<Contributor: RoomLifecycleContributor
     internal enum Status: Equatable {
         case initialized
         case attachingDueToAttachOperation(attachOperationID: UUID)
+        case attachingDueToRetryOperation(retryOperationID: UUID)
         case attachingDueToContributorStateChange(error: ARTErrorInfo?)
         case attached
         case detaching(detachOperationID: UUID)
         case detached
-        case suspended(error: ARTErrorInfo)
+        case suspended(retryOperationID: UUID, error: ARTErrorInfo)
         case failed(error: ARTErrorInfo)
         case releasing(releaseOperationID: UUID)
         case released
@@ -196,6 +197,8 @@ internal actor DefaultRoomLifecycleManager<Contributor: RoomLifecycleContributor
                 .initialized
             case .attachingDueToAttachOperation:
                 .attaching(error: nil)
+            case .attachingDueToRetryOperation:
+                .attaching(error: nil)
             case let .attachingDueToContributorStateChange(error: error):
                 .attaching(error: error)
             case .attached:
@@ -204,7 +207,7 @@ internal actor DefaultRoomLifecycleManager<Contributor: RoomLifecycleContributor
                 .detaching
             case .detached:
                 .detached
-            case let .suspended(error):
+            case let .suspended(_, error):
                 .suspended(error: error)
             case let .failed(error):
                 .failed(error: error)
@@ -219,12 +222,15 @@ internal actor DefaultRoomLifecycleManager<Contributor: RoomLifecycleContributor
             switch self {
             case let .attachingDueToAttachOperation(attachOperationID):
                 attachOperationID
+            case let .attachingDueToRetryOperation(retryOperationID):
+                retryOperationID
             case let .detaching(detachOperationID):
                 detachOperationID
             case let .releasing(releaseOperationID):
                 releaseOperationID
-            case .suspended,
-                 .initialized,
+            case let .suspended(retryOperationID, _):
+                retryOperationID
+            case .initialized,
                  .attached,
                  .detached,
                  .failed,
@@ -462,7 +468,7 @@ internal actor DefaultRoomLifecycleManager<Contributor: RoomLifecycleContributor
 
                 clearTransientDisconnectTimeouts()
 
-                changeStatus(to: .suspended(error: reason))
+                changeStatus(to: .suspended(retryOperationID: UUID(), error: reason))
             }
         case .attaching:
             if !hasOperationInProgress, !contributorAnnotations[contributor].hasTransientDisconnectTimeout {
@@ -532,7 +538,7 @@ internal actor DefaultRoomLifecycleManager<Contributor: RoomLifecycleContributor
 
     /// Whether the room lifecycle manager currently has a room lifecycle operation in progress.
     ///
-    /// - Warning: I haven’t yet figured out the exact meaning of “has an operation in progress” — at what point is an operation considered to be no longer in progress? Is it the point at which the operation has updated the manager’s status to one that no longer indicates an in-progress operation (this is the meaning currently used by `hasOperationInProgress`)? Or is it the point at which the `bodyOf*Operation` method for that operation exits (i.e. the point at which ``performAnOperation(_:)`` considers the operation to have completed)? Does it matter? I’ve chosen to not think about this very much right now, but might need to revisit. See TODO against `emitPendingDiscontinuityEvents` in `bodyOfDetachOperation` for an example of something where these two notions of “has an operation in progress” are not equivalent.
+    /// - Warning: I haven’t yet figured out the exact meaning of “has an operation in progress” — at what point is an operation considered to be no longer in progress? Is it the point at which the operation has updated the manager’s status to one that no longer indicates an in-progress operation (this is the meaning currently used by `hasOperationInProgress`)? Or is it the point at which the `bodyOf*Operation` method for that operation exits (i.e. the point at which ``performAnOperation(_:)`` considers the operation to have completed)? Does it matter? I’ve chosen to not think about this very much right now, but might need to revisit. See TODO against `emitPendingDiscontinuityEvents` in `performAttachmentCycle` for an example of something where these two notions of “has an operation in progress” are not equivalent.
     private var hasOperationInProgress: Bool {
         status.operationID != nil
     }
@@ -675,7 +681,7 @@ internal actor DefaultRoomLifecycleManager<Contributor: RoomLifecycleContributor
         case .released:
             // CHA-RL1c
             throw ARTErrorInfo(chatError: .roomIsReleased)
-        case .initialized, .suspended, .attachingDueToAttachOperation, .attachingDueToContributorStateChange, .detached, .detaching, .failed:
+        case .initialized, .suspended, .attachingDueToAttachOperation, .attachingDueToRetryOperation, .attachingDueToContributorStateChange, .detached, .detaching, .failed:
             break
         }
 
@@ -684,8 +690,39 @@ internal actor DefaultRoomLifecycleManager<Contributor: RoomLifecycleContributor
             try? await waitForCompletionOfOperationWithID(currentOperationID, waitingOperationID: operationID)
         }
 
+        try await performAttachmentCycle(trigger: .attachOperation(id: operationID))
+    }
+
+    private enum AttachmentCycleTrigger {
+        case attachOperation(id: UUID)
+        case retryOperation(id: UUID)
+
+        var attachingStatus: Status {
+            switch self {
+            case let .attachOperation(id):
+                .attachingDueToAttachOperation(attachOperationID: id)
+            case let .retryOperation(id):
+                .attachingDueToRetryOperation(retryOperationID: id)
+            }
+        }
+
+        // TODO: document
+        func createRetryOperationID() -> UUID {
+            switch self {
+            case .attachOperation:
+                // i.e. generate a new ID
+                UUID()
+            case let .retryOperation(id):
+                // i.e. keep the ID of the existing RETRY operation
+                id
+            }
+        }
+    }
+
+    // TODO: explain what this is, and what I’ve guessed
+    private func performAttachmentCycle(trigger: AttachmentCycleTrigger) async throws {
         // CHA-RL1e
-        changeStatus(to: .attachingDueToAttachOperation(attachOperationID: operationID))
+        changeStatus(to: trigger.attachingStatus)
 
         // CHA-RL1f
         for contributor in contributors {
@@ -701,10 +738,20 @@ internal actor DefaultRoomLifecycleManager<Contributor: RoomLifecycleContributor
                 case .suspended:
                     // CHA-RL1h2
                     let error = ARTErrorInfo(chatError: .attachmentFailed(feature: contributor.feature, underlyingError: contributorAttachError))
-                    changeStatus(to: .suspended(error: error))
-
-                    // CHA-RL1h3
-                    throw error
+                    let retryOperationID = trigger.createRetryOperationID()
+                    changeStatus(to: .suspended(retryOperationID: retryOperationID, error: error))
+
+                    switch trigger {
+                    case .attachOperation:
+                        // CHA-RL1h3
+                        throw error
+                    case .retryOperation:
+                        // CHA-RL5f3 (TODO confirm that the idea is continue the existing RETRY operation, not to schedule a new one)
+                        // TODO: it’s not great that there’s recursion here, could have stack overflow; would be better as a loop
+                        logger.log(message: "Attachment cycle will continue existing RETRY operation with ID \(retryOperationID)", level: .debug)
+                        await bodyOfRetryOperation(operationID: retryOperationID, triggeredByContributor: contributor)
+                        return
+                    }
                 case .failed:
                     // CHA-RL1h4
                     let error = ARTErrorInfo(chatError: .attachmentFailed(feature: contributor.feature, underlyingError: contributorAttachError))
@@ -712,7 +759,13 @@ internal actor DefaultRoomLifecycleManager<Contributor: RoomLifecycleContributor
 
                     // CHA-RL1h5
                     // TODO: Implement the "asynchronously with respect to CHA-RL1h4" part of CHA-RL1h5 (https://github.com/ably-labs/ably-chat-swift/issues/50)
-                    await detachNonFailedContributors()
+                    switch trigger {
+                    case .attachOperation:
+                        await detachNonFailedContributors()
+                    case .retryOperation:
+                        // TODO: _are_ we meant to do the CHA-RL1h5 in the CHA-RL5f case? (https://github.com/ably/specification/pull/200/files#r1829964805)
+                        break
+                    }
 
                     throw error
                 default:
@@ -728,8 +781,8 @@ internal actor DefaultRoomLifecycleManager<Contributor: RoomLifecycleContributor
         // CHA-RL1g1
         changeStatus(to: .attached)
 
-        // CHA-RL1g2
-        // TODO: It’s not clear to me whether this is considered to be part of the ATTACH operation or not; see the note on the ``hasOperationInProgress`` property
+        // CHA-RL1g2, CHA-RL5f1
+        // TODO: It’s not clear to me whether this is considered to be part of the ATTACH / RETRY operation or not; see the note on the ``hasOperationInProgress`` property
         await emitPendingDiscontinuityEvents()
     }
 
@@ -798,7 +851,7 @@ internal actor DefaultRoomLifecycleManager<Contributor: RoomLifecycleContributor
         case .failed:
             // CHA-RL2d
             throw ARTErrorInfo(chatError: .roomInFailedState)
-        case .initialized, .suspended, .attachingDueToAttachOperation, .attachingDueToContributorStateChange, .attached, .detaching:
+        case .initialized, .suspended, .attachingDueToAttachOperation, .attachingDueToRetryOperation, .attachingDueToContributorStateChange, .attached, .detaching:
             break
         }
 
@@ -897,7 +950,7 @@ internal actor DefaultRoomLifecycleManager<Contributor: RoomLifecycleContributor
             // See note on waitForCompletionOfOperationWithID for the current need for this force try
             // swiftlint:disable:next force_try
             return try! await waitForCompletionOfOperationWithID(releaseOperationID, waitingOperationID: operationID)
-        case .initialized, .attached, .attachingDueToAttachOperation, .attachingDueToContributorStateChange, .detaching, .suspended, .failed:
+        case .initialized, .attached, .attachingDueToAttachOperation, .attachingDueToRetryOperation, .attachingDueToContributorStateChange, .detaching, .suspended, .failed:
             break
         }
 
@@ -935,4 +988,87 @@ internal actor DefaultRoomLifecycleManager<Contributor: RoomLifecycleContributor
         // CHA-RL3g
         changeStatus(to: .released)
     }
+
+    // MARK: - RETRY operation
+
+    /// Implements CHA-RL5’s RETRY operation.
+    ///
+    /// - Parameters:
+    ///   - forcedOperationID: Allows tests to force the operation to have a given ID. In combination with the ``testsOnly_subscribeToOperationWaitEvents`` API, this allows tests to verify that one test-initiated operation is waiting for another test-initiated operation.
+    ///   - triggeringContributor: This is, in the language of CHA-RL5d, “the channel that caused the retry loop”.
+    internal func performRetryOperation(testsOnly_forcingOperationID forcedOperationID: UUID? = nil, triggeredByContributor triggeringContributor: Contributor) async {
+        await performAnOperation(forcingOperationID: forcedOperationID) { operationID in
+            await bodyOfRetryOperation(operationID: operationID, triggeredByContributor: triggeringContributor)
+        }
+    }
+
+    private func bodyOfRetryOperation(operationID: UUID, triggeredByContributor triggeringContributor: Contributor) async {
+        detachAllContributorsLoop: while true {
+            // CHA-RL5a - This is actually what’s written in the spec plus Andy’s clarification from https://github.com/ably/specification/pull/200/files#r1794116352 that "the implementation is supposed to be that all channels _except_ the one that entered suspended should be sent to detached" (TODO remove this comment once spec updated)
+            for contributor in contributors where contributor.id != triggeringContributor.id {
+                do {
+                    logger.log(message: "RETRY operation will detach contributor \(contributor)", level: .debug)
+                    try await contributor.channel.detach()
+                } catch {
+                    let contributorState = await contributor.channel.state
+                    if contributorState == .failed {
+                        // CHA-RL5c
+                        guard let contributorErrorReason = await contributor.channel.errorReason else {
+                            // TODO: We assume this will be populated, but working in a multi-threaded environment means it might not be (https://github.com/ably-labs/ably-chat-swift/issues/49)
+                            preconditionFailure("Contributor entered FAILED but its errorReason is not set")
+                        }
+                        // TODO: which error to use? https://github.com/ably/specification/pull/200/files#r1827819961
+                        changeStatus(to: .failed(error: contributorErrorReason))
+                        return
+                    } else {
+                        // CHA-RL5b - This is actually what’s written in the spec plus Andy’s clarification from https://github.com/ably/specification/pull/200/files#r1794116352 that this means "If the operation above fails because a channel has entered a state other than FAILED" (TODO remove this comment once spec updated)
+                        // TODO: This duration is a guess, find out correct duration (https://github.com/ably/specification/pull/200/files#r1825086288)
+                        let waitDuration = 0.5
+                        logger.log(message: "Got error \(error) attempting to detach contributor \(contributor); will retry detaching contributors in \(waitDuration)s", level: .debug)
+                        // TODO: what do to if cancelled?
+                        try! await clock.sleep(timeInterval: waitDuration)
+                        // TODO: this is a bit messy
+                        continue detachAllContributorsLoop
+                    }
+                }
+            }
+
+            break
+        }
+
+        // TODO: what if this had already happened sometime during the above detaches?
+        logger.log(message: "RETRY waiting for \(triggeringContributor) to enter ATTACHED", level: .debug)
+        waitForAttached: for await stateChange in await triggeringContributor.channel.subscribeToState() {
+            switch stateChange.current {
+            // CHA-RL5d
+            case .attached:
+                logger.log(message: "RETRY completed waiting for \(triggeringContributor) to enter ATTACHED", level: .debug)
+                break waitForAttached
+            // CHA-RL5e
+            case .failed:
+                guard let contributorErrorReason = stateChange.reason else {
+                    preconditionFailure("Contributor entered FAILED but state change’s reason is not set")
+                }
+                logger.log(message: "RETRY failed waiting for \(triggeringContributor) to enter ATTACHED, since it entered FAILED with error \(contributorErrorReason)", level: .debug)
+
+                // TODO: which error to use? https://github.com/ably/specification/pull/200/files#r1829344042
+                changeStatus(to: .failed(error: contributorErrorReason))
+                return
+            case .attaching, .detached, .detaching, .initialized, .suspended:
+                break
+            @unknown default:
+                break
+            }
+        }
+
+        // CHA-RL5f
+        // TODO: put spec points for the other CHA-RL5f1* points once they’re clarified
+        do {
+            try await performAttachmentCycle(trigger: .retryOperation(id: operationID))
+        } catch {
+            // CHA-RL5f2
+            // TODO: document the circumstances in which the attachment cycle throws an error (it should only be FAILED)
+            logger.log(message: "RETRY operation attachment cycle resulted in \(error); ending RETRY", level: .debug)
+        }
+    }
 }
diff --git a/Tests/AblyChatTests/DefaultRoomLifecycleManagerTests.swift b/Tests/AblyChatTests/DefaultRoomLifecycleManagerTests.swift
index c8e46fbb..fb405f6f 100644
--- a/Tests/AblyChatTests/DefaultRoomLifecycleManagerTests.swift
+++ b/Tests/AblyChatTests/DefaultRoomLifecycleManagerTests.swift
@@ -73,14 +73,16 @@ struct DefaultRoomLifecycleManagerTests {
         initialState: ARTRealtimeChannelState = .initialized,
         feature: RoomFeature = .messages, // Arbitrarily chosen, its value only matters in test cases where we check which error is thrown
         attachBehavior: MockRoomLifecycleContributorChannel.AttachOrDetachBehavior? = nil,
-        detachBehavior: MockRoomLifecycleContributorChannel.AttachOrDetachBehavior? = nil
+        detachBehavior: MockRoomLifecycleContributorChannel.AttachOrDetachBehavior? = nil,
+        subscribeToStateBehavior: MockRoomLifecycleContributorChannel.SubscribeToStateBehavior? = nil
     ) -> MockRoomLifecycleContributor {
         .init(
             feature: feature,
             channel: .init(
                 initialState: initialState,
                 attachBehavior: attachBehavior,
-                detachBehavior: detachBehavior
+                detachBehavior: detachBehavior,
+                subscribeToStateBehavior: subscribeToStateBehavior
             )
         )
     }
@@ -881,6 +883,310 @@ struct DefaultRoomLifecycleManagerTests {
         #expect(await manager.roomStatus == .released)
     }
 
+    // MARK: - RETRY operation
+
+    // @spec CHA-RL5a - This is actually what’s written in the spec plus Andy’s clarification from https://github.com/ably/specification/pull/200/files#r1794116352 that "the implementation is supposed to be that all channels _except_ the one that entered suspended should be sent to detached" (TODO remove this comment once spec updated)
+    @Test
+    func retry_detachesAllContributorsExceptForTriggering() async throws {
+        // Given: A RoomLifecycleManager
+        let contributors = [
+            createContributor(attachBehavior: .success /* TODO: best way to handle? this is so that we get through the subsequent CHA-RL5f attachment cycle */, subscribeToStateBehavior: .addSubscriptionAndEmitStateChange(.init(current: .attached, previous: .attaching, event: .attached, reason: .createUnknownError() /* arbitrary, but non-nil so that the CHA-RL4b1 code doesn’t crash TODO decide right thing to do  */ )) /* TODO: so that RETRY completes per CHA-RL5d wait, this is a bit hard to read though */ ),
+            createContributor(attachBehavior: .success /* TODO: best way to handle? this is so that we get through the subsequent CHA-RL5f attachment cycle */, detachBehavior: .success /* so that RETRY completes */ ),
+            createContributor(attachBehavior: .success /* TODO: best way to handle? this is so that we get through the subsequent CHA-RL5f attachment cycle */, detachBehavior: .success /* so that RETRY completes */ ),
+        ]
+
+        let manager = await createManager(contributors: contributors)
+
+        // When: `performRetryOperation(triggeredByContributor:)` is called on the manager
+        await manager.performRetryOperation(triggeredByContributor: contributors[0])
+
+        // Then: The manager calls `detach` on all contributors except that which triggered the RETRY
+        #expect(await contributors[0].channel.detachCallCount == 0)
+        #expect(await contributors[1].channel.detachCallCount == 1)
+        #expect(await contributors[2].channel.detachCallCount == 1)
+    }
+
+    // TODO:
+    // @spec CHA-RL5b - This is actually what’s written in the spec plus Andy’s clarification from https://github.com/ably/specification/pull/200/files#r1794116352 that this means "If the operation above fails because a channel has entered a state other than FAILED" (TODO remove this comment once spec updated)
+    @Test
+    func retry_ifDetachFailsDueToNonFailedChannelState_retries() async throws {
+        // Given: A RoomLifecycleManager, whose contributor at index 1 throws an error upon `detach()` being called on it, and which is in a non-FAILED state after this `detach()` call fails, and on which calling `detach()` succeeds the second time it is called
+        let detachImpl = { @Sendable (callCount: Int) -> MockRoomLifecycleContributorChannel.AttachOrDetachBehavior in
+            .complete(
+                callCount == 1 ? .failure(.createUnknownError() /* arbitrary */ )
+                    : .success // otherwise, succeed, so that RETRY completes
+            )
+        }
+
+        let contributors = [
+            createContributor(
+                attachBehavior: .success /* TODO: best way to handle? this is so that we get through the subsequent CHA-RL5f attachment cycle */,
+                subscribeToStateBehavior: .addSubscriptionAndEmitStateChange(.init(current: .attached, previous: .attaching, event: .attached, reason: .createUnknownError() /* arbitrary, but non-nil so that the CHA-RL4b1 code doesn’t crash TODO decide right thing to do  */ )) /* TODO: so that RETRY completes per CHA-RL5d wait, this is a bit hard to read though */
+            ),
+            createContributor(
+                // TODO: mimic the completeAndChangeState from elsewhere
+                initialState: .suspended, // arbitrary non-FAILED state; ideally the contributor would only go into this state _after_ its detach fails, but for the sake of keeping the mock simple let’s do it beforehand and assume that the manager won’t check it beforehand
+                attachBehavior: .success /* TODO: best way to handle? this is so that we get through the subsequent CHA-RL5f attachment cycle */,
+                detachBehavior: .fromFunction(detachImpl)
+            ),
+            createContributor(
+                attachBehavior: .success /* TODO: best way to handle? this is so that we get through the subsequent CHA-RL5f attachment cycle */,
+                detachBehavior: .success /* so that RETRY completes */
+            ),
+        ]
+
+        let clock = MockSimpleClock()
+
+        let manager = await createManager(contributors: contributors, clock: clock)
+
+        // When: `performRetryOperation(triggeredByContributor:)` is called on the manager, triggered by a contributor that isn’t that at index 1
+        await manager.performRetryOperation(triggeredByContributor: contributors[0])
+
+        // Then: The manager calls `detach` in sequence on all contributors except that which triggered the RETRY, stopping upon one of these `detach` calls throwing an error, then sleeps for 0.5s, then performs these `detach` calls again
+
+        // (Note that for simplicity of the test I’m not actually making assertions about the sequence in which events happen here)
+        #expect(await contributors[0].channel.detachCallCount == 0)
+        #expect(await contributors[1].channel.detachCallCount == 2)
+        #expect(await contributors[2].channel.detachCallCount == 1)
+        #expect(await clock.sleepCallArguments == [0.5])
+    }
+
+    // @spec CHA-RL5c
+    @Test
+    func retry_ifDetachFailsDueToFailedChannelState_transitionsToFailed() async throws {
+        // Given: A RoomLifecycleManager, whose contributor at index 1 throws an error upon `detach()` being called on it, and which is the FAILED state after this `detach()` call fails
+        let contributorFailedStateError = ARTErrorInfo.createUnknownError() // arbitrary
+
+        let contributors = [
+            createContributor(),
+            createContributor(
+                detachBehavior: .completeAndChangeState(.failure(contributorFailedStateError), newState: .failed)
+            ),
+            createContributor(),
+        ]
+
+        let manager = await createManager(contributors: contributors)
+
+        let roomStatusSubscription = await manager.onChange(bufferingPolicy: .unbounded)
+        async let failedStatusChange = roomStatusSubscription.failedElements().first { _ in true }
+
+        // When: `performRetryOperation(triggeredByContributor:)` is called on the manager, triggered by the contributor at index 0
+        await manager.performRetryOperation(triggeredByContributor: contributors[0])
+
+        // Then: The manager calls `detach` in sequence on all contributors except that which triggered the RETRY, stopping upon one of these `detach` calls throwing an error, then enters the FAILED status, the associated error for this status being that associated with the contributor’s FAILED status (TODO: clarify which error is meant to be used here; https://github.com/ably/specification/pull/200/files#r1827819961)
+        #expect(await contributors[0].channel.detachCallCount == 0)
+        #expect(await contributors[1].channel.detachCallCount == 1)
+        #expect(await contributors[2].channel.detachCallCount == 0)
+
+        _ = try #require(await failedStatusChange)
+        #expect(await manager.current == .failed(error: contributorFailedStateError))
+    }
+
+    // @spec CHA-RL5d
+    // @specPartial CHA-RL5f - Tests that the room transitions to ATTACHED, but not any of the side effects of the "attachment cycle"
+    @Test
+    func retry_afterDetach_waitsForTriggeringContributorToBecomeAttached() async throws {
+        // TODO: should this be a separate test or bolted on to the happy-path CHA-RL5a test?
+        // TODO: what about it already being in that state?
+        // Given: A RoomLifecycleManager, with a contributor that emits a state change to ATTACHED after subscribeToState() is called on it
+        let contributorAttachedStateChangeReason = ARTErrorInfo.createUnknownError()
+
+        let contributorAttachedStateChange = ARTChannelStateChange(
+            current: .attached,
+            previous: .attaching, // arbitrary
+            event: .attached,
+            reason: contributorAttachedStateChangeReason // arbitrary, but non-nil so that the CHA-RL4b1 code doesn’t crash TODO decide right thing to do
+        )
+
+        let contributor = createContributor(
+            attachBehavior: .success /* TODO: best way to handle? this is so that we get through the subsequent CHA-RL5f attachment cycle */,
+            subscribeToStateBehavior: .addSubscriptionAndEmitStateChange(contributorAttachedStateChange)
+        )
+
+        let manager = await createManager(
+            contributors: [contributor]
+        )
+
+        let roomStatusSubscription = await manager.onChange(bufferingPolicy: .unbounded)
+        async let maybeAttachingStatusChange = roomStatusSubscription.attachingElements().first { _ in true }
+
+        // When: `performRetryOperation(triggeredByContributor:)` is called on the manager, triggered by the aforementioned contributor
+        // TODO: this is not a great test, and there’s no easy way to check that it would do something else if it were a non-ATTACHED; let’s perhaps do that in the test for FAILED
+        await manager.performRetryOperation(triggeredByContributor: contributor)
+
+        // Then: The room transitions to ATTACHING, and the RETRY operation completes
+        let attachingStatusChange = try #require(await maybeAttachingStatusChange)
+        // TODO: check this one — spec isn’t clear
+        #expect(attachingStatusChange.error == nil)
+    }
+
+    // @spec CHA-RL5e
+    @Test
+    func retry_afterDetach_whenTriggeringContributorBecomesFailed() async throws {
+        // TODO: what about it already being in that state?
+        // Given: A RoomLifecycleManager, with a contributor that emits a state change to FAILED after subscribeToState() is called on it
+        let contributorFailedStateError = ARTErrorInfo.createUnknownError() // arbitrary
+        let contributorFailedStateChange = ARTChannelStateChange(
+            current: .failed,
+            previous: .attaching, // arbitrary
+            event: .failed,
+            reason: contributorFailedStateError
+        )
+
+        let contributor = createContributor(
+            subscribeToStateBehavior: .addSubscriptionAndEmitStateChange(contributorFailedStateChange)
+        )
+
+        let manager = await createManager(
+            // TODO: figure out right thing to do here (should RETRY be the one who transitions us to SUSPENDED?) — this is currently just there so that the manager believes it has an operation in progress and hence doesn’t try to detach contributors, causing a crash
+            forTestingWhatHappensWhenCurrentlyIn: .suspended(retryOperationID: UUID(), error: .createUnknownError()),
+            contributors: [contributor]
+        )
+
+        let roomStatusSubscription = await manager.onChange(bufferingPolicy: .unbounded)
+        async let failedStatusChange = roomStatusSubscription.failedElements().first { _ in true }
+
+        // When: `performRetryOperation(triggeredByContributor:)` is called on the manager, triggered by the aforementioned contributor
+        await manager.performRetryOperation(triggeredByContributor: contributor)
+
+        // Then: The RETRY operation completes and the room enters the FAILED status, the associated error for this status being that associated with the contributor’s FAILED state change (TODO: clarify which error is meant to be used here; https://github.com/ably/specification/pull/200/files#r1829344042)
+        _ = try #require(await failedStatusChange)
+        #expect(await manager.current == .failed(error: contributorFailedStateError))
+    }
+
+    // @spec CHA-RL5f
+    // @spec CHA-RL5f1 - TODO all of CHA-RL5 is kind of best-of-ability guesses
+    @Test
+    func retry_attachment_success() async throws {
+        // TODO: DRY up and tidy up these tests, and figure out their relation to the ATTACH operation
+        // the below is, I think, the setup needed to slide us into the CHA-RL5 cases
+        let attachedStateChange = ARTChannelStateChange(current: .attached, previous: .attaching, event: .attached, reason: .createUnknownError())
+
+        let contributors = [
+            createContributor(attachBehavior: .success, subscribeToStateBehavior: .addSubscriptionAndEmitStateChange(attachedStateChange)),
+            createContributor(attachBehavior: .success, detachBehavior: .complete(.success)),
+            createContributor(attachBehavior: .success, detachBehavior: .complete(.success)),
+        ]
+
+        let manager = await createManager(
+            forTestingWhatHappensWhenCurrentlyIn: .suspended(retryOperationID: UUID(), error: .createUnknownError()), // again, this is for just disabling the lifecycle stuff
+            contributors: contributors
+        )
+
+        // now, we need to set up the per-test behaviour — actually, we have to do it above, because that’s where we define the attach behaviour (so let’s try and parameterise that then after)
+
+        let roomStatusSubscription = await manager.onChange(bufferingPolicy: .unbounded)
+        async let attachedStatusChange = roomStatusSubscription.first { $0.current == .attached }
+
+        await manager.performRetryOperation(triggeredByContributor: contributors[0])
+
+        // Then: all contributors get attached, we transition to ATTACHED, and RETRY op completes
+        // TODO: discontinuity errors are broadcast to subscribers — will fill this in after
+        for contributor in contributors {
+            #expect(await contributor.channel.attachCallCount == 1)
+        }
+
+        _ = try #require(await attachedStatusChange)
+    }
+
+    // @spec CHA-RL5f2
+    @Test
+    func retry_attachment_failed() async throws {
+        // the below is, I think, the setup needed to slide us into the CHA-RL5 cases
+        let attachedStateChange = ARTChannelStateChange(current: .attached, previous: .attaching, event: .attached, reason: .createUnknownError())
+
+        let failedError = ARTErrorInfo.createUnknownError()
+
+        let contributors = [
+            createContributor(attachBehavior: .success, subscribeToStateBehavior: .addSubscriptionAndEmitStateChange(attachedStateChange)),
+            createContributor(attachBehavior: .completeAndChangeState(.failure(failedError), newState: .failed), detachBehavior: .complete(.success)),
+            createContributor(attachBehavior: .success, detachBehavior: .complete(.success)),
+        ]
+
+        let manager = await createManager(
+            forTestingWhatHappensWhenCurrentlyIn: .suspended(retryOperationID: UUID(), error: .createUnknownError()), // again, this is for just disabling the lifecycle stuff
+            contributors: contributors
+        )
+
+        // now, we need to set up the per-test behaviour — actually, we have to do it above, because that’s where we define the attach behaviour (so let’s try and parameterise that then after)
+
+        let roomStatusSubscription = await manager.onChange(bufferingPolicy: .unbounded)
+        async let maybeFailedStatusChange = roomStatusSubscription.failedElements().first { _ in true }
+
+        await manager.performRetryOperation(triggeredByContributor: contributors[0])
+
+        // Then: all contributors get attached until the one that fails, we transition to FAILED, and RETRY op completes
+        #expect(await contributors[0].channel.attachCallCount == 1)
+        #expect(await contributors[1].channel.attachCallCount == 1)
+        #expect(await contributors[2].channel.attachCallCount == 0)
+
+        let failedStatusChange = try #require(await maybeFailedStatusChange)
+        // TODO: confirm
+        #expect(isChatError(failedStatusChange.error, withCode: .messagesAttachmentFailed, cause: failedError))
+    }
+
+    // @spec CHA-RL5f3
+    @Test
+    func retry_attachment_suspended() async throws {
+        // the below is, I think, the setup needed to slide us into the CHA-RL5 cases
+        let attachedStateChange = ARTChannelStateChange(current: .attached, previous: .attaching, event: .attached, reason: .createUnknownError())
+
+        let suspendedError = ARTErrorInfo.createUnknownError()
+
+        let contributor1AttachImpl = { @Sendable (callCount: Int) -> MockRoomLifecycleContributorChannel.AttachOrDetachBehavior in
+            if callCount == 1 {
+                .completeAndChangeState(.failure(suspendedError), newState: .suspended)
+            } else {
+                .complete(.success)
+            }
+        }
+
+        let contributors = [
+            createContributor(attachBehavior: .success, detachBehavior: .complete(.success), subscribeToStateBehavior: .addSubscriptionAndEmitStateChange(attachedStateChange)),
+            createContributor(attachBehavior: .fromFunction(contributor1AttachImpl), detachBehavior: .complete(.success), subscribeToStateBehavior: .addSubscriptionAndEmitStateChange(attachedStateChange)),
+            createContributor(attachBehavior: .success, detachBehavior: .complete(.success)),
+        ]
+
+        let manager = await createManager(
+            forTestingWhatHappensWhenCurrentlyIn: .suspended(retryOperationID: UUID(), error: .createUnknownError()), // again, this is for just disabling the lifecycle stuff
+            contributors: contributors
+        )
+
+        // now, we need to set up the per-test behaviour — actually, we have to do it above, because that’s where we define the attach behaviour (so let’s try and parameterise that then after)
+
+        // TODO: to note: the contributor1AttachBehavior here is so that the first "attachment cycle" fails and the second succeeds
+        // TODO: to note: the contributor 0 detachBehavior is because we also anticipate that one being detached
+        // TODO: to note: the contributor 1 subscribeToStateBehavior is so that the second RETRY proceeds
+
+        let roomStatusSubscription = await manager.onChange(bufferingPolicy: .unbounded)
+        async let maybeFirstFourRoomStatusChanges = roomStatusSubscription.prefix(4)
+
+        await manager.performRetryOperation(triggeredByContributor: contributors[0])
+
+        // Then: all contributors get attached until the one that fails, we transition to SUSPENDED, and RETRY happens again, the second time round the attachment cycle succeeds and we transition to ATTACHED per CHA-RL5f
+        // i.e. we get ATTACHING (first attachment cycle), SUSPENDED (because attachment cycle failed), ATTACHING (second attachment cycle), ATTACHED
+        let firstFourRoomStatusChanges = await Array(maybeFirstFourRoomStatusChanges)
+
+        // TODO: what error for ATTACHING?
+        #expect(firstFourRoomStatusChanges[0].current == .attaching(error: nil))
+
+        let suspendedStatusChange = try #require(Subscription<RoomStatusChange>.StatusChangeWithError(maybeSuspendedStatusChange: firstFourRoomStatusChanges[1]))
+        // TODO: confirm
+        #expect(isChatError(suspendedStatusChange.error, withCode: .messagesAttachmentFailed, cause: suspendedError))
+
+        #expect(firstFourRoomStatusChanges[2].current == .attaching(error: nil))
+
+        #expect(firstFourRoomStatusChanges[3].current == .attached)
+
+        #expect(await contributors[0].channel.attachCallCount == 2) // because of first and second attachment cycle
+        #expect(await contributors[1].channel.attachCallCount == 2) // because of first and second attachment cycle
+        #expect(await contributors[2].channel.attachCallCount == 1) // because of second attachment cycle
+
+        #expect(await contributors[0].channel.detachCallCount == 1) // because of first RETRY
+        #expect(await contributors[1].channel.detachCallCount == 1) // because of second RETRY
+        #expect(await contributors[2].channel.detachCallCount == 2) // because of first and second RETRY
+    }
+
     // MARK: - Handling contributor UPDATE events
 
     // @spec CHA-RL4a1
diff --git a/Tests/AblyChatTests/Mocks/MockRoomLifecycleContributorChannel.swift b/Tests/AblyChatTests/Mocks/MockRoomLifecycleContributorChannel.swift
index 28f6723e..b363f875 100644
--- a/Tests/AblyChatTests/Mocks/MockRoomLifecycleContributorChannel.swift
+++ b/Tests/AblyChatTests/Mocks/MockRoomLifecycleContributorChannel.swift
@@ -4,6 +4,7 @@
 final actor MockRoomLifecycleContributorChannel: RoomLifecycleContributorChannel {
     private let attachBehavior: AttachOrDetachBehavior?
     private let detachBehavior: AttachOrDetachBehavior?
+    private let subscribeToStateBehavior: SubscribeToStateBehavior
 
     var state: ARTRealtimeChannelState
     var errorReason: ARTErrorInfo?
@@ -16,11 +17,13 @@ final actor MockRoomLifecycleContributorChannel: RoomLifecycleContributorChannel
     init(
         initialState: ARTRealtimeChannelState,
         attachBehavior: AttachOrDetachBehavior?,
-        detachBehavior: AttachOrDetachBehavior?
+        detachBehavior: AttachOrDetachBehavior?,
+        subscribeToStateBehavior: SubscribeToStateBehavior?
     ) {
         state = initialState
         self.attachBehavior = attachBehavior
         self.detachBehavior = detachBehavior
+        self.subscribeToStateBehavior = subscribeToStateBehavior ?? .justAddSubscription
     }
 
     enum AttachOrDetachResult {
@@ -52,6 +55,11 @@ final actor MockRoomLifecycleContributorChannel: RoomLifecycleContributorChannel
         }
     }
 
+    enum SubscribeToStateBehavior {
+        case justAddSubscription
+        case addSubscriptionAndEmitStateChange(ARTChannelStateChange)
+    }
+
     func attach() async throws(ARTErrorInfo) {
         attachCallCount += 1
 
@@ -100,6 +108,14 @@ final actor MockRoomLifecycleContributorChannel: RoomLifecycleContributorChannel
     func subscribeToState() -> Subscription<ARTChannelStateChange> {
         let subscription = Subscription<ARTChannelStateChange>(bufferingPolicy: .unbounded)
         subscriptions.append(subscription)
+
+        switch subscribeToStateBehavior {
+        case .justAddSubscription:
+            break
+        case let .addSubscriptionAndEmitStateChange(stateChange):
+            emitStateChange(stateChange)
+        }
+
         return subscription
     }