From d4cfc19fe34fb9705e22f9cb7499999222178652 Mon Sep 17 00:00:00 2001 From: sacOO7 Date: Mon, 9 Dec 2024 17:13:45 +0530 Subject: [PATCH] [ECO-5095][CHA-RL*] Implement logging for RoomLifecycleManager 1. Added private property logger with thread specific dynamic context 2. Added trace, debug, error logging for private doRetry method 3. Added trace, debug, error logging for attach method 4. Added trace, debug, error logging for detach method --- .../com/ably/chat/RoomLifecycleManager.kt | 123 ++++++++++++------ 1 file changed, 85 insertions(+), 38 deletions(-) diff --git a/chat-android/src/main/java/com/ably/chat/RoomLifecycleManager.kt b/chat-android/src/main/java/com/ably/chat/RoomLifecycleManager.kt index 4da631bc..c3ead4d8 100644 --- a/chat-android/src/main/java/com/ably/chat/RoomLifecycleManager.kt +++ b/chat-android/src/main/java/com/ably/chat/RoomLifecycleManager.kt @@ -117,8 +117,12 @@ internal class RoomLifecycleManager( private val roomScope: CoroutineScope, private val statusLifecycle: DefaultRoomLifecycle, private val contributors: List, - private val logger: Logger, + roomLogger: Logger, ) { + private val logger = roomLogger.withContext( + "RoomLifecycleManager", + dynamicContext = mapOf("scope" to { Thread.currentThread().name }), + ) /** * AtomicCoroutineScope makes sure all operations are atomic and run with given priority. @@ -183,63 +187,84 @@ internal class RoomLifecycleManager( * @returns Returns when the room is attached, or the room enters a failed state. * Spec: CHA-RL5 */ - @Suppress("CognitiveComplexMethod", "ThrowsCount") + @Suppress("CognitiveComplexMethod", "ThrowsCount", "LongMethod") private suspend fun doRetry(contributor: ContributesToRoomLifecycle) { + logger.trace("doRetry();") + // Handle the channel wind-down. + logger.debug("doRetry(); winding down channels except for feature: ${contributor.featureName}") // CHA-RL5a - Handle the channel wind-down for other channels var result = kotlin.runCatching { doChannelWindDown(contributor) } while (result.isFailure) { // CHA-RL5c - If in doing the wind down, we've entered failed state, then it's game over anyway if (this.statusLifecycle.status === RoomStatus.Failed) { + logger.error("doRetry(); channels wind down failed, RoomStatus is FAILED", result.exceptionOrNull()) throw result.exceptionOrNull() ?: IllegalStateException("room is in a failed state") } + logger.warn("doRetry(); channels wind down failed, retrying in $retryDurationInMs ms", result.exceptionOrNull()) delay(retryDurationInMs) result = kotlin.runCatching { doChannelWindDown(contributor) } } + logger.debug("doRetry(); channels wind down successfully completed") // A helper that allows us to retry the attach operation val doAttachWithRetry: suspend () -> Unit = { + logger.trace("doRetry.doAttachWithRetry();") coroutineScope { + logger.debug("doRetry.doAttachWithRetry(); attempting to attach all feature channels") statusLifecycle.setStatus(RoomStatus.Attaching) val attachmentResult = doAttach() // CHA-RL5c - If we're in failed, then we should wind down all the channels, eventually - but we're done here if (attachmentResult.status === RoomStatus.Failed) { + logger.error("doRetry.doAttachWithRetry(); failed to attach channels, RoomStatus is Failed", attachmentResult.exception) atomicCoroutineScope.async(LifecycleOperationPrecedence.Internal.priority) { runDownChannelsOnFailedAttach() } + logger.warn("doRetry.doAttachWithRetry(); started winding down all channels internally") return@coroutineScope } // If we're in suspended, then we should wait for the channel to reattach, but wait for it to do so if (attachmentResult.status === RoomStatus.Suspended) { - val failedFeature = attachmentResult.failedFeature - ?: throw lifeCycleException( - "no failed feature in doRetry", - ErrorCode.RoomLifecycleError, - ) + logger.warn( + "doRetry.doAttachWithRetry(); failed to attach channels, RoomStatus is Suspended", + attachmentResult.exception, + ) + val failedFeature = attachmentResult.failedFeature ?: throw lifeCycleException( + "no failed feature in doRetry", + ErrorCode.RoomLifecycleError, + ) + logger.warn("doRetry.doAttachWithRetry(); Retrying attach due to suspended feature: ${failedFeature.featureName}") // No need to catch errors, rather they should propagate to caller method return@coroutineScope doRetry(failedFeature) } + // We attached, huzzah! + logger.debug("doRetry.doAttachWithRetry(); Room ATTACH success, RoomStatus: ${statusLifecycle.status}") } } // If given suspended contributor channel has reattached, then we can retry the attach if (contributor.channel.state == ChannelState.attached) { + logger.debug("doRetry(); feature: ${contributor.featureName} reattached, retrying attach for others") return doAttachWithRetry() } // CHA-RL5d - Otherwise, wait for our suspended contributor channel to re-attach and try again try { + logger.debug("doRetry(); waiting to channel ATTACHED for feature: ${contributor.featureName}") listenToChannelAttachOrFailure(contributor) + logger.debug("doRetry(); waiting success, feature: ${contributor.featureName} ATTACHED, retrying attach for others") delay(retryDurationInMs) // Let other channels get into ATTACHING state - // Attach successful - return doAttachWithRetry() - } catch (ex: AblyException) { + } catch (ex: Throwable) { // CHA-RL5c - Channel attach failed - statusLifecycle.setStatus(RoomStatus.Failed, ex.errorInfo) + logger.error("doRetry(); waiting failed for channel attach, feature: ${contributor.featureName}", ex) + // No need for channels wind down, since it's taken care at the start of doRetry() + statusLifecycle.setStatus(RoomStatus.Failed, (ex as? AblyException)?.errorInfo) throw ex } + // Attach successful, try attach for other channels + return doAttachWithRetry() } /** @@ -292,59 +317,60 @@ internal class RoomLifecycleManager( */ @Suppress("ThrowsCount") internal suspend fun attach() { + logger.trace("attach();") val deferredAttach = atomicCoroutineScope.async(LifecycleOperationPrecedence.AttachOrDetach.priority) { // CHA-RL1d if (statusLifecycle.status == RoomStatus.Attached) { // CHA-RL1a + logger.debug("attach(); room is already attached") return@async } if (statusLifecycle.status == RoomStatus.Releasing) { // CHA-RL1b - throw lifeCycleException( - "unable to attach room; room is releasing", - ErrorCode.RoomIsReleasing, - ) + logger.error("attach(); attach failed, room is in releasing state") + throw lifeCycleException("unable to attach room; room is releasing", ErrorCode.RoomIsReleasing) } if (statusLifecycle.status == RoomStatus.Released) { // CHA-RL1c - throw lifeCycleException( - "unable to attach room; room is released", - ErrorCode.RoomIsReleased, - ) + logger.error("attach(); attach failed, room is in released state") + throw lifeCycleException("unable to attach room; room is released", ErrorCode.RoomIsReleased) } // At this point, we force the room status to be attaching clearAllTransientDetachTimeouts() operationInProgress = true + logger.debug("attach(); transitioned room into ATTACHING state") statusLifecycle.setStatus(RoomStatus.Attaching) // CHA-RL1e val attachResult = doAttach() // CHA-RL1h4 - If we're in a failed state, then we should wind down all the channels, eventually if (attachResult.status === RoomStatus.Failed) { + logger.error("attach(); failed to attach channels, RoomStatus is Failed", attachResult.exception) // CHA-RL1h5 - detach all remaining channels atomicCoroutineScope.async(LifecycleOperationPrecedence.Internal.priority) { runDownChannelsOnFailedAttach() } + logger.warn("attach(); started winding down all channels internally") throw attachResult.exception // CHA-RL1h1 } // CHA-RL1h1, CHA-RL1h2 - If we're in suspended, then this attach should fail, but we'll retry after a short delay async if (attachResult.status === RoomStatus.Suspended) { + logger.warn("attach(); failed to attach channels, RoomStatus is Suspended", attachResult.exception) if (attachResult.failedFeature == null) { - throw lifeCycleException( - "no failed feature in attach", - ErrorCode.RoomLifecycleError, - ) + throw lifeCycleException("no failed feature in attach", ErrorCode.RoomLifecycleError) } attachResult.failedFeature?.let { // CHA-RL1h3 - Enter recovery for failed room feature/contributor atomicCoroutineScope.async(LifecycleOperationPrecedence.Internal.priority) { doRetry(it) } + logger.warn("attach(); retrying attach due to suspended feature: ${it.featureName}") } throw attachResult.exception // CHA-RL1h1 } // We attached, finally! + logger.debug("attach(): room attached successfully") } deferredAttach.await() @@ -357,12 +383,17 @@ internal class RoomLifecycleManager( * Spec: CHA-RL1f, CHA-RL1g, CHA-RL1h */ private suspend fun doAttach(): RoomAttachmentResult { + logger.trace("doAttach();") val attachResult = DefaultRoomAttachmentResult() + logger.debug("doAttach(); trying to attach all features: ${contributors.joinToString { it.featureName }}") for (feature in contributors) { // CHA-RL1f - attach each feature sequentially try { + logger.debug("doAttach(); attaching feature: ${feature.featureName}") feature.channel.attachCoroutine() firstAttachesCompleted[feature] = true + logger.debug("doAttach(); attached feature: ${feature.featureName}") } catch (ex: Throwable) { // CHA-RL1h - handle channel attach failure + logger.error("doAttach(); attach failed for feature: ${feature.featureName}", ex) attachResult.throwable = ex attachResult.failedFeatureField = feature attachResult.errorField = lifeCycleErrorInfo( @@ -387,17 +418,21 @@ internal class RoomLifecycleManager( // Regardless of whether we're suspended or failed, run-down the other channels // The wind-down procedure will take Precedence over any user-driven actions + logger.warn("doAttach(); transitioning room into ${attachResult.status.name} state") statusLifecycle.setStatus(attachResult) return attachResult } } // CHA-RL1g, We successfully attached all the channels - set our status to attached, start listening changes in channel status + logger.debug("doAttach(); attach success for all features: ${contributors.joinToString { it.featureName }}") + logger.debug("doAttach(); transitioning room to ATTACHED state") this.statusLifecycle.setStatus(attachResult) this.operationInProgress = false // Iterate the pending discontinuity events and trigger them for ((contributor, error) in pendingDiscontinuityEvents) { + logger.warn("doAttach(); emitting discontinuity event for feature: ${contributor.featureName}") contributor.discontinuityDetected(error) } pendingDiscontinuityEvents.clear() @@ -407,9 +442,11 @@ internal class RoomLifecycleManager( /** * If we've failed to attach, then we're in the failed state and all that is left to do is to detach all the channels. * Spec: CHA-RL1h5, CHA-RL1h6 - * @returns Returns only when all channels are detached. Doesn't throw exception. + * @returns Returns only when all channels are either detached or failed. Doesn't throw exception. */ private suspend fun runDownChannelsOnFailedAttach() { + logger.trace("runDownChannelsOnFailedAttach();") + logger.debug("runDownChannelsOnFailedAttach(); attempting to detach all channels") // At this point, we have control over the channel lifecycle, so we can hold onto it until things are resolved // Keep trying to detach the channels until they're all detached. var channelWindDown = kotlin.runCatching { doChannelWindDown() } @@ -419,6 +456,7 @@ internal class RoomLifecycleManager( delay(retryDurationInMs) channelWindDown = kotlin.runCatching { doChannelWindDown() } } + logger.debug("runDownChannelsOnFailedAttach(); success, all channels are either detached or failed") } /** @@ -431,11 +469,13 @@ internal class RoomLifecycleManager( */ @Suppress("CognitiveComplexMethod", "ComplexCondition") private suspend fun doChannelWindDown(except: ContributesToRoomLifecycle? = null) = coroutineScope { + logger.trace("doChannelWindDown();") contributors.map { contributor: ContributesToRoomLifecycle -> async { // CHA-RL5a1 - If its the contributor we want to wait for a conclusion on, then we should not detach it // Unless we're in a failed state, in which case we should detach it if (contributor.channel === except?.channel && statusLifecycle.status !== RoomStatus.Failed) { + logger.debug("doChannelWindDown(); ignoring feature: ${contributor.featureName} as per except param") return@async } // If the room's already in the failed state, or it's releasing, we should not detach a failed channel @@ -446,12 +486,15 @@ internal class RoomLifecycleManager( ) && contributor.channel.state === ChannelState.failed ) { + logger.debug("doChannelWindDown(); ignoring feature: ${contributor.featureName} since it's already in failed state") return@async } try { + logger.debug("doChannelWindDown(); detaching feature: ${contributor.featureName}") contributor.channel.detachCoroutine() } catch (throwable: Throwable) { + logger.warn("doChannelWindDown(); failed to detach feature: ${contributor.featureName}", throwable) // CHA-RL2h2 - If the contributor is in a failed state and we're not ignoring failed states, we should fail the room if ( contributor.channel.state === ChannelState.failed && @@ -482,38 +525,35 @@ internal class RoomLifecycleManager( */ @Suppress("ThrowsCount") internal suspend fun detach() { + logger.trace("detach();") val deferredDetach = atomicCoroutineScope.async(LifecycleOperationPrecedence.AttachOrDetach.priority) { // CHA-RL2i // CHA-RL2a - If we're already detached, this is a no-op if (statusLifecycle.status === RoomStatus.Detached) { + logger.debug("detach(); room is already detached") return@async } // CHA-RL2c - If the room is released, we can't detach if (statusLifecycle.status === RoomStatus.Released) { - throw lifeCycleException( - "unable to detach room; room is released", - ErrorCode.RoomIsReleased, - ) + logger.error("detach(); detach failed, room is in released state") + throw lifeCycleException("unable to detach room; room is released", ErrorCode.RoomIsReleased) } // CHA-RL2b - If the room is releasing, we can't detach if (statusLifecycle.status === RoomStatus.Releasing) { - throw lifeCycleException( - "unable to detach room; room is releasing", - ErrorCode.RoomIsReleasing, - ) + logger.error("detach(); detach failed, room is in releasing state") + throw lifeCycleException("unable to detach room; room is releasing", ErrorCode.RoomIsReleasing) } // CHA-RL2d - If we're in failed, we should not attempt to detach if (statusLifecycle.status === RoomStatus.Failed) { - throw lifeCycleException( - "unable to detach room; room has failed", - ErrorCode.RoomInFailedState, - ) + logger.error("detach(); detach failed, room is in failed state") + throw lifeCycleException("unable to detach room; room has failed", ErrorCode.RoomInFailedState) } // CHA-RL2e - We force the room status to be detaching operationInProgress = true clearAllTransientDetachTimeouts() + logger.debug("detach(); transitioned room into DETACHING state") statusLifecycle.setStatus(RoomStatus.Detaching) // CHA-RL2f - We now perform an all-channel wind down. @@ -524,16 +564,18 @@ internal class RoomLifecycleManager( } /** - * Perform a detach. - * If detaching a channel fails, we should retry until every channel is either in the detached state, or in the failed state. + * Performs detach until every channel gets into either detached or failed state. * Spec: CHA-RL2f */ private suspend fun doDetach() { + logger.trace("doDetach();") + logger.debug("doDetach(); detaching all features: ${contributors.joinToString { it.featureName }}") var channelWindDown = kotlin.runCatching { doChannelWindDown() } var firstContributorFailedError: AblyException? = null while (channelWindDown.isFailure) { // CHA-RL2h val err = channelWindDown.exceptionOrNull() if (err is AblyException && err.errorInfo?.code != -1 && firstContributorFailedError == null) { + logger.error("doDetach(); detach failed for one of the feature, detaching others", err) firstContributorFailedError = err // CHA-RL2h1- First failed contributor error is captured } delay(retryDurationInMs) @@ -542,6 +584,8 @@ internal class RoomLifecycleManager( // CHA-RL2g - If we aren't in the failed state, then we're detached if (statusLifecycle.status !== RoomStatus.Failed) { + logger.debug("doDetach(); successfully detached all features: ${contributors.joinToString { it.featureName }}") + logger.debug("doDetach(); transitioning room to DETACHED state") statusLifecycle.setStatus(RoomStatus.Detached) return } @@ -564,6 +608,7 @@ internal class RoomLifecycleManager( * Spec: CHA-RL3 */ internal suspend fun release() { + logger.trace("release();") val deferredRelease = atomicCoroutineScope.async(LifecycleOperationPrecedence.Release.priority) { // CHA-RL3k // CHA-RL3a - If we're already released, this is a no-op if (statusLifecycle.status === RoomStatus.Released) { @@ -595,6 +640,7 @@ internal class RoomLifecycleManager( * Spec: CHA-RL3f, CHA-RL3d */ private suspend fun releaseChannels() { + logger.trace("releaseChannels();") var contributorsReleased = kotlin.runCatching { doRelease() } while (contributorsReleased.isFailure) { // Wait a short period and then try again @@ -610,6 +656,7 @@ internal class RoomLifecycleManager( */ @Suppress("RethrowCaughtException") private suspend fun doRelease() = coroutineScope { + logger.trace("doRelease();") contributors.map { contributor: ContributesToRoomLifecycle -> async { // CHA-RL3e - Failed channels, we can ignore