using System; using System.Threading; using System.Threading.Tasks; using Microsoft.Extensions.Hosting; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using Svrnty.CQRS.Events.ConsumerGroups.Abstractions; namespace Svrnty.CQRS.Events.ConsumerGroups.Monitoring; /// /// Background service that monitors consumer health and cleans up stale consumers. /// Periodically checks for consumers that haven't sent heartbeats within the session timeout. /// public class ConsumerHealthMonitor : BackgroundService { private readonly IConsumerOffsetStore _offsetStore; private readonly ConsumerHealthMonitorOptions _options; private readonly ILogger _logger; public ConsumerHealthMonitor( IConsumerOffsetStore offsetStore, IOptions options, ILogger logger) { _offsetStore = offsetStore ?? throw new ArgumentNullException(nameof(offsetStore)); _options = options?.Value ?? throw new ArgumentNullException(nameof(options)); _logger = logger ?? throw new ArgumentNullException(nameof(logger)); _options.Validate(); } protected override async Task ExecuteAsync(CancellationToken stoppingToken) { if (!_options.Enabled) { _logger.LogInformation("Consumer health monitor is disabled"); return; } _logger.LogInformation( "Consumer health monitor started. Cleanup interval: {CleanupInterval}, Session timeout: {SessionTimeout}", _options.CleanupInterval, _options.SessionTimeout); using var timer = new PeriodicTimer(_options.CleanupInterval); try { while (await timer.WaitForNextTickAsync(stoppingToken)) { await CleanupStaleConsumersAsync(stoppingToken); } } catch (OperationCanceledException) { _logger.LogInformation("Consumer health monitor stopping"); } catch (Exception ex) { _logger.LogCritical(ex, "Consumer health monitor encountered a fatal error"); throw; } } private async Task CleanupStaleConsumersAsync(CancellationToken cancellationToken) { try { _logger.LogTrace("Checking for stale consumers"); var removedConsumers = await _offsetStore.CleanupStaleConsumersAsync( _options.SessionTimeout, cancellationToken); if (removedConsumers.Count > 0) { _logger.LogWarning( "Cleaned up {Count} stale consumer(s) with session timeout {SessionTimeout}", removedConsumers.Count, _options.SessionTimeout); foreach (var consumer in removedConsumers) { _logger.LogInformation( "Removed stale consumer: {ConsumerId} from group {GroupId} (last heartbeat: {LastHeartbeat})", consumer.ConsumerId, consumer.GroupId, consumer.LastHeartbeat); } } else { _logger.LogTrace("No stale consumers found"); } } catch (Exception ex) { _logger.LogError(ex, "Failed to cleanup stale consumers"); // Don't rethrow - we want the monitor to continue running } } public override async Task StopAsync(CancellationToken cancellationToken) { _logger.LogInformation("Consumer health monitor is stopping"); await base.StopAsync(cancellationToken); } }