using System;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using Svrnty.CQRS.Events.ConsumerGroups.Abstractions;
namespace Svrnty.CQRS.Events.ConsumerGroups.Monitoring;
///
/// Background service that monitors consumer health and cleans up stale consumers.
/// Periodically checks for consumers that haven't sent heartbeats within the session timeout.
///
public class ConsumerHealthMonitor : BackgroundService
{
private readonly IConsumerOffsetStore _offsetStore;
private readonly ConsumerHealthMonitorOptions _options;
private readonly ILogger _logger;
public ConsumerHealthMonitor(
IConsumerOffsetStore offsetStore,
IOptions options,
ILogger logger)
{
_offsetStore = offsetStore ?? throw new ArgumentNullException(nameof(offsetStore));
_options = options?.Value ?? throw new ArgumentNullException(nameof(options));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_options.Validate();
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
if (!_options.Enabled)
{
_logger.LogInformation("Consumer health monitor is disabled");
return;
}
_logger.LogInformation(
"Consumer health monitor started. Cleanup interval: {CleanupInterval}, Session timeout: {SessionTimeout}",
_options.CleanupInterval, _options.SessionTimeout);
using var timer = new PeriodicTimer(_options.CleanupInterval);
try
{
while (await timer.WaitForNextTickAsync(stoppingToken))
{
await CleanupStaleConsumersAsync(stoppingToken);
}
}
catch (OperationCanceledException)
{
_logger.LogInformation("Consumer health monitor stopping");
}
catch (Exception ex)
{
_logger.LogCritical(ex, "Consumer health monitor encountered a fatal error");
throw;
}
}
private async Task CleanupStaleConsumersAsync(CancellationToken cancellationToken)
{
try
{
_logger.LogTrace("Checking for stale consumers");
var removedConsumers = await _offsetStore.CleanupStaleConsumersAsync(
_options.SessionTimeout, cancellationToken);
if (removedConsumers.Count > 0)
{
_logger.LogWarning(
"Cleaned up {Count} stale consumer(s) with session timeout {SessionTimeout}",
removedConsumers.Count, _options.SessionTimeout);
foreach (var consumer in removedConsumers)
{
_logger.LogInformation(
"Removed stale consumer: {ConsumerId} from group {GroupId} (last heartbeat: {LastHeartbeat})",
consumer.ConsumerId, consumer.GroupId, consumer.LastHeartbeat);
}
}
else
{
_logger.LogTrace("No stale consumers found");
}
}
catch (Exception ex)
{
_logger.LogError(ex, "Failed to cleanup stale consumers");
// Don't rethrow - we want the monitor to continue running
}
}
public override async Task StopAsync(CancellationToken cancellationToken)
{
_logger.LogInformation("Consumer health monitor is stopping");
await base.StopAsync(cancellationToken);
}
}