105 lines
3.6 KiB
C#
105 lines
3.6 KiB
C#
using System;
|
|
using System.Threading;
|
|
using System.Threading.Tasks;
|
|
using Microsoft.Extensions.Hosting;
|
|
using Microsoft.Extensions.Logging;
|
|
using Microsoft.Extensions.Options;
|
|
using Svrnty.CQRS.Events.ConsumerGroups.Abstractions;
|
|
|
|
namespace Svrnty.CQRS.Events.ConsumerGroups.Monitoring;
|
|
|
|
/// <summary>
|
|
/// Background service that monitors consumer health and cleans up stale consumers.
|
|
/// Periodically checks for consumers that haven't sent heartbeats within the session timeout.
|
|
/// </summary>
|
|
public class ConsumerHealthMonitor : BackgroundService
|
|
{
|
|
private readonly IConsumerOffsetStore _offsetStore;
|
|
private readonly ConsumerHealthMonitorOptions _options;
|
|
private readonly ILogger<ConsumerHealthMonitor> _logger;
|
|
|
|
public ConsumerHealthMonitor(
|
|
IConsumerOffsetStore offsetStore,
|
|
IOptions<ConsumerHealthMonitorOptions> options,
|
|
ILogger<ConsumerHealthMonitor> logger)
|
|
{
|
|
_offsetStore = offsetStore ?? throw new ArgumentNullException(nameof(offsetStore));
|
|
_options = options?.Value ?? throw new ArgumentNullException(nameof(options));
|
|
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
|
|
|
_options.Validate();
|
|
}
|
|
|
|
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
|
{
|
|
if (!_options.Enabled)
|
|
{
|
|
_logger.LogInformation("Consumer health monitor is disabled");
|
|
return;
|
|
}
|
|
|
|
_logger.LogInformation(
|
|
"Consumer health monitor started. Cleanup interval: {CleanupInterval}, Session timeout: {SessionTimeout}",
|
|
_options.CleanupInterval, _options.SessionTimeout);
|
|
|
|
using var timer = new PeriodicTimer(_options.CleanupInterval);
|
|
|
|
try
|
|
{
|
|
while (await timer.WaitForNextTickAsync(stoppingToken))
|
|
{
|
|
await CleanupStaleConsumersAsync(stoppingToken);
|
|
}
|
|
}
|
|
catch (OperationCanceledException)
|
|
{
|
|
_logger.LogInformation("Consumer health monitor stopping");
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogCritical(ex, "Consumer health monitor encountered a fatal error");
|
|
throw;
|
|
}
|
|
}
|
|
|
|
private async Task CleanupStaleConsumersAsync(CancellationToken cancellationToken)
|
|
{
|
|
try
|
|
{
|
|
_logger.LogTrace("Checking for stale consumers");
|
|
|
|
var removedConsumers = await _offsetStore.CleanupStaleConsumersAsync(
|
|
_options.SessionTimeout, cancellationToken);
|
|
|
|
if (removedConsumers.Count > 0)
|
|
{
|
|
_logger.LogWarning(
|
|
"Cleaned up {Count} stale consumer(s) with session timeout {SessionTimeout}",
|
|
removedConsumers.Count, _options.SessionTimeout);
|
|
|
|
foreach (var consumer in removedConsumers)
|
|
{
|
|
_logger.LogInformation(
|
|
"Removed stale consumer: {ConsumerId} from group {GroupId} (last heartbeat: {LastHeartbeat})",
|
|
consumer.ConsumerId, consumer.GroupId, consumer.LastHeartbeat);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
_logger.LogTrace("No stale consumers found");
|
|
}
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Failed to cleanup stale consumers");
|
|
// Don't rethrow - we want the monitor to continue running
|
|
}
|
|
}
|
|
|
|
public override async Task StopAsync(CancellationToken cancellationToken)
|
|
{
|
|
_logger.LogInformation("Consumer health monitor is stopping");
|
|
await base.StopAsync(cancellationToken);
|
|
}
|
|
}
|