dotnet-cqrs/docs/observability/health-checks/README.md

491 lines
14 KiB
Markdown

# Health Checks
Monitor stream and subscription health with ASP.NET Core health checks.
## Overview
Health checks provide visibility into system health:
- **Stream Health** - Detect unhealthy event streams
- **Consumer Health** - Monitor consumer lag and stalls
- **Subscription Health** - Track subscription status
- **ASP.NET Core Integration** - Built-in health check support
**Key Features:**
-**Stream Monitoring** - Check stream availability and status
-**Consumer Lag Detection** - Identify lagging consumers
-**Stall Detection** - Detect consumers with no progress
-**Configurable Thresholds** - Define degraded/unhealthy limits
-**Health Check UI** - Visual dashboard support
-**Kubernetes Ready** - /health endpoint for liveness/readiness probes
## Quick Start
```csharp
using Svrnty.CQRS.Events;
using Svrnty.CQRS.Events.Monitoring;
var builder = WebApplication.CreateBuilder(args);
// Register health checks
builder.Services.AddStreamHealthChecks(options =>
{
options.DegradedConsumerLagThreshold = 1000; // Warning at 1000 events
options.UnhealthyConsumerLagThreshold = 10000; // Error at 10000 events
options.DegradedStalledThreshold = TimeSpan.FromMinutes(5); // Warning after 5 min
options.UnhealthyStalledThreshold = TimeSpan.FromMinutes(15); // Error after 15 min
});
// Add to ASP.NET Core health checks
builder.Services.AddHealthChecks()
.AddCheck<StreamHealthCheck>("event-streams")
.AddCheck<ConsumerHealthCheck>("consumers");
var app = builder.Build();
// Map health check endpoint
app.MapHealthChecks("/health");
app.Run();
```
## Health Check Components
### Stream Health Check
Monitor overall stream health:
```csharp
public class StreamHealthCheck : IHealthCheck
{
private readonly IStreamHealthService _healthService;
public async Task<HealthCheckResult> CheckHealthAsync(
HealthCheckContext context,
CancellationToken ct = default)
{
var result = await _healthService.CheckAllStreamsAsync(ct);
if (result.UnhealthyCount > 0)
{
return HealthCheckResult.Unhealthy(
$"{result.UnhealthyCount} unhealthy streams",
data: new Dictionary<string, object>
{
["healthy"] = result.HealthyCount,
["degraded"] = result.DegradedCount,
["unhealthy"] = result.UnhealthyCount
});
}
if (result.DegradedCount > 0)
{
return HealthCheckResult.Degraded(
$"{result.DegradedCount} degraded streams",
data: new Dictionary<string, object>
{
["healthy"] = result.HealthyCount,
["degraded"] = result.DegradedCount
});
}
return HealthCheckResult.Healthy(
$"{result.HealthyCount} healthy streams");
}
}
```
### Consumer Health Check
Monitor consumer lag and stalls:
```csharp
public class ConsumerHealthCheck : IHealthCheck
{
private readonly IConsumerHealthService _healthService;
private readonly HealthCheckOptions _options;
public async Task<HealthCheckResult> CheckHealthAsync(
HealthCheckContext context,
CancellationToken ct = default)
{
var consumers = await _healthService.GetAllConsumersAsync(ct);
var unhealthyConsumers = new List<string>();
var degradedConsumers = new List<string>();
foreach (var consumer in consumers)
{
var lag = consumer.Lag;
var timeSinceUpdate = DateTimeOffset.UtcNow - consumer.LastUpdated;
// Check for stalls
if (timeSinceUpdate > _options.UnhealthyStalledThreshold)
{
unhealthyConsumers.Add($"{consumer.ConsumerId} (stalled {timeSinceUpdate.TotalMinutes:F0}m)");
}
else if (timeSinceUpdate > _options.DegradedStalledThreshold)
{
degradedConsumers.Add($"{consumer.ConsumerId} (stalled {timeSinceUpdate.TotalMinutes:F0}m)");
}
// Check lag
else if (lag > _options.UnhealthyConsumerLagThreshold)
{
unhealthyConsumers.Add($"{consumer.ConsumerId} (lag {lag})");
}
else if (lag > _options.DegradedConsumerLagThreshold)
{
degradedConsumers.Add($"{consumer.ConsumerId} (lag {lag})");
}
}
if (unhealthyConsumers.Any())
{
return HealthCheckResult.Unhealthy(
$"{unhealthyConsumers.Count} unhealthy consumers",
data: new Dictionary<string, object>
{
["unhealthy_consumers"] = unhealthyConsumers,
["degraded_consumers"] = degradedConsumers
});
}
if (degradedConsumers.Any())
{
return HealthCheckResult.Degraded(
$"{degradedConsumers.Count} degraded consumers",
data: new Dictionary<string, object>
{
["degraded_consumers"] = degradedConsumers
});
}
return HealthCheckResult.Healthy($"{consumers.Count} healthy consumers");
}
}
```
## Configuration Options
```csharp
public class HealthCheckOptions
{
// Consumer lag thresholds (event count)
public long DegradedConsumerLagThreshold { get; set; } = 1000;
public long UnhealthyConsumerLagThreshold { get; set; } = 10000;
// Stall detection thresholds (time without progress)
public TimeSpan DegradedStalledThreshold { get; set; } = TimeSpan.FromMinutes(5);
public TimeSpan UnhealthyStalledThreshold { get; set; } = TimeSpan.FromMinutes(15);
// Stream health thresholds
public int MaxErrorRate { get; set; } = 5; // Errors per minute
public TimeSpan StreamUnresponsiveTimeout { get; set; } = TimeSpan.FromMinutes(5);
}
```
## Health Check Endpoints
### Basic Health Endpoint
```csharp
app.MapHealthChecks("/health");
// Returns:
// 200 OK: Healthy
// 503 Service Unavailable: Degraded or Unhealthy
```
### Detailed Health Endpoint
```csharp
app.MapHealthChecks("/health/detail", new HealthCheckOptions
{
ResponseWriter = async (context, report) =>
{
context.Response.ContentType = "application/json";
var result = new
{
status = report.Status.ToString(),
totalDuration = report.TotalDuration.TotalMilliseconds,
checks = report.Entries.Select(e => new
{
name = e.Key,
status = e.Value.Status.ToString(),
duration = e.Value.Duration.TotalMilliseconds,
description = e.Value.Description,
data = e.Value.Data,
exception = e.Value.Exception?.Message
})
};
await context.Response.WriteAsJsonAsync(result);
}
});
// Returns detailed JSON response:
// {
// "status": "Healthy",
// "totalDuration": 45.2,
// "checks": [
// {
// "name": "event-streams",
// "status": "Healthy",
// "duration": 23.1,
// "description": "5 healthy streams",
// "data": { "healthy": 5, "degraded": 0, "unhealthy": 0 }
// }
// ]
// }
```
### Liveness vs Readiness
```csharp
// Liveness - is the app running?
app.MapHealthChecks("/health/live", new HealthCheckOptions
{
Predicate = check => check.Tags.Contains("live")
});
// Readiness - can the app serve traffic?
app.MapHealthChecks("/health/ready", new HealthCheckOptions
{
Predicate = check => check.Tags.Contains("ready")
});
// Registration with tags
builder.Services.AddHealthChecks()
.AddCheck<StreamHealthCheck>("event-streams", tags: new[] { "ready" })
.AddCheck<ConsumerHealthCheck>("consumers", tags: new[] { "ready" })
.AddCheck("self", () => HealthCheckResult.Healthy(), tags: new[] { "live" });
```
## Kubernetes Integration
```yaml
# deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: event-processor
spec:
template:
spec:
containers:
- name: event-processor
image: event-processor:latest
ports:
- containerPort: 80
livenessProbe:
httpGet:
path: /health/live
port: 80
initialDelaySeconds: 10
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
httpGet:
path: /health/ready
port: 80
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
```
## Health Check UI
```csharp
// Add Health Checks UI package
// dotnet add package AspNetCore.HealthChecks.UI
// dotnet add package AspNetCore.HealthChecks.UI.Client
// dotnet add package AspNetCore.HealthChecks.UI.InMemory.Storage
builder.Services.AddHealthChecks()
.AddCheck<StreamHealthCheck>("event-streams")
.AddCheck<ConsumerHealthCheck>("consumers");
builder.Services.AddHealthChecksUI()
.AddInMemoryStorage();
var app = builder.Build();
app.MapHealthChecks("/health", new HealthCheckOptions
{
ResponseWriter = UIResponseWriter.WriteHealthCheckUIResponse
});
app.MapHealthChecksUI(options =>
{
options.UIPath = "/health-ui";
});
// Access UI at: http://localhost:5000/health-ui
```
## Custom Health Checks
### Projection Health Check
```csharp
public class ProjectionHealthCheck : IHealthCheck
{
private readonly ICheckpointStore _checkpointStore;
private readonly IEventStreamStore _eventStore;
public async Task<HealthCheckResult> CheckHealthAsync(
HealthCheckContext context,
CancellationToken ct = default)
{
var checkpoint = await _checkpointStore.GetCheckpointAsync("order-summary", ct);
var streamHead = await _eventStore.GetStreamHeadAsync("orders", ct);
var lag = streamHead - checkpoint;
if (lag > 10000)
{
return HealthCheckResult.Unhealthy(
$"Projection critically lagging: {lag} events behind",
data: new Dictionary<string, object>
{
["checkpoint"] = checkpoint,
["stream_head"] = streamHead,
["lag"] = lag
});
}
if (lag > 1000)
{
return HealthCheckResult.Degraded(
$"Projection lagging: {lag} events behind",
data: new Dictionary<string, object>
{
["checkpoint"] = checkpoint,
["stream_head"] = streamHead,
["lag"] = lag
});
}
return HealthCheckResult.Healthy($"Projection up-to-date (lag: {lag})");
}
}
```
### Database Health Check
```csharp
public class PostgresHealthCheck : IHealthCheck
{
private readonly IEventStreamStore _eventStore;
public async Task<HealthCheckResult> CheckHealthAsync(
HealthCheckContext context,
CancellationToken ct = default)
{
try
{
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(5));
await _eventStore.PingAsync(cts.Token);
return HealthCheckResult.Healthy("PostgreSQL connection healthy");
}
catch (Exception ex)
{
return HealthCheckResult.Unhealthy(
"PostgreSQL connection failed",
exception: ex);
}
}
}
```
## Monitoring and Alerting
### Prometheus Integration
```csharp
// Export health check status as metrics
public class HealthCheckMetricsPublisher : IHealthCheckPublisher
{
private readonly IMetrics _metrics;
public Task PublishAsync(HealthReport report, CancellationToken ct)
{
foreach (var entry in report.Entries)
{
var status = entry.Value.Status switch
{
HealthStatus.Healthy => 1,
HealthStatus.Degraded => 0.5,
HealthStatus.Unhealthy => 0,
_ => 0
};
_metrics.RecordGauge($"health_check_{entry.Key}", status);
}
return Task.CompletedTask;
}
}
// Register publisher
builder.Services.AddSingleton<IHealthCheckPublisher, HealthCheckMetricsPublisher>();
```
### Alert on Unhealthy
```csharp
public class HealthCheckAlertPublisher : IHealthCheckPublisher
{
private readonly IAlertService _alertService;
public async Task PublishAsync(HealthReport report, CancellationToken ct)
{
if (report.Status == HealthStatus.Unhealthy)
{
await _alertService.SendAsync(new Alert
{
Severity = AlertSeverity.Critical,
Title = "System Unhealthy",
Description = $"Health check failed: {string.Join(", ", report.Entries.Where(e => e.Value.Status == HealthStatus.Unhealthy).Select(e => e.Key))}",
Timestamp = DateTimeOffset.UtcNow
});
}
}
}
```
## Best Practices
### ✅ DO
- Configure health checks for production
- Use appropriate thresholds for your workload
- Separate liveness and readiness probes
- Monitor health check metrics
- Set up alerts for unhealthy status
- Include health checks in deployment strategy
- Test health check behavior
- Document health check meanings
### ❌ DON'T
- Don't use same thresholds for all systems
- Don't ignore degraded status
- Don't skip health checks in production
- Don't make health checks too slow (> 5s)
- Don't forget to handle timeouts
- Don't expose sensitive data in health responses
- Don't use health checks for business logic
- Don't forget to test failure scenarios
## See Also
- [Observability Overview](../README.md)
- [Stream Health](stream-health.md)
- [Consumer Health](consumer-health.md)
- [ASP.NET Core Integration](aspnetcore-integration.md)
- [Health Thresholds](health-thresholds.md)
- [Metrics](../metrics/README.md)