svrnty-mcp-gateway/src/Svrnty.MCP.Gateway.Infrastructure/Health/PassiveHealthTracker.cs
Svrnty a4a1dd2e38 docs: comprehensive AI coding assistant research and MCP-first implementation plan
Research conducted on modern AI coding assistants (Cursor, GitHub Copilot, Cline,
Aider, Windsurf, Replit Agent) to understand architecture patterns, context management,
code editing workflows, and tool use protocols.

Key Decision: Pivoted from building full CLI (40-50h) to validation-driven MCP-first
approach (10-15h). Build 5 core CODEX MCP tools that work with ANY coding assistant,
validate adoption over 2-4 weeks, then decide on full CLI if demand proven.

Files:
- research/ai-systems/modern-coding-assistants-architecture.md (comprehensive research)
- research/ai-systems/codex-coding-assistant-implementation-plan.md (original CLI plan, preserved)
- research/ai-systems/codex-mcp-tools-implementation-plan.md (approved MCP-first plan)
- ideas/registry.json (updated with approved MCP tools proposal)

Architech Validation: APPROVED with pivot to MCP-first approach
Human Decision: Approved (pragmatic validation-driven development)

Next: Begin Phase 1 implementation (10-15 hours, 5 core MCP tools)

🤖 Generated with CODEX Research System

Co-Authored-By: The Archivist <archivist@codex.svrnty.io>
Co-Authored-By: The Architech <architech@codex.svrnty.io>
Co-Authored-By: Mathias Beaulieu-Duncan <mat@svrnty.io>
2025-10-22 21:00:34 -04:00

135 lines
4.4 KiB
C#

using System.Collections.Concurrent;
using OpenHarbor.MCP.Gateway.Core.Models;
namespace OpenHarbor.MCP.Gateway.Infrastructure.Health;
/// <summary>
/// Passive health tracker that monitors server health based on actual request patterns.
/// Tracks success/failure rates and response times without active probing.
/// </summary>
public class PassiveHealthTracker
{
private readonly ConcurrentDictionary<string, ServerHealthData> _healthData = new();
/// <summary>
/// Number of consecutive failures before marking server as unhealthy. Default is 5.
/// </summary>
public int UnhealthyThreshold { get; set; } = 5;
/// <summary>
/// Number of consecutive successes before marking server as healthy again. Default is 3.
/// </summary>
public int HealthyThreshold { get; set; } = 3;
/// <summary>
/// Response time threshold for marking requests as slow. Default is 5 seconds.
/// </summary>
public TimeSpan SlowResponseThreshold { get; set; } = TimeSpan.FromSeconds(5);
/// <summary>
/// Records a successful request to a server.
/// </summary>
public void RecordSuccess(string serverId, TimeSpan responseTime)
{
var data = _healthData.GetOrAdd(serverId, _ => new ServerHealthData { ServerId = serverId });
lock (data)
{
data.ConsecutiveSuccesses++;
data.ConsecutiveFailures = 0;
data.LastResponseTime = responseTime;
data.LastCheck = DateTime.UtcNow;
data.LastErrorMessage = null;
// Add response time to rolling average
data.ResponseTimes.Enqueue(responseTime);
if (data.ResponseTimes.Count > 10) // Keep last 10 response times
{
data.ResponseTimes.Dequeue();
}
// Update health status based on thresholds
if (data.ConsecutiveSuccesses >= HealthyThreshold)
{
data.IsHealthy = true;
}
}
}
/// <summary>
/// Records a failed request to a server.
/// </summary>
public void RecordFailure(string serverId, string errorMessage)
{
var data = _healthData.GetOrAdd(serverId, _ => new ServerHealthData { ServerId = serverId });
lock (data)
{
data.ConsecutiveFailures++;
data.ConsecutiveSuccesses = 0;
data.LastCheck = DateTime.UtcNow;
data.LastErrorMessage = errorMessage;
// Update health status based on thresholds
if (data.ConsecutiveFailures >= UnhealthyThreshold)
{
data.IsHealthy = false;
}
}
}
/// <summary>
/// Gets the current health status for a specific server.
/// </summary>
public ServerHealthStatus? GetServerHealth(string serverId)
{
if (!_healthData.TryGetValue(serverId, out var data))
{
return null;
}
lock (data)
{
return new ServerHealthStatus
{
ServerId = data.ServerId,
ServerName = serverId, // Default to ID if name not set
IsHealthy = data.IsHealthy,
LastCheck = data.LastCheck,
ResponseTime = data.ResponseTimes.Any()
? TimeSpan.FromMilliseconds(data.ResponseTimes.Average(t => t.TotalMilliseconds))
: data.LastResponseTime,
ErrorMessage = data.LastErrorMessage
};
}
}
/// <summary>
/// Gets health status for all tracked servers.
/// </summary>
public IEnumerable<ServerHealthStatus> GetAllServerHealth()
{
return _healthData.Keys.Select(serverId => GetServerHealth(serverId)).Where(h => h != null).Cast<ServerHealthStatus>();
}
/// <summary>
/// Resets all tracked health data.
/// </summary>
public void Reset()
{
_healthData.Clear();
}
private class ServerHealthData
{
public string ServerId { get; set; } = string.Empty;
public bool IsHealthy { get; set; } = true; // Start as healthy
public int ConsecutiveSuccesses { get; set; }
public int ConsecutiveFailures { get; set; }
public DateTime LastCheck { get; set; }
public TimeSpan? LastResponseTime { get; set; }
public string? LastErrorMessage { get; set; }
public Queue<TimeSpan> ResponseTimes { get; set; } = new Queue<TimeSpan>();
}
}