Merge pull request #247 from naturallaw777/copilot/fix-tile-health-discrepancy

Align `/api/services` tile health with full domain diagnostics via background reachability cache
This commit is contained in:
Sovran_Systems
2026-04-15 11:15:02 -05:00
committed by GitHub

View File

@@ -4,6 +4,7 @@ from __future__ import annotations
import asyncio import asyncio
import base64 import base64
import contextlib
import hashlib import hashlib
import hmac import hmac
import json import json
@@ -20,6 +21,7 @@ import time
import urllib.error import urllib.error
import urllib.parse import urllib.parse
import urllib.request import urllib.request
from threading import Lock
from fastapi import FastAPI, HTTPException from fastapi import FastAPI, HTTPException
from fastapi.responses import HTMLResponse, JSONResponse, RedirectResponse from fastapi.responses import HTMLResponse, JSONResponse, RedirectResponse
@@ -54,6 +56,12 @@ REBUILD_UNIT = "sovran-hub-rebuild.service"
# even when the frontend's offset is pointing past the pre-restart content. # even when the frontend's offset is pointing past the pre-restart content.
_update_recovery_happened: bool = False _update_recovery_happened: bool = False
_cached_external_ip: str = "unavailable" _cached_external_ip: str = "unavailable"
_domain_reachability_cache: dict[str, dict] = {}
_domain_reachability_cache_lock = Lock()
_DOMAIN_REACHABILITY_TTL = 60
_DOMAIN_REACHABILITY_STARTUP_DELAY = 5
_domain_reachability_task: asyncio.Task | None = None
_domain_reachability_task_lock = asyncio.Lock()
BACKUP_LOG = "/var/log/sovran-hub-backup.log" BACKUP_LOG = "/var/log/sovran-hub-backup.log"
BACKUP_STATUS = "/var/log/sovran-hub-backup.status" BACKUP_STATUS = "/var/log/sovran-hub-backup.status"
@@ -970,6 +978,15 @@ def _check_domain_health_fast(domain: str | None, external_ip: str) -> bool:
return resolved_ip != external_ip return resolved_ip != external_ip
def _is_domain_reachable_cached(domain: str) -> bool | None:
"""Return cached reachability, or ``None`` if not yet checked."""
with _domain_reachability_cache_lock:
entry = _domain_reachability_cache.get(domain)
if entry is None:
return None
return bool(entry.get("reachable", False))
def _evaluate_domain_checklist(domain: str | None, external_ip: str, internal_ip: str | None = None) -> dict: def _evaluate_domain_checklist(domain: str | None, external_ip: str, internal_ip: str | None = None) -> dict:
"""Evaluate sequential domain diagnostics and return UI-ready checklist data.""" """Evaluate sequential domain diagnostics and return UI-ready checklist data."""
steps: list[dict] = [] steps: list[dict] = []
@@ -2391,6 +2408,10 @@ async def api_services():
domain, domain,
_cached_external_ip, _cached_external_ip,
) )
if not has_domain_issues and domain:
cached_reachable = _is_domain_reachable_cached(domain)
if cached_reachable is False:
has_domain_issues = True
health = "needs_attention" if (has_port_issues or has_domain_issues) else "healthy" health = "needs_attention" if (has_port_issues or has_domain_issues) else "healthy"
# Check Bitcoin IBD state # Check Bitcoin IBD state
if unit == "bitcoind.service" and enabled: if unit == "bitcoind.service" and enabled:
@@ -4333,3 +4354,96 @@ async def _startup_recover_stale_status():
if corrected: if corrected:
_update_recovery_happened = True _update_recovery_happened = True
await loop.run_in_executor(None, _recover_stale_status, REBUILD_STATUS, REBUILD_LOG, REBUILD_UNIT) await loop.run_in_executor(None, _recover_stale_status, REBUILD_STATUS, REBUILD_LOG, REBUILD_UNIT)
async def _background_domain_reachability_checker():
"""Periodically curl configured domains and cache reachability results."""
await asyncio.sleep(_DOMAIN_REACHABILITY_STARTUP_DELAY)
consecutive_failures = 0
while True:
try:
cfg = load_config()
services = cfg.get("services", [])
unit_to_feature = {
unit: feat_id
for feat_id, unit in FEATURE_SERVICE_MAP.items()
if unit is not None
}
loop = asyncio.get_event_loop()
overrides, *_ = await loop.run_in_executor(None, _read_hub_overrides)
domains_to_check: list[str] = []
for entry in services:
unit = entry.get("unit", "")
icon = entry.get("icon", "")
enabled = entry.get("enabled", True)
feat_id = unit_to_feature.get(unit)
if feat_id is None:
feat_id = FEATURE_ICON_MAP.get(icon)
if feat_id is not None and feat_id in overrides:
enabled = overrides[feat_id]
if not enabled:
continue
domain_key = SERVICE_DOMAIN_MAP.get(unit)
if not domain_key:
continue
domain_path = os.path.join(DOMAINS_DIR, domain_key)
try:
with open(domain_path, "r") as f:
domain = f.read(512).strip()
if domain:
domains_to_check.append(domain)
except OSError:
continue
if domains_to_check:
# Preserve domain order while removing duplicates.
unique_domains = list(dict.fromkeys(domains_to_check))
results = await asyncio.gather(*[
loop.run_in_executor(None, _check_domain_reachable, domain)
for domain in unique_domains
])
checked_at = time.time()
with _domain_reachability_cache_lock:
for domain, result in zip(unique_domains, results):
result["checked_at"] = checked_at
_domain_reachability_cache[domain] = result
consecutive_failures = 0
except asyncio.CancelledError:
raise
except Exception:
consecutive_failures += 1
logger.exception("Background domain reachability checker error")
if consecutive_failures >= 3:
logger.warning(
"Background domain reachability checker has failed %d consecutive times",
consecutive_failures,
)
await asyncio.sleep(_DOMAIN_REACHABILITY_TTL)
@app.on_event("startup")
async def _startup_domain_reachability():
"""Start the background domain reachability checker."""
global _domain_reachability_task
async with _domain_reachability_task_lock:
if _domain_reachability_task is None or _domain_reachability_task.done():
_domain_reachability_task = asyncio.create_task(_background_domain_reachability_checker())
@app.on_event("shutdown")
async def _shutdown_domain_reachability():
"""Stop the background domain reachability checker."""
global _domain_reachability_task
async with _domain_reachability_task_lock:
task = _domain_reachability_task
_domain_reachability_task = None
if task is not None and not task.done():
task.cancel()
with contextlib.suppress(asyncio.CancelledError):
await task