Merge pull request #247 from naturallaw777/copilot/fix-tile-health-discrepancy

Align `/api/services` tile health with full domain diagnostics via background reachability cache
This commit is contained in:
Sovran_Systems
2026-04-15 11:15:02 -05:00
committed by GitHub

View File

@@ -4,6 +4,7 @@ from __future__ import annotations
import asyncio
import base64
import contextlib
import hashlib
import hmac
import json
@@ -20,6 +21,7 @@ import time
import urllib.error
import urllib.parse
import urllib.request
from threading import Lock
from fastapi import FastAPI, HTTPException
from fastapi.responses import HTMLResponse, JSONResponse, RedirectResponse
@@ -54,6 +56,12 @@ REBUILD_UNIT = "sovran-hub-rebuild.service"
# even when the frontend's offset is pointing past the pre-restart content.
_update_recovery_happened: bool = False
_cached_external_ip: str = "unavailable"
_domain_reachability_cache: dict[str, dict] = {}
_domain_reachability_cache_lock = Lock()
_DOMAIN_REACHABILITY_TTL = 60
_DOMAIN_REACHABILITY_STARTUP_DELAY = 5
_domain_reachability_task: asyncio.Task | None = None
_domain_reachability_task_lock = asyncio.Lock()
BACKUP_LOG = "/var/log/sovran-hub-backup.log"
BACKUP_STATUS = "/var/log/sovran-hub-backup.status"
@@ -970,6 +978,15 @@ def _check_domain_health_fast(domain: str | None, external_ip: str) -> bool:
return resolved_ip != external_ip
def _is_domain_reachable_cached(domain: str) -> bool | None:
"""Return cached reachability, or ``None`` if not yet checked."""
with _domain_reachability_cache_lock:
entry = _domain_reachability_cache.get(domain)
if entry is None:
return None
return bool(entry.get("reachable", False))
def _evaluate_domain_checklist(domain: str | None, external_ip: str, internal_ip: str | None = None) -> dict:
"""Evaluate sequential domain diagnostics and return UI-ready checklist data."""
steps: list[dict] = []
@@ -2391,6 +2408,10 @@ async def api_services():
domain,
_cached_external_ip,
)
if not has_domain_issues and domain:
cached_reachable = _is_domain_reachable_cached(domain)
if cached_reachable is False:
has_domain_issues = True
health = "needs_attention" if (has_port_issues or has_domain_issues) else "healthy"
# Check Bitcoin IBD state
if unit == "bitcoind.service" and enabled:
@@ -4333,3 +4354,96 @@ async def _startup_recover_stale_status():
if corrected:
_update_recovery_happened = True
await loop.run_in_executor(None, _recover_stale_status, REBUILD_STATUS, REBUILD_LOG, REBUILD_UNIT)
async def _background_domain_reachability_checker():
"""Periodically curl configured domains and cache reachability results."""
await asyncio.sleep(_DOMAIN_REACHABILITY_STARTUP_DELAY)
consecutive_failures = 0
while True:
try:
cfg = load_config()
services = cfg.get("services", [])
unit_to_feature = {
unit: feat_id
for feat_id, unit in FEATURE_SERVICE_MAP.items()
if unit is not None
}
loop = asyncio.get_event_loop()
overrides, *_ = await loop.run_in_executor(None, _read_hub_overrides)
domains_to_check: list[str] = []
for entry in services:
unit = entry.get("unit", "")
icon = entry.get("icon", "")
enabled = entry.get("enabled", True)
feat_id = unit_to_feature.get(unit)
if feat_id is None:
feat_id = FEATURE_ICON_MAP.get(icon)
if feat_id is not None and feat_id in overrides:
enabled = overrides[feat_id]
if not enabled:
continue
domain_key = SERVICE_DOMAIN_MAP.get(unit)
if not domain_key:
continue
domain_path = os.path.join(DOMAINS_DIR, domain_key)
try:
with open(domain_path, "r") as f:
domain = f.read(512).strip()
if domain:
domains_to_check.append(domain)
except OSError:
continue
if domains_to_check:
# Preserve domain order while removing duplicates.
unique_domains = list(dict.fromkeys(domains_to_check))
results = await asyncio.gather(*[
loop.run_in_executor(None, _check_domain_reachable, domain)
for domain in unique_domains
])
checked_at = time.time()
with _domain_reachability_cache_lock:
for domain, result in zip(unique_domains, results):
result["checked_at"] = checked_at
_domain_reachability_cache[domain] = result
consecutive_failures = 0
except asyncio.CancelledError:
raise
except Exception:
consecutive_failures += 1
logger.exception("Background domain reachability checker error")
if consecutive_failures >= 3:
logger.warning(
"Background domain reachability checker has failed %d consecutive times",
consecutive_failures,
)
await asyncio.sleep(_DOMAIN_REACHABILITY_TTL)
@app.on_event("startup")
async def _startup_domain_reachability():
"""Start the background domain reachability checker."""
global _domain_reachability_task
async with _domain_reachability_task_lock:
if _domain_reachability_task is None or _domain_reachability_task.done():
_domain_reachability_task = asyncio.create_task(_background_domain_reachability_checker())
@app.on_event("shutdown")
async def _shutdown_domain_reachability():
"""Stop the background domain reachability checker."""
global _domain_reachability_task
async with _domain_reachability_task_lock:
task = _domain_reachability_task
_domain_reachability_task = None
if task is not None and not task.done():
task.cancel()
with contextlib.suppress(asyncio.CancelledError):
await task