Merge pull request #247 from naturallaw777/copilot/fix-tile-health-discrepancy
Align `/api/services` tile health with full domain diagnostics via background reachability cache
This commit is contained in:
@@ -4,6 +4,7 @@ from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import contextlib
|
||||
import hashlib
|
||||
import hmac
|
||||
import json
|
||||
@@ -20,6 +21,7 @@ import time
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from threading import Lock
|
||||
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from fastapi.responses import HTMLResponse, JSONResponse, RedirectResponse
|
||||
@@ -54,6 +56,12 @@ REBUILD_UNIT = "sovran-hub-rebuild.service"
|
||||
# even when the frontend's offset is pointing past the pre-restart content.
|
||||
_update_recovery_happened: bool = False
|
||||
_cached_external_ip: str = "unavailable"
|
||||
_domain_reachability_cache: dict[str, dict] = {}
|
||||
_domain_reachability_cache_lock = Lock()
|
||||
_DOMAIN_REACHABILITY_TTL = 60
|
||||
_DOMAIN_REACHABILITY_STARTUP_DELAY = 5
|
||||
_domain_reachability_task: asyncio.Task | None = None
|
||||
_domain_reachability_task_lock = asyncio.Lock()
|
||||
|
||||
BACKUP_LOG = "/var/log/sovran-hub-backup.log"
|
||||
BACKUP_STATUS = "/var/log/sovran-hub-backup.status"
|
||||
@@ -970,6 +978,15 @@ def _check_domain_health_fast(domain: str | None, external_ip: str) -> bool:
|
||||
return resolved_ip != external_ip
|
||||
|
||||
|
||||
def _is_domain_reachable_cached(domain: str) -> bool | None:
|
||||
"""Return cached reachability, or ``None`` if not yet checked."""
|
||||
with _domain_reachability_cache_lock:
|
||||
entry = _domain_reachability_cache.get(domain)
|
||||
if entry is None:
|
||||
return None
|
||||
return bool(entry.get("reachable", False))
|
||||
|
||||
|
||||
def _evaluate_domain_checklist(domain: str | None, external_ip: str, internal_ip: str | None = None) -> dict:
|
||||
"""Evaluate sequential domain diagnostics and return UI-ready checklist data."""
|
||||
steps: list[dict] = []
|
||||
@@ -2391,6 +2408,10 @@ async def api_services():
|
||||
domain,
|
||||
_cached_external_ip,
|
||||
)
|
||||
if not has_domain_issues and domain:
|
||||
cached_reachable = _is_domain_reachable_cached(domain)
|
||||
if cached_reachable is False:
|
||||
has_domain_issues = True
|
||||
health = "needs_attention" if (has_port_issues or has_domain_issues) else "healthy"
|
||||
# Check Bitcoin IBD state
|
||||
if unit == "bitcoind.service" and enabled:
|
||||
@@ -4333,3 +4354,96 @@ async def _startup_recover_stale_status():
|
||||
if corrected:
|
||||
_update_recovery_happened = True
|
||||
await loop.run_in_executor(None, _recover_stale_status, REBUILD_STATUS, REBUILD_LOG, REBUILD_UNIT)
|
||||
|
||||
|
||||
async def _background_domain_reachability_checker():
|
||||
"""Periodically curl configured domains and cache reachability results."""
|
||||
await asyncio.sleep(_DOMAIN_REACHABILITY_STARTUP_DELAY)
|
||||
consecutive_failures = 0
|
||||
while True:
|
||||
try:
|
||||
cfg = load_config()
|
||||
services = cfg.get("services", [])
|
||||
|
||||
unit_to_feature = {
|
||||
unit: feat_id
|
||||
for feat_id, unit in FEATURE_SERVICE_MAP.items()
|
||||
if unit is not None
|
||||
}
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
overrides, *_ = await loop.run_in_executor(None, _read_hub_overrides)
|
||||
|
||||
domains_to_check: list[str] = []
|
||||
for entry in services:
|
||||
unit = entry.get("unit", "")
|
||||
icon = entry.get("icon", "")
|
||||
enabled = entry.get("enabled", True)
|
||||
|
||||
feat_id = unit_to_feature.get(unit)
|
||||
if feat_id is None:
|
||||
feat_id = FEATURE_ICON_MAP.get(icon)
|
||||
if feat_id is not None and feat_id in overrides:
|
||||
enabled = overrides[feat_id]
|
||||
if not enabled:
|
||||
continue
|
||||
|
||||
domain_key = SERVICE_DOMAIN_MAP.get(unit)
|
||||
if not domain_key:
|
||||
continue
|
||||
domain_path = os.path.join(DOMAINS_DIR, domain_key)
|
||||
try:
|
||||
with open(domain_path, "r") as f:
|
||||
domain = f.read(512).strip()
|
||||
if domain:
|
||||
domains_to_check.append(domain)
|
||||
except OSError:
|
||||
continue
|
||||
|
||||
if domains_to_check:
|
||||
# Preserve domain order while removing duplicates.
|
||||
unique_domains = list(dict.fromkeys(domains_to_check))
|
||||
results = await asyncio.gather(*[
|
||||
loop.run_in_executor(None, _check_domain_reachable, domain)
|
||||
for domain in unique_domains
|
||||
])
|
||||
checked_at = time.time()
|
||||
with _domain_reachability_cache_lock:
|
||||
for domain, result in zip(unique_domains, results):
|
||||
result["checked_at"] = checked_at
|
||||
_domain_reachability_cache[domain] = result
|
||||
consecutive_failures = 0
|
||||
except asyncio.CancelledError:
|
||||
raise
|
||||
except Exception:
|
||||
consecutive_failures += 1
|
||||
logger.exception("Background domain reachability checker error")
|
||||
if consecutive_failures >= 3:
|
||||
logger.warning(
|
||||
"Background domain reachability checker has failed %d consecutive times",
|
||||
consecutive_failures,
|
||||
)
|
||||
|
||||
await asyncio.sleep(_DOMAIN_REACHABILITY_TTL)
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def _startup_domain_reachability():
|
||||
"""Start the background domain reachability checker."""
|
||||
global _domain_reachability_task
|
||||
async with _domain_reachability_task_lock:
|
||||
if _domain_reachability_task is None or _domain_reachability_task.done():
|
||||
_domain_reachability_task = asyncio.create_task(_background_domain_reachability_checker())
|
||||
|
||||
|
||||
@app.on_event("shutdown")
|
||||
async def _shutdown_domain_reachability():
|
||||
"""Stop the background domain reachability checker."""
|
||||
global _domain_reachability_task
|
||||
async with _domain_reachability_task_lock:
|
||||
task = _domain_reachability_task
|
||||
_domain_reachability_task = None
|
||||
if task is not None and not task.done():
|
||||
task.cancel()
|
||||
with contextlib.suppress(asyncio.CancelledError):
|
||||
await task
|
||||
|
||||
Reference in New Issue
Block a user