diff --git a/app/sovran_systemsos_web/server.py b/app/sovran_systemsos_web/server.py index 2ac8e30..3a6cbec 100644 --- a/app/sovran_systemsos_web/server.py +++ b/app/sovran_systemsos_web/server.py @@ -4,6 +4,7 @@ from __future__ import annotations import asyncio import base64 +import contextlib import hashlib import hmac import json @@ -20,6 +21,7 @@ import time import urllib.error import urllib.parse import urllib.request +from threading import Lock from fastapi import FastAPI, HTTPException from fastapi.responses import HTMLResponse, JSONResponse, RedirectResponse @@ -54,6 +56,12 @@ REBUILD_UNIT = "sovran-hub-rebuild.service" # even when the frontend's offset is pointing past the pre-restart content. _update_recovery_happened: bool = False _cached_external_ip: str = "unavailable" +_domain_reachability_cache: dict[str, dict] = {} +_domain_reachability_cache_lock = Lock() +_DOMAIN_REACHABILITY_TTL = 60 +_DOMAIN_REACHABILITY_STARTUP_DELAY = 5 +_domain_reachability_task: asyncio.Task | None = None +_domain_reachability_task_lock = asyncio.Lock() BACKUP_LOG = "/var/log/sovran-hub-backup.log" BACKUP_STATUS = "/var/log/sovran-hub-backup.status" @@ -970,6 +978,15 @@ def _check_domain_health_fast(domain: str | None, external_ip: str) -> bool: return resolved_ip != external_ip +def _is_domain_reachable_cached(domain: str) -> bool | None: + """Return cached reachability, or ``None`` if not yet checked.""" + with _domain_reachability_cache_lock: + entry = _domain_reachability_cache.get(domain) + if entry is None: + return None + return bool(entry.get("reachable", False)) + + def _evaluate_domain_checklist(domain: str | None, external_ip: str, internal_ip: str | None = None) -> dict: """Evaluate sequential domain diagnostics and return UI-ready checklist data.""" steps: list[dict] = [] @@ -2391,6 +2408,10 @@ async def api_services(): domain, _cached_external_ip, ) + if not has_domain_issues and domain: + cached_reachable = _is_domain_reachable_cached(domain) + if cached_reachable is False: + has_domain_issues = True health = "needs_attention" if (has_port_issues or has_domain_issues) else "healthy" # Check Bitcoin IBD state if unit == "bitcoind.service" and enabled: @@ -4333,3 +4354,96 @@ async def _startup_recover_stale_status(): if corrected: _update_recovery_happened = True await loop.run_in_executor(None, _recover_stale_status, REBUILD_STATUS, REBUILD_LOG, REBUILD_UNIT) + + +async def _background_domain_reachability_checker(): + """Periodically curl configured domains and cache reachability results.""" + await asyncio.sleep(_DOMAIN_REACHABILITY_STARTUP_DELAY) + consecutive_failures = 0 + while True: + try: + cfg = load_config() + services = cfg.get("services", []) + + unit_to_feature = { + unit: feat_id + for feat_id, unit in FEATURE_SERVICE_MAP.items() + if unit is not None + } + + loop = asyncio.get_event_loop() + overrides, *_ = await loop.run_in_executor(None, _read_hub_overrides) + + domains_to_check: list[str] = [] + for entry in services: + unit = entry.get("unit", "") + icon = entry.get("icon", "") + enabled = entry.get("enabled", True) + + feat_id = unit_to_feature.get(unit) + if feat_id is None: + feat_id = FEATURE_ICON_MAP.get(icon) + if feat_id is not None and feat_id in overrides: + enabled = overrides[feat_id] + if not enabled: + continue + + domain_key = SERVICE_DOMAIN_MAP.get(unit) + if not domain_key: + continue + domain_path = os.path.join(DOMAINS_DIR, domain_key) + try: + with open(domain_path, "r") as f: + domain = f.read(512).strip() + if domain: + domains_to_check.append(domain) + except OSError: + continue + + if domains_to_check: + # Preserve domain order while removing duplicates. + unique_domains = list(dict.fromkeys(domains_to_check)) + results = await asyncio.gather(*[ + loop.run_in_executor(None, _check_domain_reachable, domain) + for domain in unique_domains + ]) + checked_at = time.time() + with _domain_reachability_cache_lock: + for domain, result in zip(unique_domains, results): + result["checked_at"] = checked_at + _domain_reachability_cache[domain] = result + consecutive_failures = 0 + except asyncio.CancelledError: + raise + except Exception: + consecutive_failures += 1 + logger.exception("Background domain reachability checker error") + if consecutive_failures >= 3: + logger.warning( + "Background domain reachability checker has failed %d consecutive times", + consecutive_failures, + ) + + await asyncio.sleep(_DOMAIN_REACHABILITY_TTL) + + +@app.on_event("startup") +async def _startup_domain_reachability(): + """Start the background domain reachability checker.""" + global _domain_reachability_task + async with _domain_reachability_task_lock: + if _domain_reachability_task is None or _domain_reachability_task.done(): + _domain_reachability_task = asyncio.create_task(_background_domain_reachability_checker()) + + +@app.on_event("shutdown") +async def _shutdown_domain_reachability(): + """Stop the background domain reachability checker.""" + global _domain_reachability_task + async with _domain_reachability_task_lock: + task = _domain_reachability_task + _domain_reachability_task = None + if task is not None and not task.done(): + task.cancel() + with contextlib.suppress(asyncio.CancelledError): + await task