diff --git a/app/sovran_systemsos_web/server.py b/app/sovran_systemsos_web/server.py index cb60627..90532dc 100644 --- a/app/sovran_systemsos_web/server.py +++ b/app/sovran_systemsos_web/server.py @@ -7,7 +7,6 @@ import json import os import socket import subprocess -import time import urllib.request from fastapi import FastAPI, HTTPException @@ -27,6 +26,7 @@ GITEA_API_BASE = "https://git.sovransystems.com/api/v1/repos/Sovran_Systems/Sovr UPDATE_UNIT = "sovran-hub-update.service" UPDATE_LOG = "/var/log/sovran-hub-update.log" +UPDATE_LOCK = "/run/sovran-hub-update.lock" REBOOT_COMMAND = ["reboot"] @@ -70,12 +70,6 @@ if os.path.isdir(_ICONS_DIR): templates = Jinja2Templates(directory=os.path.join(_BASE_DIR, "templates")) -# ── Track when we started an update ────────────────────────────── -# This timestamp lets us know that an update was recently kicked off, -# so we don't prematurely declare it finished if the unit hasn't -# transitioned to "active" yet. -_update_started_at: float = 0.0 - # ── Update check helpers ───────────────────────────────────────── def _get_locked_info(): @@ -173,15 +167,6 @@ def _update_is_active() -> bool: return r.returncode == 0 -def _update_state() -> str: - """Return the ActiveState of the update unit.""" - r = subprocess.run( - ["systemctl", "show", "-p", "ActiveState", "--value", UPDATE_UNIT], - capture_output=True, text=True, - ) - return r.stdout.strip() - - def _update_result() -> str: """Return 'success', 'failed', or 'unknown'.""" r = subprocess.run( @@ -196,6 +181,28 @@ def _update_result() -> str: return "unknown" +def _update_lock_exists() -> bool: + """Check if the file-based update lock exists (survives server restart).""" + return os.path.exists(UPDATE_LOCK) + + +def _create_update_lock(): + """Create the lock file to indicate an update is in progress.""" + try: + with open(UPDATE_LOCK, "w") as f: + f.write(str(os.getpid())) + except OSError: + pass + + +def _remove_update_lock(): + """Remove the lock file.""" + try: + os.unlink(UPDATE_LOCK) + except FileNotFoundError: + pass + + def _read_log(offset: int = 0) -> tuple[str, int]: """Read the update log file from the given byte offset. Returns (new_text, new_offset).""" @@ -342,7 +349,6 @@ async def api_reboot(): @app.post("/api/updates/run") async def api_updates_run(): """Kick off the detached update systemd unit.""" - global _update_started_at loop = asyncio.get_event_loop() running = await loop.run_in_executor(None, _update_is_active) @@ -356,8 +362,8 @@ async def api_updates_run(): stderr=asyncio.subprocess.DEVNULL, ) - # Record the start time so we can handle the race condition - _update_started_at = time.monotonic() + # Create a file-based lock that survives server restarts + _create_update_lock() proc = await asyncio.create_subprocess_exec( "systemctl", "start", "--no-block", UPDATE_UNIT, @@ -372,33 +378,37 @@ async def api_updates_run(): @app.get("/api/updates/status") async def api_updates_status(offset: int = 0): """Poll endpoint: returns running state, result, and new log content.""" - global _update_started_at loop = asyncio.get_event_loop() active = await loop.run_in_executor(None, _update_is_active) - state = await loop.run_in_executor(None, _update_state) result = await loop.run_in_executor(None, _update_result) + lock_exists = _update_lock_exists() new_log, new_offset = await loop.run_in_executor(None, _read_log, offset) - # Race condition guard: if we just started the unit and it hasn't - # transitioned to "activating"/"active" yet, report it as still running. - # Give it up to 10 seconds to appear as active. - if not active and _update_started_at > 0: - elapsed = time.monotonic() - _update_started_at - if elapsed < 10 and state in ("inactive", ""): - # Unit hasn't started yet — tell the frontend it's still running - return { - "running": True, - "result": "pending", - "log": new_log, - "offset": new_offset, - } - else: - # Either it finished or the grace period expired - _update_started_at = 0.0 + # If the unit is active, it's definitely still running + if active: + return { + "running": True, + "result": "pending", + "log": new_log, + "offset": new_offset, + } + # If the lock file exists but the unit is not active, the update + # finished (or the server just restarted after nixos-rebuild switch). + # The lock file persists across server restarts because it's on disk. + if lock_exists: + _remove_update_lock() + return { + "running": False, + "result": result, + "log": new_log, + "offset": new_offset, + } + + # No lock, not active — nothing happening return { - "running": active, + "running": False, "result": result, "log": new_log, "offset": new_offset, diff --git a/app/sovran_systemsos_web/static/app.js b/app/sovran_systemsos_web/static/app.js index bfc2778..f39293b 100644 --- a/app/sovran_systemsos_web/static/app.js +++ b/app/sovran_systemsos_web/static/app.js @@ -4,8 +4,7 @@ const POLL_INTERVAL_SERVICES = 5000; // 5 s const POLL_INTERVAL_UPDATES = 1800000; // 30 min const ACTION_REFRESH_DELAY = 1500; // 1.5 s after start/stop/restart -const UPDATE_POLL_INTERVAL = 1500; // 1.5 s while update is running -const UPDATE_POLL_DELAY = 3000; // 3 s before first poll (let unit start) +const UPDATE_POLL_INTERVAL = 2000; // 2 s while update is running const CATEGORY_ORDER = [ "infrastructure", @@ -29,7 +28,6 @@ let _updatePollTimer = null; let _updateLogOffset = 0; let _serverWasDown = false; let _updateFinished = false; -let _sawRunning = false; // ── DOM refs ────────────────────────────────────────────────────── @@ -74,7 +72,7 @@ async function apiFetch(path, options = {}) { return res.json(); } -// ── Render: initial build ───────────────────────────────────────── +// ── Render: initial build ────────────────────────��──────────────── function buildTiles(services, categoryLabels) { _servicesCache = services; @@ -271,7 +269,6 @@ function openUpdateModal() { _updateLogOffset = 0; _serverWasDown = false; _updateFinished = false; - _sawRunning = false; if ($modalLog) $modalLog.textContent = ""; if ($modalStatus) $modalStatus.textContent = "Starting update…"; if ($modalSpinner) $modalSpinner.classList.add("spinning"); @@ -309,11 +306,9 @@ function startUpdate() { .then(data => { if (data.status === "already_running") { appendLog("[Update already in progress, attaching…]\n\n"); - _sawRunning = true; } if ($modalStatus) $modalStatus.textContent = "Updating…"; - // Delay the first poll to give the systemd unit time to start - setTimeout(startUpdatePoll, UPDATE_POLL_DELAY); + startUpdatePoll(); }) .catch(err => { appendLog(`[Error: failed to start update — ${err}]\n`); @@ -342,6 +337,7 @@ async function pollUpdateStatus() { // Server came back after being down if (_serverWasDown) { _serverWasDown = false; + appendLog("[Server reconnected]\n"); if ($modalStatus) $modalStatus.textContent = "Updating…"; } @@ -351,14 +347,8 @@ async function pollUpdateStatus() { } _updateLogOffset = data.offset; - // Track if we ever saw the unit as running - if (data.running) { - _sawRunning = true; - } - - // Only declare finished if we previously saw it running (or server says so) - // This prevents the race where the unit hasn't started yet - if (!data.running && _sawRunning) { + // Check if finished + if (!data.running) { _updateFinished = true; stopUpdatePoll(); if (data.result === "success") { @@ -368,12 +358,10 @@ async function pollUpdateStatus() { } } } catch (err) { - // Server is likely restarting during nixos-rebuild switch - // This counts as "saw running" since it was running before it died - _sawRunning = true; + // Server is likely restarting during nixos-rebuild switch — keep polling if (!_serverWasDown) { _serverWasDown = true; - appendLog("\n[Server restarting — waiting for it to come back…]\n\n"); + appendLog("\n[Server restarting — waiting for it to come back…]\n"); if ($modalStatus) $modalStatus.textContent = "Server restarting…"; } } diff --git a/modules/core/sovran-hub.nix b/modules/core/sovran-hub.nix index 04bc0ad..121032f 100644 --- a/modules/core/sovran-hub.nix +++ b/modules/core/sovran-hub.nix @@ -28,7 +28,7 @@ let { name = "Matrix-Synapse"; unit = "matrix-synapse.service"; type = "system"; icon = "synapse"; enabled = cfg.services.synapse; category = "communication"; } { name = "Element-Call"; unit = "livekit.service"; type = "system"; icon = "livekit"; enabled = cfg.features.element-calling; category = "communication"; } ] - # ── Self-Hosted Apps ─────────────────────────────────────── + # ── Self-Hosted Apps ────────────────��────────────────────── ++ [ { name = "VaultWarden"; unit = "vaultwarden.service"; type = "system"; icon = "vaultwarden"; enabled = cfg.services.vaultwarden; category = "apps"; } { name = "Nextcloud"; unit = "phpfpm-nextcloud.service"; type = "system"; icon = "nextcloud"; enabled = cfg.services.nextcloud; category = "apps"; } @@ -58,11 +58,18 @@ let export PATH="${lib.makeBinPath [ pkgs.nix pkgs.nixos-rebuild pkgs.git pkgs.flatpak pkgs.coreutils ]}:$PATH" LOG="/var/log/sovran-hub-update.log" + LOCK="/run/sovran-hub-update.lock" + + # Create lock file (survives server restarts, cleared on reboot since /run is tmpfs) + echo $$ > "$LOCK" # Truncate the log and redirect ALL output (stdout + stderr) into it : > "$LOG" exec > >(tee -a "$LOG") 2>&1 + # Ensure lock is removed on exit (success or failure) + trap 'rm -f "$LOCK"' EXIT + echo "══════════════════════════════════════════════════" echo " Sovran_SystemsOS Update — $(date)" echo "══════════════════════════════════════════════════"