updated logging

This commit is contained in:
2026-04-02 13:34:29 -05:00
parent 3e1f672c00
commit a66e8e736f
3 changed files with 64 additions and 59 deletions

View File

@@ -7,7 +7,6 @@ import json
import os
import socket
import subprocess
import time
import urllib.request
from fastapi import FastAPI, HTTPException
@@ -27,6 +26,7 @@ GITEA_API_BASE = "https://git.sovransystems.com/api/v1/repos/Sovran_Systems/Sovr
UPDATE_UNIT = "sovran-hub-update.service"
UPDATE_LOG = "/var/log/sovran-hub-update.log"
UPDATE_LOCK = "/run/sovran-hub-update.lock"
REBOOT_COMMAND = ["reboot"]
@@ -70,12 +70,6 @@ if os.path.isdir(_ICONS_DIR):
templates = Jinja2Templates(directory=os.path.join(_BASE_DIR, "templates"))
# ── Track when we started an update ──────────────────────────────
# This timestamp lets us know that an update was recently kicked off,
# so we don't prematurely declare it finished if the unit hasn't
# transitioned to "active" yet.
_update_started_at: float = 0.0
# ── Update check helpers ─────────────────────────────────────────
def _get_locked_info():
@@ -173,15 +167,6 @@ def _update_is_active() -> bool:
return r.returncode == 0
def _update_state() -> str:
"""Return the ActiveState of the update unit."""
r = subprocess.run(
["systemctl", "show", "-p", "ActiveState", "--value", UPDATE_UNIT],
capture_output=True, text=True,
)
return r.stdout.strip()
def _update_result() -> str:
"""Return 'success', 'failed', or 'unknown'."""
r = subprocess.run(
@@ -196,6 +181,28 @@ def _update_result() -> str:
return "unknown"
def _update_lock_exists() -> bool:
"""Check if the file-based update lock exists (survives server restart)."""
return os.path.exists(UPDATE_LOCK)
def _create_update_lock():
"""Create the lock file to indicate an update is in progress."""
try:
with open(UPDATE_LOCK, "w") as f:
f.write(str(os.getpid()))
except OSError:
pass
def _remove_update_lock():
"""Remove the lock file."""
try:
os.unlink(UPDATE_LOCK)
except FileNotFoundError:
pass
def _read_log(offset: int = 0) -> tuple[str, int]:
"""Read the update log file from the given byte offset.
Returns (new_text, new_offset)."""
@@ -342,7 +349,6 @@ async def api_reboot():
@app.post("/api/updates/run")
async def api_updates_run():
"""Kick off the detached update systemd unit."""
global _update_started_at
loop = asyncio.get_event_loop()
running = await loop.run_in_executor(None, _update_is_active)
@@ -356,8 +362,8 @@ async def api_updates_run():
stderr=asyncio.subprocess.DEVNULL,
)
# Record the start time so we can handle the race condition
_update_started_at = time.monotonic()
# Create a file-based lock that survives server restarts
_create_update_lock()
proc = await asyncio.create_subprocess_exec(
"systemctl", "start", "--no-block", UPDATE_UNIT,
@@ -372,33 +378,37 @@ async def api_updates_run():
@app.get("/api/updates/status")
async def api_updates_status(offset: int = 0):
"""Poll endpoint: returns running state, result, and new log content."""
global _update_started_at
loop = asyncio.get_event_loop()
active = await loop.run_in_executor(None, _update_is_active)
state = await loop.run_in_executor(None, _update_state)
result = await loop.run_in_executor(None, _update_result)
lock_exists = _update_lock_exists()
new_log, new_offset = await loop.run_in_executor(None, _read_log, offset)
# Race condition guard: if we just started the unit and it hasn't
# transitioned to "activating"/"active" yet, report it as still running.
# Give it up to 10 seconds to appear as active.
if not active and _update_started_at > 0:
elapsed = time.monotonic() - _update_started_at
if elapsed < 10 and state in ("inactive", ""):
# Unit hasn't started yet — tell the frontend it's still running
return {
"running": True,
"result": "pending",
"log": new_log,
"offset": new_offset,
}
else:
# Either it finished or the grace period expired
_update_started_at = 0.0
# If the unit is active, it's definitely still running
if active:
return {
"running": True,
"result": "pending",
"log": new_log,
"offset": new_offset,
}
# If the lock file exists but the unit is not active, the update
# finished (or the server just restarted after nixos-rebuild switch).
# The lock file persists across server restarts because it's on disk.
if lock_exists:
_remove_update_lock()
return {
"running": False,
"result": result,
"log": new_log,
"offset": new_offset,
}
# No lock, not active — nothing happening
return {
"running": active,
"running": False,
"result": result,
"log": new_log,
"offset": new_offset,

View File

@@ -4,8 +4,7 @@
const POLL_INTERVAL_SERVICES = 5000; // 5 s
const POLL_INTERVAL_UPDATES = 1800000; // 30 min
const ACTION_REFRESH_DELAY = 1500; // 1.5 s after start/stop/restart
const UPDATE_POLL_INTERVAL = 1500; // 1.5 s while update is running
const UPDATE_POLL_DELAY = 3000; // 3 s before first poll (let unit start)
const UPDATE_POLL_INTERVAL = 2000; // 2 s while update is running
const CATEGORY_ORDER = [
"infrastructure",
@@ -29,7 +28,6 @@ let _updatePollTimer = null;
let _updateLogOffset = 0;
let _serverWasDown = false;
let _updateFinished = false;
let _sawRunning = false;
// ── DOM refs ──────────────────────────────────────────────────────
@@ -74,7 +72,7 @@ async function apiFetch(path, options = {}) {
return res.json();
}
// ── Render: initial build ────────────────────────────────────────
// ── Render: initial build ────────────────────────<EFBFBD><EFBFBD>────────────────
function buildTiles(services, categoryLabels) {
_servicesCache = services;
@@ -271,7 +269,6 @@ function openUpdateModal() {
_updateLogOffset = 0;
_serverWasDown = false;
_updateFinished = false;
_sawRunning = false;
if ($modalLog) $modalLog.textContent = "";
if ($modalStatus) $modalStatus.textContent = "Starting update…";
if ($modalSpinner) $modalSpinner.classList.add("spinning");
@@ -309,11 +306,9 @@ function startUpdate() {
.then(data => {
if (data.status === "already_running") {
appendLog("[Update already in progress, attaching…]\n\n");
_sawRunning = true;
}
if ($modalStatus) $modalStatus.textContent = "Updating…";
// Delay the first poll to give the systemd unit time to start
setTimeout(startUpdatePoll, UPDATE_POLL_DELAY);
startUpdatePoll();
})
.catch(err => {
appendLog(`[Error: failed to start update — ${err}]\n`);
@@ -342,6 +337,7 @@ async function pollUpdateStatus() {
// Server came back after being down
if (_serverWasDown) {
_serverWasDown = false;
appendLog("[Server reconnected]\n");
if ($modalStatus) $modalStatus.textContent = "Updating…";
}
@@ -351,14 +347,8 @@ async function pollUpdateStatus() {
}
_updateLogOffset = data.offset;
// Track if we ever saw the unit as running
if (data.running) {
_sawRunning = true;
}
// Only declare finished if we previously saw it running (or server says so)
// This prevents the race where the unit hasn't started yet
if (!data.running && _sawRunning) {
// Check if finished
if (!data.running) {
_updateFinished = true;
stopUpdatePoll();
if (data.result === "success") {
@@ -368,12 +358,10 @@ async function pollUpdateStatus() {
}
}
} catch (err) {
// Server is likely restarting during nixos-rebuild switch
// This counts as "saw running" since it was running before it died
_sawRunning = true;
// Server is likely restarting during nixos-rebuild switch — keep polling
if (!_serverWasDown) {
_serverWasDown = true;
appendLog("\n[Server restarting — waiting for it to come back…]\n\n");
appendLog("\n[Server restarting — waiting for it to come back…]\n");
if ($modalStatus) $modalStatus.textContent = "Server restarting…";
}
}