updated logging
This commit is contained in:
@@ -7,7 +7,6 @@ import json
|
||||
import os
|
||||
import socket
|
||||
import subprocess
|
||||
import time
|
||||
import urllib.request
|
||||
|
||||
from fastapi import FastAPI, HTTPException
|
||||
@@ -27,6 +26,7 @@ GITEA_API_BASE = "https://git.sovransystems.com/api/v1/repos/Sovran_Systems/Sovr
|
||||
|
||||
UPDATE_UNIT = "sovran-hub-update.service"
|
||||
UPDATE_LOG = "/var/log/sovran-hub-update.log"
|
||||
UPDATE_LOCK = "/run/sovran-hub-update.lock"
|
||||
|
||||
REBOOT_COMMAND = ["reboot"]
|
||||
|
||||
@@ -70,12 +70,6 @@ if os.path.isdir(_ICONS_DIR):
|
||||
|
||||
templates = Jinja2Templates(directory=os.path.join(_BASE_DIR, "templates"))
|
||||
|
||||
# ── Track when we started an update ──────────────────────────────
|
||||
# This timestamp lets us know that an update was recently kicked off,
|
||||
# so we don't prematurely declare it finished if the unit hasn't
|
||||
# transitioned to "active" yet.
|
||||
_update_started_at: float = 0.0
|
||||
|
||||
# ── Update check helpers ─────────────────────────────────────────
|
||||
|
||||
def _get_locked_info():
|
||||
@@ -173,15 +167,6 @@ def _update_is_active() -> bool:
|
||||
return r.returncode == 0
|
||||
|
||||
|
||||
def _update_state() -> str:
|
||||
"""Return the ActiveState of the update unit."""
|
||||
r = subprocess.run(
|
||||
["systemctl", "show", "-p", "ActiveState", "--value", UPDATE_UNIT],
|
||||
capture_output=True, text=True,
|
||||
)
|
||||
return r.stdout.strip()
|
||||
|
||||
|
||||
def _update_result() -> str:
|
||||
"""Return 'success', 'failed', or 'unknown'."""
|
||||
r = subprocess.run(
|
||||
@@ -196,6 +181,28 @@ def _update_result() -> str:
|
||||
return "unknown"
|
||||
|
||||
|
||||
def _update_lock_exists() -> bool:
|
||||
"""Check if the file-based update lock exists (survives server restart)."""
|
||||
return os.path.exists(UPDATE_LOCK)
|
||||
|
||||
|
||||
def _create_update_lock():
|
||||
"""Create the lock file to indicate an update is in progress."""
|
||||
try:
|
||||
with open(UPDATE_LOCK, "w") as f:
|
||||
f.write(str(os.getpid()))
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def _remove_update_lock():
|
||||
"""Remove the lock file."""
|
||||
try:
|
||||
os.unlink(UPDATE_LOCK)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
|
||||
def _read_log(offset: int = 0) -> tuple[str, int]:
|
||||
"""Read the update log file from the given byte offset.
|
||||
Returns (new_text, new_offset)."""
|
||||
@@ -342,7 +349,6 @@ async def api_reboot():
|
||||
@app.post("/api/updates/run")
|
||||
async def api_updates_run():
|
||||
"""Kick off the detached update systemd unit."""
|
||||
global _update_started_at
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
running = await loop.run_in_executor(None, _update_is_active)
|
||||
@@ -356,8 +362,8 @@ async def api_updates_run():
|
||||
stderr=asyncio.subprocess.DEVNULL,
|
||||
)
|
||||
|
||||
# Record the start time so we can handle the race condition
|
||||
_update_started_at = time.monotonic()
|
||||
# Create a file-based lock that survives server restarts
|
||||
_create_update_lock()
|
||||
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
"systemctl", "start", "--no-block", UPDATE_UNIT,
|
||||
@@ -372,33 +378,37 @@ async def api_updates_run():
|
||||
@app.get("/api/updates/status")
|
||||
async def api_updates_status(offset: int = 0):
|
||||
"""Poll endpoint: returns running state, result, and new log content."""
|
||||
global _update_started_at
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
active = await loop.run_in_executor(None, _update_is_active)
|
||||
state = await loop.run_in_executor(None, _update_state)
|
||||
result = await loop.run_in_executor(None, _update_result)
|
||||
lock_exists = _update_lock_exists()
|
||||
new_log, new_offset = await loop.run_in_executor(None, _read_log, offset)
|
||||
|
||||
# Race condition guard: if we just started the unit and it hasn't
|
||||
# transitioned to "activating"/"active" yet, report it as still running.
|
||||
# Give it up to 10 seconds to appear as active.
|
||||
if not active and _update_started_at > 0:
|
||||
elapsed = time.monotonic() - _update_started_at
|
||||
if elapsed < 10 and state in ("inactive", ""):
|
||||
# Unit hasn't started yet — tell the frontend it's still running
|
||||
return {
|
||||
"running": True,
|
||||
"result": "pending",
|
||||
"log": new_log,
|
||||
"offset": new_offset,
|
||||
}
|
||||
else:
|
||||
# Either it finished or the grace period expired
|
||||
_update_started_at = 0.0
|
||||
# If the unit is active, it's definitely still running
|
||||
if active:
|
||||
return {
|
||||
"running": True,
|
||||
"result": "pending",
|
||||
"log": new_log,
|
||||
"offset": new_offset,
|
||||
}
|
||||
|
||||
# If the lock file exists but the unit is not active, the update
|
||||
# finished (or the server just restarted after nixos-rebuild switch).
|
||||
# The lock file persists across server restarts because it's on disk.
|
||||
if lock_exists:
|
||||
_remove_update_lock()
|
||||
return {
|
||||
"running": False,
|
||||
"result": result,
|
||||
"log": new_log,
|
||||
"offset": new_offset,
|
||||
}
|
||||
|
||||
# No lock, not active — nothing happening
|
||||
return {
|
||||
"running": active,
|
||||
"running": False,
|
||||
"result": result,
|
||||
"log": new_log,
|
||||
"offset": new_offset,
|
||||
|
||||
@@ -4,8 +4,7 @@
|
||||
const POLL_INTERVAL_SERVICES = 5000; // 5 s
|
||||
const POLL_INTERVAL_UPDATES = 1800000; // 30 min
|
||||
const ACTION_REFRESH_DELAY = 1500; // 1.5 s after start/stop/restart
|
||||
const UPDATE_POLL_INTERVAL = 1500; // 1.5 s while update is running
|
||||
const UPDATE_POLL_DELAY = 3000; // 3 s before first poll (let unit start)
|
||||
const UPDATE_POLL_INTERVAL = 2000; // 2 s while update is running
|
||||
|
||||
const CATEGORY_ORDER = [
|
||||
"infrastructure",
|
||||
@@ -29,7 +28,6 @@ let _updatePollTimer = null;
|
||||
let _updateLogOffset = 0;
|
||||
let _serverWasDown = false;
|
||||
let _updateFinished = false;
|
||||
let _sawRunning = false;
|
||||
|
||||
// ── DOM refs ──────────────────────────────────────────────────────
|
||||
|
||||
@@ -74,7 +72,7 @@ async function apiFetch(path, options = {}) {
|
||||
return res.json();
|
||||
}
|
||||
|
||||
// ── Render: initial build ─────────────────────────────────────────
|
||||
// ── Render: initial build ────────────────────────<EFBFBD><EFBFBD>────────────────
|
||||
|
||||
function buildTiles(services, categoryLabels) {
|
||||
_servicesCache = services;
|
||||
@@ -271,7 +269,6 @@ function openUpdateModal() {
|
||||
_updateLogOffset = 0;
|
||||
_serverWasDown = false;
|
||||
_updateFinished = false;
|
||||
_sawRunning = false;
|
||||
if ($modalLog) $modalLog.textContent = "";
|
||||
if ($modalStatus) $modalStatus.textContent = "Starting update…";
|
||||
if ($modalSpinner) $modalSpinner.classList.add("spinning");
|
||||
@@ -309,11 +306,9 @@ function startUpdate() {
|
||||
.then(data => {
|
||||
if (data.status === "already_running") {
|
||||
appendLog("[Update already in progress, attaching…]\n\n");
|
||||
_sawRunning = true;
|
||||
}
|
||||
if ($modalStatus) $modalStatus.textContent = "Updating…";
|
||||
// Delay the first poll to give the systemd unit time to start
|
||||
setTimeout(startUpdatePoll, UPDATE_POLL_DELAY);
|
||||
startUpdatePoll();
|
||||
})
|
||||
.catch(err => {
|
||||
appendLog(`[Error: failed to start update — ${err}]\n`);
|
||||
@@ -342,6 +337,7 @@ async function pollUpdateStatus() {
|
||||
// Server came back after being down
|
||||
if (_serverWasDown) {
|
||||
_serverWasDown = false;
|
||||
appendLog("[Server reconnected]\n");
|
||||
if ($modalStatus) $modalStatus.textContent = "Updating…";
|
||||
}
|
||||
|
||||
@@ -351,14 +347,8 @@ async function pollUpdateStatus() {
|
||||
}
|
||||
_updateLogOffset = data.offset;
|
||||
|
||||
// Track if we ever saw the unit as running
|
||||
if (data.running) {
|
||||
_sawRunning = true;
|
||||
}
|
||||
|
||||
// Only declare finished if we previously saw it running (or server says so)
|
||||
// This prevents the race where the unit hasn't started yet
|
||||
if (!data.running && _sawRunning) {
|
||||
// Check if finished
|
||||
if (!data.running) {
|
||||
_updateFinished = true;
|
||||
stopUpdatePoll();
|
||||
if (data.result === "success") {
|
||||
@@ -368,12 +358,10 @@ async function pollUpdateStatus() {
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
// Server is likely restarting during nixos-rebuild switch
|
||||
// This counts as "saw running" since it was running before it died
|
||||
_sawRunning = true;
|
||||
// Server is likely restarting during nixos-rebuild switch — keep polling
|
||||
if (!_serverWasDown) {
|
||||
_serverWasDown = true;
|
||||
appendLog("\n[Server restarting — waiting for it to come back…]\n\n");
|
||||
appendLog("\n[Server restarting — waiting for it to come back…]\n");
|
||||
if ($modalStatus) $modalStatus.textContent = "Server restarting…";
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user