Merge pull request #199 from naturallaw777/copilot/fix-stale-status-logging

Fix update modal UX when hub restarts mid-update
This commit is contained in:
Sovran_Systems
2026-04-12 07:17:59 -05:00
committed by GitHub
2 changed files with 72 additions and 12 deletions

View File

@@ -43,6 +43,12 @@ REBUILD_LOG = "/var/log/sovran-hub-rebuild.log"
REBUILD_STATUS = "/var/log/sovran-hub-rebuild.status" REBUILD_STATUS = "/var/log/sovran-hub-rebuild.status"
REBUILD_UNIT = "sovran-hub-rebuild.service" REBUILD_UNIT = "sovran-hub-rebuild.service"
# Set to True by _startup_recover_stale_status() when it corrects a stale
# RUNNING → SUCCESS/FAILED for the update unit. Consumed by the first call
# to api_updates_status() so that the full log is returned to the frontend
# even when the frontend's offset is pointing past the pre-restart content.
_update_recovery_happened: bool = False
BACKUP_LOG = "/var/log/sovran-hub-backup.log" BACKUP_LOG = "/var/log/sovran-hub-backup.log"
BACKUP_STATUS = "/var/log/sovran-hub-backup.status" BACKUP_STATUS = "/var/log/sovran-hub-backup.status"
BACKUP_SCRIPT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "scripts", "sovran-hub-backup.sh") BACKUP_SCRIPT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "scripts", "sovran-hub-backup.sh")
@@ -2432,19 +2438,34 @@ async def api_updates_status(offset: int = 0):
If the status file says RUNNING but the systemd unit is no longer active If the status file says RUNNING but the systemd unit is no longer active
(e.g. the hub was restarted mid-update), correct the stale state before (e.g. the hub was restarted mid-update), correct the stale state before
returning so the frontend is never permanently stuck. returning so the frontend is never permanently stuck.
When recovery is detected (either during this call or at startup), the log
is returned from offset 0 so the frontend receives the complete output.
""" """
global _update_recovery_happened
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()
status = await loop.run_in_executor(None, _read_update_status) status = await loop.run_in_executor(None, _read_update_status)
use_full_log = False
# Detect and correct stale RUNNING state on every poll. # Detect and correct stale RUNNING state on every poll.
if status == "RUNNING": if status == "RUNNING":
await loop.run_in_executor( corrected = await loop.run_in_executor(
None, _recover_stale_status, UPDATE_STATUS, UPDATE_LOG, UPDATE_UNIT None, _recover_stale_status, UPDATE_STATUS, UPDATE_LOG, UPDATE_UNIT
) )
if corrected:
use_full_log = True
status = await loop.run_in_executor(None, _read_update_status) status = await loop.run_in_executor(None, _read_update_status)
new_log, new_offset = await loop.run_in_executor(None, _read_log, offset) # Honour a recovery that happened at server startup (stale RUNNING corrected
# before the frontend had a chance to reconnect).
if _update_recovery_happened:
use_full_log = True
_update_recovery_happened = False
effective_offset = 0 if use_full_log else offset
new_log, new_offset = await loop.run_in_executor(None, _read_log, effective_offset)
running = (status == "RUNNING") running = (status == "RUNNING")
result = "pending" if running else status.lower() result = "pending" if running else status.lower()
@@ -3611,23 +3632,25 @@ async def _startup_save_ip():
_SAFE_UNIT_RE = re.compile(r'^[a-zA-Z0-9@._\-]+\.service$') _SAFE_UNIT_RE = re.compile(r'^[a-zA-Z0-9@._\-]+\.service$')
def _recover_stale_status(status_file: str, log_file: str, unit_name: str): def _recover_stale_status(status_file: str, log_file: str, unit_name: str) -> bool:
"""If status_file says RUNNING but the systemd unit is not active, correct the status. """If status_file says RUNNING but the systemd unit is not active, correct the status.
Queries the unit's Result property to distinguish SUCCESS from FAILED so that Queries the unit's Result property to distinguish SUCCESS from FAILED so that
a completed-but-interrupted update is not wrongly marked as failed. a completed-but-interrupted update is not wrongly marked as failed.
Returns True if a correction was made, False otherwise.
""" """
if not _SAFE_UNIT_RE.match(unit_name): if not _SAFE_UNIT_RE.match(unit_name):
return return False
try: try:
with open(status_file, "r") as f: with open(status_file, "r") as f:
status = f.read().strip() status = f.read().strip()
except FileNotFoundError: except FileNotFoundError:
return return False
if status != "RUNNING": if status != "RUNNING":
return return False
try: try:
result = subprocess.run( result = subprocess.run(
@@ -3658,19 +3681,27 @@ def _recover_stale_status(status_file: str, log_file: str, unit_name: str):
f.write(new_status) f.write(new_status)
except OSError: except OSError:
pass pass
msg = (
"\n[Update completed successfully while the server was restarting.]\n"
if new_status == "SUCCESS"
else "\n[Update encountered an error. See log above for details.]\n"
)
try: try:
with open(log_file, "a") as f: with open(log_file, "a") as f:
f.write( f.write(msg)
f"\n[Hub] Stale RUNNING status detected; unit is not active."
f" Correcting to {new_status}.\n"
)
except OSError: except OSError:
pass pass
return True
return False
@app.on_event("startup") @app.on_event("startup")
async def _startup_recover_stale_status(): async def _startup_recover_stale_status():
"""Reset stale RUNNING status files left by interrupted update/rebuild jobs.""" """Reset stale RUNNING status files left by interrupted update/rebuild jobs."""
global _update_recovery_happened
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()
await loop.run_in_executor(None, _recover_stale_status, UPDATE_STATUS, UPDATE_LOG, UPDATE_UNIT) corrected = await loop.run_in_executor(None, _recover_stale_status, UPDATE_STATUS, UPDATE_LOG, UPDATE_UNIT)
if corrected:
_update_recovery_happened = True
await loop.run_in_executor(None, _recover_stale_status, REBUILD_STATUS, REBUILD_LOG, REBUILD_UNIT) await loop.run_in_executor(None, _recover_stale_status, REBUILD_STATUS, REBUILD_LOG, REBUILD_UNIT)

View File

@@ -94,7 +94,36 @@ async function pollUpdateStatus() {
if (_updateFinished) return; if (_updateFinished) return;
try { try {
var data = await apiFetch("/api/updates/status?offset=" + _updateLogOffset); var data = await apiFetch("/api/updates/status?offset=" + _updateLogOffset);
if (_serverWasDown) { _serverWasDown = false; appendLog("[Server reconnected]\n"); if ($modalStatus) $modalStatus.textContent = "Updating…"; } if (_serverWasDown) {
_serverWasDown = false;
if (!data.running) {
// The update finished while the server was restarting. Reset to
// offset 0 and re-fetch so the complete log is shown from the top.
_updateLog = "";
_updateLogOffset = 0;
if ($modalLog) $modalLog.textContent = "";
try {
var fullData = await apiFetch("/api/updates/status?offset=0");
if (fullData.log) appendLog(fullData.log);
_updateLogOffset = fullData.offset;
} catch (e) {
// If the re-fetch fails, fall through with whatever we have.
if (data.log) appendLog(data.log);
_updateLogOffset = data.offset;
}
if (data.result === "success") {
appendLog("[Server restarted — update completed successfully.]\n");
} else {
appendLog("[Server restarted — update encountered an error.]\n");
}
_updateFinished = true;
stopUpdatePoll();
onUpdateDone(data.result === "success");
return;
}
appendLog("[Server reconnected]\n");
if ($modalStatus) $modalStatus.textContent = "Updating…";
}
if (data.log) appendLog(data.log); if (data.log) appendLog(data.log);
_updateLogOffset = data.offset; _updateLogOffset = data.offset;
if (data.running) return; if (data.running) return;