Merge pull request #199 from naturallaw777/copilot/fix-stale-status-logging

Fix update modal UX when hub restarts mid-update
2026-04-12 07:17:59 -05:00
parent a48fe1c882 c7005c93b5
commit 3ead52583f
2 changed files with 72 additions and 12 deletions
@@ -43,6 +43,12 @@ REBUILD_LOG    = "/var/log/sovran-hub-rebuild.log"
 REBUILD_STATUS = "/var/log/sovran-hub-rebuild.status"
 REBUILD_UNIT   = "sovran-hub-rebuild.service"
 # Set to True by _startup_recover_stale_status() when it corrects a stale
 # RUNNING → SUCCESS/FAILED for the update unit.  Consumed by the first call
 # to api_updates_status() so that the full log is returned to the frontend
 # even when the frontend's offset is pointing past the pre-restart content.
 _update_recovery_happened: bool = False
 BACKUP_LOG    = "/var/log/sovran-hub-backup.log"
 BACKUP_STATUS = "/var/log/sovran-hub-backup.status"
 BACKUP_SCRIPT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "scripts", "sovran-hub-backup.sh")
@@ -2432,19 +2438,34 @@ async def api_updates_status(offset: int = 0):
    If the status file says RUNNING but the systemd unit is no longer active
    (e.g. the hub was restarted mid-update), correct the stale state before
    returning so the frontend is never permanently stuck.
    When recovery is detected (either during this call or at startup), the log
    is returned from offset 0 so the frontend receives the complete output.
    """
    global _update_recovery_happened
    loop = asyncio.get_event_loop()
    status = await loop.run_in_executor(None, _read_update_status)
    use_full_log = False
    # Detect and correct stale RUNNING state on every poll.
    if status == "RUNNING":
-        await loop.run_in_executor(
+        corrected = await loop.run_in_executor(
            None, _recover_stale_status, UPDATE_STATUS, UPDATE_LOG, UPDATE_UNIT
        )
        if corrected:
            use_full_log = True
        status = await loop.run_in_executor(None, _read_update_status)
-    new_log, new_offset = await loop.run_in_executor(None, _read_log, offset)
+    # Honour a recovery that happened at server startup (stale RUNNING corrected
    # before the frontend had a chance to reconnect).
    if _update_recovery_happened:
        use_full_log = True
        _update_recovery_happened = False
    effective_offset = 0 if use_full_log else offset
    new_log, new_offset = await loop.run_in_executor(None, _read_log, effective_offset)
    running = (status == "RUNNING")
    result = "pending" if running else status.lower()
@@ -3611,23 +3632,25 @@ async def _startup_save_ip():
 _SAFE_UNIT_RE = re.compile(r'^[a-zA-Z0-9@._\-]+\.service$')
-def _recover_stale_status(status_file: str, log_file: str, unit_name: str):
+def _recover_stale_status(status_file: str, log_file: str, unit_name: str) -> bool:
    """If status_file says RUNNING but the systemd unit is not active, correct the status.
    Queries the unit's Result property to distinguish SUCCESS from FAILED so that
    a completed-but-interrupted update is not wrongly marked as failed.
    Returns True if a correction was made, False otherwise.
    """
    if not _SAFE_UNIT_RE.match(unit_name):
-        return
+        return False
    try:
        with open(status_file, "r") as f:
            status = f.read().strip()
    except FileNotFoundError:
-        return
+        return False
    if status != "RUNNING":
-        return
+        return False
    try:
        result = subprocess.run(
@@ -3658,19 +3681,27 @@ def _recover_stale_status(status_file: str, log_file: str, unit_name: str):
                f.write(new_status)
        except OSError:
            pass
        msg = (
            "\n[Update completed successfully while the server was restarting.]\n"
            if new_status == "SUCCESS"
            else "\n[Update encountered an error. See log above for details.]\n"
        )
        try:
            with open(log_file, "a") as f:
-                f.write(
+                f.write(msg)
                    f"\n[Hub] Stale RUNNING status detected; unit is not active."
                    f" Correcting to {new_status}.\n"
                )
        except OSError:
            pass
        return True
    return False
@app.on_event("startup")
 async def _startup_recover_stale_status():
    """Reset stale RUNNING status files left by interrupted update/rebuild jobs."""
    global _update_recovery_happened
    loop = asyncio.get_event_loop()
-    await loop.run_in_executor(None, _recover_stale_status, UPDATE_STATUS, UPDATE_LOG, UPDATE_UNIT)
+    corrected = await loop.run_in_executor(None, _recover_stale_status, UPDATE_STATUS, UPDATE_LOG, UPDATE_UNIT)
    if corrected:
        _update_recovery_happened = True
    await loop.run_in_executor(None, _recover_stale_status, REBUILD_STATUS, REBUILD_LOG, REBUILD_UNIT)
@@ -94,7 +94,36 @@ async function pollUpdateStatus() {
  if (_updateFinished) return;
  try {
    var data = await apiFetch("/api/updates/status?offset=" + _updateLogOffset);
-    if (_serverWasDown) { _serverWasDown = false; appendLog("[Server reconnected]\n"); if ($modalStatus) $modalStatus.textContent = "Updating…"; }
+    if (_serverWasDown) {
      _serverWasDown = false;
      if (!data.running) {
        // The update finished while the server was restarting.  Reset to
        // offset 0 and re-fetch so the complete log is shown from the top.
        _updateLog = "";
        _updateLogOffset = 0;
        if ($modalLog) $modalLog.textContent = "";
        try {
          var fullData = await apiFetch("/api/updates/status?offset=0");
          if (fullData.log) appendLog(fullData.log);
          _updateLogOffset = fullData.offset;
        } catch (e) {
          // If the re-fetch fails, fall through with whatever we have.
          if (data.log) appendLog(data.log);
          _updateLogOffset = data.offset;
        }
        if (data.result === "success") {
          appendLog("[Server restarted — update completed successfully.]\n");
        } else {
          appendLog("[Server restarted — update encountered an error.]\n");
        }
        _updateFinished = true;
        stopUpdatePoll();
        onUpdateDone(data.result === "success");
        return;
      }
      appendLog("[Server reconnected]\n");
      if ($modalStatus) $modalStatus.textContent = "Updating…";
    }
    if (data.log) appendLog(data.log);
    _updateLogOffset = data.offset;
    if (data.running) return;