fix: user-friendly stale recovery messages and complete log on reconnect
- _recover_stale_status(): returns True when corrected; changes message from internal '[Hub] Stale RUNNING...' to user-friendly text - _startup_recover_stale_status(): sets _update_recovery_happened flag when update recovery happens at startup - api_updates_status(): uses offset=0 when recovery happened so frontend receives the full log, not just a stale delta - pollUpdateStatus(): when reconnecting after server-down with update done, resets offset to 0, re-fetches full log, shows '[Server restarted — update completed successfully.]' instead of '[Server reconnected]' Agent-Logs-Url: https://github.com/naturallaw777/staging_alpha/sessions/90b535d1-bc3b-4147-9d62-3c7a93b1c8e4 Co-authored-by: naturallaw777 <99053422+naturallaw777@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
d2d2ed58a6
commit
c7005c93b5
@@ -43,6 +43,12 @@ REBUILD_LOG = "/var/log/sovran-hub-rebuild.log"
|
||||
REBUILD_STATUS = "/var/log/sovran-hub-rebuild.status"
|
||||
REBUILD_UNIT = "sovran-hub-rebuild.service"
|
||||
|
||||
# Set to True by _startup_recover_stale_status() when it corrects a stale
|
||||
# RUNNING → SUCCESS/FAILED for the update unit. Consumed by the first call
|
||||
# to api_updates_status() so that the full log is returned to the frontend
|
||||
# even when the frontend's offset is pointing past the pre-restart content.
|
||||
_update_recovery_happened: bool = False
|
||||
|
||||
BACKUP_LOG = "/var/log/sovran-hub-backup.log"
|
||||
BACKUP_STATUS = "/var/log/sovran-hub-backup.status"
|
||||
BACKUP_SCRIPT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "scripts", "sovran-hub-backup.sh")
|
||||
@@ -2432,19 +2438,34 @@ async def api_updates_status(offset: int = 0):
|
||||
If the status file says RUNNING but the systemd unit is no longer active
|
||||
(e.g. the hub was restarted mid-update), correct the stale state before
|
||||
returning so the frontend is never permanently stuck.
|
||||
|
||||
When recovery is detected (either during this call or at startup), the log
|
||||
is returned from offset 0 so the frontend receives the complete output.
|
||||
"""
|
||||
global _update_recovery_happened
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
status = await loop.run_in_executor(None, _read_update_status)
|
||||
|
||||
use_full_log = False
|
||||
|
||||
# Detect and correct stale RUNNING state on every poll.
|
||||
if status == "RUNNING":
|
||||
await loop.run_in_executor(
|
||||
corrected = await loop.run_in_executor(
|
||||
None, _recover_stale_status, UPDATE_STATUS, UPDATE_LOG, UPDATE_UNIT
|
||||
)
|
||||
if corrected:
|
||||
use_full_log = True
|
||||
status = await loop.run_in_executor(None, _read_update_status)
|
||||
|
||||
new_log, new_offset = await loop.run_in_executor(None, _read_log, offset)
|
||||
# Honour a recovery that happened at server startup (stale RUNNING corrected
|
||||
# before the frontend had a chance to reconnect).
|
||||
if _update_recovery_happened:
|
||||
use_full_log = True
|
||||
_update_recovery_happened = False
|
||||
|
||||
effective_offset = 0 if use_full_log else offset
|
||||
new_log, new_offset = await loop.run_in_executor(None, _read_log, effective_offset)
|
||||
|
||||
running = (status == "RUNNING")
|
||||
result = "pending" if running else status.lower()
|
||||
@@ -3611,23 +3632,25 @@ async def _startup_save_ip():
|
||||
_SAFE_UNIT_RE = re.compile(r'^[a-zA-Z0-9@._\-]+\.service$')
|
||||
|
||||
|
||||
def _recover_stale_status(status_file: str, log_file: str, unit_name: str):
|
||||
def _recover_stale_status(status_file: str, log_file: str, unit_name: str) -> bool:
|
||||
"""If status_file says RUNNING but the systemd unit is not active, correct the status.
|
||||
|
||||
Queries the unit's Result property to distinguish SUCCESS from FAILED so that
|
||||
a completed-but-interrupted update is not wrongly marked as failed.
|
||||
|
||||
Returns True if a correction was made, False otherwise.
|
||||
"""
|
||||
if not _SAFE_UNIT_RE.match(unit_name):
|
||||
return
|
||||
return False
|
||||
|
||||
try:
|
||||
with open(status_file, "r") as f:
|
||||
status = f.read().strip()
|
||||
except FileNotFoundError:
|
||||
return
|
||||
return False
|
||||
|
||||
if status != "RUNNING":
|
||||
return
|
||||
return False
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
@@ -3658,19 +3681,27 @@ def _recover_stale_status(status_file: str, log_file: str, unit_name: str):
|
||||
f.write(new_status)
|
||||
except OSError:
|
||||
pass
|
||||
msg = (
|
||||
"\n[Update completed successfully while the server was restarting.]\n"
|
||||
if new_status == "SUCCESS"
|
||||
else "\n[Update encountered an error. See log above for details.]\n"
|
||||
)
|
||||
try:
|
||||
with open(log_file, "a") as f:
|
||||
f.write(
|
||||
f"\n[Hub] Stale RUNNING status detected; unit is not active."
|
||||
f" Correcting to {new_status}.\n"
|
||||
)
|
||||
f.write(msg)
|
||||
except OSError:
|
||||
pass
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def _startup_recover_stale_status():
|
||||
"""Reset stale RUNNING status files left by interrupted update/rebuild jobs."""
|
||||
global _update_recovery_happened
|
||||
loop = asyncio.get_event_loop()
|
||||
await loop.run_in_executor(None, _recover_stale_status, UPDATE_STATUS, UPDATE_LOG, UPDATE_UNIT)
|
||||
corrected = await loop.run_in_executor(None, _recover_stale_status, UPDATE_STATUS, UPDATE_LOG, UPDATE_UNIT)
|
||||
if corrected:
|
||||
_update_recovery_happened = True
|
||||
await loop.run_in_executor(None, _recover_stale_status, REBUILD_STATUS, REBUILD_LOG, REBUILD_UNIT)
|
||||
|
||||
@@ -94,7 +94,36 @@ async function pollUpdateStatus() {
|
||||
if (_updateFinished) return;
|
||||
try {
|
||||
var data = await apiFetch("/api/updates/status?offset=" + _updateLogOffset);
|
||||
if (_serverWasDown) { _serverWasDown = false; appendLog("[Server reconnected]\n"); if ($modalStatus) $modalStatus.textContent = "Updating…"; }
|
||||
if (_serverWasDown) {
|
||||
_serverWasDown = false;
|
||||
if (!data.running) {
|
||||
// The update finished while the server was restarting. Reset to
|
||||
// offset 0 and re-fetch so the complete log is shown from the top.
|
||||
_updateLog = "";
|
||||
_updateLogOffset = 0;
|
||||
if ($modalLog) $modalLog.textContent = "";
|
||||
try {
|
||||
var fullData = await apiFetch("/api/updates/status?offset=0");
|
||||
if (fullData.log) appendLog(fullData.log);
|
||||
_updateLogOffset = fullData.offset;
|
||||
} catch (e) {
|
||||
// If the re-fetch fails, fall through with whatever we have.
|
||||
if (data.log) appendLog(data.log);
|
||||
_updateLogOffset = data.offset;
|
||||
}
|
||||
if (data.result === "success") {
|
||||
appendLog("[Server restarted — update completed successfully.]\n");
|
||||
} else {
|
||||
appendLog("[Server restarted — update encountered an error.]\n");
|
||||
}
|
||||
_updateFinished = true;
|
||||
stopUpdatePoll();
|
||||
onUpdateDone(data.result === "success");
|
||||
return;
|
||||
}
|
||||
appendLog("[Server reconnected]\n");
|
||||
if ($modalStatus) $modalStatus.textContent = "Updating…";
|
||||
}
|
||||
if (data.log) appendLog(data.log);
|
||||
_updateLogOffset = data.offset;
|
||||
if (data.running) return;
|
||||
|
||||
Reference in New Issue
Block a user