fix: detect and correct stale RUNNING update status on poll and startup
Agent-Logs-Url: https://github.com/naturallaw777/staging_alpha/sessions/3c41cfb3-08f3-4e27-900c-7312a9204d4c Co-authored-by: naturallaw777 <99053422+naturallaw777@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
1a7ed3cb6c
commit
d5b16da57e
@@ -2402,10 +2402,23 @@ async def api_updates_run():
|
|||||||
|
|
||||||
@app.get("/api/updates/status")
|
@app.get("/api/updates/status")
|
||||||
async def api_updates_status(offset: int = 0):
|
async def api_updates_status(offset: int = 0):
|
||||||
"""Poll endpoint: reads status file + log file. No systemctl needed."""
|
"""Poll endpoint: reads status file + log file.
|
||||||
|
|
||||||
|
If the status file says RUNNING but the systemd unit is no longer active
|
||||||
|
(e.g. the hub was restarted mid-update), correct the stale state before
|
||||||
|
returning so the frontend is never permanently stuck.
|
||||||
|
"""
|
||||||
loop = asyncio.get_event_loop()
|
loop = asyncio.get_event_loop()
|
||||||
|
|
||||||
status = await loop.run_in_executor(None, _read_update_status)
|
status = await loop.run_in_executor(None, _read_update_status)
|
||||||
|
|
||||||
|
# Detect and correct stale RUNNING state on every poll.
|
||||||
|
if status == "RUNNING":
|
||||||
|
await loop.run_in_executor(
|
||||||
|
None, _recover_stale_status, UPDATE_STATUS, UPDATE_LOG, UPDATE_UNIT
|
||||||
|
)
|
||||||
|
status = await loop.run_in_executor(None, _read_update_status)
|
||||||
|
|
||||||
new_log, new_offset = await loop.run_in_executor(None, _read_log, offset)
|
new_log, new_offset = await loop.run_in_executor(None, _read_log, offset)
|
||||||
|
|
||||||
running = (status == "RUNNING")
|
running = (status == "RUNNING")
|
||||||
@@ -3574,7 +3587,11 @@ _SAFE_UNIT_RE = re.compile(r'^[a-zA-Z0-9@._\-]+\.service$')
|
|||||||
|
|
||||||
|
|
||||||
def _recover_stale_status(status_file: str, log_file: str, unit_name: str):
|
def _recover_stale_status(status_file: str, log_file: str, unit_name: str):
|
||||||
"""If status_file says RUNNING but the systemd unit is not active, reset to FAILED."""
|
"""If status_file says RUNNING but the systemd unit is not active, correct the status.
|
||||||
|
|
||||||
|
Queries the unit's Result property to distinguish SUCCESS from FAILED so that
|
||||||
|
a completed-but-interrupted update is not wrongly marked as failed.
|
||||||
|
"""
|
||||||
if not _SAFE_UNIT_RE.match(unit_name):
|
if not _SAFE_UNIT_RE.match(unit_name):
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -3597,16 +3614,30 @@ def _recover_stale_status(status_file: str, log_file: str, unit_name: str):
|
|||||||
active = False
|
active = False
|
||||||
|
|
||||||
if not active:
|
if not active:
|
||||||
|
# Check the unit's Result property to determine actual outcome.
|
||||||
|
unit_result = "failed"
|
||||||
|
try:
|
||||||
|
show = subprocess.run(
|
||||||
|
["systemctl", "show", unit_name, "--property=Result"],
|
||||||
|
capture_output=True, text=True, timeout=10,
|
||||||
|
)
|
||||||
|
# Output is "Result=success", "Result=failed", etc.
|
||||||
|
if show.returncode == 0 and show.stdout.strip() == "Result=success":
|
||||||
|
unit_result = "success"
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
new_status = "SUCCESS" if unit_result == "success" else "FAILED"
|
||||||
try:
|
try:
|
||||||
with open(status_file, "w") as f:
|
with open(status_file, "w") as f:
|
||||||
f.write("FAILED")
|
f.write(new_status)
|
||||||
except OSError:
|
except OSError:
|
||||||
pass
|
pass
|
||||||
try:
|
try:
|
||||||
with open(log_file, "a") as f:
|
with open(log_file, "a") as f:
|
||||||
f.write(
|
f.write(
|
||||||
"\n[Hub] Process was interrupted (stale RUNNING status detected"
|
f"\n[Hub] Stale RUNNING status detected; unit is not active."
|
||||||
" on startup). Marking as failed.\n"
|
f" Correcting to {new_status}.\n"
|
||||||
)
|
)
|
||||||
except OSError:
|
except OSError:
|
||||||
pass
|
pass
|
||||||
|
|||||||
Reference in New Issue
Block a user