Merge pull request #202 from naturallaw777/copilot/fix-sovran-hub-update-issue

Fix update process killed mid-run by nixos-rebuild switch
This commit is contained in:
Sovran_Systems
2026-04-12 09:23:04 -05:00
committed by GitHub
2 changed files with 64 additions and 35 deletions

View File

@@ -3636,8 +3636,9 @@ _SAFE_UNIT_RE = re.compile(r'^[a-zA-Z0-9@._\-]+\.service$')
def _recover_stale_status(status_file: str, log_file: str, unit_name: str) -> bool:
"""If status_file says RUNNING but the systemd unit is not active, correct the status.
Queries the unit's Result property to distinguish SUCCESS from FAILED so that
a completed-but-interrupted update is not wrongly marked as failed.
Uses MainPID to confirm the process is truly gone before correcting, and
checks ExecMainStatus (actual exit code) instead of Result (which may
reflect a prior run) to determine SUCCESS vs FAILED.
Returns True if a correction was made, False otherwise.
"""
@@ -3653,25 +3654,51 @@ def _recover_stale_status(status_file: str, log_file: str, unit_name: str) -> bo
if status != "RUNNING":
return False
# Check if the unit is actively running
try:
result = subprocess.run(
["systemctl", "is-active", unit_name],
capture_output=True, text=True, timeout=10,
)
active = result.stdout.strip() == "active"
if result.stdout.strip() == "active":
return False # Still genuinely running — nothing to recover
except Exception:
active = False
return False # Can't determine state — don't touch anything
if not active:
# Check the unit's Result property to determine actual outcome.
# Double-check: if MainPID is still alive, the unit is still running
# (systemctl is-active can transiently lie during daemon-reload)
try:
show = subprocess.run(
["systemctl", "show", unit_name, "--property=MainPID"],
capture_output=True, text=True, timeout=10,
)
if show.returncode == 0:
pid_line = show.stdout.strip() # "MainPID=12345"
pid_str = pid_line.split("=", 1)[-1] if "=" in pid_line else "0"
pid = int(pid_str)
if pid > 0:
try:
os.kill(pid, 0) # Signal 0 = check if process exists
return False # PID is still alive — unit is still running
except ProcessLookupError:
pass # PID is gone — unit truly finished
except PermissionError:
return False # Process exists but we can't signal it — assume running
except Exception:
pass
# Unit is truly not running. Determine outcome from ExecMainStatus
# (the actual exit code), NOT Result (which may be stale from a prior run).
unit_result = "failed"
try:
show = subprocess.run(
["systemctl", "show", unit_name, "--property=Result"],
["systemctl", "show", unit_name, "--property=ExecMainStatus"],
capture_output=True, text=True, timeout=10,
)
# Output is "Result=success", "Result=failed", etc.
if show.returncode == 0 and show.stdout.strip() == "Result=success":
if show.returncode == 0:
# Output is "ExecMainStatus=0" for success, non-zero for failure
val = show.stdout.strip().split("=", 1)[-1] if "=" in show.stdout.strip() else ""
if val == "0":
unit_result = "success"
except Exception:
pass
@@ -3694,8 +3721,6 @@ def _recover_stale_status(status_file: str, log_file: str, unit_name: str) -> bo
pass
return True
return False
@app.on_event("startup")
async def _startup_recover_stale_status():

View File

@@ -350,6 +350,8 @@ in
systemd.services.sovran-hub-update = {
description = "Sovran_SystemsOS System Update";
restartIfChanged = false; # Don't let nixos-rebuild kill an in-flight update
stopIfChanged = false; # Don't stop it during activation either
serviceConfig = {
Type = "oneshot";
ExecStart = "${update-script}";
@@ -358,6 +360,8 @@ in
systemd.services.sovran-hub-rebuild = {
description = "Sovran_SystemsOS System Rebuild";
restartIfChanged = false; # Don't let nixos-rebuild kill an in-flight rebuild
stopIfChanged = false; # Don't stop it during activation either
serviceConfig = {
Type = "oneshot";
ExecStart = "${rebuild-script}";