fix: prevent nixos-rebuild from killing in-flight update; improve stale status recovery

Part A (modules/core/sovran-hub.nix):
- Add restartIfChanged=false and stopIfChanged=false to sovran-hub-update service
- Add restartIfChanged=false and stopIfChanged=false to sovran-hub-rebuild service
These prevent nixos-rebuild switch from terminating an in-flight update mid-execution.

Part B (app/sovran_systemsos_web/server.py):
- Replace _recover_stale_status() with improved version
- Use MainPID + os.kill() to guard against transient is-active lies during daemon-reload
- Use ExecMainStatus (actual exit code) instead of Result (may be stale from prior run)

Agent-Logs-Url: https://github.com/naturallaw777/staging_alpha/sessions/63bf2cd5-9c02-4542-8926-44aa9ed63bf0

Co-authored-by: naturallaw777 <99053422+naturallaw777@users.noreply.github.com>
This commit is contained in:
copilot-swe-agent[bot]
2026-04-12 13:47:49 +00:00
committed by GitHub
parent 8310028546
commit 008a003fa1
2 changed files with 64 additions and 35 deletions

View File

@@ -3636,8 +3636,9 @@ _SAFE_UNIT_RE = re.compile(r'^[a-zA-Z0-9@._\-]+\.service$')
def _recover_stale_status(status_file: str, log_file: str, unit_name: str) -> bool: def _recover_stale_status(status_file: str, log_file: str, unit_name: str) -> bool:
"""If status_file says RUNNING but the systemd unit is not active, correct the status. """If status_file says RUNNING but the systemd unit is not active, correct the status.
Queries the unit's Result property to distinguish SUCCESS from FAILED so that Uses MainPID to confirm the process is truly gone before correcting, and
a completed-but-interrupted update is not wrongly marked as failed. checks ExecMainStatus (actual exit code) instead of Result (which may
reflect a prior run) to determine SUCCESS vs FAILED.
Returns True if a correction was made, False otherwise. Returns True if a correction was made, False otherwise.
""" """
@@ -3653,48 +3654,72 @@ def _recover_stale_status(status_file: str, log_file: str, unit_name: str) -> bo
if status != "RUNNING": if status != "RUNNING":
return False return False
# Check if the unit is actively running
try: try:
result = subprocess.run( result = subprocess.run(
["systemctl", "is-active", unit_name], ["systemctl", "is-active", unit_name],
capture_output=True, text=True, timeout=10, capture_output=True, text=True, timeout=10,
) )
active = result.stdout.strip() == "active" if result.stdout.strip() == "active":
return False # Still genuinely running — nothing to recover
except Exception: except Exception:
active = False return False # Can't determine state — don't touch anything
if not active: # Double-check: if MainPID is still alive, the unit is still running
# Check the unit's Result property to determine actual outcome. # (systemctl is-active can transiently lie during daemon-reload)
unit_result = "failed" try:
try: show = subprocess.run(
show = subprocess.run( ["systemctl", "show", unit_name, "--property=MainPID"],
["systemctl", "show", unit_name, "--property=Result"], capture_output=True, text=True, timeout=10,
capture_output=True, text=True, timeout=10,
)
# Output is "Result=success", "Result=failed", etc.
if show.returncode == 0 and show.stdout.strip() == "Result=success":
unit_result = "success"
except Exception:
pass
new_status = "SUCCESS" if unit_result == "success" else "FAILED"
try:
with open(status_file, "w") as f:
f.write(new_status)
except OSError:
pass
msg = (
"\n[Update completed successfully while the server was restarting.]\n"
if new_status == "SUCCESS"
else "\n[Update encountered an error. See log above for details.]\n"
) )
try: if show.returncode == 0:
with open(log_file, "a") as f: pid_line = show.stdout.strip() # "MainPID=12345"
f.write(msg) pid_str = pid_line.split("=", 1)[-1] if "=" in pid_line else "0"
except OSError: pid = int(pid_str)
pass if pid > 0:
return True try:
os.kill(pid, 0) # Signal 0 = check if process exists
return False # PID is still alive — unit is still running
except ProcessLookupError:
pass # PID is gone — unit truly finished
except PermissionError:
return False # Process exists but we can't signal it — assume running
except Exception:
pass
return False # Unit is truly not running. Determine outcome from ExecMainStatus
# (the actual exit code), NOT Result (which may be stale from a prior run).
unit_result = "failed"
try:
show = subprocess.run(
["systemctl", "show", unit_name, "--property=ExecMainStatus"],
capture_output=True, text=True, timeout=10,
)
if show.returncode == 0:
# Output is "ExecMainStatus=0" for success, non-zero for failure
val = show.stdout.strip().split("=", 1)[-1] if "=" in show.stdout.strip() else ""
if val == "0":
unit_result = "success"
except Exception:
pass
new_status = "SUCCESS" if unit_result == "success" else "FAILED"
try:
with open(status_file, "w") as f:
f.write(new_status)
except OSError:
pass
msg = (
"\n[Update completed successfully while the server was restarting.]\n"
if new_status == "SUCCESS"
else "\n[Update encountered an error. See log above for details.]\n"
)
try:
with open(log_file, "a") as f:
f.write(msg)
except OSError:
pass
return True
@app.on_event("startup") @app.on_event("startup")

View File

@@ -350,6 +350,8 @@ in
systemd.services.sovran-hub-update = { systemd.services.sovran-hub-update = {
description = "Sovran_SystemsOS System Update"; description = "Sovran_SystemsOS System Update";
restartIfChanged = false; # Don't let nixos-rebuild kill an in-flight update
stopIfChanged = false; # Don't stop it during activation either
serviceConfig = { serviceConfig = {
Type = "oneshot"; Type = "oneshot";
ExecStart = "${update-script}"; ExecStart = "${update-script}";
@@ -358,6 +360,8 @@ in
systemd.services.sovran-hub-rebuild = { systemd.services.sovran-hub-rebuild = {
description = "Sovran_SystemsOS System Rebuild"; description = "Sovran_SystemsOS System Rebuild";
restartIfChanged = false; # Don't let nixos-rebuild kill an in-flight rebuild
stopIfChanged = false; # Don't stop it during activation either
serviceConfig = { serviceConfig = {
Type = "oneshot"; Type = "oneshot";
ExecStart = "${rebuild-script}"; ExecStart = "${rebuild-script}";