Merge pull request #202 from naturallaw777/copilot/fix-sovran-hub-update-issue
Fix update process killed mid-run by nixos-rebuild switch
This commit is contained in:
@@ -3636,8 +3636,9 @@ _SAFE_UNIT_RE = re.compile(r'^[a-zA-Z0-9@._\-]+\.service$')
|
|||||||
def _recover_stale_status(status_file: str, log_file: str, unit_name: str) -> bool:
|
def _recover_stale_status(status_file: str, log_file: str, unit_name: str) -> bool:
|
||||||
"""If status_file says RUNNING but the systemd unit is not active, correct the status.
|
"""If status_file says RUNNING but the systemd unit is not active, correct the status.
|
||||||
|
|
||||||
Queries the unit's Result property to distinguish SUCCESS from FAILED so that
|
Uses MainPID to confirm the process is truly gone before correcting, and
|
||||||
a completed-but-interrupted update is not wrongly marked as failed.
|
checks ExecMainStatus (actual exit code) instead of Result (which may
|
||||||
|
reflect a prior run) to determine SUCCESS vs FAILED.
|
||||||
|
|
||||||
Returns True if a correction was made, False otherwise.
|
Returns True if a correction was made, False otherwise.
|
||||||
"""
|
"""
|
||||||
@@ -3653,25 +3654,51 @@ def _recover_stale_status(status_file: str, log_file: str, unit_name: str) -> bo
|
|||||||
if status != "RUNNING":
|
if status != "RUNNING":
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
# Check if the unit is actively running
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
["systemctl", "is-active", unit_name],
|
["systemctl", "is-active", unit_name],
|
||||||
capture_output=True, text=True, timeout=10,
|
capture_output=True, text=True, timeout=10,
|
||||||
)
|
)
|
||||||
active = result.stdout.strip() == "active"
|
if result.stdout.strip() == "active":
|
||||||
|
return False # Still genuinely running — nothing to recover
|
||||||
except Exception:
|
except Exception:
|
||||||
active = False
|
return False # Can't determine state — don't touch anything
|
||||||
|
|
||||||
if not active:
|
# Double-check: if MainPID is still alive, the unit is still running
|
||||||
# Check the unit's Result property to determine actual outcome.
|
# (systemctl is-active can transiently lie during daemon-reload)
|
||||||
|
try:
|
||||||
|
show = subprocess.run(
|
||||||
|
["systemctl", "show", unit_name, "--property=MainPID"],
|
||||||
|
capture_output=True, text=True, timeout=10,
|
||||||
|
)
|
||||||
|
if show.returncode == 0:
|
||||||
|
pid_line = show.stdout.strip() # "MainPID=12345"
|
||||||
|
pid_str = pid_line.split("=", 1)[-1] if "=" in pid_line else "0"
|
||||||
|
pid = int(pid_str)
|
||||||
|
if pid > 0:
|
||||||
|
try:
|
||||||
|
os.kill(pid, 0) # Signal 0 = check if process exists
|
||||||
|
return False # PID is still alive — unit is still running
|
||||||
|
except ProcessLookupError:
|
||||||
|
pass # PID is gone — unit truly finished
|
||||||
|
except PermissionError:
|
||||||
|
return False # Process exists but we can't signal it — assume running
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Unit is truly not running. Determine outcome from ExecMainStatus
|
||||||
|
# (the actual exit code), NOT Result (which may be stale from a prior run).
|
||||||
unit_result = "failed"
|
unit_result = "failed"
|
||||||
try:
|
try:
|
||||||
show = subprocess.run(
|
show = subprocess.run(
|
||||||
["systemctl", "show", unit_name, "--property=Result"],
|
["systemctl", "show", unit_name, "--property=ExecMainStatus"],
|
||||||
capture_output=True, text=True, timeout=10,
|
capture_output=True, text=True, timeout=10,
|
||||||
)
|
)
|
||||||
# Output is "Result=success", "Result=failed", etc.
|
if show.returncode == 0:
|
||||||
if show.returncode == 0 and show.stdout.strip() == "Result=success":
|
# Output is "ExecMainStatus=0" for success, non-zero for failure
|
||||||
|
val = show.stdout.strip().split("=", 1)[-1] if "=" in show.stdout.strip() else ""
|
||||||
|
if val == "0":
|
||||||
unit_result = "success"
|
unit_result = "success"
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
@@ -3694,8 +3721,6 @@ def _recover_stale_status(status_file: str, log_file: str, unit_name: str) -> bo
|
|||||||
pass
|
pass
|
||||||
return True
|
return True
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
@app.on_event("startup")
|
@app.on_event("startup")
|
||||||
async def _startup_recover_stale_status():
|
async def _startup_recover_stale_status():
|
||||||
|
|||||||
@@ -350,6 +350,8 @@ in
|
|||||||
|
|
||||||
systemd.services.sovran-hub-update = {
|
systemd.services.sovran-hub-update = {
|
||||||
description = "Sovran_SystemsOS System Update";
|
description = "Sovran_SystemsOS System Update";
|
||||||
|
restartIfChanged = false; # Don't let nixos-rebuild kill an in-flight update
|
||||||
|
stopIfChanged = false; # Don't stop it during activation either
|
||||||
serviceConfig = {
|
serviceConfig = {
|
||||||
Type = "oneshot";
|
Type = "oneshot";
|
||||||
ExecStart = "${update-script}";
|
ExecStart = "${update-script}";
|
||||||
@@ -358,6 +360,8 @@ in
|
|||||||
|
|
||||||
systemd.services.sovran-hub-rebuild = {
|
systemd.services.sovran-hub-rebuild = {
|
||||||
description = "Sovran_SystemsOS System Rebuild";
|
description = "Sovran_SystemsOS System Rebuild";
|
||||||
|
restartIfChanged = false; # Don't let nixos-rebuild kill an in-flight rebuild
|
||||||
|
stopIfChanged = false; # Don't stop it during activation either
|
||||||
serviceConfig = {
|
serviceConfig = {
|
||||||
Type = "oneshot";
|
Type = "oneshot";
|
||||||
ExecStart = "${rebuild-script}";
|
ExecStart = "${rebuild-script}";
|
||||||
|
|||||||
Reference in New Issue
Block a user