diff --git a/app/sovran_systemsos_web/server.py b/app/sovran_systemsos_web/server.py index 8fba0d0..fce9ebb 100644 --- a/app/sovran_systemsos_web/server.py +++ b/app/sovran_systemsos_web/server.py @@ -3636,8 +3636,9 @@ _SAFE_UNIT_RE = re.compile(r'^[a-zA-Z0-9@._\-]+\.service$') def _recover_stale_status(status_file: str, log_file: str, unit_name: str) -> bool: """If status_file says RUNNING but the systemd unit is not active, correct the status. - Queries the unit's Result property to distinguish SUCCESS from FAILED so that - a completed-but-interrupted update is not wrongly marked as failed. + Uses MainPID to confirm the process is truly gone before correcting, and + checks ExecMainStatus (actual exit code) instead of Result (which may + reflect a prior run) to determine SUCCESS vs FAILED. Returns True if a correction was made, False otherwise. """ @@ -3653,48 +3654,72 @@ def _recover_stale_status(status_file: str, log_file: str, unit_name: str) -> bo if status != "RUNNING": return False + # Check if the unit is actively running try: result = subprocess.run( ["systemctl", "is-active", unit_name], capture_output=True, text=True, timeout=10, ) - active = result.stdout.strip() == "active" + if result.stdout.strip() == "active": + return False # Still genuinely running — nothing to recover except Exception: - active = False + return False # Can't determine state — don't touch anything - if not active: - # Check the unit's Result property to determine actual outcome. - unit_result = "failed" - try: - show = subprocess.run( - ["systemctl", "show", unit_name, "--property=Result"], - capture_output=True, text=True, timeout=10, - ) - # Output is "Result=success", "Result=failed", etc. - if show.returncode == 0 and show.stdout.strip() == "Result=success": - unit_result = "success" - except Exception: - pass - - new_status = "SUCCESS" if unit_result == "success" else "FAILED" - try: - with open(status_file, "w") as f: - f.write(new_status) - except OSError: - pass - msg = ( - "\n[Update completed successfully while the server was restarting.]\n" - if new_status == "SUCCESS" - else "\n[Update encountered an error. See log above for details.]\n" + # Double-check: if MainPID is still alive, the unit is still running + # (systemctl is-active can transiently lie during daemon-reload) + try: + show = subprocess.run( + ["systemctl", "show", unit_name, "--property=MainPID"], + capture_output=True, text=True, timeout=10, ) - try: - with open(log_file, "a") as f: - f.write(msg) - except OSError: - pass - return True + if show.returncode == 0: + pid_line = show.stdout.strip() # "MainPID=12345" + pid_str = pid_line.split("=", 1)[-1] if "=" in pid_line else "0" + pid = int(pid_str) + if pid > 0: + try: + os.kill(pid, 0) # Signal 0 = check if process exists + return False # PID is still alive — unit is still running + except ProcessLookupError: + pass # PID is gone — unit truly finished + except PermissionError: + return False # Process exists but we can't signal it — assume running + except Exception: + pass - return False + # Unit is truly not running. Determine outcome from ExecMainStatus + # (the actual exit code), NOT Result (which may be stale from a prior run). + unit_result = "failed" + try: + show = subprocess.run( + ["systemctl", "show", unit_name, "--property=ExecMainStatus"], + capture_output=True, text=True, timeout=10, + ) + if show.returncode == 0: + # Output is "ExecMainStatus=0" for success, non-zero for failure + val = show.stdout.strip().split("=", 1)[-1] if "=" in show.stdout.strip() else "" + if val == "0": + unit_result = "success" + except Exception: + pass + + new_status = "SUCCESS" if unit_result == "success" else "FAILED" + try: + with open(status_file, "w") as f: + f.write(new_status) + except OSError: + pass + msg = ( + "\n[Update completed successfully while the server was restarting.]\n" + if new_status == "SUCCESS" + else "\n[Update encountered an error. See log above for details.]\n" + ) + try: + with open(log_file, "a") as f: + f.write(msg) + except OSError: + pass + return True @app.on_event("startup") diff --git a/modules/core/sovran-hub.nix b/modules/core/sovran-hub.nix index d0e51f9..f6600e4 100644 --- a/modules/core/sovran-hub.nix +++ b/modules/core/sovran-hub.nix @@ -350,6 +350,8 @@ in systemd.services.sovran-hub-update = { description = "Sovran_SystemsOS System Update"; + restartIfChanged = false; # Don't let nixos-rebuild kill an in-flight update + stopIfChanged = false; # Don't stop it during activation either serviceConfig = { Type = "oneshot"; ExecStart = "${update-script}"; @@ -358,6 +360,8 @@ in systemd.services.sovran-hub-rebuild = { description = "Sovran_SystemsOS System Rebuild"; + restartIfChanged = false; # Don't let nixos-rebuild kill an in-flight rebuild + stopIfChanged = false; # Don't stop it during activation either serviceConfig = { Type = "oneshot"; ExecStart = "${rebuild-script}";