therealityreport · therealityreport · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026
diff --git a/docs/observability/modal-v439-v440-serve-backend-api-crash-loop-2026-05-28.md b/docs/observability/modal-v439-v440-serve-backend-api-crash-loop-2026-05-28.md
@@ -0,0 +1,104 @@
+# Modal serve_backend_api Crash Loop: v439 to v440
+
+Date: 2026-05-28
+Workspace: `admin-56995`
+App: `trr-backend-jobs`
+Function: `trr_backend.modal_jobs.serve_backend_api`
+
+## Summary
+
+Modal emailed that `serve_backend_api` containers were repeatedly failing to
+start after deploy `v439`. The API image could resolve the deployed web
+endpoint, but cold-started containers failed while importing `api.main`.
+
+## Timeline
+
+- `2026-05-28 10:37:31 EDT`: deploy `v439` completed in workspace
+  `admin-56995`.
+- `2026-05-28 10:37:37 EDT`: first `serve_backend_api` import failure logged.
+- `2026-05-28 10:42:18 EDT`: Modal emitted crash-looping signal for
+  `serve_backend_api`.
+- `2026-05-28 11:20 EDT`: readiness still resolved the app and web endpoint,
+  but logs showed repeated cold-start failures.
+- `2026-05-28 11:23:23 EDT`: deploy `v440` completed through the pinned
+  `admin-56995` wrapper.
+- `2026-05-28 11:23:41 EDT`: `/health` returned HTTP 200 with database
+  connected.
+
+## Cause
+
+`serve_backend_api` imports `api.main`, and `api.main` imports routers including
+`api.routers.admin_show_sync`. That router imported `scripts.sync.*` modules at
+module import time. The lean API Modal image mounted `api` and `trr_backend`,
+but did not mount the top-level `scripts` package, so container startup failed:
+
+```text
+ModuleNotFoundError("No module named 'scripts'")
+```
+
+## Fix
+
+- `api.routers.admin_show_sync` now lazy-loads `scripts.sync.*` modules only
+  when script-backed endpoints execute.
+- The lean Modal image mounts the minimal script payload required by API/admin
+  sync paths:
+  - `scripts/__init__.py`
+  - `scripts/_sync_common.py`
+  - `scripts/sync`
+- Modal image payload validation now checks required local mount paths for each
+  image family before image construction.
+- The deploy wrapper now runs readiness and an API `/health` cold-start canary
+  after successful deploys.
+
+## Verification
+
+- Deploy `v440` URL:
+  `https://modal.com/apps/admin-56995/main/deployed/trr-backend-jobs`
+- Readiness:
+  - `ok = true`
+  - `api_web_url` resolved for `serve_backend_api`
+  - `missing_functions = []`
+  - `missing_web_endpoints = []`
+- `/health` returned HTTP 200 three times after `v440`.
+- Logs after `2026-05-28 11:23:20 EDT` showed API startup and health probes,
+  with no new `ModuleNotFoundError` or crash-looping entry.
+
+## Follow-up Hardening
+
+- Keep the deploy wrapper as the only deploy path for Modal backend changes.
+- Treat readiness-only success as insufficient for API deploys; the cold-start
+  canary must pass.
+- Keep script-backed admin endpoints lazy so API startup does not depend on
+  optional script payloads.
+
+## v442 Guard Regression And Replacement
+
+Deploy `v442` briefly exposed a second startup failure after the image payload
+guard was added. The guard correctly knew about browser-family script mounts,
+but it ran inside deployed Modal containers where those source paths are not
+present. That made the API function import `trr_backend.modal_jobs` and fail
+before serving `/health`.
+
+The fix was to keep the guard strict for local deploy/image construction while
+skipping local-path validation inside remote runtime containers. Deploy `v443`
+replaced `v442`; readiness returned `ok = true`, the API canary returned HTTP
+200 on attempt 1, and recent logs no longer showed `Runner failed` or
+crash-looping entries.
+
+<!-- modal-deploy-history:start -->
+## Deploy History Stamp
+
+- Last stamped: `2026-05-28T13:55:36-04:00`
+- Workspace: `admin-56995`
+- Profile: `admin-56995`
+- Canary: `https://admin-56995--trr-backend-api.modal.run/health` HTTP `200` on attempt `1`
+
+| Version | Deployed At | Deployed By | Commit | Client |
+| --- | --- | --- | --- | --- |
+| v443 | 2026-05-28 13:55:19-04:00 | admin-56995 | c150a64* | 1.4.0 |
+| v442 | 2026-05-28 13:51:21-04:00 | admin-56995 | c150a64* | 1.4.0 |
+| v441 | 2026-05-28 11:46:53-04:00 | admin-56995 | c150a64* | 1.4.0 |
+| v440 | 2026-05-28 11:23:23-04:00 | admin-56995 | c150a64* | 1.4.0 |
+| v439 | 2026-05-28 10:37:31-04:00 | admin-56995 | c150a64* | 1.4.0 |
+
+<!-- modal-deploy-history:end -->
diff --git a/scripts/modal/api_canary.py b/scripts/modal/api_canary.py
@@ -0,0 +1,47 @@
+"""Shared Modal API canary helpers."""
+
+from __future__ import annotations
+
+import urllib.error
+import urllib.parse
+import urllib.request
+from typing import Any
+
+DEFAULT_CANARY_ATTEMPTS = 3
+DEFAULT_CANARY_TIMEOUT_SECONDS = 20
+
+
+def health_url(api_web_url: str) -> str:
+    base = str(api_web_url or "").strip().rstrip("/")
+    if not base:
+        raise RuntimeError("Modal readiness did not return api_web_url for cold-start canary.")
+    parsed = urllib.parse.urlparse(base)
+    if parsed.scheme not in {"http", "https"}:
+        raise RuntimeError(f"Invalid URL scheme for cold-start canary: {base}")
+    return f"{base}/health"
+
+
+def run_api_cold_start_canary(
+    api_web_url: str,
+    *,
+    attempts: int = DEFAULT_CANARY_ATTEMPTS,
+    timeout_seconds: int = DEFAULT_CANARY_TIMEOUT_SECONDS,
+) -> dict[str, Any]:
+    url = health_url(api_web_url)
+    last_error = ""
+    for attempt in range(1, max(1, attempts) + 1):
+        try:
+            request = urllib.request.Request(url, headers={"User-Agent": "trr-modal-deploy-canary/1"})
+            with urllib.request.urlopen(request, timeout=timeout_seconds) as response:
+                body = response.read(4096).decode("utf-8", errors="replace")
+                status = int(getattr(response, "status", 0) or response.getcode())
+            if 200 <= status < 300:
+                return {"ok": True, "url": url, "status": status, "attempt": attempt, "body": body}
+            last_error = f"HTTP {status}: {body[:200]}"
+        except (urllib.error.URLError, TimeoutError, OSError) as exc:
+            last_error = str(exc)
+    raise RuntimeError(f"Modal API cold-start canary failed for {url}: {last_error}")
+
+
+def skipped_api_canary(reason: str = "not_requested") -> dict[str, Any]:
+    return {"ok": None, "ran": False, "reason": reason}
diff --git a/scripts/modal/cleanup_wrong_workspace_deploy.py b/scripts/modal/cleanup_wrong_workspace_deploy.py
@@ -0,0 +1,244 @@
+#!/usr/bin/env python3
+"""Clean up a mistaken TRR Modal deployment from a non-authoritative workspace."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import subprocess
+import sys
+from pathlib import Path
+from typing import Any
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+
+from scripts.modal.deploy_backend import (  # noqa: E402
+    REQUIRED_MODAL_PROFILE,
+    REQUIRED_MODAL_WORKSPACE,
+    pinned_modal_env,
+)
+
+DEFAULT_APP_NAME = "trr-backend-jobs"
+DEFAULT_WRONG_PROFILE = "thb-bbl"
+DEFAULT_WRONG_WORKSPACE = "tommy-hulihan-basketball"
+
+
+def python_command() -> str:
+    repo_venv_python = REPO_ROOT / ".venv" / "bin" / "python"
+    if repo_venv_python.is_file():
+        return str(repo_venv_python)
+    return sys.executable or "python3.11"
+
+
+def modal_profile_env(profile: str, environ: dict[str, str] | None = None) -> dict[str, str]:
+    env = dict(environ or os.environ)
+    env["MODAL_PROFILE"] = profile
+    return env
+
+
+def _modal_command(*args: str, modal_environment: str = "") -> list[str]:
+    command = [python_command(), "-m", "modal", *args]
+    if modal_environment:
+        command.extend(["--env", modal_environment])
+    return command
+
+
+def _run_json(command: list[str], *, env: dict[str, str], timeout_seconds: int = 120) -> Any:
+    completed = subprocess.run(
+        command,
+        cwd=REPO_ROOT,
+        env=env,
+        check=True,
+        capture_output=True,
+        text=True,
+        timeout=timeout_seconds,
+    )
+    return json.loads(completed.stdout or "[]")
+
+
+def _run_text(command: list[str], *, env: dict[str, str], timeout_seconds: int = 120) -> str:
+    completed = subprocess.run(
+        command,
+        cwd=REPO_ROOT,
+        env=env,
+        check=True,
+        capture_output=True,
+        text=True,
+        timeout=timeout_seconds,
+    )
+    return completed.stdout
+
+
+def _active_workspace(profile: str, *, env: dict[str, str] | None = None) -> dict[str, Any]:
+    rows = _run_json(
+        _modal_command("profile", "list") + ["--json"],
+        env=modal_profile_env(profile, env),
+    )
+    active = next((row for row in rows if isinstance(row, dict) and row.get("active") is True), None)
+    return {
+        "profile": str(active.get("name") or "") if active else "",
+        "workspace": str(active.get("workspace") or "") if active else "",
+    }
+
+
+def _verify_authoritative_workspace(*, modal_environment: str = "") -> dict[str, Any]:
+    command = [
+        python_command(),
+        str(REPO_ROOT / "scripts" / "modal" / "verify_modal_readiness.py"),
+        "--json",
+    ]
+    if modal_environment:
+        command.extend(["--env", modal_environment])
+    payload = _run_json(command, env=pinned_modal_env(), timeout_seconds=180)
+    modal_workspace = payload.get("modal_workspace") if isinstance(payload, dict) else None
+    return {
+        "ok": bool(payload.get("ok")) if isinstance(payload, dict) else False,
+        "modal_workspace": modal_workspace if isinstance(modal_workspace, dict) else {},
+        "blocking_probe_failures": (
+            list(payload.get("blocking_probe_failures") or []) if isinstance(payload, dict) else []
+        ),
+    }
+
+
+def _app_rows(*, profile: str, modal_environment: str = "") -> list[dict[str, Any]]:
+    payload = _run_json(
+        _modal_command("app", "list", modal_environment=modal_environment) + ["--json"],
+        env=modal_profile_env(profile),
+    )
+    return [row for row in payload if isinstance(row, dict)] if isinstance(payload, list) else []
+
+
+def _app_is_present(rows: list[dict[str, Any]], app_name: str) -> bool:
+    for row in rows:
+        values = {str(value).strip() for value in row.values() if isinstance(value, str)}
+        if app_name in values:
+            return True
+    return False
+
+
+def cleanup_wrong_workspace_deploy(
+    *,
+    wrong_profile: str,
+    wrong_workspace: str,
+    app_name: str,
+    modal_environment: str = "",
+    stop: bool = False,
+) -> dict[str, Any]:
+    authoritative = _verify_authoritative_workspace(modal_environment=modal_environment)
+    if not authoritative["ok"]:
+        return {
+            "ok": False,
+            "failure_reason": "authoritative_workspace_not_ready",
+            "authoritative": authoritative,
+            "wrong_workspace": None,
+            "wrong_app_present": None,
+            "stopped": False,
+        }
+
+    wrong_context = _active_workspace(wrong_profile)
+    if wrong_context["profile"] == REQUIRED_MODAL_PROFILE or wrong_context["workspace"] == REQUIRED_MODAL_WORKSPACE:
+        return {
+            "ok": False,
+            "failure_reason": "wrong_profile_resolves_to_authoritative_workspace",
+            "authoritative": authoritative,
+            "wrong_workspace": wrong_context,
+            "wrong_app_present": None,
+            "stopped": False,
+        }
+    if wrong_workspace and wrong_context["workspace"] != wrong_workspace:
+        return {
+            "ok": False,
+            "failure_reason": "wrong_workspace_mismatch",
+            "authoritative": authoritative,
+            "wrong_workspace": wrong_context,
+            "expected_wrong_workspace": wrong_workspace,
+            "wrong_app_present": None,
+            "stopped": False,
+        }
+
+    rows = _app_rows(profile=wrong_profile, modal_environment=modal_environment)
+    app_present = _app_is_present(rows, app_name)
+    history = _run_json(
+        _modal_command("app", "history", app_name, modal_environment=modal_environment) + ["--json"],
+        env=modal_profile_env(wrong_profile),
+    ) if app_present else []
+
+    stopped = False
+    if stop and app_present:
+        _run_text(
+            _modal_command("app", "stop", app_name, "--yes", modal_environment=modal_environment),
+            env=modal_profile_env(wrong_profile),
+        )
+        stopped = True
+
+    return {
+        "ok": True,
+        "failure_reason": None,
+        "authoritative": authoritative,
+        "wrong_workspace": wrong_context,
+        "wrong_app_name": app_name,
+        "wrong_app_present": app_present,
+        "wrong_app_history_count": len(history) if isinstance(history, list) else None,
+        "stopped": stopped,
+    }
+
+
+def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--wrong-profile",
+        default=DEFAULT_WRONG_PROFILE,
+        help="Modal profile that owns the mistaken app.",
+    )
+    parser.add_argument(
+        "--wrong-workspace",
+        default=DEFAULT_WRONG_WORKSPACE,
+        help="Expected non-authoritative workspace name for the wrong profile.",
+    )
+    parser.add_argument(
+        "--app-name",
+        default=DEFAULT_APP_NAME,
+        help=f"Wrong-workspace app name (default: {DEFAULT_APP_NAME}).",
+    )
+    parser.add_argument("--env", default="", help="Optional Modal environment name.")
+    parser.add_argument(
+        "--stop",
+        action="store_true",
+        help="Stop the wrong-workspace app after the authoritative workspace is healthy.",
+    )
+    parser.add_argument("--json", action="store_true", help="Emit the cleanup summary as JSON.")
+    return parser.parse_args(argv)
+
+
+def _print_text(summary: dict[str, Any]) -> None:
+    wrong_workspace = summary.get("wrong_workspace") or {}
+    print("Wrong-workspace Modal cleanup")
+    print(f"  Authoritative ready: {bool((summary.get('authoritative') or {}).get('ok'))}")
+    print(f"  Wrong profile: {wrong_workspace.get('profile') or '<unknown>'}")
+    print(f"  Wrong workspace: {wrong_workspace.get('workspace') or '<unknown>'}")
+    print(f"  Wrong app present: {summary.get('wrong_app_present')}")
+    print(f"  Stopped: {summary.get('stopped')}")
+    print(f"  Result: {'ok' if summary.get('ok') else summary.get('failure_reason')}")
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = parse_args(argv)
+    summary = cleanup_wrong_workspace_deploy(
+        wrong_profile=str(args.wrong_profile or "").strip(),
+        wrong_workspace=str(args.wrong_workspace or "").strip(),
+        app_name=str(args.app_name or "").strip() or DEFAULT_APP_NAME,
+        modal_environment=str(args.env or "").strip(),
+        stop=bool(args.stop),
+    )
+    if args.json:
+        print(json.dumps(summary, indent=2, sort_keys=True))
+    else:
+        _print_text(summary)
+    return 0 if summary.get("ok") else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())