therealityreport · therealityreport · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026
diff --git a/api/auth.py b/api/auth.py
@@ -8,6 +8,7 @@
 from __future__ import annotations
 
 import hmac
+import ipaddress
 import logging
 import os
 from typing import Annotated, Any
@@ -20,13 +21,67 @@
 logger = logging.getLogger(__name__)
 
 
+_LOCAL_INTERNAL_ADMIN_PROXY_HEADER = "x-trr-local-admin-proxy"
+
+
 def _env_flag_strict(name: str, default: bool = False) -> bool:
     raw = (os.getenv(name) or "").strip().lower()
     if not raw:
         return default
     return raw in {"1", "true", "yes", "on"}
 
 
+def _normalize_host(value: str | None) -> str:
+    raw = (value or "").strip().lower()
+    if not raw:
+        return ""
+    if raw.startswith("[") and "]" in raw:
+        return raw[1 : raw.index("]")]
+    if raw.count(":") > 1:
+        return raw
+    return raw.rsplit(":", 1)[0]
+
+
+def _is_loopback_host(value: str | None) -> bool:
+    host = _normalize_host(value)
+    if host in {"localhost", "ip6-localhost", "ip6-loopback"} or host.endswith(".localhost"):
+        return True
+    try:
+        return ipaddress.ip_address(host).is_loopback
+    except ValueError:
+        return False
+
+
+def _local_internal_admin_proxy_allowed(request: Request) -> bool:
+    if (request.headers.get(_LOCAL_INTERNAL_ADMIN_PROXY_HEADER) or "").strip().lower() not in {
+        "1",
+        "true",
+        "yes",
+        "on",
+    }:
+        return False
+    client_host = request.client.host if request.client else ""
+    if not _is_loopback_host(client_host):
+        return False
+    return _is_loopback_host(request.headers.get("host"))
+
+
+def _build_local_internal_admin_identity(request: Request) -> dict[str, Any]:
+    admin_uid = (request.headers.get("x-trr-admin-uid") or "").strip() or "local-loopback"
+    admin_email = (request.headers.get("x-trr-admin-email") or "").strip() or None
+    return {
+        "id": f"internal-admin:{admin_uid}",
+        "email": admin_email,
+        "role": "internal_admin",
+        "token": get_bearer_token(request),
+        "issuer": "local-loopback",
+        "scope": "internal_admin",
+        "admin_uid": admin_uid,
+        "admin_email": admin_email,
+        "verified_at": request.headers.get("x-trr-admin-verified-at"),
+    }
+
+
 def get_bearer_token(request: Request) -> str | None:
     """
     Extract Bearer token from Authorization header.
@@ -242,6 +297,9 @@ async def require_internal_admin(request: Request) -> dict:
                 detail="Authentication service unavailable",
                 headers={"x-error-code": "AUTH_SERVICE_UNAVAILABLE"},
             ) from exc
+    if _local_internal_admin_proxy_allowed(request):
+        logger.warning("[auth] local loopback internal-admin proxy bypass accepted")
+        return _build_local_internal_admin_identity(request)
 
     if current_user:
         raise HTTPException(status_code=403, detail="Allowlist admin access required")

diff --git a/docs/observability/modal-v439-v440-serve-backend-api-crash-loop-2026-05-28.md b/docs/observability/modal-v439-v440-serve-backend-api-crash-loop-2026-05-28.md
@@ -0,0 +1,104 @@
+# Modal serve_backend_api Crash Loop: v439 to v440
+
+Date: 2026-05-28
+Workspace: `admin-56995`
+App: `trr-backend-jobs`
+Function: `trr_backend.modal_jobs.serve_backend_api`
+
+## Summary
+
+Modal emailed that `serve_backend_api` containers were repeatedly failing to
+start after deploy `v439`. The API image could resolve the deployed web
+endpoint, but cold-started containers failed while importing `api.main`.
+
+## Timeline
+
+- `2026-05-28 10:37:31 EDT`: deploy `v439` completed in workspace
+  `admin-56995`.
+- `2026-05-28 10:37:37 EDT`: first `serve_backend_api` import failure logged.
+- `2026-05-28 10:42:18 EDT`: Modal emitted crash-looping signal for
+  `serve_backend_api`.
+- `2026-05-28 11:20 EDT`: readiness still resolved the app and web endpoint,
+  but logs showed repeated cold-start failures.
+- `2026-05-28 11:23:23 EDT`: deploy `v440` completed through the pinned
+  `admin-56995` wrapper.
+- `2026-05-28 11:23:41 EDT`: `/health` returned HTTP 200 with database
+  connected.
+
+## Cause
+
+`serve_backend_api` imports `api.main`, and `api.main` imports routers including
+`api.routers.admin_show_sync`. That router imported `scripts.sync.*` modules at
+module import time. The lean API Modal image mounted `api` and `trr_backend`,
+but did not mount the top-level `scripts` package, so container startup failed:
+
+```text
+ModuleNotFoundError("No module named 'scripts'")
+```
+
+## Fix
+
+- `api.routers.admin_show_sync` now lazy-loads `scripts.sync.*` modules only
+  when script-backed endpoints execute.
+- The lean Modal image mounts the minimal script payload required by API/admin
+  sync paths:
+  - `scripts/__init__.py`
+  - `scripts/_sync_common.py`
+  - `scripts/sync`
+- Modal image payload validation now checks required local mount paths for each
+  image family before image construction.
+- The deploy wrapper now runs readiness and an API `/health` cold-start canary
+  after successful deploys.
+
+## Verification
+
+- Deploy `v440` URL:
+  `https://modal.com/apps/admin-56995/main/deployed/trr-backend-jobs`
+- Readiness:
+  - `ok = true`
+  - `api_web_url` resolved for `serve_backend_api`
+  - `missing_functions = []`
+  - `missing_web_endpoints = []`
+- `/health` returned HTTP 200 three times after `v440`.
+- Logs after `2026-05-28 11:23:20 EDT` showed API startup and health probes,
+  with no new `ModuleNotFoundError` or crash-looping entry.
+
+## Follow-up Hardening
+
+- Keep the deploy wrapper as the only deploy path for Modal backend changes.
+- Treat readiness-only success as insufficient for API deploys; the cold-start
+  canary must pass.
+- Keep script-backed admin endpoints lazy so API startup does not depend on
+  optional script payloads.
+
+## v442 Guard Regression And Replacement
+
+Deploy `v442` briefly exposed a second startup failure after the image payload
+guard was added. The guard correctly knew about browser-family script mounts,
+but it ran inside deployed Modal containers where those source paths are not
+present. That made the API function import `trr_backend.modal_jobs` and fail
+before serving `/health`.
+
+The fix was to keep the guard strict for local deploy/image construction while
+skipping local-path validation inside remote runtime containers. Deploy `v443`
+replaced `v442`; readiness returned `ok = true`, the API canary returned HTTP
+200 on attempt 1, and recent logs no longer showed `Runner failed` or
+crash-looping entries.
+
+<!-- modal-deploy-history:start -->
+## Deploy History Stamp
+
+- Last stamped: `2026-05-28T13:55:36-04:00`
+- Workspace: `admin-56995`
+- Profile: `admin-56995`
+- Canary: `https://admin-56995--trr-backend-api.modal.run/health` HTTP `200` on attempt `1`
+
+| Version | Deployed At | Deployed By | Commit | Client |
+| --- | --- | --- | --- | --- |
+| v443 | 2026-05-28 13:55:19-04:00 | admin-56995 | c150a64* | 1.4.0 |
+| v442 | 2026-05-28 13:51:21-04:00 | admin-56995 | c150a64* | 1.4.0 |
+| v441 | 2026-05-28 11:46:53-04:00 | admin-56995 | c150a64* | 1.4.0 |
+| v440 | 2026-05-28 11:23:23-04:00 | admin-56995 | c150a64* | 1.4.0 |
+| v439 | 2026-05-28 10:37:31-04:00 | admin-56995 | c150a64* | 1.4.0 |
+
+<!-- modal-deploy-history:end -->
diff --git a/scripts/modal/api_canary.py b/scripts/modal/api_canary.py
@@ -0,0 +1,47 @@
+"""Shared Modal API canary helpers."""
+
+from __future__ import annotations
+
+import urllib.error
+import urllib.parse
+import urllib.request
+from typing import Any
+
+DEFAULT_CANARY_ATTEMPTS = 3
+DEFAULT_CANARY_TIMEOUT_SECONDS = 20
+
+
+def health_url(api_web_url: str) -> str:
+    base = str(api_web_url or "").strip().rstrip("/")
+    if not base:
+        raise RuntimeError("Modal readiness did not return api_web_url for cold-start canary.")
+    parsed = urllib.parse.urlparse(base)
+    if parsed.scheme not in {"http", "https"}:
+        raise RuntimeError(f"Invalid URL scheme for cold-start canary: {base}")
+    return f"{base}/health"
+
+
+def run_api_cold_start_canary(
+    api_web_url: str,
+    *,
+    attempts: int = DEFAULT_CANARY_ATTEMPTS,
+    timeout_seconds: int = DEFAULT_CANARY_TIMEOUT_SECONDS,
+) -> dict[str, Any]:
+    url = health_url(api_web_url)
+    last_error = ""
+    for attempt in range(1, max(1, attempts) + 1):
+        try:
+            request = urllib.request.Request(url, headers={"User-Agent": "trr-modal-deploy-canary/1"})
+            with urllib.request.urlopen(request, timeout=timeout_seconds) as response:
+                body = response.read(4096).decode("utf-8", errors="replace")
+                status = int(getattr(response, "status", 0) or response.getcode())
+            if 200 <= status < 300:
+                return {"ok": True, "url": url, "status": status, "attempt": attempt, "body": body}
+            last_error = f"HTTP {status}: {body[:200]}"
+        except (urllib.error.URLError, TimeoutError, OSError) as exc:
+            last_error = str(exc)
+    raise RuntimeError(f"Modal API cold-start canary failed for {url}: {last_error}")
+
+
+def skipped_api_canary(reason: str = "not_requested") -> dict[str, Any]:
+    return {"ok": None, "ran": False, "reason": reason}