Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions api/auth.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from __future__ import annotations

import hmac
import ipaddress
import logging
import os
from typing import Annotated, Any
Expand All @@ -20,13 +21,67 @@
logger = logging.getLogger(__name__)


_LOCAL_INTERNAL_ADMIN_PROXY_HEADER = "x-trr-local-admin-proxy"


def _env_flag_strict(name: str, default: bool = False) -> bool:
raw = (os.getenv(name) or "").strip().lower()
if not raw:
return default
return raw in {"1", "true", "yes", "on"}


def _normalize_host(value: str | None) -> str:
raw = (value or "").strip().lower()
if not raw:
return ""
if raw.startswith("[") and "]" in raw:
return raw[1 : raw.index("]")]
if raw.count(":") > 1:
return raw
return raw.rsplit(":", 1)[0]


def _is_loopback_host(value: str | None) -> bool:
host = _normalize_host(value)
if host in {"localhost", "ip6-localhost", "ip6-loopback"} or host.endswith(".localhost"):
return True
try:
return ipaddress.ip_address(host).is_loopback
except ValueError:
return False


def _local_internal_admin_proxy_allowed(request: Request) -> bool:
if (request.headers.get(_LOCAL_INTERNAL_ADMIN_PROXY_HEADER) or "").strip().lower() not in {
"1",
"true",
"yes",
"on",
}:
return False
client_host = request.client.host if request.client else ""
if not _is_loopback_host(client_host):
return False
return _is_loopback_host(request.headers.get("host"))

Comment on lines +55 to +67

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

This opens an unauthenticated internal-admin path behind any loopback proxy.

This branch trusts a caller-set marker header plus request.client.host/Host to mint an internal_admin identity. If the app is ever fronted by a same-host reverse proxy, public requests will also arrive with a loopback client address, so forwarding x-trr-local-admin-proxy lets a remote caller bypass the JWT/shared-secret checks and even choose x-trr-admin-* identity fields. Please make this path opt-in and require a server-controlled credential before returning _build_local_internal_admin_identity().

Minimal hardening direction
 def _local_internal_admin_proxy_allowed(request: Request) -> bool:
+    if not _env_flag_strict("TRR_INTERNAL_ADMIN_ALLOW_LOCAL_LOOPBACK_PROXY", False):
+        return False
     if (request.headers.get(_LOCAL_INTERNAL_ADMIN_PROXY_HEADER) or "").strip().lower() not in {
         "1",
         "true",
         "yes",
         "on",
     }:
         return False
     client_host = request.client.host if request.client else ""
     if not _is_loopback_host(client_host):
         return False
-    return _is_loopback_host(request.headers.get("host"))
+    return _is_loopback_host(request.headers.get("host")) and _internal_admin_secret_matches(request)

Also applies to: 300-302

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@api/auth.py` around lines 55 - 67, The _local_internal_admin_proxy_allowed
check currently trusts a client-supplied marker header plus loopback addresses
which allows bypass; make this opt-in by requiring a server-controlled
credential before returning _build_local_internal_admin_identity(): add a
server-side secret/config (e.g. _LOCAL_INTERNAL_ADMIN_SECRET) and validate the
incoming marker header value against that secret (or validate a proxy-signed
token) in _local_internal_admin_proxy_allowed (and similar logic used around
lines ~300-302) in addition to the loopback checks; only if the header matches
the configured secret and both client.host and Host are loopback should you
allow minting the local internal admin identity.


def _build_local_internal_admin_identity(request: Request) -> dict[str, Any]:
admin_uid = (request.headers.get("x-trr-admin-uid") or "").strip() or "local-loopback"
admin_email = (request.headers.get("x-trr-admin-email") or "").strip() or None
return {
"id": f"internal-admin:{admin_uid}",
"email": admin_email,
"role": "internal_admin",
"token": get_bearer_token(request),
"issuer": "local-loopback",
"scope": "internal_admin",
"admin_uid": admin_uid,
"admin_email": admin_email,
"verified_at": request.headers.get("x-trr-admin-verified-at"),
}


def get_bearer_token(request: Request) -> str | None:
"""
Extract Bearer token from Authorization header.
Expand Down Expand Up @@ -242,6 +297,9 @@ async def require_internal_admin(request: Request) -> dict:
detail="Authentication service unavailable",
headers={"x-error-code": "AUTH_SERVICE_UNAVAILABLE"},
) from exc
if _local_internal_admin_proxy_allowed(request):
logger.warning("[auth] local loopback internal-admin proxy bypass accepted")
return _build_local_internal_admin_identity(request)

if current_user:
raise HTTPException(status_code=403, detail="Allowlist admin access required")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# Modal serve_backend_api Crash Loop: v439 to v440

Date: 2026-05-28
Workspace: `admin-56995`
App: `trr-backend-jobs`
Function: `trr_backend.modal_jobs.serve_backend_api`

## Summary

Modal emailed that `serve_backend_api` containers were repeatedly failing to
start after deploy `v439`. The API image could resolve the deployed web
endpoint, but cold-started containers failed while importing `api.main`.

## Timeline

- `2026-05-28 10:37:31 EDT`: deploy `v439` completed in workspace
`admin-56995`.
- `2026-05-28 10:37:37 EDT`: first `serve_backend_api` import failure logged.
- `2026-05-28 10:42:18 EDT`: Modal emitted crash-looping signal for
`serve_backend_api`.
- `2026-05-28 11:20 EDT`: readiness still resolved the app and web endpoint,
but logs showed repeated cold-start failures.
- `2026-05-28 11:23:23 EDT`: deploy `v440` completed through the pinned
`admin-56995` wrapper.
- `2026-05-28 11:23:41 EDT`: `/health` returned HTTP 200 with database
connected.

## Cause

`serve_backend_api` imports `api.main`, and `api.main` imports routers including
`api.routers.admin_show_sync`. That router imported `scripts.sync.*` modules at
module import time. The lean API Modal image mounted `api` and `trr_backend`,
but did not mount the top-level `scripts` package, so container startup failed:

```text
ModuleNotFoundError("No module named 'scripts'")
```

## Fix

- `api.routers.admin_show_sync` now lazy-loads `scripts.sync.*` modules only
when script-backed endpoints execute.
- The lean Modal image mounts the minimal script payload required by API/admin
sync paths:
- `scripts/__init__.py`
- `scripts/_sync_common.py`
- `scripts/sync`
- Modal image payload validation now checks required local mount paths for each
image family before image construction.
- The deploy wrapper now runs readiness and an API `/health` cold-start canary
after successful deploys.

## Verification

- Deploy `v440` URL:
`https://modal.com/apps/admin-56995/main/deployed/trr-backend-jobs`
- Readiness:
- `ok = true`
- `api_web_url` resolved for `serve_backend_api`
- `missing_functions = []`
- `missing_web_endpoints = []`
- `/health` returned HTTP 200 three times after `v440`.
- Logs after `2026-05-28 11:23:20 EDT` showed API startup and health probes,
with no new `ModuleNotFoundError` or crash-looping entry.

## Follow-up Hardening

- Keep the deploy wrapper as the only deploy path for Modal backend changes.
- Treat readiness-only success as insufficient for API deploys; the cold-start
canary must pass.
- Keep script-backed admin endpoints lazy so API startup does not depend on
optional script payloads.

## v442 Guard Regression And Replacement

Deploy `v442` briefly exposed a second startup failure after the image payload
guard was added. The guard correctly knew about browser-family script mounts,
but it ran inside deployed Modal containers where those source paths are not
present. That made the API function import `trr_backend.modal_jobs` and fail
before serving `/health`.

The fix was to keep the guard strict for local deploy/image construction while
skipping local-path validation inside remote runtime containers. Deploy `v443`
replaced `v442`; readiness returned `ok = true`, the API canary returned HTTP
200 on attempt 1, and recent logs no longer showed `Runner failed` or
crash-looping entries.

<!-- modal-deploy-history:start -->
## Deploy History Stamp

- Last stamped: `2026-05-28T13:55:36-04:00`
- Workspace: `admin-56995`
- Profile: `admin-56995`
- Canary: `https://admin-56995--trr-backend-api.modal.run/health` HTTP `200` on attempt `1`

| Version | Deployed At | Deployed By | Commit | Client |
| --- | --- | --- | --- | --- |
| v443 | 2026-05-28 13:55:19-04:00 | admin-56995 | c150a64* | 1.4.0 |
| v442 | 2026-05-28 13:51:21-04:00 | admin-56995 | c150a64* | 1.4.0 |
| v441 | 2026-05-28 11:46:53-04:00 | admin-56995 | c150a64* | 1.4.0 |
| v440 | 2026-05-28 11:23:23-04:00 | admin-56995 | c150a64* | 1.4.0 |
| v439 | 2026-05-28 10:37:31-04:00 | admin-56995 | c150a64* | 1.4.0 |

<!-- modal-deploy-history:end -->
47 changes: 47 additions & 0 deletions scripts/modal/api_canary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
"""Shared Modal API canary helpers."""

from __future__ import annotations

import urllib.error
import urllib.parse
import urllib.request
from typing import Any

DEFAULT_CANARY_ATTEMPTS = 3
DEFAULT_CANARY_TIMEOUT_SECONDS = 20


def health_url(api_web_url: str) -> str:
base = str(api_web_url or "").strip().rstrip("/")
if not base:
raise RuntimeError("Modal readiness did not return api_web_url for cold-start canary.")
parsed = urllib.parse.urlparse(base)
if parsed.scheme not in {"http", "https"}:
raise RuntimeError(f"Invalid URL scheme for cold-start canary: {base}")
return f"{base}/health"
Comment on lines +14 to +21

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

Add URL scheme validation for defense-in-depth.

The health_url function validates that the base URL is non-empty but does not verify the URL scheme. While api_web_url is expected to come from Modal's readiness API in trusted contexts, adding scheme validation (ensuring https:// or http://) would prevent potential SSRF if the input source changes or is compromised.

🛡️ Proposed fix to validate URL scheme
 def health_url(api_web_url: str) -> str:
     base = str(api_web_url or "").strip().rstrip("/")
     if not base:
         raise RuntimeError("Modal readiness did not return api_web_url for cold-start canary.")
+    if not base.startswith(("https://", "http://")):
+        raise RuntimeError(f"Invalid URL scheme for cold-start canary: {base}")
     return f"{base}/health"
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@scripts/modal/api_canary.py` around lines 13 - 17, The health_url function
currently only checks for non-empty base but doesn't validate the URL scheme;
update health_url to parse api_web_url (e.g., using urllib.parse.urlparse) and
ensure the scheme is either 'https' or 'http' before returning the health path,
raising a RuntimeError if the scheme is missing/invalid; keep the existing
trimming/rstrip behavior and build the returned string from the validated base.



def run_api_cold_start_canary(
api_web_url: str,
*,
attempts: int = DEFAULT_CANARY_ATTEMPTS,
timeout_seconds: int = DEFAULT_CANARY_TIMEOUT_SECONDS,
) -> dict[str, Any]:
url = health_url(api_web_url)
last_error = ""
for attempt in range(1, max(1, attempts) + 1):
try:
request = urllib.request.Request(url, headers={"User-Agent": "trr-modal-deploy-canary/1"})
with urllib.request.urlopen(request, timeout=timeout_seconds) as response:
body = response.read(4096).decode("utf-8", errors="replace")
status = int(getattr(response, "status", 0) or response.getcode())
if 200 <= status < 300:
return {"ok": True, "url": url, "status": status, "attempt": attempt, "body": body}
last_error = f"HTTP {status}: {body[:200]}"
except (urllib.error.URLError, TimeoutError, OSError) as exc:
last_error = str(exc)
raise RuntimeError(f"Modal API cold-start canary failed for {url}: {last_error}")


def skipped_api_canary(reason: str = "not_requested") -> dict[str, Any]:
return {"ok": None, "ran": False, "reason": reason}
Loading
Loading