fix(prod): harden production configuration for launch

- Gate Swagger/ReDoc/OpenAPI behind DEBUG (no public API schema in prod) - Sentry send_default_pii only in dev (no auth headers/bodies in events) - Remove alembic from Dockerfile CMD (releaseCommand owns migrations; CMD copy raced across replicas/restarts) - Decouple rate limiting from DEBUG via RATE_LIMIT_ENABLED (PR envs with DEBUG=true were unlimited); tests disable the live limiter in conftest - max_instances=1 on the 4 scheduler jobs missing it - Boot-time failure when SELF_SERVE_ENABLED without RESEND_API_KEY/ANTHROPIC_API_KEY/FRONTEND_URL - Reject localhost OAUTH_REDIRECT_BASE outside DEBUG - pool_pre_ping + pool_recycle on the app engine - Frontend: DEV-gate stale-async console.warn; document VITE_SELF_SERVE_ENABLED fallback semantics in Dockerfile Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-12 19:22:35 -04:00
parent b69447767a
commit c4947218a4
9 changed files with 70 additions and 14 deletions
--- a/backend/.env.example
+++ b/backend/.env.example
@@ -40,3 +40,5 @@ STRIPE_WEBHOOK_SECRET=whsec_
 # before the public flip. Empty by default.
 SELF_SERVE_ENABLED=false
 INTERNAL_TESTER_EMAILS=
+# Rate limiting (decoupled from DEBUG; keep true in PR/staging/prod)
+RATE_LIMIT_ENABLED=false
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@@ -24,5 +24,6 @@ COPY . .
 # Expose port (Railway uses PORT env variable)
 EXPOSE 8000

-# Run migrations then start the application
-CMD alembic upgrade head && uvicorn app.main:app --host 0.0.0.0 --port ${PORT:-8000}
+# Migrations run exclusively via Railway releaseCommand (scripts/release) —
+# running them here too would race across replicas/restarts.
+CMD uvicorn app.main:app --host 0.0.0.0 --port ${PORT:-8000}
--- a/backend/app/core/config.py
+++ b/backend/app/core/config.py
@@ -85,6 +85,10 @@ class Settings(BaseSettings):
    # Security
    BCRYPT_ROUNDS: int = 12

+    # Rate limiting — independent of DEBUG so PR/staging envs running with
+    # DEBUG=true still rate-limit auth and AI endpoints.
+    RATE_LIMIT_ENABLED: bool = True
+
    # Security Headers
    CSP_REPORT_ONLY: bool = True  # Set False to enforce CSP
    CSP_EXTRA_SCRIPT_SOURCES: list[str] = []  # Additional script-src domains
@@ -259,6 +263,18 @@ class Settings(BaseSettings):
    MS_CLIENT_SECRET: Optional[str] = None
    OAUTH_REDIRECT_BASE: str = "http://localhost:5173"

+    @field_validator("OAUTH_REDIRECT_BASE", mode="after")
+    @classmethod
+    def reject_localhost_redirect_in_production(cls, v: str, info) -> str:
+        """OAuth code exchange against a localhost redirect_uri is always a
+        misconfiguration outside DEBUG — fail at boot, not at first sign-in."""
+        debug = info.data.get("DEBUG", False)
+        if not debug and v.startswith("http://localhost"):
+            raise ValueError(
+                "OAUTH_REDIRECT_BASE must be set to the public frontend URL in production"
+            )
+        return v
+
    # Monitoring
    SENTRY_DSN: Optional[str] = None

--- a/backend/app/core/database.py
+++ b/backend/app/core/database.py
@@ -7,7 +7,10 @@ from app.core.tenant_context import register_tenant_listener
 engine = create_async_engine(
    settings.DATABASE_URL,
    echo=settings.DEBUG,
-    future=True
+    future=True,
+    # Detect connections dropped by DB restarts/maintenance instead of failing requests
+    pool_pre_ping=True,
+    pool_recycle=1800,
 )

 # Create async session factory
--- a/backend/app/core/rate_limit.py
+++ b/backend/app/core/rate_limit.py
@@ -3,4 +3,4 @@ from slowapi.util import get_remote_address

 from app.core.config import settings

-limiter = Limiter(key_func=get_remote_address, enabled=not settings.DEBUG)
+limiter = Limiter(key_func=get_remote_address, enabled=settings.RATE_LIMIT_ENABLED)
--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -15,7 +15,9 @@ if settings.SENTRY_DSN:
    sentry_sdk.init(
        dsn=settings.SENTRY_DSN,
        environment="development" if settings.DEBUG else "production",
-        send_default_pii=True,
+        # PII (headers, bodies, IPs) only in dev — prod events must not capture
+        # auth tokens or customer data from a multi-tenant MSP product.
+        send_default_pii=settings.DEBUG,
        traces_sample_rate=1.0 if settings.DEBUG else 0.2,
        # Profiling — included in free plan
        profiles_sample_rate=1.0 if settings.DEBUG else 0.2,
@@ -151,6 +153,21 @@ async def lifespan(app: FastAPI):
    logger.info("Starting ResolutionFlow API server...")
    logger.info(f"Environment: {'Development' if settings.DEBUG else 'Production'}")
    logger.info(f"ALLOW_RAILWAY_ORIGINS: {settings.ALLOW_RAILWAY_ORIGINS}")
+
+    # Self-serve signup is broken without these — fail loudly at boot, not at the
+    # first customer's signup attempt.
+    if settings.SELF_SERVE_ENABLED and not settings.DEBUG:
+        missing = [
+            name for name, value in (
+                ("RESEND_API_KEY", settings.RESEND_API_KEY),
+                ("ANTHROPIC_API_KEY", settings.ANTHROPIC_API_KEY),
+                ("FRONTEND_URL", settings.FRONTEND_URL),
+            ) if not value
+        ]
+        if missing:
+            raise RuntimeError(
+                f"SELF_SERVE_ENABLED=true but required settings are unset: {', '.join(missing)}"
+            )
    # Note: In production, use Alembic migrations instead of init_db
    # await init_db()

@@ -170,6 +187,7 @@ async def lifespan(app: FastAPI):
        hours=1,
        id="cleanup_ai_conversations",
        replace_existing=True,
+        max_instances=1,
    )

    # Chat retention cleanup (daily)
@@ -179,6 +197,7 @@ async def lifespan(app: FastAPI):
        hours=24,
        id="cleanup_expired_chats",
        replace_existing=True,
+        max_instances=1,
    )

    # Auto-archive stale AI chat sessions (daily at 3 AM)
@@ -188,6 +207,7 @@ async def lifespan(app: FastAPI):
        hour=3,
        id="archive_stale_ai_sessions",
        replace_existing=True,
+        max_instances=1,
    )

    # PSA push retry (every 5 minutes)
@@ -198,6 +218,7 @@ async def lifespan(app: FastAPI):
        minutes=5,
        id="psa_push_retry",
        replace_existing=True,
+        max_instances=1,
    )

    # Knowledge Flywheel analysis (every 5 minutes)
@@ -252,9 +273,10 @@ app = FastAPI(
    title=settings.APP_NAME,
    description="ResolutionFlow - Take the path MOST traveled. Guided troubleshooting with automatic documentation.",
    version="1.0.0",
-    docs_url="/api/docs",
-    redoc_url="/api/redoc",
-    openapi_url="/api/openapi.json",
+    # Interactive docs + schema are dev-only; prod must not expose the full API surface.
+    docs_url="/api/docs" if settings.DEBUG else None,
+    redoc_url="/api/redoc" if settings.DEBUG else None,
+    openapi_url="/api/openapi.json" if settings.DEBUG else None,
    lifespan=lifespan
 )

--- a/backend/tests/conftest.py
+++ b/backend/tests/conftest.py
@@ -29,6 +29,12 @@ from app import models as _models  # noqa: F401
 # Disable invite code requirement for tests
 settings.REQUIRE_INVITE_CODE = False

+# Disable rate limiting in tests — auth-heavy suites would trip the
+# per-minute login/register limits. The limiter is constructed at import
+# time, so flip the live instance rather than the setting.
+from app.core.rate_limit import limiter as _limiter  # noqa: E402
+_limiter.enabled = False
+
 # Test database URL — NEVER reuse DATABASE_URL. The test_db fixture does
 # `DROP SCHEMA public CASCADE` on every test; if DATABASE_URL (which normally
 # points at the dev/prod DB) leaked into this value, running `pytest tests/`
--- a/frontend/Dockerfile
+++ b/frontend/Dockerfile
@@ -13,6 +13,10 @@ RUN npm ci
 COPY . .

 # Build arguments (set at build time)
+# NOTE: VITE_SELF_SERVE_ENABLED is only a network-error fallback — the live
+# toggle is the backend /config/public flag; rebuilding with a new value here
+# does NOT flip self-serve. VITE_STRIPE_PUBLISHABLE_KEY is currently unused
+# (checkout is fully backend-driven); kept for a future client-side Stripe.js.
 ARG VITE_API_URL
 ARG VITE_SENTRY_DSN
 ARG VITE_PUBLIC_POSTHOG_KEY
--- a/frontend/src/pages/AssistantChatPage.tsx
+++ b/frontend/src/pages/AssistantChatPage.tsx
@@ -266,11 +266,13 @@ export default function AssistantChatPage() {
  const loadedChatIdsRef = useRef<Set<string>>(new Set())
  const guardCurrentChat = useCallback((expectedChatId: string, source: string) => {
    if (currentChatRef.current === expectedChatId) return true
+    if (import.meta.env.DEV) {
      console.warn('[AssistantChat] Discarded stale async result', {
        source,
        expectedChatId,
        currentChatId: currentChatRef.current,
      })
+    }
    return false
  }, [])