fix(prod): harden production configuration for launch

- Gate Swagger/ReDoc/OpenAPI behind DEBUG (no public API schema in prod) - Sentry send_default_pii only in dev (no auth headers/bodies in events) - Remove alembic from Dockerfile CMD (releaseCommand owns migrations; CMD copy raced across replicas/restarts) - Decouple rate limiting from DEBUG via RATE_LIMIT_ENABLED (PR envs with DEBUG=true were unlimited); tests disable the live limiter in conftest - max_instances=1 on the 4 scheduler jobs missing it - Boot-time failure when SELF_SERVE_ENABLED without RESEND_API_KEY/ANTHROPIC_API_KEY/FRONTEND_URL - Reject localhost OAUTH_REDIRECT_BASE outside DEBUG - pool_pre_ping + pool_recycle on the app engine - Frontend: DEV-gate stale-async console.warn; document VITE_SELF_SERVE_ENABLED fallback semantics in Dockerfile Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-12 19:22:35 -04:00
parent 87236b57d2
commit 4dc4894fc7
9 changed files with 70 additions and 14 deletions
--- a/backend/.env.example
+++ b/backend/.env.example
@@ -39,4 +39,6 @@ STRIPE_WEBHOOK_SECRET=whsec_
 # global flag for specific users — used for prod test-mode validation
 # before the public flip. Empty by default.
 SELF_SERVE_ENABLED=false
-INTERNAL_TESTER_EMAILS=
+INTERNAL_TESTER_EMAILS=
 # Rate limiting (decoupled from DEBUG; keep true in PR/staging/prod)
 RATE_LIMIT_ENABLED=false
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@@ -24,5 +24,6 @@ COPY . .
 # Expose port (Railway uses PORT env variable)
 EXPOSE 8000
-# Run migrations then start the application
+# Migrations run exclusively via Railway releaseCommand (scripts/release) —
-CMD alembic upgrade head && uvicorn app.main:app --host 0.0.0.0 --port ${PORT:-8000}
+# running them here too would race across replicas/restarts.
 CMD uvicorn app.main:app --host 0.0.0.0 --port ${PORT:-8000}
--- a/backend/app/core/config.py
+++ b/backend/app/core/config.py
@@ -85,6 +85,10 @@ class Settings(BaseSettings):
    # Security
    BCRYPT_ROUNDS: int = 12
    # Rate limiting — independent of DEBUG so PR/staging envs running with
    # DEBUG=true still rate-limit auth and AI endpoints.
    RATE_LIMIT_ENABLED: bool = True
    # Security Headers
    CSP_REPORT_ONLY: bool = True  # Set False to enforce CSP
    CSP_EXTRA_SCRIPT_SOURCES: list[str] = []  # Additional script-src domains
@@ -255,6 +259,18 @@ class Settings(BaseSettings):
    MS_CLIENT_SECRET: Optional[str] = None
    OAUTH_REDIRECT_BASE: str = "http://localhost:5173"
    @field_validator("OAUTH_REDIRECT_BASE", mode="after")
    @classmethod
    def reject_localhost_redirect_in_production(cls, v: str, info) -> str:
        """OAuth code exchange against a localhost redirect_uri is always a
        misconfiguration outside DEBUG — fail at boot, not at first sign-in."""
        debug = info.data.get("DEBUG", False)
        if not debug and v.startswith("http://localhost"):
            raise ValueError(
                "OAUTH_REDIRECT_BASE must be set to the public frontend URL in production"
            )
        return v
    # Monitoring
    SENTRY_DSN: Optional[str] = None
--- a/backend/app/core/database.py
+++ b/backend/app/core/database.py
@@ -7,7 +7,10 @@ from app.core.tenant_context import register_tenant_listener
 engine = create_async_engine(
    settings.DATABASE_URL,
    echo=settings.DEBUG,
-    future=True
+    future=True,
    # Detect connections dropped by DB restarts/maintenance instead of failing requests
    pool_pre_ping=True,
    pool_recycle=1800,
 )
 # Create async session factory
--- a/backend/app/core/rate_limit.py
+++ b/backend/app/core/rate_limit.py
@@ -3,4 +3,4 @@ from slowapi.util import get_remote_address
 from app.core.config import settings
-limiter = Limiter(key_func=get_remote_address, enabled=not settings.DEBUG)
+limiter = Limiter(key_func=get_remote_address, enabled=settings.RATE_LIMIT_ENABLED)
--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -15,7 +15,9 @@ if settings.SENTRY_DSN:
    sentry_sdk.init(
        dsn=settings.SENTRY_DSN,
        environment="development" if settings.DEBUG else "production",
-        send_default_pii=True,
+        # PII (headers, bodies, IPs) only in dev — prod events must not capture
        # auth tokens or customer data from a multi-tenant MSP product.
        send_default_pii=settings.DEBUG,
        traces_sample_rate=1.0 if settings.DEBUG else 0.2,
        # Profiling — included in free plan
        profiles_sample_rate=1.0 if settings.DEBUG else 0.2,
@@ -151,6 +153,21 @@ async def lifespan(app: FastAPI):
    logger.info("Starting ResolutionFlow API server...")
    logger.info(f"Environment: {'Development' if settings.DEBUG else 'Production'}")
    logger.info(f"ALLOW_RAILWAY_ORIGINS: {settings.ALLOW_RAILWAY_ORIGINS}")
    # Self-serve signup is broken without these — fail loudly at boot, not at the
    # first customer's signup attempt.
    if settings.SELF_SERVE_ENABLED and not settings.DEBUG:
        missing = [
            name for name, value in (
                ("RESEND_API_KEY", settings.RESEND_API_KEY),
                ("ANTHROPIC_API_KEY", settings.ANTHROPIC_API_KEY),
                ("FRONTEND_URL", settings.FRONTEND_URL),
            ) if not value
        ]
        if missing:
            raise RuntimeError(
                f"SELF_SERVE_ENABLED=true but required settings are unset: {', '.join(missing)}"
            )
    # Note: In production, use Alembic migrations instead of init_db
    # await init_db()
@@ -170,6 +187,7 @@ async def lifespan(app: FastAPI):
        hours=1,
        id="cleanup_ai_conversations",
        replace_existing=True,
        max_instances=1,
    )
    # Chat retention cleanup (daily)
@@ -179,6 +197,7 @@ async def lifespan(app: FastAPI):
        hours=24,
        id="cleanup_expired_chats",
        replace_existing=True,
        max_instances=1,
    )
    # Auto-archive stale AI chat sessions (daily at 3 AM)
@@ -188,6 +207,7 @@ async def lifespan(app: FastAPI):
        hour=3,
        id="archive_stale_ai_sessions",
        replace_existing=True,
        max_instances=1,
    )
    # PSA push retry (every 5 minutes)
@@ -198,6 +218,7 @@ async def lifespan(app: FastAPI):
        minutes=5,
        id="psa_push_retry",
        replace_existing=True,
        max_instances=1,
    )
    # Knowledge Flywheel analysis (every 5 minutes)
@@ -252,9 +273,10 @@ app = FastAPI(
    title=settings.APP_NAME,
    description="ResolutionFlow - Take the path MOST traveled. Guided troubleshooting with automatic documentation.",
    version="1.0.0",
-    docs_url="/api/docs",
+    # Interactive docs + schema are dev-only; prod must not expose the full API surface.
-    redoc_url="/api/redoc",
+    docs_url="/api/docs" if settings.DEBUG else None,
-    openapi_url="/api/openapi.json",
+    redoc_url="/api/redoc" if settings.DEBUG else None,
    openapi_url="/api/openapi.json" if settings.DEBUG else None,
    lifespan=lifespan
 )
--- a/backend/tests/conftest.py
+++ b/backend/tests/conftest.py
@@ -29,6 +29,12 @@ from app import models as _models  # noqa: F401
 # Disable invite code requirement for tests
 settings.REQUIRE_INVITE_CODE = False
 # Disable rate limiting in tests — auth-heavy suites would trip the
 # per-minute login/register limits. The limiter is constructed at import
 # time, so flip the live instance rather than the setting.
 from app.core.rate_limit import limiter as _limiter  # noqa: E402
 _limiter.enabled = False
 # Test database URL — NEVER reuse DATABASE_URL. The test_db fixture does
 # `DROP SCHEMA public CASCADE` on every test; if DATABASE_URL (which normally
 # points at the dev/prod DB) leaked into this value, running `pytest tests/`
--- a/frontend/Dockerfile
+++ b/frontend/Dockerfile
@@ -13,6 +13,10 @@ RUN npm ci
 COPY . .
 # Build arguments (set at build time)
 # NOTE: VITE_SELF_SERVE_ENABLED is only a network-error fallback — the live
 # toggle is the backend /config/public flag; rebuilding with a new value here
 # does NOT flip self-serve. VITE_STRIPE_PUBLISHABLE_KEY is currently unused
 # (checkout is fully backend-driven); kept for a future client-side Stripe.js.
 ARG VITE_API_URL
 ARG VITE_SENTRY_DSN
 ARG VITE_PUBLIC_POSTHOG_KEY
--- a/frontend/src/pages/AssistantChatPage.tsx
+++ b/frontend/src/pages/AssistantChatPage.tsx
@@ -266,11 +266,13 @@ export default function AssistantChatPage() {
  const loadedChatIdsRef = useRef<Set<string>>(new Set())
  const guardCurrentChat = useCallback((expectedChatId: string, source: string) => {
    if (currentChatRef.current === expectedChatId) return true
-    console.warn('[AssistantChat] Discarded stale async result', {
+    if (import.meta.env.DEV) {
-      source,
+      console.warn('[AssistantChat] Discarded stale async result', {
-      expectedChatId,
+        source,
-      currentChatId: currentChatRef.current,
+        expectedChatId,
-    })
+        currentChatId: currentChatRef.current,
      })
    }
    return false
  }, [])
`@@ -3,4 +3,4 @@ from slowapi.util import get_remote_address`

	`from app.core.config import settings`	`from app.core.config import settings`

	`limiter = Limiter(key_func=get_remote_address, enabled=not settings.DEBUG)`	`limiter = Limiter(key_func=get_remote_address, enabled=settings.RATE_LIMIT_ENABLED)`