From 4dc4894fc7a6a8ab7f45de4f361afe88e070581f Mon Sep 17 00:00:00 2001 From: Michael Chihlas Date: Fri, 12 Jun 2026 19:22:35 -0400 Subject: [PATCH] fix(prod): harden production configuration for launch - Gate Swagger/ReDoc/OpenAPI behind DEBUG (no public API schema in prod) - Sentry send_default_pii only in dev (no auth headers/bodies in events) - Remove alembic from Dockerfile CMD (releaseCommand owns migrations; CMD copy raced across replicas/restarts) - Decouple rate limiting from DEBUG via RATE_LIMIT_ENABLED (PR envs with DEBUG=true were unlimited); tests disable the live limiter in conftest - max_instances=1 on the 4 scheduler jobs missing it - Boot-time failure when SELF_SERVE_ENABLED without RESEND_API_KEY/ANTHROPIC_API_KEY/FRONTEND_URL - Reject localhost OAUTH_REDIRECT_BASE outside DEBUG - pool_pre_ping + pool_recycle on the app engine - Frontend: DEV-gate stale-async console.warn; document VITE_SELF_SERVE_ENABLED fallback semantics in Dockerfile Co-Authored-By: Claude Opus 4.7 --- backend/.env.example | 4 +++- backend/Dockerfile | 5 ++-- backend/app/core/config.py | 16 +++++++++++++ backend/app/core/database.py | 5 +++- backend/app/core/rate_limit.py | 2 +- backend/app/main.py | 30 ++++++++++++++++++++---- backend/tests/conftest.py | 6 +++++ frontend/Dockerfile | 4 ++++ frontend/src/pages/AssistantChatPage.tsx | 12 ++++++---- 9 files changed, 70 insertions(+), 14 deletions(-) diff --git a/backend/.env.example b/backend/.env.example index 05d46396..b7f21d3b 100644 --- a/backend/.env.example +++ b/backend/.env.example @@ -39,4 +39,6 @@ STRIPE_WEBHOOK_SECRET=whsec_ # global flag for specific users — used for prod test-mode validation # before the public flip. Empty by default. SELF_SERVE_ENABLED=false -INTERNAL_TESTER_EMAILS= \ No newline at end of file +INTERNAL_TESTER_EMAILS= +# Rate limiting (decoupled from DEBUG; keep true in PR/staging/prod) +RATE_LIMIT_ENABLED=false diff --git a/backend/Dockerfile b/backend/Dockerfile index 21f5ab52..45c9c853 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -24,5 +24,6 @@ COPY . . # Expose port (Railway uses PORT env variable) EXPOSE 8000 -# Run migrations then start the application -CMD alembic upgrade head && uvicorn app.main:app --host 0.0.0.0 --port ${PORT:-8000} +# Migrations run exclusively via Railway releaseCommand (scripts/release) — +# running them here too would race across replicas/restarts. +CMD uvicorn app.main:app --host 0.0.0.0 --port ${PORT:-8000} diff --git a/backend/app/core/config.py b/backend/app/core/config.py index 5f215cda..c2a3198f 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -85,6 +85,10 @@ class Settings(BaseSettings): # Security BCRYPT_ROUNDS: int = 12 + # Rate limiting — independent of DEBUG so PR/staging envs running with + # DEBUG=true still rate-limit auth and AI endpoints. + RATE_LIMIT_ENABLED: bool = True + # Security Headers CSP_REPORT_ONLY: bool = True # Set False to enforce CSP CSP_EXTRA_SCRIPT_SOURCES: list[str] = [] # Additional script-src domains @@ -255,6 +259,18 @@ class Settings(BaseSettings): MS_CLIENT_SECRET: Optional[str] = None OAUTH_REDIRECT_BASE: str = "http://localhost:5173" + @field_validator("OAUTH_REDIRECT_BASE", mode="after") + @classmethod + def reject_localhost_redirect_in_production(cls, v: str, info) -> str: + """OAuth code exchange against a localhost redirect_uri is always a + misconfiguration outside DEBUG — fail at boot, not at first sign-in.""" + debug = info.data.get("DEBUG", False) + if not debug and v.startswith("http://localhost"): + raise ValueError( + "OAUTH_REDIRECT_BASE must be set to the public frontend URL in production" + ) + return v + # Monitoring SENTRY_DSN: Optional[str] = None diff --git a/backend/app/core/database.py b/backend/app/core/database.py index c8132156..4f963913 100644 --- a/backend/app/core/database.py +++ b/backend/app/core/database.py @@ -7,7 +7,10 @@ from app.core.tenant_context import register_tenant_listener engine = create_async_engine( settings.DATABASE_URL, echo=settings.DEBUG, - future=True + future=True, + # Detect connections dropped by DB restarts/maintenance instead of failing requests + pool_pre_ping=True, + pool_recycle=1800, ) # Create async session factory diff --git a/backend/app/core/rate_limit.py b/backend/app/core/rate_limit.py index c2ca6a1a..1fff89ab 100644 --- a/backend/app/core/rate_limit.py +++ b/backend/app/core/rate_limit.py @@ -3,4 +3,4 @@ from slowapi.util import get_remote_address from app.core.config import settings -limiter = Limiter(key_func=get_remote_address, enabled=not settings.DEBUG) +limiter = Limiter(key_func=get_remote_address, enabled=settings.RATE_LIMIT_ENABLED) diff --git a/backend/app/main.py b/backend/app/main.py index b6a77f62..adeb14e6 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -15,7 +15,9 @@ if settings.SENTRY_DSN: sentry_sdk.init( dsn=settings.SENTRY_DSN, environment="development" if settings.DEBUG else "production", - send_default_pii=True, + # PII (headers, bodies, IPs) only in dev — prod events must not capture + # auth tokens or customer data from a multi-tenant MSP product. + send_default_pii=settings.DEBUG, traces_sample_rate=1.0 if settings.DEBUG else 0.2, # Profiling — included in free plan profiles_sample_rate=1.0 if settings.DEBUG else 0.2, @@ -151,6 +153,21 @@ async def lifespan(app: FastAPI): logger.info("Starting ResolutionFlow API server...") logger.info(f"Environment: {'Development' if settings.DEBUG else 'Production'}") logger.info(f"ALLOW_RAILWAY_ORIGINS: {settings.ALLOW_RAILWAY_ORIGINS}") + + # Self-serve signup is broken without these — fail loudly at boot, not at the + # first customer's signup attempt. + if settings.SELF_SERVE_ENABLED and not settings.DEBUG: + missing = [ + name for name, value in ( + ("RESEND_API_KEY", settings.RESEND_API_KEY), + ("ANTHROPIC_API_KEY", settings.ANTHROPIC_API_KEY), + ("FRONTEND_URL", settings.FRONTEND_URL), + ) if not value + ] + if missing: + raise RuntimeError( + f"SELF_SERVE_ENABLED=true but required settings are unset: {', '.join(missing)}" + ) # Note: In production, use Alembic migrations instead of init_db # await init_db() @@ -170,6 +187,7 @@ async def lifespan(app: FastAPI): hours=1, id="cleanup_ai_conversations", replace_existing=True, + max_instances=1, ) # Chat retention cleanup (daily) @@ -179,6 +197,7 @@ async def lifespan(app: FastAPI): hours=24, id="cleanup_expired_chats", replace_existing=True, + max_instances=1, ) # Auto-archive stale AI chat sessions (daily at 3 AM) @@ -188,6 +207,7 @@ async def lifespan(app: FastAPI): hour=3, id="archive_stale_ai_sessions", replace_existing=True, + max_instances=1, ) # PSA push retry (every 5 minutes) @@ -198,6 +218,7 @@ async def lifespan(app: FastAPI): minutes=5, id="psa_push_retry", replace_existing=True, + max_instances=1, ) # Knowledge Flywheel analysis (every 5 minutes) @@ -252,9 +273,10 @@ app = FastAPI( title=settings.APP_NAME, description="ResolutionFlow - Take the path MOST traveled. Guided troubleshooting with automatic documentation.", version="1.0.0", - docs_url="/api/docs", - redoc_url="/api/redoc", - openapi_url="/api/openapi.json", + # Interactive docs + schema are dev-only; prod must not expose the full API surface. + docs_url="/api/docs" if settings.DEBUG else None, + redoc_url="/api/redoc" if settings.DEBUG else None, + openapi_url="/api/openapi.json" if settings.DEBUG else None, lifespan=lifespan ) diff --git a/backend/tests/conftest.py b/backend/tests/conftest.py index cd4aa10b..49ab8ba2 100644 --- a/backend/tests/conftest.py +++ b/backend/tests/conftest.py @@ -29,6 +29,12 @@ from app import models as _models # noqa: F401 # Disable invite code requirement for tests settings.REQUIRE_INVITE_CODE = False +# Disable rate limiting in tests — auth-heavy suites would trip the +# per-minute login/register limits. The limiter is constructed at import +# time, so flip the live instance rather than the setting. +from app.core.rate_limit import limiter as _limiter # noqa: E402 +_limiter.enabled = False + # Test database URL — NEVER reuse DATABASE_URL. The test_db fixture does # `DROP SCHEMA public CASCADE` on every test; if DATABASE_URL (which normally # points at the dev/prod DB) leaked into this value, running `pytest tests/` diff --git a/frontend/Dockerfile b/frontend/Dockerfile index 66b67c4a..3c6dd012 100644 --- a/frontend/Dockerfile +++ b/frontend/Dockerfile @@ -13,6 +13,10 @@ RUN npm ci COPY . . # Build arguments (set at build time) +# NOTE: VITE_SELF_SERVE_ENABLED is only a network-error fallback — the live +# toggle is the backend /config/public flag; rebuilding with a new value here +# does NOT flip self-serve. VITE_STRIPE_PUBLISHABLE_KEY is currently unused +# (checkout is fully backend-driven); kept for a future client-side Stripe.js. ARG VITE_API_URL ARG VITE_SENTRY_DSN ARG VITE_PUBLIC_POSTHOG_KEY diff --git a/frontend/src/pages/AssistantChatPage.tsx b/frontend/src/pages/AssistantChatPage.tsx index bcd214de..a46f02b9 100644 --- a/frontend/src/pages/AssistantChatPage.tsx +++ b/frontend/src/pages/AssistantChatPage.tsx @@ -266,11 +266,13 @@ export default function AssistantChatPage() { const loadedChatIdsRef = useRef>(new Set()) const guardCurrentChat = useCallback((expectedChatId: string, source: string) => { if (currentChatRef.current === expectedChatId) return true - console.warn('[AssistantChat] Discarded stale async result', { - source, - expectedChatId, - currentChatId: currentChatRef.current, - }) + if (import.meta.env.DEV) { + console.warn('[AssistantChat] Discarded stale async result', { + source, + expectedChatId, + currentChatId: currentChatRef.current, + }) + } return false }, [])