fix(prod): harden production configuration for launch
Some checks failed
Mirror to GitHub / mirror (push) Successful in 5s
CI / frontend (pull_request) Successful in 7m12s
CI / e2e (pull_request) Failing after 6m56s
CI / backend (pull_request) Failing after 9m55s

- Gate Swagger/ReDoc/OpenAPI behind DEBUG (no public API schema in prod)
- Sentry send_default_pii only in dev (no auth headers/bodies in events)
- Remove alembic from Dockerfile CMD (releaseCommand owns migrations; CMD copy raced across replicas/restarts)
- Decouple rate limiting from DEBUG via RATE_LIMIT_ENABLED (PR envs with DEBUG=true were unlimited); tests disable the live limiter in conftest
- max_instances=1 on the 4 scheduler jobs missing it
- Boot-time failure when SELF_SERVE_ENABLED without RESEND_API_KEY/ANTHROPIC_API_KEY/FRONTEND_URL
- Reject localhost OAUTH_REDIRECT_BASE outside DEBUG
- pool_pre_ping + pool_recycle on the app engine
- Frontend: DEV-gate stale-async console.warn; document VITE_SELF_SERVE_ENABLED fallback semantics in Dockerfile

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-06-12 19:22:35 -04:00
parent b69447767a
commit c4947218a4
9 changed files with 70 additions and 14 deletions

View File

@@ -40,3 +40,5 @@ STRIPE_WEBHOOK_SECRET=whsec_
# before the public flip. Empty by default.
SELF_SERVE_ENABLED=false
INTERNAL_TESTER_EMAILS=
# Rate limiting (decoupled from DEBUG; keep true in PR/staging/prod)
RATE_LIMIT_ENABLED=false

View File

@@ -24,5 +24,6 @@ COPY . .
# Expose port (Railway uses PORT env variable)
EXPOSE 8000
# Run migrations then start the application
CMD alembic upgrade head && uvicorn app.main:app --host 0.0.0.0 --port ${PORT:-8000}
# Migrations run exclusively via Railway releaseCommand (scripts/release) —
# running them here too would race across replicas/restarts.
CMD uvicorn app.main:app --host 0.0.0.0 --port ${PORT:-8000}

View File

@@ -85,6 +85,10 @@ class Settings(BaseSettings):
# Security
BCRYPT_ROUNDS: int = 12
# Rate limiting — independent of DEBUG so PR/staging envs running with
# DEBUG=true still rate-limit auth and AI endpoints.
RATE_LIMIT_ENABLED: bool = True
# Security Headers
CSP_REPORT_ONLY: bool = True # Set False to enforce CSP
CSP_EXTRA_SCRIPT_SOURCES: list[str] = [] # Additional script-src domains
@@ -259,6 +263,18 @@ class Settings(BaseSettings):
MS_CLIENT_SECRET: Optional[str] = None
OAUTH_REDIRECT_BASE: str = "http://localhost:5173"
@field_validator("OAUTH_REDIRECT_BASE", mode="after")
@classmethod
def reject_localhost_redirect_in_production(cls, v: str, info) -> str:
"""OAuth code exchange against a localhost redirect_uri is always a
misconfiguration outside DEBUG — fail at boot, not at first sign-in."""
debug = info.data.get("DEBUG", False)
if not debug and v.startswith("http://localhost"):
raise ValueError(
"OAUTH_REDIRECT_BASE must be set to the public frontend URL in production"
)
return v
# Monitoring
SENTRY_DSN: Optional[str] = None

View File

@@ -7,7 +7,10 @@ from app.core.tenant_context import register_tenant_listener
engine = create_async_engine(
settings.DATABASE_URL,
echo=settings.DEBUG,
future=True
future=True,
# Detect connections dropped by DB restarts/maintenance instead of failing requests
pool_pre_ping=True,
pool_recycle=1800,
)
# Create async session factory

View File

@@ -3,4 +3,4 @@ from slowapi.util import get_remote_address
from app.core.config import settings
limiter = Limiter(key_func=get_remote_address, enabled=not settings.DEBUG)
limiter = Limiter(key_func=get_remote_address, enabled=settings.RATE_LIMIT_ENABLED)

View File

@@ -15,7 +15,9 @@ if settings.SENTRY_DSN:
sentry_sdk.init(
dsn=settings.SENTRY_DSN,
environment="development" if settings.DEBUG else "production",
send_default_pii=True,
# PII (headers, bodies, IPs) only in dev — prod events must not capture
# auth tokens or customer data from a multi-tenant MSP product.
send_default_pii=settings.DEBUG,
traces_sample_rate=1.0 if settings.DEBUG else 0.2,
# Profiling — included in free plan
profiles_sample_rate=1.0 if settings.DEBUG else 0.2,
@@ -151,6 +153,21 @@ async def lifespan(app: FastAPI):
logger.info("Starting ResolutionFlow API server...")
logger.info(f"Environment: {'Development' if settings.DEBUG else 'Production'}")
logger.info(f"ALLOW_RAILWAY_ORIGINS: {settings.ALLOW_RAILWAY_ORIGINS}")
# Self-serve signup is broken without these — fail loudly at boot, not at the
# first customer's signup attempt.
if settings.SELF_SERVE_ENABLED and not settings.DEBUG:
missing = [
name for name, value in (
("RESEND_API_KEY", settings.RESEND_API_KEY),
("ANTHROPIC_API_KEY", settings.ANTHROPIC_API_KEY),
("FRONTEND_URL", settings.FRONTEND_URL),
) if not value
]
if missing:
raise RuntimeError(
f"SELF_SERVE_ENABLED=true but required settings are unset: {', '.join(missing)}"
)
# Note: In production, use Alembic migrations instead of init_db
# await init_db()
@@ -170,6 +187,7 @@ async def lifespan(app: FastAPI):
hours=1,
id="cleanup_ai_conversations",
replace_existing=True,
max_instances=1,
)
# Chat retention cleanup (daily)
@@ -179,6 +197,7 @@ async def lifespan(app: FastAPI):
hours=24,
id="cleanup_expired_chats",
replace_existing=True,
max_instances=1,
)
# Auto-archive stale AI chat sessions (daily at 3 AM)
@@ -188,6 +207,7 @@ async def lifespan(app: FastAPI):
hour=3,
id="archive_stale_ai_sessions",
replace_existing=True,
max_instances=1,
)
# PSA push retry (every 5 minutes)
@@ -198,6 +218,7 @@ async def lifespan(app: FastAPI):
minutes=5,
id="psa_push_retry",
replace_existing=True,
max_instances=1,
)
# Knowledge Flywheel analysis (every 5 minutes)
@@ -252,9 +273,10 @@ app = FastAPI(
title=settings.APP_NAME,
description="ResolutionFlow - Take the path MOST traveled. Guided troubleshooting with automatic documentation.",
version="1.0.0",
docs_url="/api/docs",
redoc_url="/api/redoc",
openapi_url="/api/openapi.json",
# Interactive docs + schema are dev-only; prod must not expose the full API surface.
docs_url="/api/docs" if settings.DEBUG else None,
redoc_url="/api/redoc" if settings.DEBUG else None,
openapi_url="/api/openapi.json" if settings.DEBUG else None,
lifespan=lifespan
)

View File

@@ -29,6 +29,12 @@ from app import models as _models # noqa: F401
# Disable invite code requirement for tests
settings.REQUIRE_INVITE_CODE = False
# Disable rate limiting in tests — auth-heavy suites would trip the
# per-minute login/register limits. The limiter is constructed at import
# time, so flip the live instance rather than the setting.
from app.core.rate_limit import limiter as _limiter # noqa: E402
_limiter.enabled = False
# Test database URL — NEVER reuse DATABASE_URL. The test_db fixture does
# `DROP SCHEMA public CASCADE` on every test; if DATABASE_URL (which normally
# points at the dev/prod DB) leaked into this value, running `pytest tests/`

View File

@@ -13,6 +13,10 @@ RUN npm ci
COPY . .
# Build arguments (set at build time)
# NOTE: VITE_SELF_SERVE_ENABLED is only a network-error fallback — the live
# toggle is the backend /config/public flag; rebuilding with a new value here
# does NOT flip self-serve. VITE_STRIPE_PUBLISHABLE_KEY is currently unused
# (checkout is fully backend-driven); kept for a future client-side Stripe.js.
ARG VITE_API_URL
ARG VITE_SENTRY_DSN
ARG VITE_PUBLIC_POSTHOG_KEY

View File

@@ -266,11 +266,13 @@ export default function AssistantChatPage() {
const loadedChatIdsRef = useRef<Set<string>>(new Set())
const guardCurrentChat = useCallback((expectedChatId: string, source: string) => {
if (currentChatRef.current === expectedChatId) return true
if (import.meta.env.DEV) {
console.warn('[AssistantChat] Discarded stale async result', {
source,
expectedChatId,
currentChatId: currentChatRef.current,
})
}
return false
}, [])