Files
resolutionflow/backend/app/services/redaction_service.py
chihlasm 303570ca2c feat: add sensitive data redaction to export (Phase C)
Server-side regex redaction masks IPs, emails, bearer/API tokens, and
UNC paths in exported session content. Redaction runs post-generation
and post-variable-resolution with fail-closed error handling. Frontend
gets a "Mask Sensitive Data" toggle in the export preview modal with
a summary of what was redacted. 24 unit tests passing, frontend build clean.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-14 00:11:20 -05:00

114 lines
3.3 KiB
Python

"""Sensitive data redaction service for export content.
Applies regex-based pattern matching to mask IPs, emails, tokens, and UNC paths.
Redaction is non-persistent and request-scoped — database records are never mutated.
"""
import re
from dataclasses import dataclass, field
from typing import Callable
@dataclass
class RedactionSummary:
ips: int = 0
emails: int = 0
tokens: int = 0
unc_paths: int = 0
@property
def total(self) -> int:
return self.ips + self.emails + self.tokens + self.unc_paths
def to_dict(self) -> dict:
return {
"ips": self.ips,
"emails": self.emails,
"tokens": self.tokens,
"unc_paths": self.unc_paths,
"total": self.total,
}
# --- Compiled patterns (module-level, not per-request) ---
# Order matters: more specific/longer patterns first to prevent partial matches.
_PATTERNS: list[tuple[re.Pattern, str, str]] = [
# 1. Bearer tokens (before general token detection)
(
re.compile(r"Bearer\s+[A-Za-z0-9._\-]+", re.ASCII),
"[TOKEN REDACTED]",
"tokens",
),
# 2. API key / long hex-base64 strings (32+ chars of hex/base64 characters)
(
re.compile(r"\b[A-Za-z0-9+/=_\-]{32,}\b", re.ASCII),
"[TOKEN REDACTED]",
"tokens",
),
# 3. UNC paths (\\server\share)
(
re.compile(r"\\\\[\w.\-]+\\[\w$.\-]+"),
"[UNC PATH REDACTED]",
"unc_paths",
),
# 4. Email addresses
(
re.compile(r"\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b"),
"[EMAIL REDACTED]",
"emails",
),
# 5. IPv6 (before IPv4 to avoid partial matches on mixed notation)
(
re.compile(r"\b(?:[0-9a-fA-F]{1,4}:){2,7}[0-9a-fA-F]{1,4}\b"),
"[IP REDACTED]",
"ips",
),
# 6. IPv4
(
re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b"),
"[IP REDACTED]",
"ips",
),
]
def apply_redaction_to_text(content: str) -> tuple[str, RedactionSummary]:
"""Apply all redaction patterns to text content.
Uses re.subn for replacement + counting in one pass per pattern.
Patterns are applied in priority order (most specific first).
Returns (redacted_content, summary).
"""
if not content:
return content, RedactionSummary()
summary = RedactionSummary()
for pattern, replacement, category in _PATTERNS:
content, count = pattern.subn(replacement, content)
if count > 0:
current = getattr(summary, category)
setattr(summary, category, current + count)
return content, summary
def format_redaction_footer(summary: RedactionSummary) -> str:
"""Build a human-readable footer line summarizing what was redacted."""
if summary.total == 0:
return ""
parts = []
if summary.ips > 0:
parts.append(f"{summary.ips} IP{'s' if summary.ips != 1 else ''}")
if summary.emails > 0:
parts.append(f"{summary.emails} email{'s' if summary.emails != 1 else ''}")
if summary.tokens > 0:
parts.append(f"{summary.tokens} token{'s' if summary.tokens != 1 else ''}")
if summary.unc_paths > 0:
parts.append(f"{summary.unc_paths} UNC path{'s' if summary.unc_paths != 1 else ''}")
return f"\n--- Redacted: {', '.join(parts)} ---"