"""Sensitive data redaction service for export content. Applies regex-based pattern matching to mask IPs, emails, tokens, and UNC paths. Redaction is non-persistent and request-scoped — database records are never mutated. """ import re from dataclasses import dataclass, field from typing import Callable @dataclass class RedactionSummary: ips: int = 0 emails: int = 0 tokens: int = 0 unc_paths: int = 0 @property def total(self) -> int: return self.ips + self.emails + self.tokens + self.unc_paths def to_dict(self) -> dict: return { "ips": self.ips, "emails": self.emails, "tokens": self.tokens, "unc_paths": self.unc_paths, "total": self.total, } # --- Compiled patterns (module-level, not per-request) --- # Order matters: more specific/longer patterns first to prevent partial matches. _PATTERNS: list[tuple[re.Pattern, str, str]] = [ # 1. Bearer tokens (before general token detection) ( re.compile(r"Bearer\s+[A-Za-z0-9._\-]+", re.ASCII), "[TOKEN REDACTED]", "tokens", ), # 2. API key / long hex-base64 strings (32+ chars of hex/base64 characters) ( re.compile(r"\b[A-Za-z0-9+/=_\-]{32,}\b", re.ASCII), "[TOKEN REDACTED]", "tokens", ), # 3. UNC paths (\\server\share) ( re.compile(r"\\\\[\w.\-]+\\[\w$.\-]+"), "[UNC PATH REDACTED]", "unc_paths", ), # 4. Email addresses ( re.compile(r"\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b"), "[EMAIL REDACTED]", "emails", ), # 5. IPv6 (before IPv4 to avoid partial matches on mixed notation) ( re.compile(r"\b(?:[0-9a-fA-F]{1,4}:){2,7}[0-9a-fA-F]{1,4}\b"), "[IP REDACTED]", "ips", ), # 6. IPv4 ( re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b"), "[IP REDACTED]", "ips", ), ] def apply_redaction_to_text(content: str) -> tuple[str, RedactionSummary]: """Apply all redaction patterns to text content. Uses re.subn for replacement + counting in one pass per pattern. Patterns are applied in priority order (most specific first). Returns (redacted_content, summary). """ if not content: return content, RedactionSummary() summary = RedactionSummary() for pattern, replacement, category in _PATTERNS: content, count = pattern.subn(replacement, content) if count > 0: current = getattr(summary, category) setattr(summary, category, current + count) return content, summary def format_redaction_footer(summary: RedactionSummary) -> str: """Build a human-readable footer line summarizing what was redacted.""" if summary.total == 0: return "" parts = [] if summary.ips > 0: parts.append(f"{summary.ips} IP{'s' if summary.ips != 1 else ''}") if summary.emails > 0: parts.append(f"{summary.emails} email{'s' if summary.emails != 1 else ''}") if summary.tokens > 0: parts.append(f"{summary.tokens} token{'s' if summary.tokens != 1 else ''}") if summary.unc_paths > 0: parts.append(f"{summary.unc_paths} UNC path{'s' if summary.unc_paths != 1 else ''}") return f"\n--- Redacted: {', '.join(parts)} ---"