Server-side regex redaction masks IPs, emails, bearer/API tokens, and UNC paths in exported session content. Redaction runs post-generation and post-variable-resolution with fail-closed error handling. Frontend gets a "Mask Sensitive Data" toggle in the export preview modal with a summary of what was redacted. 24 unit tests passing, frontend build clean. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
114 lines
3.3 KiB
Python
114 lines
3.3 KiB
Python
"""Sensitive data redaction service for export content.
|
|
|
|
Applies regex-based pattern matching to mask IPs, emails, tokens, and UNC paths.
|
|
Redaction is non-persistent and request-scoped — database records are never mutated.
|
|
"""
|
|
|
|
import re
|
|
from dataclasses import dataclass, field
|
|
from typing import Callable
|
|
|
|
|
|
@dataclass
|
|
class RedactionSummary:
|
|
ips: int = 0
|
|
emails: int = 0
|
|
tokens: int = 0
|
|
unc_paths: int = 0
|
|
|
|
@property
|
|
def total(self) -> int:
|
|
return self.ips + self.emails + self.tokens + self.unc_paths
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"ips": self.ips,
|
|
"emails": self.emails,
|
|
"tokens": self.tokens,
|
|
"unc_paths": self.unc_paths,
|
|
"total": self.total,
|
|
}
|
|
|
|
|
|
# --- Compiled patterns (module-level, not per-request) ---
|
|
# Order matters: more specific/longer patterns first to prevent partial matches.
|
|
|
|
_PATTERNS: list[tuple[re.Pattern, str, str]] = [
|
|
# 1. Bearer tokens (before general token detection)
|
|
(
|
|
re.compile(r"Bearer\s+[A-Za-z0-9._\-]+", re.ASCII),
|
|
"[TOKEN REDACTED]",
|
|
"tokens",
|
|
),
|
|
# 2. API key / long hex-base64 strings (32+ chars of hex/base64 characters)
|
|
(
|
|
re.compile(r"\b[A-Za-z0-9+/=_\-]{32,}\b", re.ASCII),
|
|
"[TOKEN REDACTED]",
|
|
"tokens",
|
|
),
|
|
# 3. UNC paths (\\server\share)
|
|
(
|
|
re.compile(r"\\\\[\w.\-]+\\[\w$.\-]+"),
|
|
"[UNC PATH REDACTED]",
|
|
"unc_paths",
|
|
),
|
|
# 4. Email addresses
|
|
(
|
|
re.compile(r"\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b"),
|
|
"[EMAIL REDACTED]",
|
|
"emails",
|
|
),
|
|
# 5. IPv6 (before IPv4 to avoid partial matches on mixed notation)
|
|
(
|
|
re.compile(r"\b(?:[0-9a-fA-F]{1,4}:){2,7}[0-9a-fA-F]{1,4}\b"),
|
|
"[IP REDACTED]",
|
|
"ips",
|
|
),
|
|
# 6. IPv4
|
|
(
|
|
re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b"),
|
|
"[IP REDACTED]",
|
|
"ips",
|
|
),
|
|
]
|
|
|
|
|
|
def apply_redaction_to_text(content: str) -> tuple[str, RedactionSummary]:
|
|
"""Apply all redaction patterns to text content.
|
|
|
|
Uses re.subn for replacement + counting in one pass per pattern.
|
|
Patterns are applied in priority order (most specific first).
|
|
|
|
Returns (redacted_content, summary).
|
|
"""
|
|
if not content:
|
|
return content, RedactionSummary()
|
|
|
|
summary = RedactionSummary()
|
|
|
|
for pattern, replacement, category in _PATTERNS:
|
|
content, count = pattern.subn(replacement, content)
|
|
if count > 0:
|
|
current = getattr(summary, category)
|
|
setattr(summary, category, current + count)
|
|
|
|
return content, summary
|
|
|
|
|
|
def format_redaction_footer(summary: RedactionSummary) -> str:
|
|
"""Build a human-readable footer line summarizing what was redacted."""
|
|
if summary.total == 0:
|
|
return ""
|
|
|
|
parts = []
|
|
if summary.ips > 0:
|
|
parts.append(f"{summary.ips} IP{'s' if summary.ips != 1 else ''}")
|
|
if summary.emails > 0:
|
|
parts.append(f"{summary.emails} email{'s' if summary.emails != 1 else ''}")
|
|
if summary.tokens > 0:
|
|
parts.append(f"{summary.tokens} token{'s' if summary.tokens != 1 else ''}")
|
|
if summary.unc_paths > 0:
|
|
parts.append(f"{summary.unc_paths} UNC path{'s' if summary.unc_paths != 1 else ''}")
|
|
|
|
return f"\n--- Redacted: {', '.join(parts)} ---"
|