resolutionflow/backend/app/services/redaction_service.py

"""Sensitive data redaction service for export content.

Applies regex-based pattern matching to mask IPs, emails, tokens, and UNC paths.
Redaction is non-persistent and request-scoped — database records are never mutated.
"""

import re
from dataclasses import dataclass, field
from typing import Callable


@dataclass
class RedactionSummary:
    ips: int = 0
    emails: int = 0
    tokens: int = 0
    unc_paths: int = 0

    @property
    def total(self) -> int:
        return self.ips + self.emails + self.tokens + self.unc_paths

    def to_dict(self) -> dict:
        return {
            "ips": self.ips,
            "emails": self.emails,
            "tokens": self.tokens,
            "unc_paths": self.unc_paths,
            "total": self.total,
        }


# --- Compiled patterns (module-level, not per-request) ---
# Order matters: more specific/longer patterns first to prevent partial matches.

_PATTERNS: list[tuple[re.Pattern, str, str]] = [
    # 1. Bearer tokens (before general token detection)
    (
        re.compile(r"Bearer\s+[A-Za-z0-9._\-]+", re.ASCII),
        "[TOKEN REDACTED]",
        "tokens",
    ),
    # 2. API key / long hex-base64 strings (32+ chars of hex/base64 characters)
    (
        re.compile(r"\b[A-Za-z0-9+/=_\-]{32,}\b", re.ASCII),
        "[TOKEN REDACTED]",
        "tokens",
    ),
    # 3. UNC paths (\\server\share)
    (
        re.compile(r"\\\\[\w.\-]+\\[\w$.\-]+"),
        "[UNC PATH REDACTED]",
        "unc_paths",
    ),
    # 4. Email addresses
    (
        re.compile(r"\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b"),
        "[EMAIL REDACTED]",
        "emails",
    ),
    # 5. IPv6 (before IPv4 to avoid partial matches on mixed notation)
    (
        re.compile(r"\b(?:[0-9a-fA-F]{1,4}:){2,7}[0-9a-fA-F]{1,4}\b"),
        "[IP REDACTED]",
        "ips",
    ),
    # 6. IPv4
    (
        re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b"),
        "[IP REDACTED]",
        "ips",
    ),
]


def apply_redaction_to_text(content: str) -> tuple[str, RedactionSummary]:
    """Apply all redaction patterns to text content.

    Uses re.subn for replacement + counting in one pass per pattern.
    Patterns are applied in priority order (most specific first).

    Returns (redacted_content, summary).
    """
    if not content:
        return content, RedactionSummary()

    summary = RedactionSummary()

    for pattern, replacement, category in _PATTERNS:
        content, count = pattern.subn(replacement, content)
        if count > 0:
            current = getattr(summary, category)
            setattr(summary, category, current + count)

    return content, summary


def format_redaction_footer(summary: RedactionSummary) -> str:
    """Build a human-readable footer line summarizing what was redacted."""
    if summary.total == 0:
        return ""

    parts = []
    if summary.ips > 0:
        parts.append(f"{summary.ips} IP{'s' if summary.ips != 1 else ''}")
    if summary.emails > 0:
        parts.append(f"{summary.emails} email{'s' if summary.emails != 1 else ''}")
    if summary.tokens > 0:
        parts.append(f"{summary.tokens} token{'s' if summary.tokens != 1 else ''}")
    if summary.unc_paths > 0:
        parts.append(f"{summary.unc_paths} UNC path{'s' if summary.unc_paths != 1 else ''}")

    return f"\n--- Redacted: {', '.join(parts)} ---"