Add M365 and additional networking seed trees
This commit is contained in:
761
backend/scripts/seed_trees_ad.py
Normal file
761
backend/scripts/seed_trees_ad.py
Normal file
@@ -0,0 +1,761 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
ResolutionFlow Decision Trees - Batch 2b: Active Directory / Entra ID
|
||||
|
||||
Six AD/Entra ID troubleshooting trees for MSP engineers.
|
||||
Imported by seed_trees_v2.py for seeding.
|
||||
"""
|
||||
|
||||
from typing import Any
|
||||
|
||||
|
||||
def get_repeated_lockout_tree() -> dict[str, Any]:
|
||||
"""User Account Locked Out (Repeated) - AD tree."""
|
||||
return {
|
||||
"name": "User Account Locked Out (Repeated)",
|
||||
"description": "Investigate and resolve repeated Active Directory account lockouts. Covers lockout source identification, common causes like stale credentials, service accounts, and mobile devices, with PowerShell diagnostics.",
|
||||
"category": "Active Directory",
|
||||
"tree_structure": {
|
||||
"id": "root",
|
||||
"type": "decision",
|
||||
"question": "Is this a one-time lockout or has the user been locked out multiple times recently?",
|
||||
"help_text": "Check AD account properties and recent lockout history. A single lockout is usually a forgotten password; repeated lockouts indicate a deeper issue.",
|
||||
"options": [
|
||||
{"id": "one_time", "label": "First or one-time lockout", "next_node_id": "simple_unlock"},
|
||||
{"id": "repeated", "label": "Multiple lockouts (keeps happening)", "next_node_id": "find_lockout_source"},
|
||||
{"id": "many_users", "label": "Multiple users getting locked out", "next_node_id": "check_brute_force"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "simple_unlock",
|
||||
"type": "action",
|
||||
"title": "Unlock Account and Verify",
|
||||
"description": "Simple lockout — unlock and confirm.\n\n**PowerShell:**\n```\nUnlock-ADAccount -Identity \"username\"\nGet-ADUser -Identity \"username\" -Properties LockedOut,PasswordLastSet,PasswordExpired\n```\n\n**Ask the user:**\n- Did you recently change your password?\n- Are you typing the right password?\n- Is Caps Lock on?\n\n**If password expired:** Reset it.\n**If user forgot password:** Reset and have them set a new one at next login.",
|
||||
"next_node_id": "verify_simple_unlock"
|
||||
},
|
||||
{
|
||||
"id": "verify_simple_unlock",
|
||||
"type": "decision",
|
||||
"question": "Can the user log in successfully now?",
|
||||
"help_text": "Have the user try logging in after the unlock",
|
||||
"options": [
|
||||
{"id": "success", "label": "Yes, user is logged in", "next_node_id": "solution_simple_unlock"},
|
||||
{"id": "locked_again", "label": "User got locked out again within minutes", "next_node_id": "find_lockout_source"},
|
||||
{"id": "wrong_password", "label": "User says password is wrong (but it's correct in AD)", "next_node_id": "check_password_sync"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "solution_simple_unlock",
|
||||
"type": "solution",
|
||||
"title": "Resolved: Account Unlocked",
|
||||
"description": "Simple lockout resolved by unlocking the account.\n\n**Ticket Notes:** Account was locked due to failed login attempts. Unlocked via PowerShell. User confirmed successful login.\n\n**If this recurs:** Use the 'repeated lockout' path to investigate the source."
|
||||
},
|
||||
{
|
||||
"id": "check_password_sync",
|
||||
"type": "action",
|
||||
"title": "Check Password Sync Status",
|
||||
"description": "User's password works in AD but not at the login prompt. This may be a sync/replication issue.\n\n**Check AD replication:**\n```\nrepadmin /replsummary\nrepadmin /showrepl\n```\n\n**Check which DC the user is authenticating against:**\n```\nnltest /dsgetdc:yourdomain.local\necho %LOGONSERVER%\n```\n\n**If using Entra ID / M365:** Check if password hash sync is current in Entra Connect.\n\n**Common cause:** Password was reset on DC1 but DC2 hasn't replicated yet. User's workstation is authenticating against DC2.",
|
||||
"next_node_id": "find_lockout_source"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "find_lockout_source",
|
||||
"type": "action",
|
||||
"title": "Identify Lockout Source Computer",
|
||||
"description": "Find which computer or device is causing the lockouts.\n\n**Step 1: Find the PDC Emulator** (lockout events are forwarded here):\n```\nGet-ADDomain | Select PDCEmulator\n```\n\n**Step 2: Query lockout events on the PDC:**\n```\nGet-WinEvent -ComputerName <PDC_NAME> -FilterHashtable @{\n LogName='Security'\n Id=4740\n} -MaxEvents 20 | Where-Object {\n $_.Properties[0].Value -eq 'username'\n} | Select TimeCreated,\n @{N='User';E={$_.Properties[0].Value}},\n @{N='SourceComputer';E={$_.Properties[1].Value}}\n```\n\n**Alternative:** Use Microsoft Account Lockout Status Tool (LockoutStatus.exe) for a GUI approach.\n\n**Document:** The source computer name and timestamps.",
|
||||
"next_node_id": "lockout_source_result"
|
||||
},
|
||||
{
|
||||
"id": "lockout_source_result",
|
||||
"type": "decision",
|
||||
"question": "What is the lockout source?",
|
||||
"help_text": "The SourceComputer field in Event 4740 tells you where the bad attempts come from",
|
||||
"options": [
|
||||
{"id": "user_workstation", "label": "User's own workstation", "next_node_id": "check_cached_creds_workstation"},
|
||||
{"id": "mobile_device", "label": "Mobile device or Exchange/ActiveSync", "next_node_id": "check_mobile_device"},
|
||||
{"id": "server", "label": "A server (file server, app server, etc.)", "next_node_id": "check_service_account"},
|
||||
{"id": "multiple_sources", "label": "Multiple different source computers", "next_node_id": "check_brute_force"},
|
||||
{"id": "cant_determine", "label": "Source is blank or can't determine", "next_node_id": "enable_netlogon_logging"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "check_cached_creds_workstation",
|
||||
"type": "action",
|
||||
"title": "Check for Cached/Saved Credentials on Workstation",
|
||||
"description": "The user's own workstation is sending bad credentials.\n\n**Check on the user's workstation:**\n\n**1. Windows Credential Manager:**\n```\nrundll32.exe keymgr.dll, KRShowKeyMgr\n# Or: Control Panel > Credential Manager\n```\nLook for saved credentials with old passwords.\n\n**2. Mapped drives with saved credentials:**\n```\nnet use\n```\nCheck for drives mapped with explicit credentials.\n\n**3. Scheduled tasks running as the user:**\n```\nGet-ScheduledTask | Where-Object {$_.Principal.UserId -like '*username*'}\n```\n\n**4. Browser saved passwords** — check Edge, Chrome for saved domain passwords.\n\n**5. RDP saved connections** — check for .rdp files with saved credentials.",
|
||||
"next_node_id": "cached_cred_result"
|
||||
},
|
||||
{
|
||||
"id": "cached_cred_result",
|
||||
"type": "decision",
|
||||
"question": "Did you find stale credentials?",
|
||||
"help_text": "Any saved password that doesn't match the current AD password will cause lockouts",
|
||||
"options": [
|
||||
{"id": "found_cred_manager", "label": "Found old entries in Credential Manager", "next_node_id": "fix_credential_manager"},
|
||||
{"id": "found_mapped_drive", "label": "Found mapped drive with saved creds", "next_node_id": "fix_mapped_drives"},
|
||||
{"id": "found_scheduled_task", "label": "Found scheduled task running as user", "next_node_id": "fix_scheduled_task"},
|
||||
{"id": "nothing_found", "label": "Nothing obvious found", "next_node_id": "check_deeper_sources"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "fix_credential_manager",
|
||||
"type": "solution",
|
||||
"title": "Resolved: Remove Stale Credential Manager Entries",
|
||||
"description": "Old passwords saved in Credential Manager were causing lockouts.\n\n**Fix:**\n1. Open Credential Manager (Control Panel)\n2. Under 'Windows Credentials', find entries for your domain\n3. Remove or update entries with the correct password\n4. Restart the workstation\n5. Unlock the AD account: `Unlock-ADAccount -Identity \"username\"`\n\n**Prevention:** Educate user that after password changes, they should update saved credentials.\n\n**Ticket Notes:** Stale credentials in Credential Manager causing lockouts. Entries removed/updated."
|
||||
},
|
||||
{
|
||||
"id": "fix_mapped_drives",
|
||||
"type": "solution",
|
||||
"title": "Resolved: Fix Mapped Drive Credentials",
|
||||
"description": "A mapped network drive was using old credentials.\n\n**Fix:**\n```\n# Remove the problematic mapping\nnet use Z: /delete\n\n# Remap without saved credentials (will use current login)\nnet use Z: \\\\server\\share /persistent:yes\n```\n\n**Or use Group Policy** to manage drive mappings (preferred for enterprise).\n\n**After fixing:** Unlock the account and monitor for recurrence."
|
||||
},
|
||||
{
|
||||
"id": "fix_scheduled_task",
|
||||
"type": "solution",
|
||||
"title": "Resolved: Fix Scheduled Task Credentials",
|
||||
"description": "A scheduled task was running with the user's old password.\n\n**Fix:**\n1. Open Task Scheduler on the affected machine\n2. Find the task running as the user\n3. Update the password in the task properties\n\n**PowerShell:**\n```\nGet-ScheduledTask | Where-Object {$_.Principal.UserId -like '*username*'} | Select TaskName,TaskPath\n```\n\n**Best practice:** Scheduled tasks should use service accounts, not user accounts.\n\n**After fixing:** Unlock the account."
|
||||
},
|
||||
{
|
||||
"id": "check_deeper_sources",
|
||||
"type": "action",
|
||||
"title": "Check Less Obvious Lockout Sources",
|
||||
"description": "Common sources cleared. Check these less obvious causes:\n\n**1. Outlook/Teams on another device:**\nIs the user logged into Outlook or Teams on a second computer, tablet, or phone with old password?\n\n**2. WiFi authentication (802.1X):**\nIf your WiFi uses domain credentials, the saved WiFi password may be old.\n\n**3. VPN client:**\nSaved VPN credentials with old password.\n\n**4. Applications with saved logins:**\nLOB apps, web portals using Windows auth.\n\n**5. Another user's machine:**\nIs someone else trying to access a share using this person's credentials?\n\n**Ask the user:** Have you logged into any other devices recently? Changed your password recently? Using any company apps on your phone?",
|
||||
"next_node_id": "escalate_persistent_lockout"
|
||||
},
|
||||
{
|
||||
"id": "escalate_persistent_lockout",
|
||||
"type": "solution",
|
||||
"title": "Escalate: Persistent Lockout - Source Unknown",
|
||||
"description": "Unable to identify the lockout source through standard methods.\n\n**Advanced investigation needed:**\n1. Enable detailed Netlogon logging on DCs\n2. Use network packet capture to find authentication attempts\n3. Review RADIUS/NPS logs if using 802.1X\n4. Check Entra ID sign-in logs for cloud auth attempts\n\n**Temporary workaround:**\n- Increase account lockout threshold temporarily\n- Or add user to a 'lockout exempt' fine-grained password policy (if available)\n\n**Escalate to:** Senior Systems Administrator\n**Include:** Event 4740 logs, source computers found, items already checked."
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "check_mobile_device",
|
||||
"type": "solution",
|
||||
"title": "Fix Mobile Device / Exchange ActiveSync",
|
||||
"description": "A mobile device (phone/tablet) is sending old credentials via ActiveSync or Outlook mobile.\n\n**Fix:**\n1. Have the user update their password on their mobile device:\n - iPhone: Settings > Passwords & Accounts > Exchange > re-enter password\n - Android: Settings > Accounts > Exchange > update password\n - Outlook Mobile: Profile > Account > re-enter password\n2. If that doesn't work, remove and re-add the email account on the device\n\n**To confirm it's ActiveSync:**\nCheck Exchange/M365 ActiveSync logs for the user:\n```\nGet-MobileDeviceStatistics -Mailbox user@domain.com | Select DeviceFriendlyName,LastSyncAttemptTime,Status\n```\n\n**After fixing:** Unlock the AD account.\n\n**Prevention:** Consider using Intune or MDM to manage device password policies."
|
||||
},
|
||||
{
|
||||
"id": "check_service_account",
|
||||
"type": "solution",
|
||||
"title": "Fix Service or Application Using User Credentials",
|
||||
"description": "A server or application is using this user's credentials (usually incorrectly).\n\n**Check on the source server:**\n```\n# Services running as this user\nGet-WmiObject Win32_Service | Where-Object {$_.StartName -like '*username*'} | Select Name,StartName,State\n\n# IIS App Pools\nGet-IISAppPool | Where-Object {$_.ProcessModel.UserName -like '*username*'}\n\n# Scheduled Tasks\nGet-ScheduledTask | Where-Object {$_.Principal.UserId -like '*username*'}\n\n# COM+ Applications\n# Check via Component Services (dcomcnfg)\n```\n\n**Best practice:** Services should use dedicated service accounts (preferably Managed Service Accounts), never personal user accounts.\n\n**Fix:** Update the password in the service/app or migrate to a proper service account.\n\n**After fixing:** Unlock the AD account."
|
||||
},
|
||||
{
|
||||
"id": "check_brute_force",
|
||||
"type": "action",
|
||||
"title": "Investigate Potential Brute Force Attack",
|
||||
"description": "Multiple users getting locked out or lockouts from many different sources could indicate an attack.\n\n**Check Security Event Log for patterns:**\n```\n# Failed logon attempts (Event 4625)\nGet-WinEvent -FilterHashtable @{LogName='Security';Id=4625} -MaxEvents 100 |\n Group-Object {$_.Properties[5].Value} | Sort Count -Descending |\n Select Count,Name -First 20\n```\n\n**Red flags:**\n- Lockouts from unknown/external IPs\n- Lockouts happening at unusual hours\n- Many accounts targeted simultaneously\n- Attempts from multiple geographic locations\n\n**If this looks like an attack:**\n1. Do NOT just unlock accounts — investigate first\n2. Check if any accounts were actually compromised\n3. Review VPN and external-facing authentication logs",
|
||||
"next_node_id": "brute_force_result"
|
||||
},
|
||||
{
|
||||
"id": "brute_force_result",
|
||||
"type": "decision",
|
||||
"question": "Does this appear to be a security incident?",
|
||||
"help_text": "Look at the pattern of lockouts, source IPs, and timing",
|
||||
"options": [
|
||||
{"id": "likely_attack", "label": "Yes, appears to be an attack / security incident", "next_node_id": "escalate_security"},
|
||||
{"id": "not_attack", "label": "No, appears to be a system/config issue", "next_node_id": "check_common_mass_lockout"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "escalate_security",
|
||||
"type": "solution",
|
||||
"title": "SECURITY INCIDENT: Escalate Immediately",
|
||||
"description": "**Priority: CRITICAL — Potential security incident.**\n\n**Do NOT just unlock accounts.**\n\n**Immediate actions:**\n1. Document all affected accounts and lockout sources\n2. Check if any accounts show successful logins from suspicious IPs\n3. Preserve event logs for forensics\n4. Check if MFA was bypassed\n\n**Escalate to:** Security team / CISO immediately\n**Include:** Event log exports, list of affected accounts, source IPs, timeline\n\n**Consider:**\n- Blocking suspicious source IPs at the firewall\n- Forcing password resets for affected accounts\n- Enabling enhanced logging\n\n**Communication:** Follow your incident response plan."
|
||||
},
|
||||
{
|
||||
"id": "check_common_mass_lockout",
|
||||
"type": "solution",
|
||||
"title": "Investigate Mass Lockout (Non-Security)",
|
||||
"description": "Multiple users locked out but doesn't appear to be an attack.\n\n**Common causes of mass lockouts:**\n\n1. **Password policy change:** New policy locked accounts that don't comply\n2. **Application with hardcoded credentials:** An app using a shared credential that was changed\n3. **GPO change:** New GPO tightened lockout thresholds\n4. **Service account cascade:** A service account got locked, causing dependent services to fail and retry\n5. **Kerberos ticket issues:** Time sync problem between DCs and clients\n\n**Check:**\n```\n# Recent GPO changes\nGet-GPO -All | Sort ModificationTime -Descending | Select DisplayName,ModificationTime -First 10\n\n# Time sync\nw32tm /query /status\n```\n\n**Escalate to:** Senior AD Administrator with pattern analysis."
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "enable_netlogon_logging",
|
||||
"type": "solution",
|
||||
"title": "Enable Netlogon Logging for Detailed Tracking",
|
||||
"description": "Event 4740 doesn't show the source. Enable Netlogon debug logging.\n\n**On the PDC Emulator:**\n```\n# Enable Netlogon debug logging\nnltest /dbflag:0x2080ffff\n\n# Log location\n# C:\\Windows\\debug\\netlogon.log\n```\n\n**Wait for the next lockout**, then search the log:\n```\nSelect-String -Path C:\\Windows\\debug\\netlogon.log -Pattern 'username'\n```\n\n**IMPORTANT:** Disable logging after troubleshooting:\n```\nnltest /dbflag:0x0\n```\n\nNetlogon logging is verbose and can fill disk space if left on.\n\n**Escalate to:** Senior AD admin if you need help interpreting the logs."
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def get_ad_replication_tree() -> dict[str, Any]:
|
||||
"""AD Replication Failures - Systems administration tree."""
|
||||
return {
|
||||
"name": "AD Replication Failures",
|
||||
"description": "Diagnose and resolve Active Directory replication issues between domain controllers. Covers repadmin diagnostics, common error codes, DNS dependencies, and RPC connectivity troubleshooting.",
|
||||
"category": "Active Directory",
|
||||
"tree_structure": {
|
||||
"id": "root",
|
||||
"type": "decision",
|
||||
"question": "How was the AD replication issue discovered?",
|
||||
"help_text": "Replication failures can cause inconsistent data across DCs — different users see different results for passwords, group memberships, GPOs, and DNS.",
|
||||
"options": [
|
||||
{"id": "monitoring_alert", "label": "Monitoring alert / repadmin check", "next_node_id": "run_repl_diagnostics"},
|
||||
{"id": "user_symptoms", "label": "User-reported symptoms (password not working on some PCs, etc.)", "next_node_id": "confirm_repl_issue"},
|
||||
{"id": "dcdiag_failure", "label": "DCDiag reported failures", "next_node_id": "run_repl_diagnostics"},
|
||||
{"id": "new_dc", "label": "New DC not replicating", "next_node_id": "check_new_dc"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "confirm_repl_issue",
|
||||
"type": "action",
|
||||
"title": "Confirm This Is a Replication Issue",
|
||||
"description": "User symptoms may or may not be replication. Quick check:\n\n```\nrepadmin /replsummary\n```\n\nIf you see failures or large 'number of failures' counts, replication is broken.\n\n**Also try:**\n```\nrepadmin /showrepl\ndcdiag /test:replications\n```\n\n**If replication looks healthy:** The user's issue is likely something else (password reset needed, group membership change, etc.)",
|
||||
"next_node_id": "repl_confirmed"
|
||||
},
|
||||
{
|
||||
"id": "repl_confirmed",
|
||||
"type": "decision",
|
||||
"question": "Does repadmin /replsummary show failures?",
|
||||
"help_text": "Look for non-zero failure counts and error codes",
|
||||
"options": [
|
||||
{"id": "yes_failures", "label": "Yes, replication failures shown", "next_node_id": "run_repl_diagnostics"},
|
||||
{"id": "no_failures", "label": "No, replication looks healthy", "next_node_id": "solution_repl_healthy"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "solution_repl_healthy",
|
||||
"type": "solution",
|
||||
"title": "AD Replication is Healthy",
|
||||
"description": "Replication is working correctly. The user's issue has a different root cause.\n\n**Common alternative causes for 'replication-like' symptoms:**\n- Password was recently changed and user hit a DC that hasn't processed it yet (wait 15 min, normal delay)\n- Group membership change (Kerberos ticket needs renewal — user must log out/in)\n- DNS stale record (different from AD replication)\n\n**Ticket Notes:** AD replication verified healthy. User issue has different root cause."
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "run_repl_diagnostics",
|
||||
"type": "action",
|
||||
"title": "Run Detailed Replication Diagnostics",
|
||||
"description": "Gather comprehensive replication status.\n\n```\n# Summary of all replication partnerships\nrepadmin /replsummary\n\n# Detailed per-DC replication status\nrepadmin /showrepl * /csv > C:\\temp\\replstatus.csv\n\n# Check for lingering objects\nrepadmin /removelingeringobjects\n\n# Full DC health check\ndcdiag /v /c /d /e /s:<DC_NAME>\n```\n\n**Key things to note:**\n- Which DCs are failing?\n- What error codes are shown?\n- How long has replication been failing?\n- Is it one-way or both directions?",
|
||||
"next_node_id": "repl_error_type"
|
||||
},
|
||||
{
|
||||
"id": "repl_error_type",
|
||||
"type": "decision",
|
||||
"question": "What replication error code or message do you see?",
|
||||
"help_text": "Check the error code in repadmin /showrepl output",
|
||||
"options": [
|
||||
{"id": "rpc_error", "label": "RPC server unavailable (Error 1722)", "next_node_id": "fix_rpc"},
|
||||
{"id": "dns_error", "label": "DNS lookup failure (Error 8524/8453)", "next_node_id": "fix_repl_dns"},
|
||||
{"id": "access_denied", "label": "Access denied (Error 8453/5)", "next_node_id": "fix_repl_access"},
|
||||
{"id": "schema_mismatch", "label": "Schema mismatch / version error", "next_node_id": "fix_schema"},
|
||||
{"id": "other_error", "label": "Different error or not sure", "next_node_id": "general_repl_troubleshooting"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "fix_rpc",
|
||||
"type": "action",
|
||||
"title": "Fix RPC Connectivity (Error 1722)",
|
||||
"description": "AD replication uses RPC. Error 1722 means DCs can't communicate.\n\n**Test RPC connectivity:**\n```\n# Test from source DC to destination DC\nTest-NetConnection -ComputerName <TARGET_DC> -Port 135\nTest-NetConnection -ComputerName <TARGET_DC> -Port 445\n\n# Test RPC endpoint mapper\nportqry -n <TARGET_DC> -e 135\n```\n\n**Common causes:**\n- Firewall blocking RPC ports (135 + dynamic range 49152-65535)\n- DC is offline or unreachable\n- DNS returning wrong IP for the DC\n- Windows Firewall enabled with wrong rules\n\n**Check DNS resolution for the DC:**\n```\nnslookup <TARGET_DC_NAME>\nnslookup <TARGET_DC_FQDN>\n```",
|
||||
"next_node_id": "rpc_result"
|
||||
},
|
||||
{
|
||||
"id": "rpc_result",
|
||||
"type": "decision",
|
||||
"question": "Can you reach the target DC on port 135?",
|
||||
"help_text": "Test-NetConnection result",
|
||||
"options": [
|
||||
{"id": "port_blocked", "label": "Port 135 blocked", "next_node_id": "escalate_rpc_firewall"},
|
||||
{"id": "dc_offline", "label": "DC is completely unreachable", "next_node_id": "escalate_dc_offline"},
|
||||
{"id": "port_open_still_fails", "label": "Port open but replication still fails", "next_node_id": "check_rpc_dynamic_ports"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "escalate_rpc_firewall",
|
||||
"type": "solution",
|
||||
"title": "Escalate: Firewall Blocking AD Replication",
|
||||
"description": "A firewall is blocking RPC between DCs.\n\n**Required ports for AD replication:**\n- TCP 135 (RPC Endpoint Mapper)\n- TCP 389 (LDAP)\n- TCP 636 (LDAP SSL)\n- TCP 3268 (Global Catalog)\n- TCP 88 (Kerberos)\n- TCP 445 (SMB)\n- TCP 49152-65535 (RPC dynamic ports)\n - Or restrict to a fixed port range via registry\n\n**Escalate to:** Network team to open required ports between DCs.\n**Priority:** High — AD replication is critical infrastructure."
|
||||
},
|
||||
{
|
||||
"id": "escalate_dc_offline",
|
||||
"type": "solution",
|
||||
"title": "Escalate: Domain Controller Offline",
|
||||
"description": "The target DC is unreachable.\n\n**Check:**\n1. Is the server powered on? (hypervisor, iLO/iDRAC)\n2. Is the OS running? (try RDP, ping)\n3. Was it recently decommissioned?\n\n**If permanently offline:** The DC metadata needs to be cleaned from AD:\n```\nntdsutil\n metadata cleanup\n connections\n connect to server <WORKING_DC>\n quit\n select operation target\n list domains\n ...\n```\n\n**Escalate to:** Senior AD Administrator\n**Priority:** High"
|
||||
},
|
||||
{
|
||||
"id": "check_rpc_dynamic_ports",
|
||||
"type": "solution",
|
||||
"title": "Check RPC Dynamic Port Range",
|
||||
"description": "Port 135 is open but RPC dynamic ports may be blocked.\n\nAD replication uses dynamic RPC ports (49152-65535 by default).\n\n**To restrict to a specific range** (makes firewall rules easier):\n```\n# On each DC - set fixed RPC port range\nreg add HKLM\\SYSTEM\\CurrentControlSet\\Services\\NTDS\\Parameters /v \"TCP/IP Port\" /t REG_DWORD /d 50000\n```\nRestart the NTDS service after.\n\n**Escalate to:** Network team with the dynamic port range information."
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "fix_repl_dns",
|
||||
"type": "solution",
|
||||
"title": "Fix DNS Issues Blocking Replication",
|
||||
"description": "AD replication depends heavily on DNS. DCs find each other via SRV records.\n\n**Check DNS health:**\n```\n# Verify DC SRV records exist\nnslookup -type=srv _ldap._tcp.dc._msdcs.yourdomain.local\n\n# Re-register DC DNS records\nipconfig /registerdns\nnet stop netlogon && net start netlogon\n\n# Verify DNS on the DC\ndcdiag /test:dns /v\n```\n\n**Common causes:**\n- DC's DNS records missing or stale\n- DC pointing to wrong DNS server\n- DNS zone not replicating\n\n**Each DC should point to:** Itself and at least one other DC for DNS.\n\n**Escalate to:** DNS/AD Administrator if records are missing and won't re-register."
|
||||
},
|
||||
{
|
||||
"id": "fix_repl_access",
|
||||
"type": "solution",
|
||||
"title": "Fix Access Denied Errors in Replication",
|
||||
"description": "Replication is being denied — authentication or permission issue.\n\n**Common causes:**\n- Time skew between DCs (Kerberos requires <5 min difference)\n- Computer account password expired\n- Permissions removed from DC object in AD\n\n**Check time sync:**\n```\nw32tm /query /status\nw32tm /query /peers\n\n# Force time resync\nw32tm /resync /force\n```\n\n**If time is more than 5 minutes off:** Kerberos will fail. Fix time sync first.\n\n**Check secure channel:**\n```\nTest-ComputerSecureChannel -Verbose\nTest-ComputerSecureChannel -Repair\n```\n\n**Escalate to:** Senior AD Administrator if permissions or secure channel repair fails."
|
||||
},
|
||||
{
|
||||
"id": "fix_schema",
|
||||
"type": "solution",
|
||||
"title": "Escalate: Schema Version Mismatch",
|
||||
"description": "Schema versions don't match between DCs.\n\n**Check schema version:**\n```\nGet-ADObject (Get-ADRootDSE).schemaNamingContext -Properties objectVersion | Select objectVersion\n```\n\n**This usually happens when:** A DC was promoted or demoted improperly, or an AD upgrade (schema extension) partially completed.\n\n**This requires:** Senior AD administrator intervention. Do not attempt schema repairs without expertise.\n\n**Escalate to:** Senior AD Administrator / Directory Services specialist\n**Priority:** High — schema issues can corrupt the directory."
|
||||
},
|
||||
{
|
||||
"id": "general_repl_troubleshooting",
|
||||
"type": "solution",
|
||||
"title": "General Replication Troubleshooting",
|
||||
"description": "For errors not covered above, try these general steps:\n\n**1. Force replication:**\n```\nrepadmin /syncall /APed\n```\n\n**2. Check DC health:**\n```\ndcdiag /v /c\n```\n\n**3. Check event logs:**\n```\nGet-WinEvent -FilterHashtable @{LogName='Directory Service';Level=2,3} -MaxEvents 20\n```\n\n**4. Verify AD sites and subnets:**\nAD Sites and Services — are DCs in the correct sites? Are site links configured?\n\n**5. Check USN rollback:**\nIf a DC was restored from snapshot incorrectly, USN rollback can break replication permanently for that DC.\n\n**Escalate to:** Senior AD Administrator with dcdiag output and event logs."
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "check_new_dc",
|
||||
"type": "solution",
|
||||
"title": "Troubleshoot New DC Not Replicating",
|
||||
"description": "A newly promoted DC isn't replicating.\n\n**Check in order:**\n\n1. **DNS:** Is the new DC registered in DNS? Can it resolve other DCs?\n```\nnslookup <OTHER_DC_NAME>\ndcdiag /test:dns\n```\n\n2. **Site assignment:** Is the new DC in the correct AD site?\n Open AD Sites and Services and verify.\n\n3. **Replication partners:** Does it have replication partners?\n```\nrepadmin /showrepl <NEW_DC_NAME>\n```\n\n4. **Initial replication:** After promotion, initial replication can take time. Wait 15-30 minutes.\n\n5. **Network:** Can the new DC reach other DCs on required ports?\n\n**If still not replicating after 30 minutes:** Run `dcdiag /v` and `repadmin /showrepl` and escalate with the output."
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def get_gpo_not_applying_tree() -> dict[str, Any]:
|
||||
"""Group Policy Not Applying - AD tree."""
|
||||
return {
|
||||
"name": "Group Policy Not Applying",
|
||||
"description": "Troubleshoot Group Policy Objects that aren't applying to users or computers. Covers GPResult diagnostics, scope filtering, WMI filters, inheritance, and common GPO processing issues.",
|
||||
"category": "Active Directory",
|
||||
"tree_structure": {
|
||||
"id": "root",
|
||||
"type": "decision",
|
||||
"question": "Is the GPO not applying to a single user/computer or multiple?",
|
||||
"help_text": "This determines whether it's a scoping/targeting issue or a broader GPO infrastructure problem.",
|
||||
"options": [
|
||||
{"id": "single_target", "label": "Single user or computer", "next_node_id": "run_gpresult"},
|
||||
{"id": "multiple_targets", "label": "Multiple users/computers", "next_node_id": "check_gpo_config"},
|
||||
{"id": "new_gpo", "label": "Newly created GPO not working", "next_node_id": "check_new_gpo"},
|
||||
{"id": "gpo_stopped", "label": "GPO was working but stopped", "next_node_id": "check_gpo_changes"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "run_gpresult",
|
||||
"type": "action",
|
||||
"title": "Run GPResult on the Affected Machine",
|
||||
"description": "GPResult shows exactly which GPOs are applied and which are filtered out.\n\n**PowerShell (as Administrator on the affected machine):**\n```\n# Full HTML report (most useful)\ngpresult /h C:\\temp\\gpresult.html\nstart C:\\temp\\gpresult.html\n\n# Quick console output\ngpresult /r\n\n# For a specific user\ngpresult /user domain\\username /r\n```\n\n**Look for your GPO in the report:**\n- Is it listed under 'Applied GPOs'?\n- Is it listed under 'Denied GPOs' or 'Filtered GPOs'?\n- Is it missing entirely?",
|
||||
"next_node_id": "gpresult_result"
|
||||
},
|
||||
{
|
||||
"id": "gpresult_result",
|
||||
"type": "decision",
|
||||
"question": "Where does your GPO appear in the GPResult report?",
|
||||
"help_text": "Check both Computer Configuration and User Configuration sections",
|
||||
"options": [
|
||||
{"id": "applied", "label": "GPO shows as Applied but settings not working", "next_node_id": "check_conflicting_gpo"},
|
||||
{"id": "filtered_security", "label": "GPO shows as Filtered (Security)", "next_node_id": "fix_security_filtering"},
|
||||
{"id": "filtered_wmi", "label": "GPO shows as Filtered (WMI)", "next_node_id": "fix_wmi_filter"},
|
||||
{"id": "not_listed", "label": "GPO not listed at all", "next_node_id": "check_gpo_link"},
|
||||
{"id": "denied", "label": "GPO shows as Denied", "next_node_id": "check_block_inheritance"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "check_conflicting_gpo",
|
||||
"type": "solution",
|
||||
"title": "Check for Conflicting GPO / Precedence",
|
||||
"description": "GPO is applied but settings aren't taking effect. Another GPO may be overriding it.\n\n**GPO precedence (highest to lowest):**\n1. Local GPO\n2. Site GPOs\n3. Domain GPOs\n4. OU GPOs (child OU overrides parent OU)\n\n**Later-applied GPOs win** when settings conflict.\n\n**In the GPResult report:** Look for other GPOs that configure the same setting. The last one applied wins.\n\n**Also check:**\n- Is the setting under Computer or User configuration? It must match what you configured.\n- Are Preferences vs Policies confused? (Preferences can be overridden by users)\n\n**Fix:** Adjust GPO link order, use Enforced on the important GPO, or remove conflicting settings."
|
||||
},
|
||||
{
|
||||
"id": "fix_security_filtering",
|
||||
"type": "solution",
|
||||
"title": "Fix Security Filtering",
|
||||
"description": "GPO is filtered out by security permissions.\n\n**Check in Group Policy Management Console:**\n1. Select the GPO\n2. Check 'Security Filtering' section\n3. By default, 'Authenticated Users' should be listed\n\n**Common issues:**\n- Removed 'Authenticated Users' and added a specific group, but target isn't in that group\n- Missing 'Domain Computers' read permission (required since MS16-072 patch)\n\n**Fix for MS16-072:**\nThe GPO needs 'Domain Computers' (for computer policies) or 'Authenticated Users' with Read permission in the Delegation tab, even if security filtering targets a specific group.\n\n**GPMC:** GPO > Delegation tab > Add 'Domain Computers' with Read permission."
|
||||
},
|
||||
{
|
||||
"id": "fix_wmi_filter",
|
||||
"type": "solution",
|
||||
"title": "Fix WMI Filter",
|
||||
"description": "A WMI filter is preventing the GPO from applying.\n\n**Check the WMI filter query:**\nGPMC > Select GPO > WMI Filtering section — note the filter name.\nThen check: GPMC > WMI Filters > open the filter to see the query.\n\n**Test the WMI filter on the target machine:**\n```\n# Run the WMI query directly\nGet-WmiObject -Query \"SELECT * FROM Win32_OperatingSystem WHERE Version LIKE '10%'\"\n```\nIf it returns nothing, the filter is excluding this machine.\n\n**Common WMI filter issues:**\n- OS version filter excludes newer Windows versions\n- Hardware filter doesn't match (laptop vs desktop)\n- WMI repository corruption on client\n\n**Fix WMI on client:** `winmgmt /salvagerepository`"
|
||||
},
|
||||
{
|
||||
"id": "check_gpo_link",
|
||||
"type": "solution",
|
||||
"title": "GPO Not Linked or Wrong OU",
|
||||
"description": "GPO doesn't appear in GPResult at all — it's likely not linked to the correct OU or the object is in the wrong OU.\n\n**Check:**\n1. **Where is the user/computer in AD?**\n```\nGet-ADUser -Identity username | Select DistinguishedName\nGet-ADComputer -Identity computername | Select DistinguishedName\n```\n\n2. **Where is the GPO linked?**\nGPMC > Select GPO > check 'Scope' tab > 'Links' section\n\n3. **Does the OU match?** The GPO link OU must be the same OU (or a parent OU) where the user/computer object lives.\n\n**Common issues:**\n- Computer/user in wrong OU\n- GPO linked to wrong OU\n- GPO link is disabled (check the link status)\n\n**Fix:** Move the object to correct OU or link GPO to correct OU."
|
||||
},
|
||||
{
|
||||
"id": "check_block_inheritance",
|
||||
"type": "solution",
|
||||
"title": "Check Block Inheritance / Enforced",
|
||||
"description": "GPO is being denied — likely by Block Inheritance on the OU.\n\n**In GPMC:** Check the OU where the target object resides. If it has a blue exclamation mark, 'Block Inheritance' is enabled.\n\n**Options to fix:**\n1. Remove Block Inheritance on the OU (affects all GPOs)\n2. Set the GPO to 'Enforced' — this overrides Block Inheritance\n3. Link the GPO directly to the blocking OU\n\n**Use Enforced sparingly** — it overrides normal precedence and can cause unexpected behavior."
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "check_gpo_config",
|
||||
"type": "solution",
|
||||
"title": "Check GPO Configuration for Multiple Targets",
|
||||
"description": "GPO not applying to multiple targets. Check the GPO itself.\n\n**In GPMC:**\n1. Is the GPO link enabled? (Not disabled or unenforced)\n2. Is the GPO status correct? (Not 'All settings disabled')\n - GPO > Details tab > GPO Status\n3. Are the settings in the correct section?\n - Computer settings only apply to computer objects\n - User settings only apply to user objects\n\n**Force GP update on a test machine:**\n```\ngpupdate /force\n```\n\n**Check SYSVOL replication:**\n```\n# Compare GPO version on different DCs\nGet-GPO -Name \"Your GPO Name\" -Server DC1 | Select DisplayName,Computer,User\nGet-GPO -Name \"Your GPO Name\" -Server DC2 | Select DisplayName,Computer,User\n```\n\n**If versions differ:** SYSVOL replication (DFS-R or FRS) may be broken."
|
||||
},
|
||||
{
|
||||
"id": "check_new_gpo",
|
||||
"type": "solution",
|
||||
"title": "New GPO Checklist",
|
||||
"description": "Newly created GPO not working. Verify these common mistakes:\n\n**1. Is it linked?** Creating a GPO doesn't link it automatically.\n**2. Is the link enabled?** Check for the green checkmark on the link.\n**3. Security filtering:** Default is 'Authenticated Users' (correct).\n**4. Computer vs User settings:** Make sure settings are in the right section.\n**5. Loopback processing:** If applying user settings based on computer location, you need loopback processing enabled.\n**6. Replication time:** New GPO needs to replicate to all DCs. Wait 15-30 minutes.\n\n**Force update:**\n```\ngpupdate /force\ngpresult /r\n```\n\n**Still not working:** Check the GPResult report for why it's filtered."
|
||||
},
|
||||
{
|
||||
"id": "check_gpo_changes",
|
||||
"type": "solution",
|
||||
"title": "Investigate GPO That Stopped Working",
|
||||
"description": "GPO was working but stopped. Something changed.\n\n**Check recent changes:**\n```\n# When was the GPO last modified?\nGet-GPO -Name \"Your GPO Name\" | Select DisplayName,ModificationTime\n\n# All recently modified GPOs\nGet-GPO -All | Where-Object {$_.ModificationTime -gt (Get-Date).AddDays(-7)} | Sort ModificationTime -Descending\n```\n\n**Common causes:**\n- Someone edited the GPO and broke a setting\n- Security filtering was changed\n- WMI filter was added or modified\n- OU structure changed (objects moved)\n- SYSVOL replication broke\n- A Windows update changed how a setting works\n\n**Check SYSVOL health:**\n```\ndcdiag /test:sysvolcheck\ndcdiag /test:dfsrevent\n```\n\n**Escalate to:** Whoever manages GPOs with the modification timeline."
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def get_entra_id_sync_tree() -> dict[str, Any]:
|
||||
"""Entra ID Sync Issues (AD Connect) - Cloud identity tree."""
|
||||
return {
|
||||
"name": "Entra ID Sync Issues (AD Connect)",
|
||||
"description": "Troubleshoot Microsoft Entra Connect (formerly Azure AD Connect) synchronization failures. Covers sync cycle errors, password hash sync, attribute conflicts, and connector space issues.",
|
||||
"category": "Active Directory",
|
||||
"tree_structure": {
|
||||
"id": "root",
|
||||
"type": "decision",
|
||||
"question": "What type of Entra ID sync issue are you experiencing?",
|
||||
"help_text": "Entra Connect syncs on-premises AD objects to Entra ID (Azure AD). Issues affect M365 services, SSO, and cloud app access.",
|
||||
"options": [
|
||||
{"id": "sync_stopped", "label": "Sync has completely stopped", "next_node_id": "check_sync_service"},
|
||||
{"id": "specific_user", "label": "Specific user/group not syncing", "next_node_id": "check_user_sync"},
|
||||
{"id": "password_sync", "label": "Password changes not syncing to cloud", "next_node_id": "check_password_hash_sync"},
|
||||
{"id": "export_errors", "label": "Sync errors / export failures", "next_node_id": "check_sync_errors"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "check_sync_service",
|
||||
"type": "action",
|
||||
"title": "Check Entra Connect Sync Service",
|
||||
"description": "Verify the sync service is running.\n\n**On the Entra Connect server:**\n```\n# Check sync service status\nGet-Service ADSync\n\n# Check last sync time\nGet-ADSyncScheduler\n\n# Check sync cycle status\nGet-ADSyncScheduler | Select SyncCycleEnabled,NextSyncCycleStartTimeInUTC,CurrentlyEffectiveSyncCycleInterval\n```\n\n**Also check:** Entra admin center > Entra Connect > Sync status\n\n**If the service is stopped:** Start it: `Start-Service ADSync`",
|
||||
"next_node_id": "sync_service_result"
|
||||
},
|
||||
{
|
||||
"id": "sync_service_result",
|
||||
"type": "decision",
|
||||
"question": "What is the sync service status?",
|
||||
"help_text": "Check service state and scheduler status",
|
||||
"options": [
|
||||
{"id": "service_stopped", "label": "ADSync service is stopped", "next_node_id": "fix_sync_service"},
|
||||
{"id": "scheduler_disabled", "label": "Service running but scheduler disabled", "next_node_id": "enable_scheduler"},
|
||||
{"id": "service_running", "label": "Service running, scheduler active", "next_node_id": "check_sync_errors"},
|
||||
{"id": "server_unreachable", "label": "Entra Connect server is down", "next_node_id": "escalate_connect_server"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "fix_sync_service",
|
||||
"type": "action",
|
||||
"title": "Start ADSync Service",
|
||||
"description": "```\nStart-Service ADSync\nGet-Service ADSync\n\n# If it won't start, check event logs\nGet-WinEvent -FilterHashtable @{LogName='Application';ProviderName='ADSync'} -MaxEvents 20\n```\n\n**Common causes of service failure:**\n- SQL Server Express instance is down (ADSync uses a local SQL)\n- Disk space full on the Entra Connect server\n- Service account password changed\n- Windows update broke something\n\n**Check SQL:** `Get-Service 'ADSync' ; Get-Service MSSQL*`",
|
||||
"next_node_id": "check_sync_errors"
|
||||
},
|
||||
{
|
||||
"id": "enable_scheduler",
|
||||
"type": "solution",
|
||||
"title": "Re-enable Sync Scheduler",
|
||||
"description": "Scheduler was disabled (commonly done during maintenance).\n\n```\n# Re-enable the scheduler\nSet-ADSyncScheduler -SyncCycleEnabled $true\n\n# Trigger an immediate sync\nStart-ADSyncSyncCycle -PolicyType Delta\n\n# Verify\nGet-ADSyncScheduler\n```\n\n**Note:** Scheduler is sometimes disabled during maintenance or troubleshooting. If someone disabled it, check if there's ongoing work before re-enabling.\n\n**Ticket Notes:** Sync scheduler was disabled. Re-enabled and triggered delta sync."
|
||||
},
|
||||
{
|
||||
"id": "escalate_connect_server",
|
||||
"type": "solution",
|
||||
"title": "CRITICAL: Entra Connect Server Down",
|
||||
"description": "**Priority: HIGH** — Sync will stop but existing cloud accounts continue working.\n\n**Impact:** Password changes, new users, and group changes won't sync to M365.\n\n**Immediate actions:**\n1. Check VM/server status in hypervisor\n2. Existing users can still log into M365 (cached auth)\n3. Password changes won't sync until server is back\n\n**Escalate to:** Infrastructure team to restore the server\n**Note:** If server can't be recovered, Entra Connect can be reinstalled on another server (requires config backup or reconfiguration)."
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "check_user_sync",
|
||||
"type": "action",
|
||||
"title": "Check Why Specific User Isn't Syncing",
|
||||
"description": "Use the Entra Connect Synchronization Service Manager or PowerShell.\n\n```\n# Search for the user in connector space\n$csUser = Get-ADSyncCSObject -ConnectorName \"yourdomain.local\" -DistinguishedName \"CN=User Name,OU=Users,DC=yourdomain,DC=local\"\n\n# Check if user is in sync scope\n# (Simpler approach - check if user exists in Entra)\nGet-AzureADUser -SearchString \"username\" | Select DisplayName,UserPrincipalName,DirSyncEnabled\n```\n\n**Common reasons a user doesn't sync:**\n- User is in an OU not selected for sync (OU filtering)\n- User is filtered by attribute-based sync rule\n- Duplicate or conflicting attribute (UPN, proxyAddress)\n- User was soft-deleted in Entra and conflicts",
|
||||
"next_node_id": "user_sync_result"
|
||||
},
|
||||
{
|
||||
"id": "user_sync_result",
|
||||
"type": "decision",
|
||||
"question": "Why is the user not syncing?",
|
||||
"help_text": "Check Entra Connect OU filtering and sync rules",
|
||||
"options": [
|
||||
{"id": "wrong_ou", "label": "User is in an OU not selected for sync", "next_node_id": "fix_ou_filtering"},
|
||||
{"id": "attribute_conflict", "label": "Duplicate attribute conflict (UPN, email)", "next_node_id": "fix_attribute_conflict"},
|
||||
{"id": "filtered_rule", "label": "Filtered by a sync rule", "next_node_id": "fix_sync_rule"},
|
||||
{"id": "unclear", "label": "Not sure why", "next_node_id": "check_sync_errors"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "fix_ou_filtering",
|
||||
"type": "solution",
|
||||
"title": "Fix OU Filtering",
|
||||
"description": "The user's OU is not included in the sync scope.\n\n**Options:**\n1. **Move the user** to an OU that's in sync scope\n2. **Add the OU** to the sync configuration:\n - Run the Entra Connect wizard\n - Choose 'Customize synchronization options'\n - Select the additional OU\n - Complete the wizard\n\n**After changing:** Run a delta sync:\n```\nStart-ADSyncSyncCycle -PolicyType Delta\n```\n\n**Caution:** Adding a large OU may sync many objects — verify your Entra ID license count."
|
||||
},
|
||||
{
|
||||
"id": "fix_attribute_conflict",
|
||||
"type": "solution",
|
||||
"title": "Fix Duplicate Attribute Conflict",
|
||||
"description": "Another object already has the same UPN or proxyAddress in Entra ID.\n\n**Check Entra admin center:**\nEntra ID > Users > search for the conflicting UPN or email.\n\n**Common conflicts:**\n- User was deleted and recreated with same UPN (soft-deleted copy still in Entra recycle bin)\n- Two AD users have the same proxyAddress/email\n- A cloud-only user exists with the same UPN\n\n**Fixes:**\n1. If soft-deleted: Permanently delete the old object in Entra recycle bin\n2. If duplicate email: Fix the duplicate in AD\n3. If cloud-only conflict: Delete the cloud user or change its UPN\n\n**After fixing:** Run delta sync: `Start-ADSyncSyncCycle -PolicyType Delta`"
|
||||
},
|
||||
{
|
||||
"id": "fix_sync_rule",
|
||||
"type": "solution",
|
||||
"title": "Escalate: Custom Sync Rule Filtering",
|
||||
"description": "A custom sync rule is filtering out this user.\n\n**Check sync rules:**\nOpen 'Synchronization Rules Editor' on the Entra Connect server.\n\nCustom rules are risky to modify without understanding the full sync configuration.\n\n**Escalate to:** Identity/Cloud Administrator who manages Entra Connect\n**Include:** User's DN, the sync rule name, and why the user needs to sync."
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "check_password_hash_sync",
|
||||
"type": "solution",
|
||||
"title": "Troubleshoot Password Hash Sync",
|
||||
"description": "Password changes in AD aren't reflecting in M365/Entra ID.\n\n**Check PHS status:**\n```\nInvoke-ADSyncDiagnostics -PasswordSync\n```\n\n**Check Event Log:**\n```\nGet-WinEvent -FilterHashtable @{LogName='Application';ProviderName='Directory Synchronization';Id=656,657} -MaxEvents 10\n```\n\n**Event 656:** Successful password sync\n**Event 657:** Failed password sync\n\n**Common causes:**\n- Password hash sync feature disabled in Entra Connect config\n- Connector account permissions changed in AD\n- Recent password change hasn't synced yet (wait for next cycle, usually 2 min)\n\n**Force immediate password sync:**\n```\nInvoke-ADSyncDiagnostics -PasswordSync\n```\n\n**If PHS is disabled:** Re-run the Entra Connect wizard and enable it.\n\n**Escalate to:** Identity Administrator if the connector account needs permission fixes."
|
||||
},
|
||||
{
|
||||
"id": "check_sync_errors",
|
||||
"type": "solution",
|
||||
"title": "Review Sync Errors",
|
||||
"description": "Check for export errors and sync failures.\n\n**On the Entra Connect server:**\n1. Open **Synchronization Service Manager**\n2. Check the **Operations** tab for recent sync cycles\n3. Look for 'export' operations with errors\n4. Click on the error count for details\n\n**PowerShell:**\n```\n# Get recent sync results\nGet-ADSyncRunProfileResult | Sort StartDate -Descending | Select -First 5\n\n# Check Entra portal\n# Entra admin center > Entra Connect > Sync errors\n```\n\n**Common export errors:**\n- InvalidSoftMatch: Attribute conflict in cloud\n- DataValidationFailed: Invalid characters in attributes\n- LargeObject: Object exceeds attribute size limits\n\n**Escalate to:** Identity/Cloud Administrator with the specific error details."
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def get_domain_join_tree() -> dict[str, Any]:
|
||||
"""User Cannot Join Domain - AD tree."""
|
||||
return {
|
||||
"name": "Computer Cannot Join Domain",
|
||||
"description": "Troubleshoot domain join failures for new or reimaged computers. Covers DNS requirements, authentication issues, computer account limits, and common error codes.",
|
||||
"category": "Active Directory",
|
||||
"tree_structure": {
|
||||
"id": "root",
|
||||
"type": "decision",
|
||||
"question": "What error occurs when trying to join the domain?",
|
||||
"help_text": "Try joining: System Properties > Computer Name > Change > Domain. Note the exact error message.",
|
||||
"options": [
|
||||
{"id": "domain_not_found", "label": "Domain could not be contacted / not found", "next_node_id": "check_dns_for_domain"},
|
||||
{"id": "access_denied", "label": "Access denied / insufficient permissions", "next_node_id": "check_join_permissions"},
|
||||
{"id": "account_exists", "label": "Computer account already exists", "next_node_id": "fix_existing_account"},
|
||||
{"id": "other_error", "label": "Different error message", "next_node_id": "check_general_join"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "check_dns_for_domain",
|
||||
"type": "action",
|
||||
"title": "Verify DNS Can Resolve Domain Controllers",
|
||||
"description": "Domain join requires DNS to find DCs. This is the most common failure.\n\n**On the computer being joined:**\n```\n# Check DNS settings\nipconfig /all\n\n# Can you resolve the domain?\nnslookup yourdomain.local\n\n# Can you find DC SRV records?\nnslookup -type=srv _ldap._tcp.dc._msdcs.yourdomain.local\n\n# Can you ping a DC?\nping <DC_HOSTNAME>\n```\n\n**The computer's DNS MUST point to an internal DNS server** that has the AD DNS zones. Public DNS (8.8.8.8) won't work for domain join.",
|
||||
"next_node_id": "dns_join_result"
|
||||
},
|
||||
{
|
||||
"id": "dns_join_result",
|
||||
"type": "decision",
|
||||
"question": "Can the computer resolve the domain name?",
|
||||
"help_text": "nslookup should return DC IP addresses",
|
||||
"options": [
|
||||
{"id": "wrong_dns", "label": "DNS is pointing to wrong server (public DNS, etc.)", "next_node_id": "fix_dns_for_join"},
|
||||
{"id": "dns_ok_cant_reach", "label": "DNS resolves but can't reach the DC", "next_node_id": "check_network_to_dc"},
|
||||
{"id": "dns_resolves_ok", "label": "DNS resolves and can ping DC", "next_node_id": "check_join_permissions"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "fix_dns_for_join",
|
||||
"type": "action",
|
||||
"title": "Set DNS to Internal DNS Servers",
|
||||
"description": "The computer must use your AD DNS servers.\n\n```\n# Set DNS to your domain controllers/DNS servers\nSet-DnsClientServerAddress -InterfaceAlias 'Ethernet' -ServerAddresses '<DC1_IP>','<DC2_IP>'\n\n# Verify\nnslookup yourdomain.local\n```\n\n**If using DHCP:** The DHCP scope should be assigning internal DNS. If not, fix the DHCP scope options.\n\n**After setting DNS:** Retry the domain join.",
|
||||
"next_node_id": "retry_join"
|
||||
},
|
||||
{
|
||||
"id": "check_network_to_dc",
|
||||
"type": "solution",
|
||||
"title": "Check Network Connectivity to Domain Controller",
|
||||
"description": "DNS resolves but can't reach the DC. Check network path.\n\n```\nTest-NetConnection -ComputerName <DC_IP> -Port 389\nTest-NetConnection -ComputerName <DC_IP> -Port 445\ntracert <DC_IP>\n```\n\n**Required ports for domain join:**\n- TCP/UDP 389 (LDAP)\n- TCP 445 (SMB)\n- TCP/UDP 88 (Kerberos)\n- TCP 135 + dynamic RPC\n- TCP/UDP 53 (DNS)\n\n**Common causes:** VLAN isolation, firewall blocking, VPN not connected.\n\n**Escalate to:** Network team if ports are blocked."
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "check_join_permissions",
|
||||
"type": "decision",
|
||||
"question": "What credentials are being used to join the domain?",
|
||||
"help_text": "Domain join requires specific permissions in AD",
|
||||
"options": [
|
||||
{"id": "regular_user", "label": "Regular domain user account", "next_node_id": "check_join_quota"},
|
||||
{"id": "admin_account", "label": "Domain admin or delegated join account", "next_node_id": "check_admin_join_issue"},
|
||||
{"id": "wrong_creds", "label": "Credentials might be wrong / expired", "next_node_id": "verify_credentials"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "check_join_quota",
|
||||
"type": "solution",
|
||||
"title": "Check Domain Join Quota",
|
||||
"description": "Regular users can join up to **10 computers** by default (ms-DS-MachineAccountQuota).\n\n**Check current quota:**\n```\nGet-ADObject -Identity (Get-ADDomain).DistinguishedName -Properties ms-DS-MachineAccountQuota | Select ms-DS-MachineAccountQuota\n```\n\n**Check how many the user has joined:**\n```\nGet-ADComputer -Filter {ms-DS-CreatorSID -eq $((Get-ADUser username).SID)} | Measure-Object\n```\n\n**If quota exceeded:**\n1. Use a domain admin account to join instead\n2. Or pre-stage the computer account in AD (allows the user to join that specific computer)\n3. Or increase the quota (not recommended for security)\n\n**Best practice:** Pre-stage computer accounts or use a dedicated join account with delegated permissions."
|
||||
},
|
||||
{
|
||||
"id": "check_admin_join_issue",
|
||||
"type": "solution",
|
||||
"title": "Admin Account Can't Join - Check OU Permissions",
|
||||
"description": "Even admin accounts can fail if OU permissions are restricted.\n\n**Check:**\n1. Does a computer account already exist with this name? `Get-ADComputer -Identity \"COMPUTERNAME\"`\n2. If pre-staged, does the joining user have 'Reset Password' and 'Write Account Restrictions' on that computer object?\n3. Is the target OU restricted via delegation?\n\n**Try joining to default Computers container first:** If that works, it's an OU permissions issue.\n\n**If admin account is locked or expired:**\n```\nGet-ADUser -Identity adminaccount -Properties LockedOut,Enabled,PasswordExpired\n```"
|
||||
},
|
||||
{
|
||||
"id": "verify_credentials",
|
||||
"type": "solution",
|
||||
"title": "Verify Domain Credentials",
|
||||
"description": "Make sure the credentials are correct.\n\n**Use the full domain format:**\n- `DOMAIN\\username` or `username@domain.local`\n\n**Verify the account works:**\n- Try logging into another domain-joined PC\n- Or test: `runas /user:DOMAIN\\username cmd`\n\n**Check if account is locked/disabled:**\n```\nGet-ADUser -Identity username -Properties LockedOut,Enabled,PasswordExpired\n```"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "fix_existing_account",
|
||||
"type": "solution",
|
||||
"title": "Fix Existing Computer Account Conflict",
|
||||
"description": "A computer account with this name already exists in AD.\n\n**Options:**\n1. **Delete the old account** (if the old computer is decommissioned):\n```\nRemove-ADComputer -Identity \"COMPUTERNAME\"\n```\n\n2. **Reset the old account** (allows rejoin):\n```\nReset-ComputerMachinePassword -Server <DC_NAME> -Credential (Get-Credential)\n```\n\n3. **Use a different computer name**\n\n4. **Pre-stage:** If the account was pre-staged, the joining user needs permission on that specific object.\n\n**After fixing:** Retry the domain join."
|
||||
},
|
||||
{
|
||||
"id": "check_general_join",
|
||||
"type": "solution",
|
||||
"title": "General Domain Join Troubleshooting",
|
||||
"description": "For other domain join errors:\n\n**Check the basics:**\n1. Time sync: Is the computer within 5 minutes of the DC?\n ```\n w32tm /query /status\n net time \\\\<DC_NAME>\n ```\n2. Network: Can you access `\\\\<DC_NAME>\\SYSVOL`?\n3. Firewall: Is Windows Firewall blocking domain traffic?\n4. Secure channel: For rejoins, try: `Test-ComputerSecureChannel -Repair`\n\n**Common error codes:**\n- 53: Network path not found (connectivity issue)\n- 1355: Domain not found (DNS issue)\n- 2224: Account already exists\n- 2691: Already joined to a domain (unjoin first)\n\n**Escalate to:** AD Administrator with the exact error code and message."
|
||||
},
|
||||
{
|
||||
"id": "retry_join",
|
||||
"type": "decision",
|
||||
"question": "Did the domain join succeed after fixing DNS?",
|
||||
"help_text": "Retry: System Properties > Computer Name > Change > Domain",
|
||||
"options": [
|
||||
{"id": "success", "label": "Yes, joined successfully", "next_node_id": "solution_joined"},
|
||||
{"id": "different_error", "label": "Different error now", "next_node_id": "check_general_join"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "solution_joined",
|
||||
"type": "solution",
|
||||
"title": "Resolved: Computer Joined Domain",
|
||||
"description": "Computer successfully joined the domain.\n\n**Post-join steps:**\n1. Restart the computer (required)\n2. Log in with domain credentials\n3. Verify Group Policy: `gpupdate /force`\n4. Move computer to correct OU if needed\n\n**Ticket Notes:** Domain join completed. Root cause was [DNS/permissions/etc]."
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def get_kerberos_auth_tree() -> dict[str, Any]:
|
||||
"""Kerberos/NTLM Authentication Failures - AD tree."""
|
||||
return {
|
||||
"name": "Kerberos / NTLM Authentication Failures",
|
||||
"description": "Troubleshoot authentication failures including Kerberos ticket issues, NTLM fallback problems, SPN misconfigurations, and time sync issues that affect logins, file shares, and web applications.",
|
||||
"category": "Active Directory",
|
||||
"tree_structure": {
|
||||
"id": "root",
|
||||
"type": "decision",
|
||||
"question": "What authentication symptom is the user experiencing?",
|
||||
"help_text": "Authentication issues can manifest as login failures, access denied to resources, or double-prompts for credentials.",
|
||||
"options": [
|
||||
{"id": "login_failure", "label": "Can't log into Windows at all", "next_node_id": "check_dc_connectivity"},
|
||||
{"id": "resource_access", "label": "Logged in but can't access file shares/apps", "next_node_id": "check_kerberos_tickets"},
|
||||
{"id": "double_prompt", "label": "Gets prompted for credentials repeatedly (SSO not working)", "next_node_id": "check_spn_issues"},
|
||||
{"id": "intermittent", "label": "Authentication works sometimes, fails other times", "next_node_id": "check_time_sync"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "check_dc_connectivity",
|
||||
"type": "action",
|
||||
"title": "Check Domain Controller Connectivity",
|
||||
"description": "Windows login requires DC access for Kerberos authentication.\n\n**On the affected machine:**\n```\n# Which DC is being used?\necho %LOGONSERVER%\nnltest /dsgetdc:yourdomain.local\n\n# Can you reach a DC?\nTest-NetConnection -ComputerName <DC_NAME> -Port 88\nTest-NetConnection -ComputerName <DC_NAME> -Port 389\n```\n\n**If offline:** Windows will use cached credentials for login (if previously logged in). First-time logins require DC connectivity.\n\n**No DC available:** Check network, VPN, DNS settings.",
|
||||
"next_node_id": "dc_connect_result"
|
||||
},
|
||||
{
|
||||
"id": "dc_connect_result",
|
||||
"type": "decision",
|
||||
"question": "Can the machine reach a domain controller?",
|
||||
"help_text": "Kerberos uses port 88, LDAP uses port 389",
|
||||
"options": [
|
||||
{"id": "no_dc", "label": "Can't reach any DC", "next_node_id": "fix_dc_connectivity"},
|
||||
{"id": "dc_reachable", "label": "DC is reachable but login still fails", "next_node_id": "check_account_status"},
|
||||
{"id": "cached_login", "label": "Can log in with cached creds only", "next_node_id": "fix_dc_connectivity"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "fix_dc_connectivity",
|
||||
"type": "solution",
|
||||
"title": "Restore Domain Controller Connectivity",
|
||||
"description": "Machine can't reach a DC. Check:\n\n1. **Network:** Is the machine connected? `ipconfig /all`\n2. **DNS:** Pointing to internal DNS? `nslookup yourdomain.local`\n3. **VPN:** If remote, is VPN connected?\n4. **Firewall:** Ports 88, 389, 445, 135 open to DC?\n5. **DC status:** Are DCs actually online?\n\n**If VPN user:** Connect VPN first, then Ctrl+Alt+Del > Switch User > log in with domain creds (forces DC authentication).\n\n**If all DCs are down:** This is a major outage. Users can only use cached logins.\n\n**Escalate to:** Network team (if routing issue) or Infrastructure (if DC issue)."
|
||||
},
|
||||
{
|
||||
"id": "check_account_status",
|
||||
"type": "solution",
|
||||
"title": "Check AD Account Status",
|
||||
"description": "DC is reachable but auth fails. Check the account.\n\n```\nGet-ADUser -Identity username -Properties LockedOut,Enabled,PasswordExpired,PasswordLastSet,AccountExpirationDate\n```\n\n**Possible issues:**\n- Account locked out → Unlock it\n- Account disabled → Enable or investigate why\n- Password expired → Reset password\n- Account expired → Extend expiration date\n\n**Also check:** Is the computer's secure channel healthy?\n```\nTest-ComputerSecureChannel -Verbose\n```\nIf broken: `Test-ComputerSecureChannel -Repair -Credential (Get-Credential)`"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "check_kerberos_tickets",
|
||||
"type": "action",
|
||||
"title": "Check Kerberos Tickets",
|
||||
"description": "User is logged in but can't access resources. Check Kerberos tickets.\n\n```\n# List current Kerberos tickets\nklist\n\n# Purge and get new tickets (forces re-authentication)\nklist purge\n\n# Then access the resource again — new tickets will be requested\n```\n\n**Look for:**\n- Are there valid TGT (krbtgt) tickets?\n- Are there service tickets for the resource you're accessing?\n- Have tickets expired?\n\n**If no tickets at all:** The machine may not be properly domain-joined or DC unreachable.",
|
||||
"next_node_id": "ticket_result"
|
||||
},
|
||||
{
|
||||
"id": "ticket_result",
|
||||
"type": "decision",
|
||||
"question": "Did purging and refreshing tickets fix the issue?",
|
||||
"help_text": "After klist purge, try accessing the resource again",
|
||||
"options": [
|
||||
{"id": "fixed", "label": "Yes, resource access works now", "next_node_id": "solution_ticket_refresh"},
|
||||
{"id": "still_fails", "label": "Still can't access the resource", "next_node_id": "check_spn_issues"},
|
||||
{"id": "ntlm_fallback", "label": "Works but with credential prompt (NTLM fallback)", "next_node_id": "check_spn_issues"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "solution_ticket_refresh",
|
||||
"type": "solution",
|
||||
"title": "Resolved: Stale Kerberos Tickets",
|
||||
"description": "Old Kerberos tickets were cached with outdated information.\n\n**Common causes:** Group membership change, password change, DC switchover.\n\n**Resolution:** Purged ticket cache with `klist purge`.\n\n**If this happens frequently:** The user may need to log out and back in after permission changes, or there may be a time sync issue.\n\n**Ticket Notes:** Stale Kerberos tickets cleared. User can access resources normally."
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "check_spn_issues",
|
||||
"type": "solution",
|
||||
"title": "Check SPN Configuration",
|
||||
"description": "Kerberos requires correct Service Principal Names (SPNs) on the target service.\n\n**Check SPNs for a service account:**\n```\nsetspn -L <SERVICE_ACCOUNT_OR_COMPUTER>\n\n# Check for duplicate SPNs (common problem)\nsetspn -X\n```\n\n**Common SPN issues:**\n- Missing SPN: Kerberos can't find the service, falls back to NTLM\n- Duplicate SPN: Two accounts claim the same service — Kerberos fails\n- Wrong SPN format: Must match how clients access the service\n\n**Example SPNs:**\n- File share: `HOST/servername`\n- Web app: `HTTP/webapp.domain.local`\n- SQL: `MSSQLSvc/sqlserver.domain.local:1433`\n\n**Fix duplicate SPNs:** Remove the incorrect one: `setspn -D <SPN> <WRONG_ACCOUNT>`\n\n**Escalate to:** Senior AD admin for SPN changes — incorrect SPNs can break other services."
|
||||
},
|
||||
{
|
||||
"id": "check_time_sync",
|
||||
"type": "action",
|
||||
"title": "Check Time Synchronization",
|
||||
"description": "Kerberos requires clocks to be within 5 minutes of each other.\n\n```\n# Check current time vs DC time\nw32tm /query /status\nnet time \\\\<DC_NAME>\n\n# Check time source\nw32tm /query /source\n\n# Force time resync\nw32tm /resync /force\n\n# Check time offset\nw32tm /stripchart /computer:<DC_NAME> /samples:5\n```\n\n**If time is off by more than 5 minutes:** Kerberos authentication will fail completely.\n\n**Common causes of time drift:**\n- VM time sync disabled\n- Laptop was offline for extended period\n- NTP source unreachable\n- Hyper-V time sync conflicting with domain time",
|
||||
"next_node_id": "time_result"
|
||||
},
|
||||
{
|
||||
"id": "time_result",
|
||||
"type": "decision",
|
||||
"question": "Was the time more than 5 minutes off?",
|
||||
"help_text": "Compare client time to DC time",
|
||||
"options": [
|
||||
{"id": "time_fixed", "label": "Yes, fixed time sync — auth works now", "next_node_id": "solution_time_sync"},
|
||||
{"id": "time_ok", "label": "Time was fine, issue is something else", "next_node_id": "check_kerberos_tickets"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "solution_time_sync",
|
||||
"type": "solution",
|
||||
"title": "Resolved: Time Sync Issue",
|
||||
"description": "Kerberos was failing due to clock skew greater than 5 minutes.\n\n**Prevention:**\n- Ensure all domain members sync time from the DC\n- PDC Emulator should sync from an external NTP source\n- For VMs: Disable hypervisor time sync (use domain time hierarchy)\n\n**Verify domain time hierarchy:**\n```\nw32tm /query /source\n```\nDomain members should show a DC. PDC should show an NTP server.\n\n**Ticket Notes:** Authentication failure due to clock skew. Resynced time."
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
1269
backend/scripts/seed_trees_m365.py
Normal file
1269
backend/scripts/seed_trees_m365.py
Normal file
File diff suppressed because it is too large
Load Diff
733
backend/scripts/seed_trees_networking.py
Normal file
733
backend/scripts/seed_trees_networking.py
Normal file
@@ -0,0 +1,733 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
ResolutionFlow Decision Trees - Batch 4: Additional Networking
|
||||
|
||||
Three additional networking troubleshooting trees for MSP engineers.
|
||||
Imported by seed_trees_v2.py for seeding.
|
||||
|
||||
Trees:
|
||||
1. Bandwidth / Slow Internet
|
||||
2. Wireless Connectivity Problems
|
||||
3. Firewall Blocking Issues
|
||||
"""
|
||||
|
||||
from typing import Any
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Tree 1: Bandwidth / Slow Internet
|
||||
# =============================================================================
|
||||
def get_bandwidth_slow_internet_tree() -> dict[str, Any]:
|
||||
"""Bandwidth / Slow Internet - Networking tree."""
|
||||
return {
|
||||
"name": "Bandwidth / Slow Internet",
|
||||
"description": "Diagnose and resolve slow internet or bandwidth issues at client sites. Covers ISP problems, LAN saturation, QoS misconfiguration, bandwidth hogs, and speed test analysis for MSP-managed environments.",
|
||||
"category": "Networking",
|
||||
"tree_structure": {
|
||||
"id": "root",
|
||||
"type": "decision",
|
||||
"question": "How many users are affected by the slow internet?",
|
||||
"help_text": "Scope determines whether this is a single-device issue, LAN problem, or ISP/WAN issue.",
|
||||
"options": [
|
||||
{"id": "one_user", "label": "Just one user / one device", "next_node_id": "check_single_device"},
|
||||
{"id": "several_users", "label": "Several users at the same location", "next_node_id": "check_lan_saturation"},
|
||||
{"id": "everyone", "label": "Everyone at the site is slow", "next_node_id": "check_wan_isp"},
|
||||
{"id": "intermittent", "label": "Intermittent — comes and goes throughout the day", "next_node_id": "check_intermittent"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "check_single_device",
|
||||
"type": "action",
|
||||
"title": "Diagnose Single Device Slow Internet",
|
||||
"description": "Only one user is affected — likely a local device or connection issue.\n\n**Step 1: Speed test on the affected device**\nhttps://www.speedtest.net — note download, upload, jitter, and latency.\n\n**Step 2: Compare to another device on the same network**\nRun a speed test from another computer nearby. If the other device is fast, the issue is device-specific.\n\n**Step 3: Check connection type**\n- Wi-Fi or Ethernet? If Wi-Fi, try Ethernet to rule it out.\n- What speed does the NIC show?\n```\n# Check link speed:\nGet-NetAdapter | Select Name, LinkSpeed, Status\n```\n\n**Step 4: Check for bandwidth hogs on the device**\n- Task Manager > Performance > Open Resource Monitor > Network tab\n- Look for processes with high network usage (OneDrive sync, Windows Update, cloud backup, etc.)\n\n**Step 5: Check NIC driver**\n- Device Manager > Network adapters > check for warnings\n- Update or reinstall the NIC driver",
|
||||
"next_node_id": "single_device_result"
|
||||
},
|
||||
{
|
||||
"id": "single_device_result",
|
||||
"type": "decision",
|
||||
"question": "What did the single-device check reveal?",
|
||||
"help_text": "Compare speed test results and device checks",
|
||||
"options": [
|
||||
{"id": "wifi_issue", "label": "Wi-Fi is the problem — Ethernet is fast", "next_node_id": "fix_single_wifi"},
|
||||
{"id": "hog_found", "label": "Found a bandwidth hog process", "next_node_id": "fix_bandwidth_hog"},
|
||||
{"id": "nic_issue", "label": "NIC showing errors or slow link speed", "next_node_id": "fix_nic_issue"},
|
||||
{"id": "device_ok", "label": "Device seems fine — issue may be network-wide", "next_node_id": "check_lan_saturation"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "fix_single_wifi",
|
||||
"type": "action",
|
||||
"title": "Fix Single Device Wi-Fi Performance",
|
||||
"description": "Ethernet is fast but Wi-Fi is slow on this device.\n\n**Check Wi-Fi signal strength:**\n```\nnetsh wlan show interfaces\n```\nLook at 'Signal' percentage — below 70% is problematic.\n\n**Check Wi-Fi band:**\n- 2.4GHz = longer range but slower and more congested\n- 5GHz = faster but shorter range\n- Force 5GHz: Network adapter properties > Advanced > Preferred Band\n\n**Try these fixes:**\n1. Move closer to the access point\n2. Forget and reconnect to the network\n3. Reset the Wi-Fi adapter:\n```\nnetsh winsock reset\nnetsh int ip reset\nipconfig /flushdns\n```\n4. Update or reinstall the wireless driver\n5. Check for interference (Bluetooth, USB 3.0 devices near the antenna)\n\n**If the laptop has an old/cheap Wi-Fi adapter:** An external USB Wi-Fi adapter (Wi-Fi 6) can be a quick fix.",
|
||||
"next_node_id": "solution_single_wifi"
|
||||
},
|
||||
{
|
||||
"id": "solution_single_wifi",
|
||||
"type": "solution",
|
||||
"title": "Resolved: Single Device Wi-Fi Issue",
|
||||
"description": "Wi-Fi performance improved on the affected device.\n\n**Ticket Notes:** User experiencing slow internet. Isolated to Wi-Fi on their device — Ethernet was full speed. Resolved by [switching to 5GHz / moving closer to AP / updating driver / resetting adapter].\n\n**If recurring:** Consider a USB Wi-Fi 6 adapter or relocating the user closer to an AP."
|
||||
},
|
||||
{
|
||||
"id": "fix_bandwidth_hog",
|
||||
"type": "action",
|
||||
"title": "Address Bandwidth Hog Process",
|
||||
"description": "A process on the device is consuming excessive bandwidth.\n\n**Common offenders:**\n- **OneDrive/SharePoint sync** — large initial sync or many changes\n- **Windows Update** — downloading feature updates (can be several GB)\n- **Cloud backup** (Veeam agent, Carbonite, etc.)\n- **Browser tabs** — streaming video, large downloads\n- **Antivirus** — cloud scanning or definition updates\n- **Teams/Zoom** — video call running in background\n\n**Fixes:**\n- OneDrive: Pause sync or set upload bandwidth limit (OneDrive > Settings > Network)\n- Windows Update: Pause for 7 days if it's disrupting work\n- Cloud backup: Schedule outside business hours\n- Browser: Close unnecessary tabs, check for extensions consuming bandwidth\n\n**Long-term:** Implement QoS or traffic shaping at the firewall to protect critical traffic from bulk transfers.",
|
||||
"next_node_id": "solution_bandwidth_hog"
|
||||
},
|
||||
{
|
||||
"id": "solution_bandwidth_hog",
|
||||
"type": "solution",
|
||||
"title": "Resolved: Bandwidth Hog Identified",
|
||||
"description": "Bandwidth restored after addressing the high-usage process.\n\n**Ticket Notes:** Slow internet on user's device caused by [process name] consuming bandwidth. [Paused sync / rescheduled backup / closed streaming tab / paused Windows Update].\n\n**Prevention:** Configure OneDrive bandwidth limits org-wide via Group Policy. Schedule backups and updates outside business hours."
|
||||
},
|
||||
{
|
||||
"id": "fix_nic_issue",
|
||||
"type": "action",
|
||||
"title": "Fix NIC / Cable Issue",
|
||||
"description": "Network adapter showing slow link speed or errors.\n\n**Check link speed:**\n```\nGet-NetAdapter | Select Name, LinkSpeed, Status, MediaConnectionState\n```\nExpected: 1 Gbps for Ethernet. If showing 100 Mbps or 10 Mbps — cable or port issue.\n\n**Check for errors:**\n```\nGet-NetAdapterStatistics | Select Name, ReceivedErrors, OutboundErrors, ReceivedDiscards\n```\n\n**Common causes of slow link speed:**\n- Bad Ethernet cable (bent pins, damaged cable) — try a different cable\n- Plugged into a 100Mbps switch port — check the switch\n- NIC auto-negotiation failing — try setting speed manually\n- USB docking station — many docks have 100Mbps NICs\n\n**Fix:**\n1. Try a different Ethernet cable\n2. Try a different switch port\n3. Check dock specs if using a docking station\n4. Update NIC driver\n5. If NIC is failing: replace or use a USB Ethernet adapter",
|
||||
"next_node_id": "solution_nic_fixed"
|
||||
},
|
||||
{
|
||||
"id": "solution_nic_fixed",
|
||||
"type": "solution",
|
||||
"title": "Resolved: NIC / Cable Issue",
|
||||
"description": "Network speed restored after fixing the NIC or cable.\n\n**Ticket Notes:** Slow internet caused by [bad cable / 100Mbps dock NIC / NIC errors / wrong switch port]. Resolved by [replacing cable / using direct Ethernet / updating driver / swapping to gigabit port].\n\n**Check:** Confirm link speed is now 1 Gbps with `Get-NetAdapter`."
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "check_lan_saturation",
|
||||
"type": "action",
|
||||
"title": "Check LAN for Saturation or Bottleneck",
|
||||
"description": "Multiple users are slow — check if the internal network is the bottleneck.\n\n**Step 1: Speed test at the firewall/router level**\nIf possible, run a speed test from a device directly connected to the firewall. This eliminates LAN issues.\n- If speed is fine at the firewall: LAN bottleneck\n- If speed is slow at the firewall: WAN/ISP issue\n\n**Step 2: Check switch utilization**\n- Log into managed switches and check port utilization\n- Look for ports at 90%+ utilization\n- Check for CRC errors or packet drops on uplink ports\n\n**Step 3: Check for a single device saturating the LAN**\n- Is someone downloading a large file?\n- Is a server doing a backup over the LAN during business hours?\n- Is a NAS replicating?\n\n**Step 4: Check uplinks between switches**\n- Are inter-switch uplinks gigabit or 10G? If only 1G and lots of traffic, they may be saturated.\n\n**Step 5: Look for broadcast storms**\n- High CPU on switches can indicate a loop or broadcast storm\n- Check spanning tree status",
|
||||
"next_node_id": "lan_result"
|
||||
},
|
||||
{
|
||||
"id": "lan_result",
|
||||
"type": "decision",
|
||||
"question": "Where is the LAN bottleneck?",
|
||||
"help_text": "Based on speed tests and switch checks",
|
||||
"options": [
|
||||
{"id": "device_saturating", "label": "One device is saturating the network", "next_node_id": "fix_lan_hog"},
|
||||
{"id": "uplink_saturated", "label": "Switch uplink is saturated", "next_node_id": "fix_uplink"},
|
||||
{"id": "switch_issue", "label": "Switch errors / spanning tree / loop", "next_node_id": "fix_switch_issue"},
|
||||
{"id": "lan_ok", "label": "LAN is fine — issue is WAN/ISP", "next_node_id": "check_wan_isp"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "fix_lan_hog",
|
||||
"type": "action",
|
||||
"title": "Address Device Saturating the LAN",
|
||||
"description": "A single device is consuming most of the LAN bandwidth.\n\n**Identify the device:** Check switch port utilization or use a network monitoring tool (PRTG, Auvik, Datto RMM, etc.).\n\n**Common culprits:**\n- Server backup running during business hours\n- NAS replication job\n- Large file copy between servers\n- User downloading/uploading huge files\n- Malware-infected device generating traffic\n\n**Immediate fix:** Rate-limit or pause the offending activity.\n\n**Long-term fixes:**\n- Schedule backups outside business hours\n- Implement QoS on the firewall to prioritize business traffic\n- Segment the network (put backup traffic on its own VLAN)\n- If malware: isolate the device immediately and scan",
|
||||
"next_node_id": "solution_lan_hog"
|
||||
},
|
||||
{
|
||||
"id": "solution_lan_hog",
|
||||
"type": "solution",
|
||||
"title": "Resolved: LAN Bandwidth Hog",
|
||||
"description": "LAN performance restored after addressing the high-traffic device.\n\n**Ticket Notes:** Network slowdown caused by [device/server] consuming excessive LAN bandwidth due to [backup / replication / file transfer / malware]. Resolved by [pausing job / rescheduling / isolating device].\n\n**Recommendations:**\n- Schedule bulk transfers outside 8AM-6PM\n- Implement QoS policies\n- Consider network segmentation (backup VLAN)"
|
||||
},
|
||||
{
|
||||
"id": "fix_uplink",
|
||||
"type": "action",
|
||||
"title": "Fix Saturated Switch Uplink",
|
||||
"description": "The uplink between switches (or switch to firewall) is maxed out.\n\n**Check the uplink:**\n- What speed is it? (1G, 10G?)\n- Is it a single link or LAG (link aggregation)?\n\n**Fixes:**\n- **Upgrade the uplink** to 10G if switches support it\n- **Add a second uplink** and configure Link Aggregation (LACP)\n- **Move heavy-traffic devices** to the switch closest to the firewall\n- **Implement VLANs** to keep local traffic local (e.g., printer traffic shouldn't cross uplinks)\n\n**If the firewall uplink is saturated:**\nThe internet connection itself may be too small for the number of users. See the WAN/ISP troubleshooting path.",
|
||||
"next_node_id": "solution_uplink"
|
||||
},
|
||||
{
|
||||
"id": "solution_uplink",
|
||||
"type": "solution",
|
||||
"title": "Resolved: Switch Uplink Upgraded",
|
||||
"description": "Uplink bottleneck resolved.\n\n**Ticket Notes:** Network slowdown caused by saturated switch uplink (was [speed]). Resolved by [upgrading to 10G / adding LACP / restructuring traffic flow].\n\n**Prevention:** Monitor uplink utilization with network monitoring tools. Set alerts at 70% sustained utilization."
|
||||
},
|
||||
{
|
||||
"id": "fix_switch_issue",
|
||||
"type": "action",
|
||||
"title": "Fix Switch / Spanning Tree Issue",
|
||||
"description": "Switch is showing errors, high CPU, or possible network loop.\n\n**Check for a network loop:**\n- Unmanaged switches or user-plugged patch cables are common loop sources\n- High CPU + broadcast storm symptoms: everything slows, then briefly recovers, then slows again\n- On managed switches: check spanning tree status for 'blocking' ports\n\n**Check for CRC errors:**\n- Log into managed switch\n- Show interface counters for CRC errors, runts, giants\n- Errors usually indicate bad cables, bad SFPs, or failing ports\n\n**Fix:**\n- Loop: Find and remove the offending cable/switch. Enable spanning tree (BPDU guard, loop protection)\n- CRC errors: Replace the cable or SFP on the erroring port\n- High CPU: Check for broadcast storms, ARP floods, or multicast issues\n\n**If unmanaged switches are present:** Replace with managed switches. Unmanaged switches are a major risk for loops.",
|
||||
"next_node_id": "solution_switch_fixed"
|
||||
},
|
||||
{
|
||||
"id": "solution_switch_fixed",
|
||||
"type": "solution",
|
||||
"title": "Resolved: Switch / Network Issue",
|
||||
"description": "Network performance restored after fixing the switch issue.\n\n**Ticket Notes:** Network slowdown caused by [loop / CRC errors / spanning tree issue / broadcast storm]. Resolved by [removing loop / replacing cable / enabling BPDU guard / replacing unmanaged switch].\n\n**Prevention:**\n- Enable BPDU guard and loop protection on all managed switches\n- Replace unmanaged switches with managed\n- Label all patch cables to prevent accidental loops"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "check_wan_isp",
|
||||
"type": "action",
|
||||
"title": "Check WAN / ISP Connection",
|
||||
"description": "Everyone is slow — likely a WAN or ISP issue.\n\n**Step 1: Speed test from the firewall or a directly-connected device**\nCompare results to the contracted ISP speed.\n\n**Step 2: Check the ISP circuit**\n- Is the modem/ONT showing link lights?\n- Any errors on the WAN interface of the firewall?\n- Check firewall WAN interface stats for errors, drops, CRC\n\n**Step 3: Check if the ISP is having an outage**\n- https://downdetector.com — search for the ISP\n- Check ISP's status page\n- Call the ISP NOC\n\n**Step 4: Run a traceroute**\n```\ntracert 8.8.8.8\n```\nLook for high latency or timeouts at specific hops. If the first hop (firewall) is slow, it's internal. If later hops are slow, it's ISP.\n\n**Step 5: Check firewall throughput**\n- Is UTM/IPS/content filtering maxing out the firewall CPU?\n- Some firewalls slow down significantly with all security features enabled\n- Check firewall CPU and memory utilization",
|
||||
"next_node_id": "wan_result"
|
||||
},
|
||||
{
|
||||
"id": "wan_result",
|
||||
"type": "decision",
|
||||
"question": "What did the WAN/ISP check reveal?",
|
||||
"help_text": "Compare speed tests to contracted speeds and check firewall stats",
|
||||
"options": [
|
||||
{"id": "isp_issue", "label": "ISP speed is well below contracted rate", "next_node_id": "fix_isp_issue"},
|
||||
{"id": "firewall_bottleneck", "label": "Firewall is the bottleneck (high CPU / UTM)", "next_node_id": "fix_firewall_bottleneck"},
|
||||
{"id": "circuit_too_small", "label": "Speed matches contract but is too slow for the site", "next_node_id": "solution_upgrade_circuit"},
|
||||
{"id": "wan_ok", "label": "WAN speed is fine — issue is elsewhere", "next_node_id": "check_lan_saturation"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "fix_isp_issue",
|
||||
"type": "action",
|
||||
"title": "Address ISP Performance Issue",
|
||||
"description": "Speed is significantly below the contracted rate.\n\n**Document before calling the ISP:**\n- Speed test results (multiple tests, different times)\n- Traceroute showing where the slowdown is\n- WAN interface stats from the firewall\n- Test from a device directly connected to the modem (bypass firewall) to rule out internal issues\n\n**Call the ISP:**\n- Reference your circuit ID / account number\n- Report the speed discrepancy\n- Ask them to check for errors on their side, check the modem/ONT signal levels\n- Request a tech dispatch if they can't resolve remotely\n\n**If the ISP says everything looks fine on their end:**\n- Ask for the modem/ONT signal levels (SNR, attenuation)\n- Power cycle the modem/ONT\n- Check for damaged cabling from the demarc to the modem",
|
||||
"next_node_id": "solution_isp_issue"
|
||||
},
|
||||
{
|
||||
"id": "solution_isp_issue",
|
||||
"type": "solution",
|
||||
"title": "ISP Issue Reported",
|
||||
"description": "ISP performance issue identified and reported.\n\n**Ticket Notes:** Internet speed at [X] Mbps, contracted for [Y] Mbps. Tested from device directly connected to modem to rule out internal issues. ISP ticket opened: [ISP ticket #]. [ISP is dispatching tech / ISP found issue on their side / awaiting ISP response].\n\n**Follow-up:** Retest after ISP resolves. If this is a recurring issue, consider a secondary ISP for failover."
|
||||
},
|
||||
{
|
||||
"id": "fix_firewall_bottleneck",
|
||||
"type": "action",
|
||||
"title": "Address Firewall Throughput Bottleneck",
|
||||
"description": "The firewall is limiting throughput — CPU is high or UTM features are reducing speed.\n\n**Check firewall CPU and memory:**\nLog into the firewall admin console and check dashboard/system status.\n\n**Common causes:**\n- UTM features (IPS, content filtering, SSL inspection) consuming too much CPU\n- Firewall hardware is undersized for the number of users/throughput\n- Firmware is outdated (newer firmware often has performance improvements)\n- Too many VPN tunnels or NAT sessions\n\n**Quick fixes:**\n- Reduce IPS/UTM logging verbosity\n- Disable SSL deep inspection if not strictly required (major CPU saver)\n- Exclude trusted traffic from UTM scanning (M365, known-good sites)\n- Update firmware\n\n**Long-term:** If the firewall is simply too small, it needs to be right-sized. Check the vendor's throughput specs with UTM enabled (not just raw firewall throughput).",
|
||||
"next_node_id": "solution_firewall_bottleneck"
|
||||
},
|
||||
{
|
||||
"id": "solution_firewall_bottleneck",
|
||||
"type": "solution",
|
||||
"title": "Resolved: Firewall Throughput Issue",
|
||||
"description": "Internet speed improved after addressing firewall bottleneck.\n\n**Ticket Notes:** Internet slow for all users. Firewall CPU at [X]% with UTM enabled. Resolved by [disabling SSL inspection / excluding M365 from UTM / updating firmware / reducing logging]. Speed improved from [X] to [Y] Mbps.\n\n**If firewall is undersized:** Recommend hardware upgrade. Always check vendor specs for 'threat inspection throughput' not just 'firewall throughput' — they can be 5-10x different."
|
||||
},
|
||||
{
|
||||
"id": "solution_upgrade_circuit",
|
||||
"type": "solution",
|
||||
"title": "Recommendation: Upgrade Internet Circuit",
|
||||
"description": "The internet connection is performing at contracted speed but is insufficient for the site.\n\n**Ticket Notes:** Internet speed matches contracted [X] Mbps but is insufficient for [Y] users at this site. Average utilization during business hours: [Z]%.\n\n**Recommendations:**\n- Current bandwidth per user: [X/Y] Mbps — industry recommendation is 25-50 Mbps per user minimum for cloud-heavy environments\n- Upgrade circuit to [recommended speed]\n- Consider adding a secondary ISP for failover and load balancing\n- In the meantime: Implement QoS to prioritize critical applications (VoIP, video conferencing) over bulk traffic\n\n**Escalate to:** Client decision-maker for circuit upgrade approval."
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "check_intermittent",
|
||||
"type": "action",
|
||||
"title": "Diagnose Intermittent Slowdowns",
|
||||
"description": "Internet speed comes and goes — hard to catch in the moment.\n\n**Step 1: Establish a baseline with continuous monitoring**\n- Set up a continuous ping to 8.8.8.8 and log results:\n```\nping -t 8.8.8.8 > C:\\Temp\\ping_log.txt\n```\n- Use a free monitoring tool: PRTG (100 sensors free) or PingPlotter\n- Let it run for 24-48 hours to catch the pattern\n\n**Step 2: Identify the pattern**\n- Same time every day? → Scheduled job (backup, updates, AV scan)\n- Random but frequent? → ISP instability, bad cable, or overheating equipment\n- Only during heavy usage? → Bandwidth is insufficient for peak demand\n\n**Step 3: Check for scheduled jobs**\n- What time do backups run?\n- When does Windows Update check/install?\n- When does AV push definitions?\n- When do cloud sync tools run full scans?\n\n**Step 4: Check hardware health**\n- Is the modem, switch, or firewall overheating? (check in a hot server room?)\n- Overheating equipment can throttle or restart intermittently",
|
||||
"next_node_id": "intermittent_result"
|
||||
},
|
||||
{
|
||||
"id": "intermittent_result",
|
||||
"type": "decision",
|
||||
"question": "Did you identify the pattern?",
|
||||
"help_text": "Review monitoring data and scheduled tasks",
|
||||
"options": [
|
||||
{"id": "scheduled_job", "label": "Coincides with a scheduled job (backup, updates)", "next_node_id": "fix_bandwidth_hog"},
|
||||
{"id": "isp_instability", "label": "ISP connection is dropping/degrading intermittently", "next_node_id": "fix_isp_issue"},
|
||||
{"id": "hardware_issue", "label": "Equipment overheating or failing", "next_node_id": "solution_hardware_issue"},
|
||||
{"id": "peak_usage", "label": "Happens during peak usage times", "next_node_id": "solution_upgrade_circuit"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "solution_hardware_issue",
|
||||
"type": "solution",
|
||||
"title": "Resolved: Network Hardware Issue",
|
||||
"description": "Intermittent slowdowns caused by failing or overheating network equipment.\n\n**Ticket Notes:** Intermittent internet slowdowns traced to [modem / switch / firewall] [overheating / failing]. [Moved equipment / improved cooling / replaced device].\n\n**Prevention:**\n- Ensure network equipment has adequate ventilation\n- Monitor equipment temperatures (SNMP sensors)\n- Replace aging equipment proactively (switches, firewalls have ~7-10 year lifespans)\n- Keep firmware updated"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Tree 2: Wireless Connectivity Problems
|
||||
# =============================================================================
|
||||
def get_wireless_connectivity_tree() -> dict[str, Any]:
|
||||
"""Wireless Connectivity Problems - Networking tree."""
|
||||
return {
|
||||
"name": "Wireless Connectivity Problems",
|
||||
"description": "Troubleshoot Wi-Fi connectivity issues including connection failures, frequent disconnects, slow wireless speeds, roaming problems, and SSID visibility. Covers both single-AP and enterprise wireless environments.",
|
||||
"category": "Networking",
|
||||
"tree_structure": {
|
||||
"id": "root",
|
||||
"type": "decision",
|
||||
"question": "What is the wireless issue?",
|
||||
"help_text": "Identify the specific Wi-Fi problem to narrow down the cause.",
|
||||
"options": [
|
||||
{"id": "cant_connect", "label": "Can't connect to Wi-Fi at all", "next_node_id": "check_cant_connect"},
|
||||
{"id": "keeps_dropping", "label": "Connects but keeps disconnecting", "next_node_id": "check_drops"},
|
||||
{"id": "slow_wifi", "label": "Connected but Wi-Fi is very slow", "next_node_id": "check_slow_wifi"},
|
||||
{"id": "no_ssid", "label": "Wi-Fi network (SSID) not showing up", "next_node_id": "check_ssid_missing"},
|
||||
{"id": "roaming_issues", "label": "Drops when moving between areas / floors", "next_node_id": "check_roaming"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "check_cant_connect",
|
||||
"type": "action",
|
||||
"title": "Diagnose Wi-Fi Connection Failure",
|
||||
"description": "User can't connect to the wireless network.\n\n**Step 1: Check the basics**\n- Is Wi-Fi turned on? (check hardware switch, Fn key, airplane mode)\n- Is the correct SSID selected?\n- Is the password correct? (most common issue)\n\n**Step 2: Check if other devices can connect**\n- If no devices can connect: AP or RADIUS issue\n- If only this device fails: device-specific problem\n\n**Step 3: Check the Wi-Fi adapter**\n```\nnetsh wlan show interfaces\nnetsh wlan show drivers\n```\nLook for: Radio state (on/off), supported modes, driver version.\n\n**Step 4: Forget and reconnect**\n1. Settings > Network & Internet > Wi-Fi > Manage known networks\n2. Select the network > Forget\n3. Reconnect and enter the password\n\n**Step 5: For enterprise WPA2-Enterprise / 802.1X:**\n- Is the user's certificate valid?\n- Is the RADIUS server reachable and responding?\n- Check the RADIUS server logs for rejection reasons",
|
||||
"next_node_id": "cant_connect_result"
|
||||
},
|
||||
{
|
||||
"id": "cant_connect_result",
|
||||
"type": "decision",
|
||||
"question": "What's preventing the connection?",
|
||||
"help_text": "Based on the checks above",
|
||||
"options": [
|
||||
{"id": "wrong_password", "label": "Wrong password / credential issue", "next_node_id": "solution_wifi_password"},
|
||||
{"id": "adapter_issue", "label": "Wi-Fi adapter disabled or driver issue", "next_node_id": "fix_wifi_adapter"},
|
||||
{"id": "radius_issue", "label": "802.1X / RADIUS authentication failing", "next_node_id": "fix_radius_auth"},
|
||||
{"id": "ap_issue", "label": "No devices can connect — AP issue", "next_node_id": "fix_ap_issue"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "solution_wifi_password",
|
||||
"type": "solution",
|
||||
"title": "Resolved: Wi-Fi Password/Credential Issue",
|
||||
"description": "User connected after fixing credentials.\n\n**Ticket Notes:** Wi-Fi connection failed due to [wrong password / expired credentials / incorrect profile]. Forgot the network, reconnected with correct credentials.\n\n**If WPA2-Enterprise:** Ensure the user's certificate hasn't expired and their domain credentials are current."
|
||||
},
|
||||
{
|
||||
"id": "fix_wifi_adapter",
|
||||
"type": "action",
|
||||
"title": "Fix Wi-Fi Adapter Issue",
|
||||
"description": "Wi-Fi adapter is disabled, missing, or has a driver problem.\n\n**Re-enable the adapter:**\n```\n# Check adapter status\nGet-NetAdapter -Name \"Wi-Fi\" | Select Status\n\n# Enable if disabled\nEnable-NetAdapter -Name \"Wi-Fi\"\n```\n\n**Check Device Manager:**\n- Right-click Start > Device Manager > Network adapters\n- Look for the wireless adapter — yellow warning icon means driver issue\n- Right-click > Update driver > Search automatically\n- If no wireless adapter listed: check if it's disabled in BIOS/UEFI\n\n**Reset the adapter stack:**\n```\nnetsh winsock reset\nnetsh int ip reset\nipconfig /flushdns\nipconfig /release\nipconfig /renew\n```\nRestart the computer after running these.\n\n**If driver update doesn't help:** Download the latest driver from the laptop manufacturer's website (not Windows Update).",
|
||||
"next_node_id": "solution_adapter_fixed"
|
||||
},
|
||||
{
|
||||
"id": "solution_adapter_fixed",
|
||||
"type": "solution",
|
||||
"title": "Resolved: Wi-Fi Adapter Fixed",
|
||||
"description": "Wi-Fi adapter restored and connecting.\n\n**Ticket Notes:** Wi-Fi not connecting. Adapter was [disabled / driver corrupted / missing from Device Manager]. Resolved by [re-enabling / updating driver from manufacturer / resetting network stack].\n\n**If BIOS-disabled:** Document that the wireless was disabled in BIOS settings and re-enabled."
|
||||
},
|
||||
{
|
||||
"id": "fix_radius_auth",
|
||||
"type": "action",
|
||||
"title": "Fix RADIUS / 802.1X Authentication",
|
||||
"description": "Enterprise Wi-Fi authentication is failing.\n\n**Check the RADIUS server (NPS):**\n1. Event Viewer on the NPS server > Custom Views > Server Roles > Network Policy and Access Services\n2. Look for reject events — they show the reason code\n\n**Common RADIUS failures:**\n- **Certificate expired** on the user, computer, or RADIUS server\n- **User not in the allowed group** specified in the NPS policy\n- **Computer not domain-joined** (if policy requires domain membership)\n- **NPS policy mismatch** (wrong auth type, encryption settings)\n- **RADIUS shared secret mismatch** between AP and NPS server\n\n**Quick fixes:**\n1. Verify the user is in the correct security group\n2. Check certificate expiration dates\n3. Delete the Wi-Fi profile on the client and re-create it\n4. If using GPO-deployed Wi-Fi profiles: run `gpupdate /force`\n\n**Test with a known-working account** to isolate whether it's user-specific or systemic.",
|
||||
"next_node_id": "solution_radius_fixed"
|
||||
},
|
||||
{
|
||||
"id": "solution_radius_fixed",
|
||||
"type": "solution",
|
||||
"title": "Resolved: RADIUS Authentication Fixed",
|
||||
"description": "802.1X/RADIUS authentication restored.\n\n**Ticket Notes:** Wi-Fi 802.1X authentication failing. NPS logs showed: [reason]. Resolved by [adding user to group / renewing certificate / fixing NPS policy / correcting shared secret].\n\n**If certificate-related:** Check expiration dates for:\n- NPS server certificate\n- Root CA certificate distributed to clients\n- User/computer certificates"
|
||||
},
|
||||
{
|
||||
"id": "fix_ap_issue",
|
||||
"type": "action",
|
||||
"title": "Troubleshoot Access Point",
|
||||
"description": "No devices can connect — the AP itself may be the problem.\n\n**Step 1: Check AP status**\n- Is the AP powered on? (check LED indicators)\n- Is the AP reachable on the network? (ping its management IP)\n- Log into the wireless controller or AP management console\n\n**Step 2: Check for common AP issues:**\n- **Power cycle the AP** — many issues resolve with a reboot\n- **PoE power** — is the switch providing enough power? (check PoE budget)\n- **DHCP pool exhausted** — clients can't get an IP (check DHCP scope)\n- **Channel congestion** — AP is on a congested channel\n- **Firmware** — is the AP firmware up to date?\n\n**Step 3: If managed by a controller:**\n- Check controller for AP status and alerts\n- Check if the AP has lost its connection to the controller\n- Is the AP's VLAN trunk configured correctly on the switch?\n\n**Step 4: Try a different AP** — swap with a known-good AP to isolate hardware failure.",
|
||||
"next_node_id": "solution_ap_fixed"
|
||||
},
|
||||
{
|
||||
"id": "solution_ap_fixed",
|
||||
"type": "solution",
|
||||
"title": "Resolved: Access Point Issue",
|
||||
"description": "Wi-Fi connectivity restored after fixing the AP.\n\n**Ticket Notes:** No devices could connect to [SSID]. AP at [location] was [unresponsive / PoE issue / firmware crash / controller disconnect]. Resolved by [power cycle / fixing PoE / updating firmware / re-adopting to controller].\n\n**If AP hardware failure:** Replace the unit and configure the replacement."
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "check_drops",
|
||||
"type": "action",
|
||||
"title": "Diagnose Frequent Wi-Fi Disconnections",
|
||||
"description": "User connects but keeps getting disconnected.\n\n**Step 1: Check event logs for disconnect reasons**\n```\nGet-WinEvent -LogName 'Microsoft-Windows-WLAN-AutoConfig/Operational' -MaxEvents 20 | Select TimeCreated, Message\n```\n\n**Step 2: Check signal strength during a dropout**\n```\nnetsh wlan show interfaces\n```\nSignal below 50% = likely cause of drops.\n\n**Step 3: Common causes:**\n- **Weak signal** — user is too far from the AP\n- **Interference** — microwaves, Bluetooth, cordless phones on 2.4GHz\n- **Driver power management** — Windows is turning off Wi-Fi to save power\n- **AP overloaded** — too many clients on one AP (usually 30+ causes issues)\n- **DHCP lease issues** — very short lease time causing re-auth\n- **DFS channel change** — radar detection causes AP to switch channels, dropping clients\n\n**Step 4: Disable Wi-Fi power saving**\nDevice Manager > Network adapter > Properties > Power Management > Uncheck \"Allow the computer to turn off this device to save power\"\n\nAlso: Adapter properties > Advanced > Power Save Mode > set to Maximum Performance",
|
||||
"next_node_id": "drops_result"
|
||||
},
|
||||
{
|
||||
"id": "drops_result",
|
||||
"type": "decision",
|
||||
"question": "What's causing the disconnections?",
|
||||
"help_text": "Based on signal strength, event logs, and environment checks",
|
||||
"options": [
|
||||
{"id": "weak_signal", "label": "Weak signal — too far from AP", "next_node_id": "solution_weak_signal"},
|
||||
{"id": "power_mgmt", "label": "Power management turning off Wi-Fi", "next_node_id": "solution_power_mgmt"},
|
||||
{"id": "interference", "label": "Interference on the channel", "next_node_id": "fix_interference"},
|
||||
{"id": "ap_overloaded", "label": "AP is overloaded with too many clients", "next_node_id": "solution_ap_overloaded"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "solution_weak_signal",
|
||||
"type": "solution",
|
||||
"title": "Resolved: Weak Wi-Fi Signal",
|
||||
"description": "Disconnections caused by weak signal in the user's area.\n\n**Ticket Notes:** Wi-Fi disconnecting due to weak signal ([X]% signal strength). User is [location], too far from nearest AP at [AP location].\n\n**Fixes applied:** [Moved user / added AP / replaced AP with higher-power model / switched to 2.4GHz for better range].\n\n**If additional coverage is needed:** Recommend a site survey to identify optimal AP placement."
|
||||
},
|
||||
{
|
||||
"id": "solution_power_mgmt",
|
||||
"type": "solution",
|
||||
"title": "Resolved: Wi-Fi Power Management Disabled",
|
||||
"description": "Disconnections stopped after disabling Wi-Fi power management.\n\n**Ticket Notes:** Wi-Fi disconnecting intermittently. Windows power management was turning off the wireless adapter. Disabled in Device Manager and set adapter to Maximum Performance.\n\n**To deploy org-wide:** Use Group Policy:\nComputer Config > Admin Templates > System > Power Management > set wireless adapter to Maximum Performance on AC power."
|
||||
},
|
||||
{
|
||||
"id": "fix_interference",
|
||||
"type": "action",
|
||||
"title": "Address Wi-Fi Channel Interference",
|
||||
"description": "Wi-Fi channel is congested or has interference.\n\n**Step 1: Scan for competing networks**\nUse a Wi-Fi analyzer app (e.g., WiFi Analyzer for Android, or inSSIDer for Windows).\n- How many SSIDs are on the same channel?\n- Are neighboring businesses on overlapping channels?\n\n**Step 2: Choose the best channel**\n- **2.4GHz:** Only use channels 1, 6, or 11 (non-overlapping). Pick the least crowded.\n- **5GHz:** More channels available — switch to a less crowded one. Avoid DFS channels if radar is an issue.\n\n**Step 3: Change the channel on the AP**\n- Log into the AP or wireless controller\n- Set the radio to the selected channel (disable auto if it keeps picking a bad one)\n\n**Step 4: Check for non-Wi-Fi interference**\n- Microwaves (2.4GHz interference)\n- Bluetooth devices\n- Cordless phones\n- USB 3.0 hubs (known to cause 2.4GHz interference)",
|
||||
"next_node_id": "solution_interference_fixed"
|
||||
},
|
||||
{
|
||||
"id": "solution_interference_fixed",
|
||||
"type": "solution",
|
||||
"title": "Resolved: Wi-Fi Interference",
|
||||
"description": "Wi-Fi stability improved after addressing channel interference.\n\n**Ticket Notes:** Wi-Fi disconnections caused by channel interference. Changed [AP name] from channel [X] to channel [Y] on [2.4/5]GHz band. Also [removed interference source / moved microwave / switched clients to 5GHz].\n\n**Best practice:** For enterprise environments, use a wireless controller with automatic channel management. For small sites, manually set non-overlapping channels."
|
||||
},
|
||||
{
|
||||
"id": "solution_ap_overloaded",
|
||||
"type": "solution",
|
||||
"title": "Resolved: AP Overloaded — Too Many Clients",
|
||||
"description": "AP had too many connected clients causing instability.\n\n**Ticket Notes:** AP at [location] had [X] connected clients. Performance degrades above ~25-30 clients per AP. [Added additional AP / load balanced clients / configured band steering to push clients to 5GHz].\n\n**Recommendations:**\n- Deploy additional APs to distribute the client load\n- Enable band steering to push dual-band devices to 5GHz\n- Consider client load balancing on the wireless controller\n- Target 15-25 clients per AP for reliable performance"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "check_slow_wifi",
|
||||
"type": "action",
|
||||
"title": "Diagnose Slow Wi-Fi Speeds",
|
||||
"description": "User is connected but Wi-Fi is very slow.\n\n**Step 1: Check connection speed and signal**\n```\nnetsh wlan show interfaces\n```\nLook at: Receive/Transmit rate, Signal, Radio type, Channel\n\n**Step 2: Speed test on Wi-Fi vs Ethernet**\nRun speedtest.net on Wi-Fi, then on Ethernet. This shows how much the Wi-Fi is limiting speed.\n\n**Step 3: Check which band/standard the client is on**\n- 802.11n on 2.4GHz = max ~70Mbps real-world\n- 802.11ac on 5GHz = max ~400Mbps real-world\n- 802.11ax (Wi-Fi 6) on 5GHz = max ~600Mbps+ real-world\n\n**If connected at low rates (e.g., 54Mbps, 72Mbps):**\n- Client may be forcing an older standard\n- Adapter > Properties > Advanced > Wireless Mode > enable all standards\n- Or the AP is configured for legacy compatibility mode (slows everyone down)\n\n**Step 4: Check AP client count**\nMany clients on one AP = everyone gets slower. Over 25 clients is a concern.",
|
||||
"next_node_id": "slow_wifi_result"
|
||||
},
|
||||
{
|
||||
"id": "slow_wifi_result",
|
||||
"type": "decision",
|
||||
"question": "What's causing slow Wi-Fi?",
|
||||
"help_text": "Compare Wi-Fi speed to Ethernet and check connection parameters",
|
||||
"options": [
|
||||
{"id": "old_standard", "label": "Client connected on old/slow standard (11n, 11g)", "next_node_id": "solution_upgrade_wifi_standard"},
|
||||
{"id": "poor_signal", "label": "Signal is weak — degrading speed", "next_node_id": "solution_weak_signal"},
|
||||
{"id": "congested_channel", "label": "Channel is congested", "next_node_id": "fix_interference"},
|
||||
{"id": "too_many_clients", "label": "Too many clients on the AP", "next_node_id": "solution_ap_overloaded"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "solution_upgrade_wifi_standard",
|
||||
"type": "solution",
|
||||
"title": "Resolved: Wi-Fi Standard Upgrade Needed",
|
||||
"description": "Client is connecting on an older, slower Wi-Fi standard.\n\n**Ticket Notes:** User's Wi-Fi slow due to connection on [802.11n/g] instead of [802.11ac/ax]. [Updated adapter settings / replaced adapter / switched to 5GHz band].\n\n**If the AP only supports 802.11n:** Recommend upgrading to Wi-Fi 5 (802.11ac) or Wi-Fi 6 (802.11ax) APs.\n**If the client only supports 802.11n:** A USB Wi-Fi 6 adapter is an inexpensive upgrade."
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "check_ssid_missing",
|
||||
"type": "action",
|
||||
"title": "Troubleshoot Missing SSID",
|
||||
"description": "The Wi-Fi network name isn't appearing in the available networks list.\n\n**Step 1: Can other devices see the SSID?**\n- If no devices see it: AP issue or SSID is disabled\n- If only this device can't see it: client-side issue\n\n**Step 2: Check if SSID is hidden**\n- Some networks are configured as hidden (SSID broadcast disabled)\n- To connect to a hidden SSID: Network & Internet > Wi-Fi > Add a network > enter the SSID manually\n\n**Step 3: Check if the correct band is supported**\n- If the SSID is only on 5GHz and the client only has 2.4GHz, it won't appear\n- Check: `netsh wlan show drivers` — look for 'Supported bands'\n\n**Step 4: Check the AP**\n- Is the SSID still configured and enabled on the AP/controller?\n- Is the AP's radio turned on?\n- Did someone accidentally delete or disable the SSID?\n\n**Step 5: Scan for networks**\n```\nnetsh wlan show networks mode=bssid\n```\nThis shows all detected networks with their channel and signal strength.",
|
||||
"next_node_id": "ssid_result"
|
||||
},
|
||||
{
|
||||
"id": "ssid_result",
|
||||
"type": "decision",
|
||||
"question": "Why is the SSID not visible?",
|
||||
"help_text": "Based on the checks above",
|
||||
"options": [
|
||||
{"id": "hidden", "label": "SSID is hidden — need to connect manually", "next_node_id": "solution_hidden_ssid"},
|
||||
{"id": "band_mismatch", "label": "SSID is on 5GHz, client only has 2.4GHz", "next_node_id": "solution_band_mismatch"},
|
||||
{"id": "ap_ssid_down", "label": "SSID was disabled or AP radio is off", "next_node_id": "fix_ap_issue"},
|
||||
{"id": "client_driver", "label": "Client Wi-Fi driver issue — can't scan", "next_node_id": "fix_wifi_adapter"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "solution_hidden_ssid",
|
||||
"type": "solution",
|
||||
"title": "Resolved: Connected to Hidden SSID",
|
||||
"description": "Network was configured as a hidden SSID. Connected manually.\n\n**Ticket Notes:** Wi-Fi SSID [name] not appearing because SSID broadcast is disabled. Connected manually by adding the network profile.\n\n**Note:** Hidden SSIDs are not more secure — they actually cause the client to broadcast the SSID name while probing. Consider enabling SSID broadcast and using proper WPA2/3 Enterprise for security."
|
||||
},
|
||||
{
|
||||
"id": "solution_band_mismatch",
|
||||
"type": "solution",
|
||||
"title": "Resolved: Band Mismatch",
|
||||
"description": "Client doesn't support the frequency band the SSID is on.\n\n**Ticket Notes:** SSID [name] is configured on 5GHz only. User's device only supports 2.4GHz. [Added 2.4GHz SSID / provided USB dual-band adapter / user connected to alternate SSID].\n\n**Recommendation:** Most enterprise environments should have both 2.4GHz and 5GHz SSIDs available, or a single SSID on both bands with band steering."
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "check_roaming",
|
||||
"type": "action",
|
||||
"title": "Troubleshoot Wi-Fi Roaming Issues",
|
||||
"description": "User drops connection when moving between APs (different floors, areas).\n\n**What should happen:** Client seamlessly roams from one AP to the next without disconnecting.\n\n**Step 1: Check roaming configuration**\n- Are all APs on the same SSID and security settings? (Must match exactly)\n- Are all APs on the same VLAN? (Or is there L3 roaming configured?)\n- Is fast roaming enabled? (802.11r, OKC, or PMKSA caching)\n\n**Step 2: Check AP overlap**\n- Adjacent APs should have 15-20% signal overlap\n- If there's a dead zone between APs, the client drops before finding the next AP\n- Use a Wi-Fi survey tool to check coverage\n\n**Step 3: Check client-side roaming aggressiveness**\n- Adapter > Properties > Advanced > Roaming Aggressiveness\n- Set to 'Medium' or 'High' — low aggressiveness means the client clings to a weak AP too long\n\n**Step 4: Check for 'sticky client' behavior**\n- Client stays connected to a distant AP instead of roaming to a closer one\n- Fix: Enable minimum RSSI on the AP (disconnect clients below -75dBm threshold)\n- Enable band steering and fast roaming on the controller",
|
||||
"next_node_id": "roaming_result"
|
||||
},
|
||||
{
|
||||
"id": "roaming_result",
|
||||
"type": "decision",
|
||||
"question": "What's causing the roaming issue?",
|
||||
"help_text": "Based on coverage analysis and configuration checks",
|
||||
"options": [
|
||||
{"id": "dead_zone", "label": "Dead zone between APs — no overlap", "next_node_id": "solution_dead_zone"},
|
||||
{"id": "sticky_client", "label": "Client is sticky — won't roam", "next_node_id": "solution_sticky_client"},
|
||||
{"id": "config_mismatch", "label": "SSID or security mismatch between APs", "next_node_id": "solution_ssid_mismatch"},
|
||||
{"id": "no_fast_roaming", "label": "Fast roaming (802.11r) not enabled", "next_node_id": "solution_fast_roaming"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "solution_dead_zone",
|
||||
"type": "solution",
|
||||
"title": "Resolved: Wi-Fi Dead Zone",
|
||||
"description": "Coverage gap between APs causing disconnections.\n\n**Ticket Notes:** Wi-Fi drops when user moves between [area A] and [area B]. Coverage survey confirmed dead zone. [Repositioned AP / added additional AP / increased AP transmit power].\n\n**Recommendation:** Conduct a professional Wi-Fi site survey to identify all dead zones. APs should have 15-20% signal overlap at -67dBm or better for seamless roaming."
|
||||
},
|
||||
{
|
||||
"id": "solution_sticky_client",
|
||||
"type": "solution",
|
||||
"title": "Resolved: Sticky Client Issue",
|
||||
"description": "Client was holding onto a distant AP instead of roaming.\n\n**Ticket Notes:** User's device staying connected to distant AP ([AP name], signal [X]dBm) instead of roaming to closer AP. Resolved by [increasing roaming aggressiveness on client / enabling minimum RSSI on AP / configuring band steering].\n\n**AP-side fixes:**\n- Set minimum RSSI threshold to -75dBm (disconnect weak clients)\n- Enable client load balancing on the controller"
|
||||
},
|
||||
{
|
||||
"id": "solution_ssid_mismatch",
|
||||
"type": "solution",
|
||||
"title": "Resolved: SSID/Security Mismatch",
|
||||
"description": "APs had different SSID or security configurations preventing roaming.\n\n**Ticket Notes:** Roaming failure between APs. [AP at location A] and [AP at location B] had mismatched [SSID / security type / VLAN / WPA settings]. Corrected to match across all APs.\n\n**Prevention:** Use a wireless controller to manage all APs centrally — this prevents configuration drift."
|
||||
},
|
||||
{
|
||||
"id": "solution_fast_roaming",
|
||||
"type": "solution",
|
||||
"title": "Resolved: Fast Roaming Enabled",
|
||||
"description": "Enabled fast roaming protocols to speed up transitions between APs.\n\n**Ticket Notes:** Wi-Fi drops during roaming due to slow re-authentication. Enabled [802.11r (Fast BSS Transition) / OKC (Opportunistic Key Caching) / PMKSA caching] on the wireless controller.\n\n**Note:** 802.11r can cause issues with some older devices. Test before deploying broadly. OKC is usually a safer first option.\n\n**Result:** Roaming transitions now take <50ms instead of 1-3 seconds."
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Tree 3: Firewall Blocking Issues
|
||||
# =============================================================================
|
||||
def get_firewall_blocking_tree() -> dict[str, Any]:
|
||||
"""Firewall Blocking Issues - Networking tree."""
|
||||
return {
|
||||
"name": "Firewall Blocking Issues",
|
||||
"description": "Troubleshoot firewall-related blocking of applications, websites, ports, and services. Covers both Windows Firewall and network firewalls (UTM/NGFW). Includes common port requirements, rule creation, and log analysis.",
|
||||
"category": "Networking",
|
||||
"tree_structure": {
|
||||
"id": "root",
|
||||
"type": "decision",
|
||||
"question": "What is being blocked?",
|
||||
"help_text": "Identify what the user can't access or what application isn't working.",
|
||||
"options": [
|
||||
{"id": "website", "label": "A specific website or web application", "next_node_id": "check_website_block"},
|
||||
{"id": "application", "label": "A desktop application can't connect", "next_node_id": "check_app_block"},
|
||||
{"id": "port_service", "label": "A specific port or service is blocked", "next_node_id": "check_port_block"},
|
||||
{"id": "vpn_blocked", "label": "VPN can't connect through the firewall", "next_node_id": "check_vpn_block"},
|
||||
{"id": "not_sure", "label": "Something isn't working but not sure if it's the firewall", "next_node_id": "diagnose_firewall_vs_other"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "check_website_block",
|
||||
"type": "action",
|
||||
"title": "Diagnose Website/URL Blocking",
|
||||
"description": "User can't access a specific website.\n\n**Step 1: Verify the block**\n- Can other users at the same site access it?\n- Can the user access it from their phone (on cellular, not Wi-Fi)?\n- What error message do they see? (timeout, block page, SSL error, etc.)\n\n**Step 2: Check for a firewall block page**\n- Many UTM firewalls show a branded block page (SonicWall, Fortinet, Sophos, etc.)\n- The block page usually tells you the category (e.g., 'Social Media', 'Uncategorized', 'Security Risk')\n- This confirms it's the firewall content filter\n\n**Step 3: Check the firewall content filter logs**\n- Log into the firewall admin console\n- Check the web filter or content filter log\n- Search for the URL/domain\n- Note the category and policy that blocked it\n\n**Step 4: Check SSL/TLS inspection**\n- If SSL inspection is enabled, it may be causing certificate errors on some sites\n- Some sites use certificate pinning and break with SSL inspection\n- Check if excluding the site from SSL inspection fixes it",
|
||||
"next_node_id": "website_block_cause"
|
||||
},
|
||||
{
|
||||
"id": "website_block_cause",
|
||||
"type": "decision",
|
||||
"question": "Why is the website blocked?",
|
||||
"help_text": "Based on the block page and firewall logs",
|
||||
"options": [
|
||||
{"id": "content_filter", "label": "Content filter category block (intended)", "next_node_id": "fix_content_filter"},
|
||||
{"id": "wrong_category", "label": "Website is miscategorized by the filter", "next_node_id": "fix_miscategorized"},
|
||||
{"id": "ssl_inspection", "label": "SSL inspection causing certificate errors", "next_node_id": "fix_ssl_inspection"},
|
||||
{"id": "dns_filter", "label": "DNS-level filtering (DNS Security, Umbrella, etc.)", "next_node_id": "fix_dns_filter"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "fix_content_filter",
|
||||
"type": "action",
|
||||
"title": "Handle Content Filter Block",
|
||||
"description": "Website is blocked by the content filter policy — this is working as designed.\n\n**If the user needs access for work:**\n1. Verify the business justification\n2. Get approval from the client's manager or IT decision-maker\n3. Options to allow access:\n\n**Option A: Allow the specific URL/domain** (recommended)\n- Firewall > Content Filter > Allow List\n- Add just the specific domain (not the entire category)\n\n**Option B: Allow for specific user/group only**\n- If the firewall supports user-based policies (most NGFWs do)\n- Create a policy for the user/group that allows the category\n\n**Option C: Allow the category** (least recommended)\n- Unblocking an entire category opens it for everyone\n- Only do this if the policy needs to change org-wide\n\n**Document:** Who approved the exception and the business justification.",
|
||||
"next_node_id": "solution_content_filter"
|
||||
},
|
||||
{
|
||||
"id": "solution_content_filter",
|
||||
"type": "solution",
|
||||
"title": "Resolved: Content Filter Exception Added",
|
||||
"description": "Website access granted via content filter exception.\n\n**Ticket Notes:** [URL] blocked by content filter (category: [category]). Business justification: [reason]. Approved by: [approver]. Added domain to [allow list / user-specific policy]. Access confirmed.\n\n**Important:** Document all exceptions for compliance and audit purposes. Review exceptions periodically."
|
||||
},
|
||||
{
|
||||
"id": "fix_miscategorized",
|
||||
"type": "action",
|
||||
"title": "Fix Miscategorized Website",
|
||||
"description": "The website is in the wrong content filter category.\n\n**Immediate fix:** Add the domain to the allow list so the user can work.\n\n**Submit a recategorization request:**\n- Most firewall vendors let you request a category change:\n - **Fortinet:** https://www.fortiguard.com/faq/wfrating\n - **SonicWall:** https://cfssupport.sonicwall.com/\n - **Sophos:** Submit through Sophos Central\n - **Palo Alto:** https://urlfiltering.paloaltonetworks.com/\n - **Cisco/OpenDNS:** https://community.opendns.com/domaintagging/\n\n- Recategorization usually takes 1-3 business days\n\n**After recategorization:** You can remove the manual allow list entry once the category is corrected.",
|
||||
"next_node_id": "solution_recategorized"
|
||||
},
|
||||
{
|
||||
"id": "solution_recategorized",
|
||||
"type": "solution",
|
||||
"title": "Resolved: Website Recategorization Submitted",
|
||||
"description": "Website miscategorized — added to allow list and submitted recategorization.\n\n**Ticket Notes:** [URL] miscategorized as [wrong category] instead of [correct category]. Added to allow list for immediate access. Recategorization request submitted to [vendor]. Will remove allow list entry once category is corrected.\n\n**Follow-up:** Check categorization in 3-5 business days."
|
||||
},
|
||||
{
|
||||
"id": "fix_ssl_inspection",
|
||||
"type": "action",
|
||||
"title": "Fix SSL Inspection Certificate Issues",
|
||||
"description": "SSL deep inspection is causing certificate errors on certain sites.\n\n**Why this happens:** The firewall intercepts HTTPS, re-signs the certificate with its own CA. Sites that use certificate pinning (banking, government, some apps) will reject the firewall's certificate.\n\n**Fix: Exclude the site from SSL inspection**\n1. Firewall > SSL Inspection policy\n2. Add the domain to the SSL inspection bypass/exclusion list\n\n**Common sites that need SSL inspection bypass:**\n- Banking and financial sites\n- Government sites\n- Microsoft 365 (Microsoft recommends bypassing)\n- Video conferencing (Teams, Zoom, WebEx)\n- Healthcare portals\n\n**If the issue is that the firewall's CA cert isn't trusted:**\n- Deploy the firewall's root CA certificate to all domain computers via GPO\n- Non-domain devices will show certificate warnings unless the CA is manually trusted",
|
||||
"next_node_id": "solution_ssl_fixed"
|
||||
},
|
||||
{
|
||||
"id": "solution_ssl_fixed",
|
||||
"type": "solution",
|
||||
"title": "Resolved: SSL Inspection Bypass Added",
|
||||
"description": "Certificate errors resolved by excluding the site from SSL inspection.\n\n**Ticket Notes:** [URL] showing certificate errors due to SSL deep inspection. Added to SSL inspection bypass list. Site now loads correctly.\n\n**If CA deployment is needed:** Deploy firewall root CA to all endpoints via GPO:\nComputer Config > Windows Settings > Security Settings > Public Key Policies > Trusted Root Certification Authorities"
|
||||
},
|
||||
{
|
||||
"id": "fix_dns_filter",
|
||||
"type": "action",
|
||||
"title": "Fix DNS-Level Filtering Block",
|
||||
"description": "Website is blocked at the DNS level (Cisco Umbrella, DNSFilter, Cloudflare Gateway, etc.).\n\n**How to identify DNS filtering:**\n- User gets a block page but it's from the DNS service, not the firewall\n- `nslookup` for the domain returns the DNS filter's block IP instead of the real IP\n```\nnslookup blocked-site.com\n```\n\n**To fix:**\n1. Log into the DNS filtering console (Umbrella, DNSFilter, etc.)\n2. Check the logs for the blocked domain\n3. Add to the allow list if it should be permitted\n\n**If you can't access the DNS filter console:** The DNS filter may be managed by a different team or MSP. Escalate.\n\n**Quick test:** Temporarily change the client's DNS to 8.8.8.8 to bypass DNS filtering and confirm the site works. (Change it back afterward!)",
|
||||
"next_node_id": "solution_dns_filter"
|
||||
},
|
||||
{
|
||||
"id": "solution_dns_filter",
|
||||
"type": "solution",
|
||||
"title": "Resolved: DNS Filter Exception Added",
|
||||
"description": "Website unblocked in DNS filtering service.\n\n**Ticket Notes:** [URL] blocked by [DNS filter service]. Added to allow list in [service name]. Access confirmed.\n\n**Note:** DNS filtering and firewall content filtering are separate layers. A site may need to be allowed in both if the org uses both."
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "check_app_block",
|
||||
"type": "action",
|
||||
"title": "Diagnose Application Connection Block",
|
||||
"description": "A desktop application can't connect to its server or service.\n\n**Step 1: Identify what the app needs**\n- What server/IP does it connect to?\n- What port(s) does it use?\n- Check the vendor's documentation for required ports and IPs\n\n**Step 2: Test connectivity**\n```\n# Test if the port is reachable\nTest-NetConnection -ComputerName server.example.com -Port 443\nTest-NetConnection -ComputerName server.example.com -Port 8080\n\n# Check if Windows Firewall is blocking\nGet-NetFirewallRule | Where-Object {$_.DisplayName -like '*AppName*'} | Select DisplayName, Enabled, Direction, Action\n```\n\n**Step 3: Check Windows Firewall first**\n- Windows Defender Firewall may be blocking the app independently from the network firewall\n- Check: Control Panel > Windows Defender Firewall > Allow an app\n- Temporarily disable Windows Firewall to test (re-enable immediately after)\n\n**Step 4: Check network firewall logs**\n- Search for the source IP (user's computer) in the firewall deny logs\n- Look at what destination IP and port is being blocked",
|
||||
"next_node_id": "app_block_source"
|
||||
},
|
||||
{
|
||||
"id": "app_block_source",
|
||||
"type": "decision",
|
||||
"question": "What is blocking the application?",
|
||||
"help_text": "Based on connectivity tests and firewall log analysis",
|
||||
"options": [
|
||||
{"id": "windows_fw", "label": "Windows Firewall is blocking it", "next_node_id": "fix_windows_firewall"},
|
||||
{"id": "network_fw", "label": "Network firewall is blocking the port/IP", "next_node_id": "fix_network_firewall_rule"},
|
||||
{"id": "both", "label": "Both firewalls need rules", "next_node_id": "fix_windows_firewall"},
|
||||
{"id": "not_firewall", "label": "Connectivity works — issue isn't firewall", "next_node_id": "solution_not_firewall"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "fix_windows_firewall",
|
||||
"type": "action",
|
||||
"title": "Create Windows Firewall Rule",
|
||||
"description": "Windows Firewall is blocking the application.\n\n**Option 1: Allow the app through Windows Firewall**\n1. Control Panel > Windows Defender Firewall > Allow an app\n2. Click 'Change settings' > 'Allow another app'\n3. Browse to the application's .exe file\n4. Check Private and/or Domain as appropriate\n\n**Option 2: Create a port-based rule**\n```\n# Allow inbound on specific port\nNew-NetFirewallRule -DisplayName 'Allow MyApp' -Direction Inbound -Protocol TCP -LocalPort 8080 -Action Allow\n\n# Allow outbound on specific port\nNew-NetFirewallRule -DisplayName 'Allow MyApp Outbound' -Direction Outbound -Protocol TCP -RemotePort 443 -Action Allow\n```\n\n**Option 3: Deploy via Group Policy (for org-wide apps)**\nComputer Config > Windows Settings > Security Settings > Windows Defender Firewall with Advanced Security > Inbound/Outbound Rules\n\n**After adding the rule:** Test the application. If it still doesn't work, also check the network firewall.",
|
||||
"next_node_id": "windows_fw_result"
|
||||
},
|
||||
{
|
||||
"id": "windows_fw_result",
|
||||
"type": "decision",
|
||||
"question": "Did the Windows Firewall rule fix it?",
|
||||
"help_text": "Test the application after adding the rule",
|
||||
"options": [
|
||||
{"id": "yes", "label": "Yes, application works now", "next_node_id": "solution_windows_fw"},
|
||||
{"id": "no", "label": "No, still blocked — network firewall too", "next_node_id": "fix_network_firewall_rule"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "solution_windows_fw",
|
||||
"type": "solution",
|
||||
"title": "Resolved: Windows Firewall Rule Added",
|
||||
"description": "Application connectivity restored after adding Windows Firewall rule.\n\n**Ticket Notes:** [Application] blocked by Windows Defender Firewall. Created [inbound/outbound] rule for [app/port]. Application confirmed working.\n\n**If this needs to be deployed org-wide:** Create the rule via Group Policy to push to all domain computers."
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "fix_network_firewall_rule",
|
||||
"type": "action",
|
||||
"title": "Create Network Firewall Rule",
|
||||
"description": "The network firewall needs a rule to allow the application's traffic.\n\n**Step 1: Gather the requirements**\n- Source: User's subnet or specific IP\n- Destination: Application server IP or FQDN\n- Port(s): TCP/UDP port numbers the app uses\n- Protocol: TCP, UDP, or both\n\n**Step 2: Check vendor documentation**\nAlways check the app vendor's docs for the complete list of required ports and IPs. Common apps:\n- RDP: TCP 3389\n- SQL Server: TCP 1433\n- HTTPS: TCP 443\n- SSH: TCP 22\n- FTP: TCP 20-21, passive ports\n- SIP/VoIP: UDP 5060-5061, RTP 10000-20000\n\n**Step 3: Create the rule**\nLog into the firewall and create an allow rule with the specific source, destination, ports, and protocol.\n\n**Step 4: Test and verify**\n```\nTest-NetConnection -ComputerName destination -Port port_number\n```\n\n**Best practice:** Use the most specific rule possible (exact IPs and ports). Avoid broad 'allow all' rules.",
|
||||
"next_node_id": "solution_network_fw_rule"
|
||||
},
|
||||
{
|
||||
"id": "solution_network_fw_rule",
|
||||
"type": "solution",
|
||||
"title": "Resolved: Network Firewall Rule Created",
|
||||
"description": "Application connectivity restored after creating firewall rule.\n\n**Ticket Notes:** [Application] blocked by network firewall. Created rule: Source [IP/subnet] → Destination [IP/FQDN] Port [ports] [TCP/UDP]. Application confirmed working.\n\n**Documentation:** Record the rule in the client's firewall change log with business justification and approval."
|
||||
},
|
||||
{
|
||||
"id": "solution_not_firewall",
|
||||
"type": "solution",
|
||||
"title": "Not a Firewall Issue",
|
||||
"description": "Connectivity test succeeded — the firewall is not blocking the traffic.\n\n**Ticket Notes:** Application [name] not connecting. Firewall ruled out — port test to [destination:port] succeeds. Issue is likely:\n- Application configuration (wrong server address, credentials)\n- Server-side issue (service down, certificate expired)\n- DNS resolution (app resolving to wrong IP)\n- Application-level authentication failure\n\n**Next steps:** Troubleshoot at the application level."
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "check_port_block",
|
||||
"type": "action",
|
||||
"title": "Test and Fix Specific Port Block",
|
||||
"description": "A specific port or service needs to be opened.\n\n**Step 1: Confirm the port is actually blocked**\n```\n# Test TCP port\nTest-NetConnection -ComputerName target_ip -Port port_number\n\n# If the above isn't available, use telnet:\ntelnet target_ip port_number\n\n# Check what's listening locally\nnetstat -an | findstr :port_number\n```\n\n**Step 2: Determine WHERE the block is**\n1. Test from the server itself (is the service even listening?)\n2. Test from the same subnet (is it a Windows Firewall issue?)\n3. Test from a different subnet (is it the network firewall?)\n4. Test from outside the network (is it the edge firewall?)\n\n**Step 3: Common port requirements by service**\n- HTTP/HTTPS: 80, 443\n- RDP: 3389\n- SSH: 22\n- DNS: 53 (TCP+UDP)\n- SMTP: 25, 587\n- IMAP: 143, 993\n- FTP: 20-21 + passive range\n- SMB: 445\n- SQL: 1433\n- MySQL: 3306\n- PostgreSQL: 5432",
|
||||
"next_node_id": "app_block_source"
|
||||
},
|
||||
{
|
||||
"id": "check_vpn_block",
|
||||
"type": "action",
|
||||
"title": "Troubleshoot VPN Blocked by Firewall",
|
||||
"description": "VPN connection can't establish through the firewall.\n\n**Identify the VPN type and required ports:**\n\n**IPSec VPN:**\n- UDP 500 (IKE)\n- UDP 4500 (NAT Traversal)\n- Protocol 50 (ESP) — note: this is an IP protocol, not a port\n\n**SSL VPN / OpenVPN:**\n- TCP or UDP 443 (most common)\n- Or custom port (check VPN server config)\n\n**WireGuard:**\n- UDP 51820 (default)\n\n**L2TP/IPSec:**\n- UDP 500, UDP 4500, UDP 1701, Protocol 50\n\n**PPTP (legacy, avoid):**\n- TCP 1723, Protocol 47 (GRE)\n\n**Step 1: Check which ports are needed** based on the VPN type above.\n\n**Step 2: Test if the port is reachable**\n```\nTest-NetConnection -ComputerName vpn_server -Port 443\n```\n\n**Step 3: Check both directions**\n- Outbound: Is the user's firewall allowing outbound VPN traffic?\n- Inbound: Is the VPN server's firewall allowing inbound connections?\n- NAT: Is port forwarding configured correctly for the VPN server?\n\n**Step 4: Check for ISP blocking**\nSome ISPs and hotel/public Wi-Fi block VPN protocols. Try port 443 (usually open everywhere).",
|
||||
"next_node_id": "vpn_block_result"
|
||||
},
|
||||
{
|
||||
"id": "vpn_block_result",
|
||||
"type": "decision",
|
||||
"question": "Where is the VPN being blocked?",
|
||||
"help_text": "Based on port tests and firewall log analysis",
|
||||
"options": [
|
||||
{"id": "outbound_fw", "label": "User's network firewall blocking outbound VPN", "next_node_id": "fix_network_firewall_rule"},
|
||||
{"id": "inbound_fw", "label": "VPN server firewall blocking inbound", "next_node_id": "fix_vpn_inbound"},
|
||||
{"id": "nat_issue", "label": "NAT or port forwarding not configured", "next_node_id": "fix_vpn_nat"},
|
||||
{"id": "isp_block", "label": "ISP or public Wi-Fi blocking VPN protocols", "next_node_id": "solution_isp_vpn_block"}
|
||||
],
|
||||
"children": [
|
||||
{
|
||||
"id": "fix_vpn_inbound",
|
||||
"type": "action",
|
||||
"title": "Fix VPN Server Inbound Firewall",
|
||||
"description": "The firewall in front of the VPN server is blocking incoming VPN connections.\n\n**Create the inbound rule:**\nBased on VPN type, allow the required ports/protocols inbound to the VPN server's internal IP.\n\n**For IPSec:** Allow UDP 500, UDP 4500, and IP Protocol 50 to the VPN server.\n\n**For SSL VPN:** Allow TCP 443 (or the custom port) to the VPN server.\n\n**Check NAT:** If the VPN server is behind NAT, port forwarding must be configured (see NAT fix).\n\n**After creating the rule:** Test the VPN connection from outside the network.",
|
||||
"next_node_id": "solution_vpn_inbound"
|
||||
},
|
||||
{
|
||||
"id": "solution_vpn_inbound",
|
||||
"type": "solution",
|
||||
"title": "Resolved: VPN Server Firewall Rule Added",
|
||||
"description": "VPN connections now working after adding inbound firewall rule.\n\n**Ticket Notes:** VPN connections blocked by firewall in front of VPN server. Created inbound rules for [ports/protocols] to [VPN server IP]. VPN confirmed working from external network."
|
||||
},
|
||||
{
|
||||
"id": "fix_vpn_nat",
|
||||
"type": "action",
|
||||
"title": "Fix VPN NAT / Port Forwarding",
|
||||
"description": "VPN server is behind NAT and port forwarding isn't configured.\n\n**Configure port forwarding:**\n1. Log into the edge firewall/router\n2. Create port forwarding rules:\n - External port → Internal VPN server IP : Internal port\n\n**For IPSec behind NAT:**\n- Forward UDP 500 and UDP 4500 to the VPN server\n- NAT-Traversal (NAT-T) must be enabled on both ends\n- Note: Multiple IPSec VPNs behind the same NAT can cause issues\n\n**For SSL VPN behind NAT:**\n- Forward TCP 443 to the VPN server\n- If port 443 is already used by something else, use a different port and update the VPN client config\n\n**Important:** Only ONE device can receive forwarded traffic for a given port. If 443 is forwarded to a web server, the SSL VPN needs a different port.",
|
||||
"next_node_id": "solution_vpn_nat"
|
||||
},
|
||||
{
|
||||
"id": "solution_vpn_nat",
|
||||
"type": "solution",
|
||||
"title": "Resolved: VPN Port Forwarding Configured",
|
||||
"description": "VPN connectivity restored after configuring NAT/port forwarding.\n\n**Ticket Notes:** VPN server behind NAT at [public IP]. Configured port forwarding: [external port] → [internal IP:port]. VPN confirmed working.\n\n**Document:** Record the port forwarding rule in the client's network documentation."
|
||||
},
|
||||
{
|
||||
"id": "solution_isp_vpn_block",
|
||||
"type": "solution",
|
||||
"title": "ISP or Public Wi-Fi Blocking VPN",
|
||||
"description": "The user's ISP or public Wi-Fi is blocking VPN protocols.\n\n**Ticket Notes:** VPN blocked by [ISP / hotel Wi-Fi / public network]. Standard VPN ports are filtered.\n\n**Workarounds:**\n1. Switch VPN to port 443 (TCP) — almost never blocked because it looks like HTTPS\n2. Use SSL VPN instead of IPSec if available\n3. Use a mobile hotspot instead of the public Wi-Fi\n4. Some VPN clients support stealth/obfuscation modes\n\n**If the VPN server supports it:** Configure an alternative listener on TCP 443 for users in restrictive networks."
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "diagnose_firewall_vs_other",
|
||||
"type": "action",
|
||||
"title": "Determine If the Firewall Is the Problem",
|
||||
"description": "Not sure if the firewall is causing the issue. Let's find out.\n\n**Quick test: Is it the firewall?**\n\n**Test 1: Check firewall deny logs**\nSearch the firewall's deny/drop log for the user's IP address in the last hour. If you see blocked traffic, the firewall is involved.\n\n**Test 2: Test from inside vs outside the firewall**\n- Can the user reach the resource from the same subnet? (bypasses the firewall)\n- If it works from the same subnet, the firewall is likely involved\n\n**Test 3: Temporarily create a broad allow rule** (for testing ONLY)\n- Allow all traffic from the user's IP to the destination\n- If it works: firewall is the issue — now narrow down which specific port/protocol is needed\n- **Remove the broad rule immediately after testing**\n\n**Test 4: Check Windows Firewall too**\n```\n# Temporarily disable Windows Firewall to test\nSet-NetFirewallProfile -Profile Domain,Public,Private -Enabled False\n# TEST NOW — then immediately re-enable:\nSet-NetFirewallProfile -Profile Domain,Public,Private -Enabled True\n```\n\n**If none of these point to the firewall:** The issue is likely DNS, application configuration, server-side, or authentication.",
|
||||
"next_node_id": "firewall_diagnosis_result"
|
||||
},
|
||||
{
|
||||
"id": "firewall_diagnosis_result",
|
||||
"type": "decision",
|
||||
"question": "Is the firewall causing the problem?",
|
||||
"help_text": "Based on the tests above",
|
||||
"options": [
|
||||
{"id": "windows_fw", "label": "Yes — Windows Firewall is blocking", "next_node_id": "fix_windows_firewall"},
|
||||
{"id": "network_fw", "label": "Yes — Network firewall is blocking", "next_node_id": "check_app_block"},
|
||||
{"id": "not_fw", "label": "No — Firewall isn't the issue", "next_node_id": "solution_not_firewall"}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
1113
backend/scripts/seed_trees_v2.py
Normal file
1113
backend/scripts/seed_trees_v2.py
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user