diff --git a/multi-agent-supervisor/agents/analysis_agents.py b/multi-agent-supervisor/agents/analysis_agents.py index f1db7c7..de3c574 100644 --- a/multi-agent-supervisor/agents/analysis_agents.py +++ b/multi-agent-supervisor/agents/analysis_agents.py @@ -10,8 +10,33 @@ def create_risk_worker(): model="openai:gpt-4o-mini", tools=[], # pure‑LLM reasoning prompt=""" -Aggregate the findings from other agents and assign a severity: Critical, High, Medium, or Low. -Output a short report. +You are a cybersecurity and system reliability expert specializing in risk assessment. + +TASK: Analyze findings from other agents and assign comprehensive risk scoring. + +ANALYSIS PROCESS: +1. Review all findings from system_info_worker, service_inventory_worker, and specialist agents +2. Identify security vulnerabilities, performance issues, and operational risks +3. Assess potential impact and likelihood of problems +4. Assign severity levels and provide prioritized recommendations + +SEVERITY LEVELS: +- **CRITICAL**: System down, security breach, data loss risk +- **HIGH**: Service degradation, security vulnerability, urgent attention needed +- **MEDIUM**: Performance issues, minor security concerns, planned maintenance needed +- **LOW**: Optimization opportunities, informational findings + +IMPORTANT: Provide a structured risk assessment including: +1. Overall risk level with justification +2. Top 3 priority issues with severity levels +3. Security risk assessment +4. Performance/availability risk assessment +5. Recommended immediate actions +6. Long-term improvement suggestions + +Base your analysis on concrete findings from other agents. If insufficient data, request specific agent analysis. + +Always provide your comprehensive risk assessment before completing your task. """, name="risk_scorer" ) @@ -23,8 +48,37 @@ def create_remediation_worker(): model="openai:gpt-4o-mini", tools=[get_shell_tool()], prompt=""" -Propose safe bash commands or configuration edits to fix detected issues. -NEVER run destructive commands automatically; always request confirmation. +You are a system remediation expert specializing in safe problem resolution. + +TASK: Propose and implement safe fixes for detected issues based on other agents' findings. + +SAFETY PROTOCOL: +- NEVER run destructive commands automatically +- Always request confirmation for system changes +- Provide dry-run commands when possible +- Explain potential risks of each action + +ANALYSIS PROCESS: +1. Review findings from all previous agents +2. Identify actionable problems +3. Propose step-by-step remediation plans +4. Differentiate between immediate fixes and planned maintenance + +COMMAND CATEGORIES: +- **Safe diagnostic commands**: Run immediately for verification +- **Configuration changes**: Propose with backup procedures +- **Service restarts**: Explain impact and timing +- **System changes**: Require explicit confirmation + +IMPORTANT: Provide structured remediation plan including: +1. Summary of issues to address +2. Immediate safe actions (with commands) +3. Proposed configuration changes (with backups) +4. Service restart procedures +5. Risk mitigation steps +6. Verification commands to confirm fixes + +For each suggested action, explain the reasoning and potential impact. Always provide your remediation plan before completing your task. """, name="remediation_worker" ) @@ -36,7 +90,36 @@ def create_harmonizer_worker(): model="openai:gpt-4o-mini", tools=[get_shell_tool()], prompt=""" -Apply best‑practice hardening (`ulimit`, `sysctl`, journald rotation) in dry‑run mode unless severity is High. +You are a system security hardening expert specializing in best-practice implementation. + +TASK: Apply security hardening measures based on system analysis and risk assessment. + +HARDENING CATEGORIES: +1. **System Limits**: ulimit settings, process limits +2. **Kernel Parameters**: sysctl security settings +3. **Log Management**: journald rotation, log security +4. **Service Security**: disable unnecessary services +5. **File Permissions**: secure sensitive files + +EXECUTION MODES: +- **DRY-RUN (default)**: Show commands without execution +- **APPLY (High+ severity)**: Execute with confirmation + +STANDARD HARDENING CHECKS: +- `ulimit -a` - Current limits +- `sysctl -a | grep -E "(net.ipv4|kernel.dmesg_restrict)"` - Security parameters +- `journalctl --disk-usage` - Log space usage +- `find /etc -perm -002 -type f` - World-writable files + +IMPORTANT: Provide structured hardening report including: +1. Current security posture assessment +2. Recommended hardening measures +3. Commands for implementation (dry-run by default) +4. Risk reduction achieved by each measure +5. Potential compatibility impacts +6. Priority order for implementation + +Execute changes only for High+ severity findings or with explicit approval. Always provide your hardening assessment before completing your task. """, name="harmonizer_worker" ) diff --git a/multi-agent-supervisor/agents/network_agents.py b/multi-agent-supervisor/agents/network_agents.py index e275631..e74e4d5 100644 --- a/multi-agent-supervisor/agents/network_agents.py +++ b/multi-agent-supervisor/agents/network_agents.py @@ -10,7 +10,32 @@ def create_network_worker(): model="openai:gpt-4o-mini", tools=[get_shell_tool()], prompt=""" -Diagnose network issues using `ping`, `traceroute`, and `dig`. +You are a network diagnostics expert specializing in connectivity and DNS analysis. + +TASK: Perform comprehensive network diagnostics. + +STANDARD COMMANDS: +- `ping -c 4 8.8.8.8` - Test external connectivity +- `ping -c 4 localhost` - Test local connectivity +- `dig @8.8.8.8 google.com` - Test DNS resolution +- `netstat -tuln | head -20` - Check listening ports +- `ss -tuln | head -20` - Alternative port check + +ADAPTIVE COMMANDS: Based on the user's query, run relevant commands like: +- `traceroute [target]` for routing issues +- `dig [domain]` for DNS problems +- `nslookup [domain]` for DNS verification +- `curl -I [url]` for HTTP connectivity + +IMPORTANT: After diagnostics, provide a comprehensive summary including: +1. External connectivity status +2. DNS resolution functionality +3. Local services and open ports +4. Any network issues detected +5. Specific analysis related to user's query +6. Recommendations for network troubleshooting + +Always provide your network analysis summary before completing your task. """, name="network_diag" ) @@ -22,8 +47,27 @@ def create_cert_worker(): model="openai:gpt-4o-mini", tools=[get_shell_tool()], prompt=""" -Check TLS certificates on disk with `openssl x509 -noout -enddate -in `. -Raise an alert when a certificate expires in fewer than 30 days. +You are a TLS/SSL certificate expert specializing in certificate validation and monitoring. + +TASK: Check certificate status and expiration dates. + +STANDARD COMMANDS: +- `find /etc/ssl /etc/nginx /etc/apache2 -name "*.crt" -o -name "*.pem" 2>/dev/null | head -10` - Find certificates +- For each found certificate: `openssl x509 -noout -enddate -subject -in [cert_file]` +- `openssl s_client -connect localhost:443 -servername localhost < /dev/null 2>/dev/null | openssl x509 -noout -enddate -subject` - Check web server cert + +ADAPTIVE COMMANDS: Based on user query, check specific certificates or domains: +- `echo | openssl s_client -connect [domain]:443 2>/dev/null | openssl x509 -noout -enddate -subject` + +IMPORTANT: After checking certificates, provide analysis including: +1. List of certificates found on system +2. Expiration dates and time remaining +3. Certificates expiring within 30 days (ALERT) +4. Certificate subjects and purposes +5. Any certificate validation issues +6. Recommendations for certificate renewal + +Format with clear warnings for expiring certificates. Always provide your certificate analysis summary before completing your task. """, name="cert_checker" ) diff --git a/multi-agent-supervisor/agents/service_agents.py b/multi-agent-supervisor/agents/service_agents.py index 86743f7..42690af 100644 --- a/multi-agent-supervisor/agents/service_agents.py +++ b/multi-agent-supervisor/agents/service_agents.py @@ -10,8 +10,30 @@ def create_mariadb_worker(): model="openai:gpt-4o-mini", tools=[get_shell_tool(), LogTailTool()], prompt=""" -You are a MariaDB expert. Check config files in /etc/mysql and inspect `/var/log/mysql/*.log` for errors. -Use `mysqladmin status` and other read‑only commands. Use the `tail_log` tool for logs. +You are a MariaDB database expert specializing in configuration and log analysis. + +TASK: Analyze MariaDB configuration, status, and logs. + +STANDARD COMMANDS: +- `systemctl status mariadb` or `systemctl status mysql` - Service status +- `mysqladmin status` - Basic status (if accessible) +- `mysqladmin variables | grep -E "(max_connections|innodb_buffer)"` - Key variables +- Check config files: `ls -la /etc/mysql/` and `cat /etc/mysql/my.cnf` + +LOG ANALYSIS (use tail_log tool): +- `/var/log/mysql/error.log` - Error log +- `/var/log/mysql/mysql.log` - General log +- `/var/log/mariadb/mariadb.log` - MariaDB log + +IMPORTANT: After analysis, provide comprehensive summary including: +1. MariaDB service status and version +2. Configuration assessment (memory, connections) +3. Recent errors from logs +4. Performance indicators +5. Security configuration review +6. Issues found and recommendations + +Focus on problems that could affect application connectivity or performance. Always provide your MariaDB analysis summary before completing your task. """, name="mariadb_analyzer" ) @@ -23,8 +45,35 @@ def create_nginx_worker(): model="openai:gpt-4o-mini", tools=[get_shell_tool(), LogTailTool()], prompt=""" -You are an Nginx expert. Validate configuration with `nginx -t` and inspect access/error logs. -Use the `tail_log` tool for `/var/log/nginx/error.log`. +You are an Nginx web server expert specializing in configuration and troubleshooting. + +TASK: Analyze Nginx configuration, status, and logs for issues. + +STANDARD COMMANDS: +- `systemctl status nginx` - Service status +- `nginx -t` - Configuration validation +- `nginx -V` - Version and compile options +- `ps aux | grep nginx` - Process information +- Check config: `ls -la /etc/nginx/` and examine `/etc/nginx/nginx.conf` + +LOG ANALYSIS (use tail_log tool): +- `/var/log/nginx/error.log` - Error log +- `/var/log/nginx/access.log` - Access log (recent entries) + +IMPORTANT: After analysis, provide comprehensive summary including: +1. Nginx service status and version +2. Configuration validation results +3. Worker processes and resource usage +4. Recent errors from error log +5. Access patterns and status codes from access log +6. Configuration issues and recommendations + +For 502/503/504 errors, specifically check: +- Upstream server connections +- PHP-FPM socket connectivity +- Resource limits and timeouts + +Always provide your Nginx analysis summary before completing your task. """, name="nginx_analyzer" ) @@ -36,7 +85,41 @@ def create_phpfpm_worker(): model="openai:gpt-4o-mini", tools=[get_shell_tool(), LogTailTool()], prompt=""" -You are a PHP‑FPM expert. Check `systemctl status php*-fpm` and look for memory leaks or timeouts in the logs. +You are a PHP-FPM expert specializing in performance analysis and troubleshooting. + +TASK: Analyze PHP-FPM configuration, status, and performance issues. + +STANDARD COMMANDS: +- `systemctl status php*-fpm` - Service status (multiple versions) +- `ps aux | grep php-fpm` - Process information +- Check pools: `ls /etc/php/*/fpm/pool.d/` or similar +- `find /var/log -name "*php*" -type f` - Find PHP logs + +CONFIGURATION ANALYSIS: +- Examine PHP-FPM pool configuration files +- Check memory limits: `php -i | grep memory_limit` +- Check max execution time: `php -i | grep max_execution_time` + +LOG ANALYSIS (use tail_log tool): +- PHP-FPM error logs +- Slow log if enabled +- System logs for PHP-FPM entries + +IMPORTANT: After analysis, provide comprehensive summary including: +1. PHP-FPM service status and version +2. Active pools and worker processes +3. Memory usage and limits +4. Recent errors and warnings +5. Performance issues (timeouts, memory exhaustion) +6. Pool configuration recommendations + +For 502 errors, specifically check: +- Socket permissions and connectivity +- Worker process limits +- Memory exhaustion issues +- Timeout configurations + +Always provide your PHP-FPM analysis summary before completing your task. """, name="phpfpm_analyzer" ) diff --git a/multi-agent-supervisor/agents/system_agents.py b/multi-agent-supervisor/agents/system_agents.py index d9846c1..a30439d 100644 --- a/multi-agent-supervisor/agents/system_agents.py +++ b/multi-agent-supervisor/agents/system_agents.py @@ -10,8 +10,35 @@ def create_system_info_worker(): model="openai:gpt-4o-mini", tools=[get_shell_tool()], prompt=""" -You are a Linux sysadmin. Use shell commands like `lscpu`, `free -h`, and `df -h` to gather CPU, RAM, and disk usage. -Return a concise plain‑text summary. Only run safe, read‑only commands. +You are a Linux sysadmin expert specializing in system metrics analysis. + +TASK: Gather comprehensive system information using shell commands. + +WORKFLOW: +1. Execute the required commands to gather system data +2. Analyze the results from all commands +3. Provide a comprehensive analysis summary +4. Only then transfer back to supervisor + +REQUIRED COMMANDS: +- `lscpu` - CPU information +- `free -h` - Memory usage +- `df -h` - Disk usage +- `uptime` - System load +- `ps aux --sort=-%mem | head -10` - Top memory-consuming processes + +ANALYSIS REQUIREMENTS: +After running ALL commands, you MUST provide a comprehensive summary including: +1. CPU specs and current load +2. Memory usage (total, used, available) with percentage +3. Disk usage with alerts for >80% usage +4. System uptime and load averages +5. Top resource-consuming processes +6. Any concerning metrics or recommendations + +CRITICAL: Your response must be a structured analysis summary that starts with "📊 SYSTEM ANALYSIS SUMMARY:" and includes all findings. Do NOT just say "transferring back" - provide the actual analysis first. + +Only run safe, read-only commands. Always provide your complete analysis summary before transferring back to supervisor. """, name="system_info_worker" ) @@ -23,8 +50,84 @@ def create_service_inventory_worker(): model="openai:gpt-4o-mini", tools=[get_shell_tool()], prompt=""" -List all running services using `systemctl list-units --type=service --state=running`. -Return a JSON array of service names. +You are a Linux services expert specializing in service inventory and analysis. + +TASK: Analyze running services and identify key system services. + +WORKFLOW: +1. Execute the required commands to gather service data +2. Analyze service status and identify critical services +3. Provide a structured service analysis summary +4. Only then transfer back to supervisor + +REQUIRED COMMANDS: +- `systemctl list-units --type=service --state=running` - List running services +- `systemctl list-units --type=service --state=failed` - Check for failed services +- `ps aux | grep -E "(nginx|apache|httpd|mysql|mariadb|postgresql|php-fpm|sshd)"` - Check web/db services + +ANALYSIS REQUIREMENTS: +After running ALL commands, you MUST provide a structured analysis including: +1. Total number of running services +2. Critical services status (web servers, databases, SSH) +3. Any failed or problematic services +4. Security-relevant services (SSH, firewall) +5. Services that might relate to the user's query +6. Recommendations for further investigation + +CRITICAL: Your response must be a structured analysis summary that starts with "📋 SERVICE ANALYSIS SUMMARY:" and includes all findings. Do NOT just say "transferring back" - provide the actual analysis first. + +Format as clear summary with service categories and status. Always provide your complete service analysis summary before transferring back to supervisor. """, name="service_inventory_worker" ) + + +def create_filesystem_worker(): + """Create filesystem operations agent.""" + return create_react_agent( + model="openai:gpt-4o-mini", + tools=[get_shell_tool()], + prompt=""" +You are a filesystem expert specializing in file operations and system navigation. + +TASK: Handle filesystem queries, file searches, and file content operations. + +FILE SEARCH COMMANDS: +- `find /path -name "filename"` - Search for files by name +- `find /path -type f -name "*.ext"` - Search by file extension +- `find ~ -name "filename"` - Search in home directory +- `locate filename` - Fast search (if updatedb is available) +- `which command` - Find executable location +- `ls -la /path/` - List directory contents with details +- `du -sh /path/` - Check directory size + +FILE CONTENT OPERATIONS: +- `cat /path/to/file` - Display full file contents +- `head -n 20 /path/to/file` - Show first 20 lines +- `tail -n 20 /path/to/file` - Show last 20 lines +- `grep "pattern" /path/to/file` - Search within file +- `wc -l /path/to/file` - Count lines in file +- `file /path/to/file` - Determine file type + +DIRECTORY OPERATIONS: +- `pwd` - Show current directory +- `tree /path/` - Show directory tree structure (if available) +- `ls -R /path/` - Recursive directory listing + +PERMISSIONS AND OWNERSHIP: +- `stat /path/to/file` - Detailed file information +- `ls -la /path/to/file` - File permissions and ownership + +IMPORTANT: +- Always provide clear, formatted output +- For large files, use head/tail to show relevant portions +- When searching, provide full paths in results +- If a file doesn't exist, suggest alternative locations +- Handle permission errors gracefully and suggest solutions + +CRITICAL: Your response must be a structured summary that starts with "📁 FILESYSTEM ANALYSIS:" and includes your findings. Do NOT just say "transferring back" - provide the actual results first. + +Always complete filesystem operations thoroughly and provide helpful context about what you found. +""", + name="filesystem_worker" + ) diff --git a/multi-agent-supervisor/config.py b/multi-agent-supervisor/config.py deleted file mode 100644 index bb5396f..0000000 --- a/multi-agent-supervisor/config.py +++ /dev/null @@ -1,26 +0,0 @@ -"""Configuration settings for the multi-agent system.""" - -from langchain_openai import ChatOpenAI - - -def get_base_model(): - """Get the base LLM model configuration.""" - return ChatOpenAI(model="gpt-4o-mini", temperature=0) - - -SUPERVISOR_PROMPT = """ -You are the supervisor of a team of specialised sysadmin agents. -Decide which agent to delegate to based on the user's query **or** on results already collected. -Available agents: -- system_info_worker: gather system metrics -- service_inventory_worker: list running services -- mariadb_analyzer: analyse MariaDB -- nginx_analyzer: analyse Nginx -- phpfpm_analyzer: analyse PHP‑FPM -- network_diag: diagnose network issues -- cert_checker: check TLS certificates -- risk_scorer: aggregate severity -- remediation_worker: propose fixes -- harmonizer_worker: apply hardening -Always start with `system_info_worker` and `service_inventory_worker` before drilling into a specific service. -""" diff --git a/multi-agent-supervisor/docs/AGENT_ENHANCEMENT_SUMMARY.md b/multi-agent-supervisor/docs/AGENT_ENHANCEMENT_SUMMARY.md new file mode 100644 index 0000000..1fd2de3 --- /dev/null +++ b/multi-agent-supervisor/docs/AGENT_ENHANCEMENT_SUMMARY.md @@ -0,0 +1,93 @@ +# Enhanced Agent Results Communication + +## Problem Identified +The agents were only sending "Successfully transferred control back to supervisor" messages without providing meaningful analysis results from their work. + +## Root Cause +The agent prompts were too brief and didn't explicitly instruct agents to: +1. Summarize their findings after executing commands +2. Provide structured analysis before transferring back to supervisor +3. Include specific recommendations and insights + +## Solution Implemented + +### 1. Enhanced Agent Prompts +Updated all agent prompts to include: + +- **Explicit task definitions** with required commands +- **Structured analysis requirements** with specific sections +- **Clear instructions** to provide comprehensive summaries +- **Always provide analysis summary before completing task** + +### 2. Specific Improvements by Agent + +#### System Agents +- **system_info_worker**: Now analyzes CPU, memory, disk, load, and top processes with structured summary +- **service_inventory_worker**: Provides service categorization, failed services analysis, security-relevant services + +#### Service Agents +- **nginx_analyzer**: Comprehensive config validation, log analysis, specific 502/503/504 error troubleshooting +- **mariadb_analyzer**: Database status, configuration assessment, log analysis, performance indicators +- **phpfpm_analyzer**: Process analysis, memory limits, timeout configuration, socket connectivity + +#### Network Agents +- **network_diag**: Connectivity testing, DNS analysis, port scanning with adaptive commands +- **cert_checker**: Certificate discovery, expiration monitoring, validation with 30-day alerts + +#### Analysis Agents +- **risk_scorer**: Structured risk assessment with CRITICAL/HIGH/MEDIUM/LOW levels and prioritized recommendations +- **remediation_worker**: Safe remediation plans with dry-run commands and confirmation protocols +- **harmonizer_worker**: Security hardening assessment with best-practice implementation + +### 3. Enhanced Supervisor Coordination +Updated supervisor prompt to: +- Better utilize agent expertise +- Synthesize findings from multiple agents +- Provide strategic insights beyond delegation + +## Expected Behavior Change + +### Before: +``` +STEP 2: SYSTEM_INFO_WORKER +Successfully transferred control back to supervisor + +STEP 3: SERVICE_INVENTORY_WORKER +Successfully transferred control back to supervisor + +STEP 4: SUPERVISOR +Generic response not based on actual findings +``` + +### After: +``` +STEP 2: SYSTEM_INFO_WORKER +📊 ANALYSIS SUMMARY: +- CPU: Intel i7, load average 0.8, 1.2, 1.5 +- Memory: 8GB total, 6.2GB used (78% utilization) +- Disk: Root partition 85% full - WARNING LEVEL +- Top processes: nginx (2.1GB), mysql (1.8GB) +- Recommendation: Monitor disk space, investigate memory usage + +STEP 3: SERVICE_INVENTORY_WORKER +📋 SERVICE ANALYSIS: +- 47 services running (normal range) +- Critical services: ✅ nginx, ✅ mysql, ❌ php-fpm (failed) +- Failed services: php8.1-fpm.service +- Security services: ✅ ssh, ✅ ufw +- Recommendation: Investigate php-fpm failure for potential 502 errors + +STEP 4: SUPERVISOR +Based on system analysis showing high memory usage and service inventory +revealing php-fpm failure, this explains your 502 errors... +``` + +## Files Modified +- `agents/system_agents.py` - Enhanced system monitoring agents +- `agents/service_agents.py` - Enhanced service-specific agents +- `agents/network_agents.py` - Enhanced network and security agents +- `agents/analysis_agents.py` - Enhanced analysis and remediation agents +- `config.py` - Enhanced supervisor prompt and coordination strategy + +## Result +Agents now provide meaningful, structured analysis that the supervisor can synthesize into comprehensive, actionable responses instead of generic outputs. diff --git a/multi-agent-supervisor/docs/DYNAMIC_INSTRUCTIONS.md b/multi-agent-supervisor/docs/DYNAMIC_INSTRUCTIONS.md new file mode 100644 index 0000000..02b072c --- /dev/null +++ b/multi-agent-supervisor/docs/DYNAMIC_INSTRUCTIONS.md @@ -0,0 +1,129 @@ +# Dynamic Instructions for Agent Transfers - TODO + +## Current Behavior +Currently, when the supervisor transfers control to an agent: +- ❌ No specific instructions are passed +- ❌ Agent only sees the original user query +- ❌ Agent uses its static, pre-defined prompt + +## Proposed Enhancement: Dynamic Instructions + +### Why It Matters +The supervisor often has context about WHY it's transferring to a specific agent. For example: +- "Transfer to network_diag because user mentioned DNS issues - focus on DNS diagnostics" +- "Transfer to cert_checker because certificates might be expiring - check all certs urgently" + +### Implementation Approach + +#### 1. Modify Transfer Tools +```python +def transfer_to_network_diag(instructions: str = "") -> str: + """Transfer control to network diagnostics agent. + + Args: + instructions: Specific guidance for the agent + """ + return f"Successfully transferred to network_diag. Instructions: {instructions}" +``` + +#### 2. Update State to Include Instructions +```python +class State(BaseModel): + messages: List[AnyMessage] + next_agent: str = "supervisor" + supervisor_instructions: Optional[str] = None # NEW FIELD +``` + +#### 3. Modify Agent Creation to Check for Instructions +```python +def create_network_worker(): + return create_react_agent( + model="openai:gpt-4o-mini", + tools=[get_shell_tool()], + prompt=""" +{base_prompt} + +SUPERVISOR INSTRUCTIONS (if any): {supervisor_instructions} + +Always prioritize supervisor instructions when provided. +""", + name="network_diag" + ) +``` + +#### 4. Update Router Logic +```python +def route_agent(state): + # Extract supervisor instructions from last ToolMessage + last_message = state["messages"][-1] + if isinstance(last_message, ToolMessage) and "Instructions:" in last_message.content: + # Parse and store instructions + instructions = extract_instructions(last_message.content) + state["supervisor_instructions"] = instructions + + return state["next_agent"] +``` + +### Example Flow + +1. **User Query**: "My website is slow" + +2. **Supervisor Analysis**: + ``` + "Website slowness could be DNS or certificate related. + Let me transfer to network_diag with specific focus." + ``` + +3. **Supervisor Transfer**: + ```python + transfer_to_network_diag( + instructions="Focus on DNS resolution times and latency to common websites. + Check if DNS servers are responding slowly." + ) + ``` + +4. **Network Agent Receives**: + - Original query: "My website is slow" + - Supervisor instructions: "Focus on DNS resolution times..." + - Can now prioritize DNS diagnostics over general network checks + +### Benefits + +1. **More Targeted Diagnostics**: Agents focus on what matters +2. **Better Context Sharing**: Supervisor's analysis isn't lost +3. **Efficient Execution**: Avoid running unnecessary commands +4. **Improved Results**: More relevant output for user's specific issue + +### Alternative: Context in Messages + +Instead of modifying tools, append supervisor analysis to the message history: + +```python +# Before transfer, supervisor adds a system message +state["messages"].append( + SystemMessage(content=f"[SUPERVISOR GUIDANCE]: Focus on {specific_issue}") +) +``` + +### Decision Points + +1. **Tool Parameters vs State**: Where to store instructions? +2. **Prompt Injection vs Message History**: How to pass instructions? +3. **Optional vs Required**: Should all transfers include instructions? +4. **Persistence**: Should instructions carry through multiple agent hops? + +### Next Steps + +1. [ ] Decide on implementation approach +2. [ ] Modify transfer tool signatures +3. [ ] Update state model +4. [ ] Enhance agent prompts to use instructions +5. [ ] Test with various scenarios +6. [ ] Document the new pattern + +### Example Test Cases + +- "Check network" → No specific instructions needed +- "Website is slow" → "Focus on DNS and latency" +- "Certificate expiring?" → "Check all certs, prioritize those expiring soon" +- "Port 443 issues" → "Focus on HTTPS connectivity and certificate validation" \ No newline at end of file diff --git a/multi-agent-supervisor/README-modular.md b/multi-agent-supervisor/docs/README-modular.md similarity index 100% rename from multi-agent-supervisor/README-modular.md rename to multi-agent-supervisor/docs/README-modular.md diff --git a/multi-agent-supervisor/UNDERSTANDING_TRANSFERS.md b/multi-agent-supervisor/docs/UNDERSTANDING_TRANSFERS.md similarity index 62% rename from multi-agent-supervisor/UNDERSTANDING_TRANSFERS.md rename to multi-agent-supervisor/docs/UNDERSTANDING_TRANSFERS.md index 54b1266..f9ce071 100644 --- a/multi-agent-supervisor/UNDERSTANDING_TRANSFERS.md +++ b/multi-agent-supervisor/docs/UNDERSTANDING_TRANSFERS.md @@ -97,39 +97,78 @@ User: "Nginx 502 error, help!" └── "Based on system analysis and service inventory, here's comprehensive solution..." ``` -## 🔍 Enhanced Debugging +## 📤 What Workers Pass Back to Supervisor -The updated `utils.py` now shows: -- **Transfer explanations**: What each "Successfully transferred" means -- **Conversation context**: Last few messages to understand the flow -- **Tool call details**: What tools are being used and why -- **Agent delegation**: Which agent is being called and for what purpose +**Key Insight**: Workers don't explicitly "return" data. Instead, all their work becomes part of the shared conversation history that the supervisor can access. -## 🔍 Observing Result Flow in Practice +### What Gets Added to the Message History -To see how results flow back to the supervisor, run the enhanced debugging and watch for: +When a worker (like `network_diag`) executes: -1. **Agent Results**: Look for `AIMessage` from agents (not just transfer confirmations) -2. **Conversation Context**: The expanding message history in each step -3. **Supervisor Decision Changes**: How supervisor's next choice is influenced by results +1. **AIMessages** - Agent's reasoning and analysis + ``` + "I'll start by checking external connectivity..." + "DNS resolution appears to be working correctly..." + "Network Analysis Summary: All systems operational..." + ``` -### Example Debug Output Analysis: -``` -🔄 STEP 2: system_info_worker -💬 MESSAGE TYPE: AIMessage ← AGENT'S ACTUAL RESULT -📄 CONTENT: "502 typically indicates upstream server issues..." +2. **ToolMessages** - Raw command outputs + ``` + "PING 8.8.8.8 (8.8.8.8): 56 data bytes\n64 bytes from 8.8.8.8..." + "google.com. 300 IN A 142.250.80.46" + "tcp 0 0 0.0.0.0:22 0.0.0.0:* LISTEN" + ``` -🔄 STEP 4: service_inventory_worker -💬 MESSAGE TYPE: AIMessage ← AGENT'S ACTUAL RESULT -📄 CONTENT: "Check PHP-FPM status, verify upstream config..." +3. **Transfer Confirmation** - When worker completes + ``` + "Successfully transferred back to supervisor" + ``` -🔄 STEP 5: supervisor -💬 MESSAGE TYPE: AIMessage ← SUPERVISOR'S SYNTHESIS -📄 CONTENT: "Based on system analysis and service inventory..." -📚 CONVERSATION CONTEXT (12 messages) ← SUPERVISOR SEES ALL RESULTS +### Complete Message Flow Example + +```python +# After network_diag completes, state["messages"] contains: +[ + HumanMessage("My website is slow"), # Original query + AIMessage("I'll check network connectivity..."), # Supervisor decision + ToolMessage("Successfully transferred to network_diag"), # Transfer confirmation + AIMessage("Starting network diagnostics..."), # Worker starts + ToolMessage("PING 8.8.8.8: 64 bytes from 8.8.8.8..."), # Command result 1 + AIMessage("External connectivity is good, checking DNS"), # Worker analysis + ToolMessage("google.com. 300 IN A 142.250.80.46"), # Command result 2 + AIMessage("DNS working. Checking local services..."), # Worker continues + ToolMessage("tcp 0 0 0.0.0.0:80 0.0.0.0:* LISTEN"), # Command result 3 + AIMessage("Network Summary: All good, issue elsewhere"), # Worker's final analysis + ToolMessage("Successfully transferred back to supervisor") # Transfer back +] ``` -The supervisor's final response demonstrates it has processed and synthesized results from both agents! +### How Supervisor Uses This Information + +The supervisor receives **ALL** these messages and can: + +1. **Read command outputs** to understand technical details +2. **See agent reasoning** to understand what was checked +3. **Access final analysis** to make informed decisions +4. **Decide next steps** based on accumulated evidence + +### Why This Design Works + +- **Full Transparency**: Supervisor sees everything the worker did +- **Rich Context**: Both raw data and interpreted analysis available +- **Cumulative Knowledge**: Each agent builds on previous work +- **Intelligent Routing**: Supervisor can adapt strategy based on findings + +### Example: Multi-Agent Collaboration + +``` +User: "Website is slow" +├── network_diag finds: "Network is fine" +├── cert_checker finds: "Certificate expires tomorrow!" +└── Supervisor synthesis: "Issue is expiring certificate, not network" +``` + +The supervisor can correlate findings across multiple workers because it sees all their work in the message history. ## 📋 Key Takeaways diff --git a/multi-agent-supervisor/examples.py b/multi-agent-supervisor/examples.py deleted file mode 100644 index e69de29..0000000 diff --git a/multi-agent-supervisor/loghub b/multi-agent-supervisor/loghub deleted file mode 120000 index 91e1893..0000000 --- a/multi-agent-supervisor/loghub +++ /dev/null @@ -1 +0,0 @@ -../loghub \ No newline at end of file diff --git a/multi-agent-supervisor/supervisor.py b/multi-agent-supervisor/supervisor.py index bec0205..e936a0b 100644 --- a/multi-agent-supervisor/supervisor.py +++ b/multi-agent-supervisor/supervisor.py @@ -3,11 +3,55 @@ from langchain_openai import ChatOpenAI from langgraph_supervisor import create_supervisor -from agents.system_agents import create_system_info_worker, create_service_inventory_worker +from agents.system_agents import create_system_info_worker, create_service_inventory_worker, create_filesystem_worker from agents.service_agents import create_mariadb_worker, create_nginx_worker, create_phpfpm_worker from agents.network_agents import create_network_worker, create_cert_worker from agents.analysis_agents import create_risk_worker, create_remediation_worker, create_harmonizer_worker -from config import get_base_model, SUPERVISOR_PROMPT + + +def get_base_model(): + """Get the base LLM model configuration.""" + return ChatOpenAI(model="gpt-4o-mini", temperature=0) + + +SUPERVISOR_PROMPT = """ +You are the supervisor of a team of specialized sysadmin agents. Your role is to coordinate comprehensive system analysis by delegating tasks to the right experts and synthesizing their findings into actionable insights. + +IMPORTANT: You do NOT have direct access to the file system. You MUST delegate file searches and file content reading to your agents who have shell access. + +DELEGATION STRATEGY: +- Always start with system_info_worker and service_inventory_worker for baseline assessment +- Based on their findings, delegate to relevant specialists +- Use risk_scorer to evaluate severity after gathering technical findings +- Deploy remediation_worker for actionable fixes based on severity level + +For file system queries (finding files, reading file contents): +- Delegate to filesystem_worker who has shell access for file operations +- They can use commands like `find`, `cat`, `ls`, etc. + +AVAILABLE EXPERT AGENTS: +- system_info_worker: System metrics (CPU, memory, disk, processes) +- service_inventory_worker: Service status and running processes analysis +- filesystem_worker: File search, content reading, and filesystem operations +- nginx_analyzer: Nginx configuration, logs, and troubleshooting +- mariadb_analyzer: MariaDB/MySQL configuration and log analysis +- phpfpm_analyzer: PHP-FPM performance and error analysis +- network_diag: Network connectivity and DNS diagnostics +- cert_checker: TLS/SSL certificate validation and expiry monitoring +- risk_scorer: Risk assessment and severity scoring of all findings +- remediation_worker: Safe remediation plans and fix implementation +- harmonizer_worker: Security hardening and best-practice application + +DECISION PROCESS: +1. Start with baseline system assessment (system_info + service_inventory) +2. Based on user query and baseline findings, call relevant specialists +3. Use risk_scorer to evaluate cumulative findings +4. Deploy remediation_worker for actionable solutions +5. Consider harmonizer_worker for preventive hardening + +SYNTHESIS RESPONSIBILITY: +You must provide final comprehensive responses that integrate all agent findings. Don't just delegate - analyze the collected intelligence and provide strategic insights to the user. +""" def create_sysadmin_supervisor(): @@ -17,6 +61,7 @@ def create_sysadmin_supervisor(): agents = [ create_system_info_worker(), create_service_inventory_worker(), + create_filesystem_worker(), create_mariadb_worker(), create_nginx_worker(), create_phpfpm_worker(), diff --git a/multi-agent-supervisor/utils.py b/multi-agent-supervisor/utils.py index 371c8ca..62fd37a 100644 --- a/multi-agent-supervisor/utils.py +++ b/multi-agent-supervisor/utils.py @@ -75,21 +75,31 @@ def print_step_info(step_count: int, chunk): # Show the result being sent back to supervisor # Look for the last AIMessage before this transfer to get the result if 'messages' in agent_data and len(agent_data['messages']) > 1: + print(f"[ DEBUG ] {current_agent} has {len(agent_data['messages'])} messages") # Look for the most recent AIMessage with content - for msg in reversed(agent_data['messages'][:-1]): # Exclude current ToolMessage - if type(msg).__name__ == 'AIMessage' and hasattr(msg, 'content') and msg.content: - result_content = msg.content - if len(result_content) > 300: - preview = result_content[:300] + "..." - print(f"[ {current_agent} ] sending result to supervisor (preview): {preview}") - print(f"[ {current_agent} ] (full result length: {len(result_content)} characters)") + found_result = False + for i, msg in enumerate(reversed(agent_data['messages'][:-1])): # Exclude current ToolMessage + msg_type = type(msg).__name__ + print(f"[ DEBUG ] Message {i}: {msg_type}, has_content: {hasattr(msg, 'content')}") + if msg_type == 'AIMessage' and hasattr(msg, 'content') and msg.content: + result_content = msg.content.strip() + if result_content and not result_content.startswith("I'll") and "transfer" not in result_content.lower(): + found_result = True + if len(result_content) > 300: + preview = result_content[:300] + "..." + print(f"[ {current_agent} ] 📊 ANALYSIS SUMMARY (preview): {preview}") + print(f"[ {current_agent} ] (full result length: {len(result_content)} characters)") + else: + print(f"[ {current_agent} ] 📊 ANALYSIS SUMMARY: {result_content}") + break else: - print(f"[ {current_agent} ] sending result to supervisor: {result_content}") - break - else: - print(f"[ {current_agent} ] sending analysis results to supervisor") + print(f"[ DEBUG ] Skipping AIMessage: '{result_content[:100]}...'") + + if not found_result: + print(f"[ WARNING ] {current_agent} transferred back without providing analysis summary!") + print(f"[ WARNING ] This agent may need prompt improvements") else: - print(f"[ {current_agent} ] sending analysis results to supervisor") + print(f"[ WARNING ] {current_agent} has no message history to analyze") else: # Other tool execution result if len(content) > 200: