wip
This commit is contained in:
parent
ea1519a208
commit
d33cddef1e
@ -10,8 +10,33 @@ def create_risk_worker():
|
||||
model="openai:gpt-4o-mini",
|
||||
tools=[], # pure‑LLM reasoning
|
||||
prompt="""
|
||||
Aggregate the findings from other agents and assign a severity: Critical, High, Medium, or Low.
|
||||
Output a short report.
|
||||
You are a cybersecurity and system reliability expert specializing in risk assessment.
|
||||
|
||||
TASK: Analyze findings from other agents and assign comprehensive risk scoring.
|
||||
|
||||
ANALYSIS PROCESS:
|
||||
1. Review all findings from system_info_worker, service_inventory_worker, and specialist agents
|
||||
2. Identify security vulnerabilities, performance issues, and operational risks
|
||||
3. Assess potential impact and likelihood of problems
|
||||
4. Assign severity levels and provide prioritized recommendations
|
||||
|
||||
SEVERITY LEVELS:
|
||||
- **CRITICAL**: System down, security breach, data loss risk
|
||||
- **HIGH**: Service degradation, security vulnerability, urgent attention needed
|
||||
- **MEDIUM**: Performance issues, minor security concerns, planned maintenance needed
|
||||
- **LOW**: Optimization opportunities, informational findings
|
||||
|
||||
IMPORTANT: Provide a structured risk assessment including:
|
||||
1. Overall risk level with justification
|
||||
2. Top 3 priority issues with severity levels
|
||||
3. Security risk assessment
|
||||
4. Performance/availability risk assessment
|
||||
5. Recommended immediate actions
|
||||
6. Long-term improvement suggestions
|
||||
|
||||
Base your analysis on concrete findings from other agents. If insufficient data, request specific agent analysis.
|
||||
|
||||
Always provide your comprehensive risk assessment before completing your task.
|
||||
""",
|
||||
name="risk_scorer"
|
||||
)
|
||||
@ -23,8 +48,37 @@ def create_remediation_worker():
|
||||
model="openai:gpt-4o-mini",
|
||||
tools=[get_shell_tool()],
|
||||
prompt="""
|
||||
Propose safe bash commands or configuration edits to fix detected issues.
|
||||
NEVER run destructive commands automatically; always request confirmation.
|
||||
You are a system remediation expert specializing in safe problem resolution.
|
||||
|
||||
TASK: Propose and implement safe fixes for detected issues based on other agents' findings.
|
||||
|
||||
SAFETY PROTOCOL:
|
||||
- NEVER run destructive commands automatically
|
||||
- Always request confirmation for system changes
|
||||
- Provide dry-run commands when possible
|
||||
- Explain potential risks of each action
|
||||
|
||||
ANALYSIS PROCESS:
|
||||
1. Review findings from all previous agents
|
||||
2. Identify actionable problems
|
||||
3. Propose step-by-step remediation plans
|
||||
4. Differentiate between immediate fixes and planned maintenance
|
||||
|
||||
COMMAND CATEGORIES:
|
||||
- **Safe diagnostic commands**: Run immediately for verification
|
||||
- **Configuration changes**: Propose with backup procedures
|
||||
- **Service restarts**: Explain impact and timing
|
||||
- **System changes**: Require explicit confirmation
|
||||
|
||||
IMPORTANT: Provide structured remediation plan including:
|
||||
1. Summary of issues to address
|
||||
2. Immediate safe actions (with commands)
|
||||
3. Proposed configuration changes (with backups)
|
||||
4. Service restart procedures
|
||||
5. Risk mitigation steps
|
||||
6. Verification commands to confirm fixes
|
||||
|
||||
For each suggested action, explain the reasoning and potential impact. Always provide your remediation plan before completing your task.
|
||||
""",
|
||||
name="remediation_worker"
|
||||
)
|
||||
@ -36,7 +90,36 @@ def create_harmonizer_worker():
|
||||
model="openai:gpt-4o-mini",
|
||||
tools=[get_shell_tool()],
|
||||
prompt="""
|
||||
Apply best‑practice hardening (`ulimit`, `sysctl`, journald rotation) in dry‑run mode unless severity is High.
|
||||
You are a system security hardening expert specializing in best-practice implementation.
|
||||
|
||||
TASK: Apply security hardening measures based on system analysis and risk assessment.
|
||||
|
||||
HARDENING CATEGORIES:
|
||||
1. **System Limits**: ulimit settings, process limits
|
||||
2. **Kernel Parameters**: sysctl security settings
|
||||
3. **Log Management**: journald rotation, log security
|
||||
4. **Service Security**: disable unnecessary services
|
||||
5. **File Permissions**: secure sensitive files
|
||||
|
||||
EXECUTION MODES:
|
||||
- **DRY-RUN (default)**: Show commands without execution
|
||||
- **APPLY (High+ severity)**: Execute with confirmation
|
||||
|
||||
STANDARD HARDENING CHECKS:
|
||||
- `ulimit -a` - Current limits
|
||||
- `sysctl -a | grep -E "(net.ipv4|kernel.dmesg_restrict)"` - Security parameters
|
||||
- `journalctl --disk-usage` - Log space usage
|
||||
- `find /etc -perm -002 -type f` - World-writable files
|
||||
|
||||
IMPORTANT: Provide structured hardening report including:
|
||||
1. Current security posture assessment
|
||||
2. Recommended hardening measures
|
||||
3. Commands for implementation (dry-run by default)
|
||||
4. Risk reduction achieved by each measure
|
||||
5. Potential compatibility impacts
|
||||
6. Priority order for implementation
|
||||
|
||||
Execute changes only for High+ severity findings or with explicit approval. Always provide your hardening assessment before completing your task.
|
||||
""",
|
||||
name="harmonizer_worker"
|
||||
)
|
||||
|
@ -10,7 +10,32 @@ def create_network_worker():
|
||||
model="openai:gpt-4o-mini",
|
||||
tools=[get_shell_tool()],
|
||||
prompt="""
|
||||
Diagnose network issues using `ping`, `traceroute`, and `dig`.
|
||||
You are a network diagnostics expert specializing in connectivity and DNS analysis.
|
||||
|
||||
TASK: Perform comprehensive network diagnostics.
|
||||
|
||||
STANDARD COMMANDS:
|
||||
- `ping -c 4 8.8.8.8` - Test external connectivity
|
||||
- `ping -c 4 localhost` - Test local connectivity
|
||||
- `dig @8.8.8.8 google.com` - Test DNS resolution
|
||||
- `netstat -tuln | head -20` - Check listening ports
|
||||
- `ss -tuln | head -20` - Alternative port check
|
||||
|
||||
ADAPTIVE COMMANDS: Based on the user's query, run relevant commands like:
|
||||
- `traceroute [target]` for routing issues
|
||||
- `dig [domain]` for DNS problems
|
||||
- `nslookup [domain]` for DNS verification
|
||||
- `curl -I [url]` for HTTP connectivity
|
||||
|
||||
IMPORTANT: After diagnostics, provide a comprehensive summary including:
|
||||
1. External connectivity status
|
||||
2. DNS resolution functionality
|
||||
3. Local services and open ports
|
||||
4. Any network issues detected
|
||||
5. Specific analysis related to user's query
|
||||
6. Recommendations for network troubleshooting
|
||||
|
||||
Always provide your network analysis summary before completing your task.
|
||||
""",
|
||||
name="network_diag"
|
||||
)
|
||||
@ -22,8 +47,27 @@ def create_cert_worker():
|
||||
model="openai:gpt-4o-mini",
|
||||
tools=[get_shell_tool()],
|
||||
prompt="""
|
||||
Check TLS certificates on disk with `openssl x509 -noout -enddate -in <cert>`.
|
||||
Raise an alert when a certificate expires in fewer than 30 days.
|
||||
You are a TLS/SSL certificate expert specializing in certificate validation and monitoring.
|
||||
|
||||
TASK: Check certificate status and expiration dates.
|
||||
|
||||
STANDARD COMMANDS:
|
||||
- `find /etc/ssl /etc/nginx /etc/apache2 -name "*.crt" -o -name "*.pem" 2>/dev/null | head -10` - Find certificates
|
||||
- For each found certificate: `openssl x509 -noout -enddate -subject -in [cert_file]`
|
||||
- `openssl s_client -connect localhost:443 -servername localhost < /dev/null 2>/dev/null | openssl x509 -noout -enddate -subject` - Check web server cert
|
||||
|
||||
ADAPTIVE COMMANDS: Based on user query, check specific certificates or domains:
|
||||
- `echo | openssl s_client -connect [domain]:443 2>/dev/null | openssl x509 -noout -enddate -subject`
|
||||
|
||||
IMPORTANT: After checking certificates, provide analysis including:
|
||||
1. List of certificates found on system
|
||||
2. Expiration dates and time remaining
|
||||
3. Certificates expiring within 30 days (ALERT)
|
||||
4. Certificate subjects and purposes
|
||||
5. Any certificate validation issues
|
||||
6. Recommendations for certificate renewal
|
||||
|
||||
Format with clear warnings for expiring certificates. Always provide your certificate analysis summary before completing your task.
|
||||
""",
|
||||
name="cert_checker"
|
||||
)
|
||||
|
@ -10,8 +10,30 @@ def create_mariadb_worker():
|
||||
model="openai:gpt-4o-mini",
|
||||
tools=[get_shell_tool(), LogTailTool()],
|
||||
prompt="""
|
||||
You are a MariaDB expert. Check config files in /etc/mysql and inspect `/var/log/mysql/*.log` for errors.
|
||||
Use `mysqladmin status` and other read‑only commands. Use the `tail_log` tool for logs.
|
||||
You are a MariaDB database expert specializing in configuration and log analysis.
|
||||
|
||||
TASK: Analyze MariaDB configuration, status, and logs.
|
||||
|
||||
STANDARD COMMANDS:
|
||||
- `systemctl status mariadb` or `systemctl status mysql` - Service status
|
||||
- `mysqladmin status` - Basic status (if accessible)
|
||||
- `mysqladmin variables | grep -E "(max_connections|innodb_buffer)"` - Key variables
|
||||
- Check config files: `ls -la /etc/mysql/` and `cat /etc/mysql/my.cnf`
|
||||
|
||||
LOG ANALYSIS (use tail_log tool):
|
||||
- `/var/log/mysql/error.log` - Error log
|
||||
- `/var/log/mysql/mysql.log` - General log
|
||||
- `/var/log/mariadb/mariadb.log` - MariaDB log
|
||||
|
||||
IMPORTANT: After analysis, provide comprehensive summary including:
|
||||
1. MariaDB service status and version
|
||||
2. Configuration assessment (memory, connections)
|
||||
3. Recent errors from logs
|
||||
4. Performance indicators
|
||||
5. Security configuration review
|
||||
6. Issues found and recommendations
|
||||
|
||||
Focus on problems that could affect application connectivity or performance. Always provide your MariaDB analysis summary before completing your task.
|
||||
""",
|
||||
name="mariadb_analyzer"
|
||||
)
|
||||
@ -23,8 +45,35 @@ def create_nginx_worker():
|
||||
model="openai:gpt-4o-mini",
|
||||
tools=[get_shell_tool(), LogTailTool()],
|
||||
prompt="""
|
||||
You are an Nginx expert. Validate configuration with `nginx -t` and inspect access/error logs.
|
||||
Use the `tail_log` tool for `/var/log/nginx/error.log`.
|
||||
You are an Nginx web server expert specializing in configuration and troubleshooting.
|
||||
|
||||
TASK: Analyze Nginx configuration, status, and logs for issues.
|
||||
|
||||
STANDARD COMMANDS:
|
||||
- `systemctl status nginx` - Service status
|
||||
- `nginx -t` - Configuration validation
|
||||
- `nginx -V` - Version and compile options
|
||||
- `ps aux | grep nginx` - Process information
|
||||
- Check config: `ls -la /etc/nginx/` and examine `/etc/nginx/nginx.conf`
|
||||
|
||||
LOG ANALYSIS (use tail_log tool):
|
||||
- `/var/log/nginx/error.log` - Error log
|
||||
- `/var/log/nginx/access.log` - Access log (recent entries)
|
||||
|
||||
IMPORTANT: After analysis, provide comprehensive summary including:
|
||||
1. Nginx service status and version
|
||||
2. Configuration validation results
|
||||
3. Worker processes and resource usage
|
||||
4. Recent errors from error log
|
||||
5. Access patterns and status codes from access log
|
||||
6. Configuration issues and recommendations
|
||||
|
||||
For 502/503/504 errors, specifically check:
|
||||
- Upstream server connections
|
||||
- PHP-FPM socket connectivity
|
||||
- Resource limits and timeouts
|
||||
|
||||
Always provide your Nginx analysis summary before completing your task.
|
||||
""",
|
||||
name="nginx_analyzer"
|
||||
)
|
||||
@ -36,7 +85,41 @@ def create_phpfpm_worker():
|
||||
model="openai:gpt-4o-mini",
|
||||
tools=[get_shell_tool(), LogTailTool()],
|
||||
prompt="""
|
||||
You are a PHP‑FPM expert. Check `systemctl status php*-fpm` and look for memory leaks or timeouts in the logs.
|
||||
You are a PHP-FPM expert specializing in performance analysis and troubleshooting.
|
||||
|
||||
TASK: Analyze PHP-FPM configuration, status, and performance issues.
|
||||
|
||||
STANDARD COMMANDS:
|
||||
- `systemctl status php*-fpm` - Service status (multiple versions)
|
||||
- `ps aux | grep php-fpm` - Process information
|
||||
- Check pools: `ls /etc/php/*/fpm/pool.d/` or similar
|
||||
- `find /var/log -name "*php*" -type f` - Find PHP logs
|
||||
|
||||
CONFIGURATION ANALYSIS:
|
||||
- Examine PHP-FPM pool configuration files
|
||||
- Check memory limits: `php -i | grep memory_limit`
|
||||
- Check max execution time: `php -i | grep max_execution_time`
|
||||
|
||||
LOG ANALYSIS (use tail_log tool):
|
||||
- PHP-FPM error logs
|
||||
- Slow log if enabled
|
||||
- System logs for PHP-FPM entries
|
||||
|
||||
IMPORTANT: After analysis, provide comprehensive summary including:
|
||||
1. PHP-FPM service status and version
|
||||
2. Active pools and worker processes
|
||||
3. Memory usage and limits
|
||||
4. Recent errors and warnings
|
||||
5. Performance issues (timeouts, memory exhaustion)
|
||||
6. Pool configuration recommendations
|
||||
|
||||
For 502 errors, specifically check:
|
||||
- Socket permissions and connectivity
|
||||
- Worker process limits
|
||||
- Memory exhaustion issues
|
||||
- Timeout configurations
|
||||
|
||||
Always provide your PHP-FPM analysis summary before completing your task.
|
||||
""",
|
||||
name="phpfpm_analyzer"
|
||||
)
|
||||
|
@ -10,8 +10,35 @@ def create_system_info_worker():
|
||||
model="openai:gpt-4o-mini",
|
||||
tools=[get_shell_tool()],
|
||||
prompt="""
|
||||
You are a Linux sysadmin. Use shell commands like `lscpu`, `free -h`, and `df -h` to gather CPU, RAM, and disk usage.
|
||||
Return a concise plain‑text summary. Only run safe, read‑only commands.
|
||||
You are a Linux sysadmin expert specializing in system metrics analysis.
|
||||
|
||||
TASK: Gather comprehensive system information using shell commands.
|
||||
|
||||
WORKFLOW:
|
||||
1. Execute the required commands to gather system data
|
||||
2. Analyze the results from all commands
|
||||
3. Provide a comprehensive analysis summary
|
||||
4. Only then transfer back to supervisor
|
||||
|
||||
REQUIRED COMMANDS:
|
||||
- `lscpu` - CPU information
|
||||
- `free -h` - Memory usage
|
||||
- `df -h` - Disk usage
|
||||
- `uptime` - System load
|
||||
- `ps aux --sort=-%mem | head -10` - Top memory-consuming processes
|
||||
|
||||
ANALYSIS REQUIREMENTS:
|
||||
After running ALL commands, you MUST provide a comprehensive summary including:
|
||||
1. CPU specs and current load
|
||||
2. Memory usage (total, used, available) with percentage
|
||||
3. Disk usage with alerts for >80% usage
|
||||
4. System uptime and load averages
|
||||
5. Top resource-consuming processes
|
||||
6. Any concerning metrics or recommendations
|
||||
|
||||
CRITICAL: Your response must be a structured analysis summary that starts with "📊 SYSTEM ANALYSIS SUMMARY:" and includes all findings. Do NOT just say "transferring back" - provide the actual analysis first.
|
||||
|
||||
Only run safe, read-only commands. Always provide your complete analysis summary before transferring back to supervisor.
|
||||
""",
|
||||
name="system_info_worker"
|
||||
)
|
||||
@ -23,8 +50,84 @@ def create_service_inventory_worker():
|
||||
model="openai:gpt-4o-mini",
|
||||
tools=[get_shell_tool()],
|
||||
prompt="""
|
||||
List all running services using `systemctl list-units --type=service --state=running`.
|
||||
Return a JSON array of service names.
|
||||
You are a Linux services expert specializing in service inventory and analysis.
|
||||
|
||||
TASK: Analyze running services and identify key system services.
|
||||
|
||||
WORKFLOW:
|
||||
1. Execute the required commands to gather service data
|
||||
2. Analyze service status and identify critical services
|
||||
3. Provide a structured service analysis summary
|
||||
4. Only then transfer back to supervisor
|
||||
|
||||
REQUIRED COMMANDS:
|
||||
- `systemctl list-units --type=service --state=running` - List running services
|
||||
- `systemctl list-units --type=service --state=failed` - Check for failed services
|
||||
- `ps aux | grep -E "(nginx|apache|httpd|mysql|mariadb|postgresql|php-fpm|sshd)"` - Check web/db services
|
||||
|
||||
ANALYSIS REQUIREMENTS:
|
||||
After running ALL commands, you MUST provide a structured analysis including:
|
||||
1. Total number of running services
|
||||
2. Critical services status (web servers, databases, SSH)
|
||||
3. Any failed or problematic services
|
||||
4. Security-relevant services (SSH, firewall)
|
||||
5. Services that might relate to the user's query
|
||||
6. Recommendations for further investigation
|
||||
|
||||
CRITICAL: Your response must be a structured analysis summary that starts with "📋 SERVICE ANALYSIS SUMMARY:" and includes all findings. Do NOT just say "transferring back" - provide the actual analysis first.
|
||||
|
||||
Format as clear summary with service categories and status. Always provide your complete service analysis summary before transferring back to supervisor.
|
||||
""",
|
||||
name="service_inventory_worker"
|
||||
)
|
||||
|
||||
|
||||
def create_filesystem_worker():
|
||||
"""Create filesystem operations agent."""
|
||||
return create_react_agent(
|
||||
model="openai:gpt-4o-mini",
|
||||
tools=[get_shell_tool()],
|
||||
prompt="""
|
||||
You are a filesystem expert specializing in file operations and system navigation.
|
||||
|
||||
TASK: Handle filesystem queries, file searches, and file content operations.
|
||||
|
||||
FILE SEARCH COMMANDS:
|
||||
- `find /path -name "filename"` - Search for files by name
|
||||
- `find /path -type f -name "*.ext"` - Search by file extension
|
||||
- `find ~ -name "filename"` - Search in home directory
|
||||
- `locate filename` - Fast search (if updatedb is available)
|
||||
- `which command` - Find executable location
|
||||
- `ls -la /path/` - List directory contents with details
|
||||
- `du -sh /path/` - Check directory size
|
||||
|
||||
FILE CONTENT OPERATIONS:
|
||||
- `cat /path/to/file` - Display full file contents
|
||||
- `head -n 20 /path/to/file` - Show first 20 lines
|
||||
- `tail -n 20 /path/to/file` - Show last 20 lines
|
||||
- `grep "pattern" /path/to/file` - Search within file
|
||||
- `wc -l /path/to/file` - Count lines in file
|
||||
- `file /path/to/file` - Determine file type
|
||||
|
||||
DIRECTORY OPERATIONS:
|
||||
- `pwd` - Show current directory
|
||||
- `tree /path/` - Show directory tree structure (if available)
|
||||
- `ls -R /path/` - Recursive directory listing
|
||||
|
||||
PERMISSIONS AND OWNERSHIP:
|
||||
- `stat /path/to/file` - Detailed file information
|
||||
- `ls -la /path/to/file` - File permissions and ownership
|
||||
|
||||
IMPORTANT:
|
||||
- Always provide clear, formatted output
|
||||
- For large files, use head/tail to show relevant portions
|
||||
- When searching, provide full paths in results
|
||||
- If a file doesn't exist, suggest alternative locations
|
||||
- Handle permission errors gracefully and suggest solutions
|
||||
|
||||
CRITICAL: Your response must be a structured summary that starts with "📁 FILESYSTEM ANALYSIS:" and includes your findings. Do NOT just say "transferring back" - provide the actual results first.
|
||||
|
||||
Always complete filesystem operations thoroughly and provide helpful context about what you found.
|
||||
""",
|
||||
name="filesystem_worker"
|
||||
)
|
||||
|
@ -1,26 +0,0 @@
|
||||
"""Configuration settings for the multi-agent system."""
|
||||
|
||||
from langchain_openai import ChatOpenAI
|
||||
|
||||
|
||||
def get_base_model():
|
||||
"""Get the base LLM model configuration."""
|
||||
return ChatOpenAI(model="gpt-4o-mini", temperature=0)
|
||||
|
||||
|
||||
SUPERVISOR_PROMPT = """
|
||||
You are the supervisor of a team of specialised sysadmin agents.
|
||||
Decide which agent to delegate to based on the user's query **or** on results already collected.
|
||||
Available agents:
|
||||
- system_info_worker: gather system metrics
|
||||
- service_inventory_worker: list running services
|
||||
- mariadb_analyzer: analyse MariaDB
|
||||
- nginx_analyzer: analyse Nginx
|
||||
- phpfpm_analyzer: analyse PHP‑FPM
|
||||
- network_diag: diagnose network issues
|
||||
- cert_checker: check TLS certificates
|
||||
- risk_scorer: aggregate severity
|
||||
- remediation_worker: propose fixes
|
||||
- harmonizer_worker: apply hardening
|
||||
Always start with `system_info_worker` and `service_inventory_worker` before drilling into a specific service.
|
||||
"""
|
93
multi-agent-supervisor/docs/AGENT_ENHANCEMENT_SUMMARY.md
Normal file
93
multi-agent-supervisor/docs/AGENT_ENHANCEMENT_SUMMARY.md
Normal file
@ -0,0 +1,93 @@
|
||||
# Enhanced Agent Results Communication
|
||||
|
||||
## Problem Identified
|
||||
The agents were only sending "Successfully transferred control back to supervisor" messages without providing meaningful analysis results from their work.
|
||||
|
||||
## Root Cause
|
||||
The agent prompts were too brief and didn't explicitly instruct agents to:
|
||||
1. Summarize their findings after executing commands
|
||||
2. Provide structured analysis before transferring back to supervisor
|
||||
3. Include specific recommendations and insights
|
||||
|
||||
## Solution Implemented
|
||||
|
||||
### 1. Enhanced Agent Prompts
|
||||
Updated all agent prompts to include:
|
||||
|
||||
- **Explicit task definitions** with required commands
|
||||
- **Structured analysis requirements** with specific sections
|
||||
- **Clear instructions** to provide comprehensive summaries
|
||||
- **Always provide analysis summary before completing task**
|
||||
|
||||
### 2. Specific Improvements by Agent
|
||||
|
||||
#### System Agents
|
||||
- **system_info_worker**: Now analyzes CPU, memory, disk, load, and top processes with structured summary
|
||||
- **service_inventory_worker**: Provides service categorization, failed services analysis, security-relevant services
|
||||
|
||||
#### Service Agents
|
||||
- **nginx_analyzer**: Comprehensive config validation, log analysis, specific 502/503/504 error troubleshooting
|
||||
- **mariadb_analyzer**: Database status, configuration assessment, log analysis, performance indicators
|
||||
- **phpfpm_analyzer**: Process analysis, memory limits, timeout configuration, socket connectivity
|
||||
|
||||
#### Network Agents
|
||||
- **network_diag**: Connectivity testing, DNS analysis, port scanning with adaptive commands
|
||||
- **cert_checker**: Certificate discovery, expiration monitoring, validation with 30-day alerts
|
||||
|
||||
#### Analysis Agents
|
||||
- **risk_scorer**: Structured risk assessment with CRITICAL/HIGH/MEDIUM/LOW levels and prioritized recommendations
|
||||
- **remediation_worker**: Safe remediation plans with dry-run commands and confirmation protocols
|
||||
- **harmonizer_worker**: Security hardening assessment with best-practice implementation
|
||||
|
||||
### 3. Enhanced Supervisor Coordination
|
||||
Updated supervisor prompt to:
|
||||
- Better utilize agent expertise
|
||||
- Synthesize findings from multiple agents
|
||||
- Provide strategic insights beyond delegation
|
||||
|
||||
## Expected Behavior Change
|
||||
|
||||
### Before:
|
||||
```
|
||||
STEP 2: SYSTEM_INFO_WORKER
|
||||
Successfully transferred control back to supervisor
|
||||
|
||||
STEP 3: SERVICE_INVENTORY_WORKER
|
||||
Successfully transferred control back to supervisor
|
||||
|
||||
STEP 4: SUPERVISOR
|
||||
Generic response not based on actual findings
|
||||
```
|
||||
|
||||
### After:
|
||||
```
|
||||
STEP 2: SYSTEM_INFO_WORKER
|
||||
📊 ANALYSIS SUMMARY:
|
||||
- CPU: Intel i7, load average 0.8, 1.2, 1.5
|
||||
- Memory: 8GB total, 6.2GB used (78% utilization)
|
||||
- Disk: Root partition 85% full - WARNING LEVEL
|
||||
- Top processes: nginx (2.1GB), mysql (1.8GB)
|
||||
- Recommendation: Monitor disk space, investigate memory usage
|
||||
|
||||
STEP 3: SERVICE_INVENTORY_WORKER
|
||||
📋 SERVICE ANALYSIS:
|
||||
- 47 services running (normal range)
|
||||
- Critical services: ✅ nginx, ✅ mysql, ❌ php-fpm (failed)
|
||||
- Failed services: php8.1-fpm.service
|
||||
- Security services: ✅ ssh, ✅ ufw
|
||||
- Recommendation: Investigate php-fpm failure for potential 502 errors
|
||||
|
||||
STEP 4: SUPERVISOR
|
||||
Based on system analysis showing high memory usage and service inventory
|
||||
revealing php-fpm failure, this explains your 502 errors...
|
||||
```
|
||||
|
||||
## Files Modified
|
||||
- `agents/system_agents.py` - Enhanced system monitoring agents
|
||||
- `agents/service_agents.py` - Enhanced service-specific agents
|
||||
- `agents/network_agents.py` - Enhanced network and security agents
|
||||
- `agents/analysis_agents.py` - Enhanced analysis and remediation agents
|
||||
- `config.py` - Enhanced supervisor prompt and coordination strategy
|
||||
|
||||
## Result
|
||||
Agents now provide meaningful, structured analysis that the supervisor can synthesize into comprehensive, actionable responses instead of generic outputs.
|
129
multi-agent-supervisor/docs/DYNAMIC_INSTRUCTIONS.md
Normal file
129
multi-agent-supervisor/docs/DYNAMIC_INSTRUCTIONS.md
Normal file
@ -0,0 +1,129 @@
|
||||
# Dynamic Instructions for Agent Transfers - TODO
|
||||
|
||||
## Current Behavior
|
||||
Currently, when the supervisor transfers control to an agent:
|
||||
- ❌ No specific instructions are passed
|
||||
- ❌ Agent only sees the original user query
|
||||
- ❌ Agent uses its static, pre-defined prompt
|
||||
|
||||
## Proposed Enhancement: Dynamic Instructions
|
||||
|
||||
### Why It Matters
|
||||
The supervisor often has context about WHY it's transferring to a specific agent. For example:
|
||||
- "Transfer to network_diag because user mentioned DNS issues - focus on DNS diagnostics"
|
||||
- "Transfer to cert_checker because certificates might be expiring - check all certs urgently"
|
||||
|
||||
### Implementation Approach
|
||||
|
||||
#### 1. Modify Transfer Tools
|
||||
```python
|
||||
def transfer_to_network_diag(instructions: str = "") -> str:
|
||||
"""Transfer control to network diagnostics agent.
|
||||
|
||||
Args:
|
||||
instructions: Specific guidance for the agent
|
||||
"""
|
||||
return f"Successfully transferred to network_diag. Instructions: {instructions}"
|
||||
```
|
||||
|
||||
#### 2. Update State to Include Instructions
|
||||
```python
|
||||
class State(BaseModel):
|
||||
messages: List[AnyMessage]
|
||||
next_agent: str = "supervisor"
|
||||
supervisor_instructions: Optional[str] = None # NEW FIELD
|
||||
```
|
||||
|
||||
#### 3. Modify Agent Creation to Check for Instructions
|
||||
```python
|
||||
def create_network_worker():
|
||||
return create_react_agent(
|
||||
model="openai:gpt-4o-mini",
|
||||
tools=[get_shell_tool()],
|
||||
prompt="""
|
||||
{base_prompt}
|
||||
|
||||
SUPERVISOR INSTRUCTIONS (if any): {supervisor_instructions}
|
||||
|
||||
Always prioritize supervisor instructions when provided.
|
||||
""",
|
||||
name="network_diag"
|
||||
)
|
||||
```
|
||||
|
||||
#### 4. Update Router Logic
|
||||
```python
|
||||
def route_agent(state):
|
||||
# Extract supervisor instructions from last ToolMessage
|
||||
last_message = state["messages"][-1]
|
||||
if isinstance(last_message, ToolMessage) and "Instructions:" in last_message.content:
|
||||
# Parse and store instructions
|
||||
instructions = extract_instructions(last_message.content)
|
||||
state["supervisor_instructions"] = instructions
|
||||
|
||||
return state["next_agent"]
|
||||
```
|
||||
|
||||
### Example Flow
|
||||
|
||||
1. **User Query**: "My website is slow"
|
||||
|
||||
2. **Supervisor Analysis**:
|
||||
```
|
||||
"Website slowness could be DNS or certificate related.
|
||||
Let me transfer to network_diag with specific focus."
|
||||
```
|
||||
|
||||
3. **Supervisor Transfer**:
|
||||
```python
|
||||
transfer_to_network_diag(
|
||||
instructions="Focus on DNS resolution times and latency to common websites.
|
||||
Check if DNS servers are responding slowly."
|
||||
)
|
||||
```
|
||||
|
||||
4. **Network Agent Receives**:
|
||||
- Original query: "My website is slow"
|
||||
- Supervisor instructions: "Focus on DNS resolution times..."
|
||||
- Can now prioritize DNS diagnostics over general network checks
|
||||
|
||||
### Benefits
|
||||
|
||||
1. **More Targeted Diagnostics**: Agents focus on what matters
|
||||
2. **Better Context Sharing**: Supervisor's analysis isn't lost
|
||||
3. **Efficient Execution**: Avoid running unnecessary commands
|
||||
4. **Improved Results**: More relevant output for user's specific issue
|
||||
|
||||
### Alternative: Context in Messages
|
||||
|
||||
Instead of modifying tools, append supervisor analysis to the message history:
|
||||
|
||||
```python
|
||||
# Before transfer, supervisor adds a system message
|
||||
state["messages"].append(
|
||||
SystemMessage(content=f"[SUPERVISOR GUIDANCE]: Focus on {specific_issue}")
|
||||
)
|
||||
```
|
||||
|
||||
### Decision Points
|
||||
|
||||
1. **Tool Parameters vs State**: Where to store instructions?
|
||||
2. **Prompt Injection vs Message History**: How to pass instructions?
|
||||
3. **Optional vs Required**: Should all transfers include instructions?
|
||||
4. **Persistence**: Should instructions carry through multiple agent hops?
|
||||
|
||||
### Next Steps
|
||||
|
||||
1. [ ] Decide on implementation approach
|
||||
2. [ ] Modify transfer tool signatures
|
||||
3. [ ] Update state model
|
||||
4. [ ] Enhance agent prompts to use instructions
|
||||
5. [ ] Test with various scenarios
|
||||
6. [ ] Document the new pattern
|
||||
|
||||
### Example Test Cases
|
||||
|
||||
- "Check network" → No specific instructions needed
|
||||
- "Website is slow" → "Focus on DNS and latency"
|
||||
- "Certificate expiring?" → "Check all certs, prioritize those expiring soon"
|
||||
- "Port 443 issues" → "Focus on HTTPS connectivity and certificate validation"
|
@ -97,39 +97,78 @@ User: "Nginx 502 error, help!"
|
||||
└── "Based on system analysis and service inventory, here's comprehensive solution..."
|
||||
```
|
||||
|
||||
## 🔍 Enhanced Debugging
|
||||
## 📤 What Workers Pass Back to Supervisor
|
||||
|
||||
The updated `utils.py` now shows:
|
||||
- **Transfer explanations**: What each "Successfully transferred" means
|
||||
- **Conversation context**: Last few messages to understand the flow
|
||||
- **Tool call details**: What tools are being used and why
|
||||
- **Agent delegation**: Which agent is being called and for what purpose
|
||||
**Key Insight**: Workers don't explicitly "return" data. Instead, all their work becomes part of the shared conversation history that the supervisor can access.
|
||||
|
||||
## 🔍 Observing Result Flow in Practice
|
||||
### What Gets Added to the Message History
|
||||
|
||||
To see how results flow back to the supervisor, run the enhanced debugging and watch for:
|
||||
When a worker (like `network_diag`) executes:
|
||||
|
||||
1. **Agent Results**: Look for `AIMessage` from agents (not just transfer confirmations)
|
||||
2. **Conversation Context**: The expanding message history in each step
|
||||
3. **Supervisor Decision Changes**: How supervisor's next choice is influenced by results
|
||||
1. **AIMessages** - Agent's reasoning and analysis
|
||||
```
|
||||
"I'll start by checking external connectivity..."
|
||||
"DNS resolution appears to be working correctly..."
|
||||
"Network Analysis Summary: All systems operational..."
|
||||
```
|
||||
|
||||
### Example Debug Output Analysis:
|
||||
```
|
||||
🔄 STEP 2: system_info_worker
|
||||
💬 MESSAGE TYPE: AIMessage ← AGENT'S ACTUAL RESULT
|
||||
📄 CONTENT: "502 typically indicates upstream server issues..."
|
||||
2. **ToolMessages** - Raw command outputs
|
||||
```
|
||||
"PING 8.8.8.8 (8.8.8.8): 56 data bytes\n64 bytes from 8.8.8.8..."
|
||||
"google.com. 300 IN A 142.250.80.46"
|
||||
"tcp 0 0 0.0.0.0:22 0.0.0.0:* LISTEN"
|
||||
```
|
||||
|
||||
🔄 STEP 4: service_inventory_worker
|
||||
💬 MESSAGE TYPE: AIMessage ← AGENT'S ACTUAL RESULT
|
||||
📄 CONTENT: "Check PHP-FPM status, verify upstream config..."
|
||||
3. **Transfer Confirmation** - When worker completes
|
||||
```
|
||||
"Successfully transferred back to supervisor"
|
||||
```
|
||||
|
||||
🔄 STEP 5: supervisor
|
||||
💬 MESSAGE TYPE: AIMessage ← SUPERVISOR'S SYNTHESIS
|
||||
📄 CONTENT: "Based on system analysis and service inventory..."
|
||||
📚 CONVERSATION CONTEXT (12 messages) ← SUPERVISOR SEES ALL RESULTS
|
||||
### Complete Message Flow Example
|
||||
|
||||
```python
|
||||
# After network_diag completes, state["messages"] contains:
|
||||
[
|
||||
HumanMessage("My website is slow"), # Original query
|
||||
AIMessage("I'll check network connectivity..."), # Supervisor decision
|
||||
ToolMessage("Successfully transferred to network_diag"), # Transfer confirmation
|
||||
AIMessage("Starting network diagnostics..."), # Worker starts
|
||||
ToolMessage("PING 8.8.8.8: 64 bytes from 8.8.8.8..."), # Command result 1
|
||||
AIMessage("External connectivity is good, checking DNS"), # Worker analysis
|
||||
ToolMessage("google.com. 300 IN A 142.250.80.46"), # Command result 2
|
||||
AIMessage("DNS working. Checking local services..."), # Worker continues
|
||||
ToolMessage("tcp 0 0 0.0.0.0:80 0.0.0.0:* LISTEN"), # Command result 3
|
||||
AIMessage("Network Summary: All good, issue elsewhere"), # Worker's final analysis
|
||||
ToolMessage("Successfully transferred back to supervisor") # Transfer back
|
||||
]
|
||||
```
|
||||
|
||||
The supervisor's final response demonstrates it has processed and synthesized results from both agents!
|
||||
### How Supervisor Uses This Information
|
||||
|
||||
The supervisor receives **ALL** these messages and can:
|
||||
|
||||
1. **Read command outputs** to understand technical details
|
||||
2. **See agent reasoning** to understand what was checked
|
||||
3. **Access final analysis** to make informed decisions
|
||||
4. **Decide next steps** based on accumulated evidence
|
||||
|
||||
### Why This Design Works
|
||||
|
||||
- **Full Transparency**: Supervisor sees everything the worker did
|
||||
- **Rich Context**: Both raw data and interpreted analysis available
|
||||
- **Cumulative Knowledge**: Each agent builds on previous work
|
||||
- **Intelligent Routing**: Supervisor can adapt strategy based on findings
|
||||
|
||||
### Example: Multi-Agent Collaboration
|
||||
|
||||
```
|
||||
User: "Website is slow"
|
||||
├── network_diag finds: "Network is fine"
|
||||
├── cert_checker finds: "Certificate expires tomorrow!"
|
||||
└── Supervisor synthesis: "Issue is expiring certificate, not network"
|
||||
```
|
||||
|
||||
The supervisor can correlate findings across multiple workers because it sees all their work in the message history.
|
||||
|
||||
## 📋 Key Takeaways
|
||||
|
@ -1 +0,0 @@
|
||||
../loghub
|
@ -3,11 +3,55 @@
|
||||
from langchain_openai import ChatOpenAI
|
||||
from langgraph_supervisor import create_supervisor
|
||||
|
||||
from agents.system_agents import create_system_info_worker, create_service_inventory_worker
|
||||
from agents.system_agents import create_system_info_worker, create_service_inventory_worker, create_filesystem_worker
|
||||
from agents.service_agents import create_mariadb_worker, create_nginx_worker, create_phpfpm_worker
|
||||
from agents.network_agents import create_network_worker, create_cert_worker
|
||||
from agents.analysis_agents import create_risk_worker, create_remediation_worker, create_harmonizer_worker
|
||||
from config import get_base_model, SUPERVISOR_PROMPT
|
||||
|
||||
|
||||
def get_base_model():
|
||||
"""Get the base LLM model configuration."""
|
||||
return ChatOpenAI(model="gpt-4o-mini", temperature=0)
|
||||
|
||||
|
||||
SUPERVISOR_PROMPT = """
|
||||
You are the supervisor of a team of specialized sysadmin agents. Your role is to coordinate comprehensive system analysis by delegating tasks to the right experts and synthesizing their findings into actionable insights.
|
||||
|
||||
IMPORTANT: You do NOT have direct access to the file system. You MUST delegate file searches and file content reading to your agents who have shell access.
|
||||
|
||||
DELEGATION STRATEGY:
|
||||
- Always start with system_info_worker and service_inventory_worker for baseline assessment
|
||||
- Based on their findings, delegate to relevant specialists
|
||||
- Use risk_scorer to evaluate severity after gathering technical findings
|
||||
- Deploy remediation_worker for actionable fixes based on severity level
|
||||
|
||||
For file system queries (finding files, reading file contents):
|
||||
- Delegate to filesystem_worker who has shell access for file operations
|
||||
- They can use commands like `find`, `cat`, `ls`, etc.
|
||||
|
||||
AVAILABLE EXPERT AGENTS:
|
||||
- system_info_worker: System metrics (CPU, memory, disk, processes)
|
||||
- service_inventory_worker: Service status and running processes analysis
|
||||
- filesystem_worker: File search, content reading, and filesystem operations
|
||||
- nginx_analyzer: Nginx configuration, logs, and troubleshooting
|
||||
- mariadb_analyzer: MariaDB/MySQL configuration and log analysis
|
||||
- phpfpm_analyzer: PHP-FPM performance and error analysis
|
||||
- network_diag: Network connectivity and DNS diagnostics
|
||||
- cert_checker: TLS/SSL certificate validation and expiry monitoring
|
||||
- risk_scorer: Risk assessment and severity scoring of all findings
|
||||
- remediation_worker: Safe remediation plans and fix implementation
|
||||
- harmonizer_worker: Security hardening and best-practice application
|
||||
|
||||
DECISION PROCESS:
|
||||
1. Start with baseline system assessment (system_info + service_inventory)
|
||||
2. Based on user query and baseline findings, call relevant specialists
|
||||
3. Use risk_scorer to evaluate cumulative findings
|
||||
4. Deploy remediation_worker for actionable solutions
|
||||
5. Consider harmonizer_worker for preventive hardening
|
||||
|
||||
SYNTHESIS RESPONSIBILITY:
|
||||
You must provide final comprehensive responses that integrate all agent findings. Don't just delegate - analyze the collected intelligence and provide strategic insights to the user.
|
||||
"""
|
||||
|
||||
|
||||
def create_sysadmin_supervisor():
|
||||
@ -17,6 +61,7 @@ def create_sysadmin_supervisor():
|
||||
agents = [
|
||||
create_system_info_worker(),
|
||||
create_service_inventory_worker(),
|
||||
create_filesystem_worker(),
|
||||
create_mariadb_worker(),
|
||||
create_nginx_worker(),
|
||||
create_phpfpm_worker(),
|
||||
|
@ -75,21 +75,31 @@ def print_step_info(step_count: int, chunk):
|
||||
# Show the result being sent back to supervisor
|
||||
# Look for the last AIMessage before this transfer to get the result
|
||||
if 'messages' in agent_data and len(agent_data['messages']) > 1:
|
||||
print(f"[ DEBUG ] {current_agent} has {len(agent_data['messages'])} messages")
|
||||
# Look for the most recent AIMessage with content
|
||||
for msg in reversed(agent_data['messages'][:-1]): # Exclude current ToolMessage
|
||||
if type(msg).__name__ == 'AIMessage' and hasattr(msg, 'content') and msg.content:
|
||||
result_content = msg.content
|
||||
if len(result_content) > 300:
|
||||
preview = result_content[:300] + "..."
|
||||
print(f"[ {current_agent} ] sending result to supervisor (preview): {preview}")
|
||||
print(f"[ {current_agent} ] (full result length: {len(result_content)} characters)")
|
||||
found_result = False
|
||||
for i, msg in enumerate(reversed(agent_data['messages'][:-1])): # Exclude current ToolMessage
|
||||
msg_type = type(msg).__name__
|
||||
print(f"[ DEBUG ] Message {i}: {msg_type}, has_content: {hasattr(msg, 'content')}")
|
||||
if msg_type == 'AIMessage' and hasattr(msg, 'content') and msg.content:
|
||||
result_content = msg.content.strip()
|
||||
if result_content and not result_content.startswith("I'll") and "transfer" not in result_content.lower():
|
||||
found_result = True
|
||||
if len(result_content) > 300:
|
||||
preview = result_content[:300] + "..."
|
||||
print(f"[ {current_agent} ] 📊 ANALYSIS SUMMARY (preview): {preview}")
|
||||
print(f"[ {current_agent} ] (full result length: {len(result_content)} characters)")
|
||||
else:
|
||||
print(f"[ {current_agent} ] 📊 ANALYSIS SUMMARY: {result_content}")
|
||||
break
|
||||
else:
|
||||
print(f"[ {current_agent} ] sending result to supervisor: {result_content}")
|
||||
break
|
||||
else:
|
||||
print(f"[ {current_agent} ] sending analysis results to supervisor")
|
||||
print(f"[ DEBUG ] Skipping AIMessage: '{result_content[:100]}...'")
|
||||
|
||||
if not found_result:
|
||||
print(f"[ WARNING ] {current_agent} transferred back without providing analysis summary!")
|
||||
print(f"[ WARNING ] This agent may need prompt improvements")
|
||||
else:
|
||||
print(f"[ {current_agent} ] sending analysis results to supervisor")
|
||||
print(f"[ WARNING ] {current_agent} has no message history to analyze")
|
||||
else:
|
||||
# Other tool execution result
|
||||
if len(content) > 200:
|
||||
|
Loading…
x
Reference in New Issue
Block a user