This commit is contained in:
Gaetan Hurel 2025-06-26 18:02:43 +02:00
parent ea1519a208
commit d33cddef1e
No known key found for this signature in database
13 changed files with 684 additions and 82 deletions

View File

@ -10,8 +10,33 @@ def create_risk_worker():
model="openai:gpt-4o-mini",
tools=[], # pureLLM reasoning
prompt="""
Aggregate the findings from other agents and assign a severity: Critical, High, Medium, or Low.
Output a short report.
You are a cybersecurity and system reliability expert specializing in risk assessment.
TASK: Analyze findings from other agents and assign comprehensive risk scoring.
ANALYSIS PROCESS:
1. Review all findings from system_info_worker, service_inventory_worker, and specialist agents
2. Identify security vulnerabilities, performance issues, and operational risks
3. Assess potential impact and likelihood of problems
4. Assign severity levels and provide prioritized recommendations
SEVERITY LEVELS:
- **CRITICAL**: System down, security breach, data loss risk
- **HIGH**: Service degradation, security vulnerability, urgent attention needed
- **MEDIUM**: Performance issues, minor security concerns, planned maintenance needed
- **LOW**: Optimization opportunities, informational findings
IMPORTANT: Provide a structured risk assessment including:
1. Overall risk level with justification
2. Top 3 priority issues with severity levels
3. Security risk assessment
4. Performance/availability risk assessment
5. Recommended immediate actions
6. Long-term improvement suggestions
Base your analysis on concrete findings from other agents. If insufficient data, request specific agent analysis.
Always provide your comprehensive risk assessment before completing your task.
""",
name="risk_scorer"
)
@ -23,8 +48,37 @@ def create_remediation_worker():
model="openai:gpt-4o-mini",
tools=[get_shell_tool()],
prompt="""
Propose safe bash commands or configuration edits to fix detected issues.
NEVER run destructive commands automatically; always request confirmation.
You are a system remediation expert specializing in safe problem resolution.
TASK: Propose and implement safe fixes for detected issues based on other agents' findings.
SAFETY PROTOCOL:
- NEVER run destructive commands automatically
- Always request confirmation for system changes
- Provide dry-run commands when possible
- Explain potential risks of each action
ANALYSIS PROCESS:
1. Review findings from all previous agents
2. Identify actionable problems
3. Propose step-by-step remediation plans
4. Differentiate between immediate fixes and planned maintenance
COMMAND CATEGORIES:
- **Safe diagnostic commands**: Run immediately for verification
- **Configuration changes**: Propose with backup procedures
- **Service restarts**: Explain impact and timing
- **System changes**: Require explicit confirmation
IMPORTANT: Provide structured remediation plan including:
1. Summary of issues to address
2. Immediate safe actions (with commands)
3. Proposed configuration changes (with backups)
4. Service restart procedures
5. Risk mitigation steps
6. Verification commands to confirm fixes
For each suggested action, explain the reasoning and potential impact. Always provide your remediation plan before completing your task.
""",
name="remediation_worker"
)
@ -36,7 +90,36 @@ def create_harmonizer_worker():
model="openai:gpt-4o-mini",
tools=[get_shell_tool()],
prompt="""
Apply bestpractice hardening (`ulimit`, `sysctl`, journald rotation) in dryrun mode unless severity is High.
You are a system security hardening expert specializing in best-practice implementation.
TASK: Apply security hardening measures based on system analysis and risk assessment.
HARDENING CATEGORIES:
1. **System Limits**: ulimit settings, process limits
2. **Kernel Parameters**: sysctl security settings
3. **Log Management**: journald rotation, log security
4. **Service Security**: disable unnecessary services
5. **File Permissions**: secure sensitive files
EXECUTION MODES:
- **DRY-RUN (default)**: Show commands without execution
- **APPLY (High+ severity)**: Execute with confirmation
STANDARD HARDENING CHECKS:
- `ulimit -a` - Current limits
- `sysctl -a | grep -E "(net.ipv4|kernel.dmesg_restrict)"` - Security parameters
- `journalctl --disk-usage` - Log space usage
- `find /etc -perm -002 -type f` - World-writable files
IMPORTANT: Provide structured hardening report including:
1. Current security posture assessment
2. Recommended hardening measures
3. Commands for implementation (dry-run by default)
4. Risk reduction achieved by each measure
5. Potential compatibility impacts
6. Priority order for implementation
Execute changes only for High+ severity findings or with explicit approval. Always provide your hardening assessment before completing your task.
""",
name="harmonizer_worker"
)

View File

@ -10,7 +10,32 @@ def create_network_worker():
model="openai:gpt-4o-mini",
tools=[get_shell_tool()],
prompt="""
Diagnose network issues using `ping`, `traceroute`, and `dig`.
You are a network diagnostics expert specializing in connectivity and DNS analysis.
TASK: Perform comprehensive network diagnostics.
STANDARD COMMANDS:
- `ping -c 4 8.8.8.8` - Test external connectivity
- `ping -c 4 localhost` - Test local connectivity
- `dig @8.8.8.8 google.com` - Test DNS resolution
- `netstat -tuln | head -20` - Check listening ports
- `ss -tuln | head -20` - Alternative port check
ADAPTIVE COMMANDS: Based on the user's query, run relevant commands like:
- `traceroute [target]` for routing issues
- `dig [domain]` for DNS problems
- `nslookup [domain]` for DNS verification
- `curl -I [url]` for HTTP connectivity
IMPORTANT: After diagnostics, provide a comprehensive summary including:
1. External connectivity status
2. DNS resolution functionality
3. Local services and open ports
4. Any network issues detected
5. Specific analysis related to user's query
6. Recommendations for network troubleshooting
Always provide your network analysis summary before completing your task.
""",
name="network_diag"
)
@ -22,8 +47,27 @@ def create_cert_worker():
model="openai:gpt-4o-mini",
tools=[get_shell_tool()],
prompt="""
Check TLS certificates on disk with `openssl x509 -noout -enddate -in <cert>`.
Raise an alert when a certificate expires in fewer than 30 days.
You are a TLS/SSL certificate expert specializing in certificate validation and monitoring.
TASK: Check certificate status and expiration dates.
STANDARD COMMANDS:
- `find /etc/ssl /etc/nginx /etc/apache2 -name "*.crt" -o -name "*.pem" 2>/dev/null | head -10` - Find certificates
- For each found certificate: `openssl x509 -noout -enddate -subject -in [cert_file]`
- `openssl s_client -connect localhost:443 -servername localhost < /dev/null 2>/dev/null | openssl x509 -noout -enddate -subject` - Check web server cert
ADAPTIVE COMMANDS: Based on user query, check specific certificates or domains:
- `echo | openssl s_client -connect [domain]:443 2>/dev/null | openssl x509 -noout -enddate -subject`
IMPORTANT: After checking certificates, provide analysis including:
1. List of certificates found on system
2. Expiration dates and time remaining
3. Certificates expiring within 30 days (ALERT)
4. Certificate subjects and purposes
5. Any certificate validation issues
6. Recommendations for certificate renewal
Format with clear warnings for expiring certificates. Always provide your certificate analysis summary before completing your task.
""",
name="cert_checker"
)

View File

@ -10,8 +10,30 @@ def create_mariadb_worker():
model="openai:gpt-4o-mini",
tools=[get_shell_tool(), LogTailTool()],
prompt="""
You are a MariaDB expert. Check config files in /etc/mysql and inspect `/var/log/mysql/*.log` for errors.
Use `mysqladmin status` and other readonly commands. Use the `tail_log` tool for logs.
You are a MariaDB database expert specializing in configuration and log analysis.
TASK: Analyze MariaDB configuration, status, and logs.
STANDARD COMMANDS:
- `systemctl status mariadb` or `systemctl status mysql` - Service status
- `mysqladmin status` - Basic status (if accessible)
- `mysqladmin variables | grep -E "(max_connections|innodb_buffer)"` - Key variables
- Check config files: `ls -la /etc/mysql/` and `cat /etc/mysql/my.cnf`
LOG ANALYSIS (use tail_log tool):
- `/var/log/mysql/error.log` - Error log
- `/var/log/mysql/mysql.log` - General log
- `/var/log/mariadb/mariadb.log` - MariaDB log
IMPORTANT: After analysis, provide comprehensive summary including:
1. MariaDB service status and version
2. Configuration assessment (memory, connections)
3. Recent errors from logs
4. Performance indicators
5. Security configuration review
6. Issues found and recommendations
Focus on problems that could affect application connectivity or performance. Always provide your MariaDB analysis summary before completing your task.
""",
name="mariadb_analyzer"
)
@ -23,8 +45,35 @@ def create_nginx_worker():
model="openai:gpt-4o-mini",
tools=[get_shell_tool(), LogTailTool()],
prompt="""
You are an Nginx expert. Validate configuration with `nginx -t` and inspect access/error logs.
Use the `tail_log` tool for `/var/log/nginx/error.log`.
You are an Nginx web server expert specializing in configuration and troubleshooting.
TASK: Analyze Nginx configuration, status, and logs for issues.
STANDARD COMMANDS:
- `systemctl status nginx` - Service status
- `nginx -t` - Configuration validation
- `nginx -V` - Version and compile options
- `ps aux | grep nginx` - Process information
- Check config: `ls -la /etc/nginx/` and examine `/etc/nginx/nginx.conf`
LOG ANALYSIS (use tail_log tool):
- `/var/log/nginx/error.log` - Error log
- `/var/log/nginx/access.log` - Access log (recent entries)
IMPORTANT: After analysis, provide comprehensive summary including:
1. Nginx service status and version
2. Configuration validation results
3. Worker processes and resource usage
4. Recent errors from error log
5. Access patterns and status codes from access log
6. Configuration issues and recommendations
For 502/503/504 errors, specifically check:
- Upstream server connections
- PHP-FPM socket connectivity
- Resource limits and timeouts
Always provide your Nginx analysis summary before completing your task.
""",
name="nginx_analyzer"
)
@ -36,7 +85,41 @@ def create_phpfpm_worker():
model="openai:gpt-4o-mini",
tools=[get_shell_tool(), LogTailTool()],
prompt="""
You are a PHPFPM expert. Check `systemctl status php*-fpm` and look for memory leaks or timeouts in the logs.
You are a PHP-FPM expert specializing in performance analysis and troubleshooting.
TASK: Analyze PHP-FPM configuration, status, and performance issues.
STANDARD COMMANDS:
- `systemctl status php*-fpm` - Service status (multiple versions)
- `ps aux | grep php-fpm` - Process information
- Check pools: `ls /etc/php/*/fpm/pool.d/` or similar
- `find /var/log -name "*php*" -type f` - Find PHP logs
CONFIGURATION ANALYSIS:
- Examine PHP-FPM pool configuration files
- Check memory limits: `php -i | grep memory_limit`
- Check max execution time: `php -i | grep max_execution_time`
LOG ANALYSIS (use tail_log tool):
- PHP-FPM error logs
- Slow log if enabled
- System logs for PHP-FPM entries
IMPORTANT: After analysis, provide comprehensive summary including:
1. PHP-FPM service status and version
2. Active pools and worker processes
3. Memory usage and limits
4. Recent errors and warnings
5. Performance issues (timeouts, memory exhaustion)
6. Pool configuration recommendations
For 502 errors, specifically check:
- Socket permissions and connectivity
- Worker process limits
- Memory exhaustion issues
- Timeout configurations
Always provide your PHP-FPM analysis summary before completing your task.
""",
name="phpfpm_analyzer"
)

View File

@ -10,8 +10,35 @@ def create_system_info_worker():
model="openai:gpt-4o-mini",
tools=[get_shell_tool()],
prompt="""
You are a Linux sysadmin. Use shell commands like `lscpu`, `free -h`, and `df -h` to gather CPU, RAM, and disk usage.
Return a concise plaintext summary. Only run safe, readonly commands.
You are a Linux sysadmin expert specializing in system metrics analysis.
TASK: Gather comprehensive system information using shell commands.
WORKFLOW:
1. Execute the required commands to gather system data
2. Analyze the results from all commands
3. Provide a comprehensive analysis summary
4. Only then transfer back to supervisor
REQUIRED COMMANDS:
- `lscpu` - CPU information
- `free -h` - Memory usage
- `df -h` - Disk usage
- `uptime` - System load
- `ps aux --sort=-%mem | head -10` - Top memory-consuming processes
ANALYSIS REQUIREMENTS:
After running ALL commands, you MUST provide a comprehensive summary including:
1. CPU specs and current load
2. Memory usage (total, used, available) with percentage
3. Disk usage with alerts for >80% usage
4. System uptime and load averages
5. Top resource-consuming processes
6. Any concerning metrics or recommendations
CRITICAL: Your response must be a structured analysis summary that starts with "📊 SYSTEM ANALYSIS SUMMARY:" and includes all findings. Do NOT just say "transferring back" - provide the actual analysis first.
Only run safe, read-only commands. Always provide your complete analysis summary before transferring back to supervisor.
""",
name="system_info_worker"
)
@ -23,8 +50,84 @@ def create_service_inventory_worker():
model="openai:gpt-4o-mini",
tools=[get_shell_tool()],
prompt="""
List all running services using `systemctl list-units --type=service --state=running`.
Return a JSON array of service names.
You are a Linux services expert specializing in service inventory and analysis.
TASK: Analyze running services and identify key system services.
WORKFLOW:
1. Execute the required commands to gather service data
2. Analyze service status and identify critical services
3. Provide a structured service analysis summary
4. Only then transfer back to supervisor
REQUIRED COMMANDS:
- `systemctl list-units --type=service --state=running` - List running services
- `systemctl list-units --type=service --state=failed` - Check for failed services
- `ps aux | grep -E "(nginx|apache|httpd|mysql|mariadb|postgresql|php-fpm|sshd)"` - Check web/db services
ANALYSIS REQUIREMENTS:
After running ALL commands, you MUST provide a structured analysis including:
1. Total number of running services
2. Critical services status (web servers, databases, SSH)
3. Any failed or problematic services
4. Security-relevant services (SSH, firewall)
5. Services that might relate to the user's query
6. Recommendations for further investigation
CRITICAL: Your response must be a structured analysis summary that starts with "📋 SERVICE ANALYSIS SUMMARY:" and includes all findings. Do NOT just say "transferring back" - provide the actual analysis first.
Format as clear summary with service categories and status. Always provide your complete service analysis summary before transferring back to supervisor.
""",
name="service_inventory_worker"
)
def create_filesystem_worker():
"""Create filesystem operations agent."""
return create_react_agent(
model="openai:gpt-4o-mini",
tools=[get_shell_tool()],
prompt="""
You are a filesystem expert specializing in file operations and system navigation.
TASK: Handle filesystem queries, file searches, and file content operations.
FILE SEARCH COMMANDS:
- `find /path -name "filename"` - Search for files by name
- `find /path -type f -name "*.ext"` - Search by file extension
- `find ~ -name "filename"` - Search in home directory
- `locate filename` - Fast search (if updatedb is available)
- `which command` - Find executable location
- `ls -la /path/` - List directory contents with details
- `du -sh /path/` - Check directory size
FILE CONTENT OPERATIONS:
- `cat /path/to/file` - Display full file contents
- `head -n 20 /path/to/file` - Show first 20 lines
- `tail -n 20 /path/to/file` - Show last 20 lines
- `grep "pattern" /path/to/file` - Search within file
- `wc -l /path/to/file` - Count lines in file
- `file /path/to/file` - Determine file type
DIRECTORY OPERATIONS:
- `pwd` - Show current directory
- `tree /path/` - Show directory tree structure (if available)
- `ls -R /path/` - Recursive directory listing
PERMISSIONS AND OWNERSHIP:
- `stat /path/to/file` - Detailed file information
- `ls -la /path/to/file` - File permissions and ownership
IMPORTANT:
- Always provide clear, formatted output
- For large files, use head/tail to show relevant portions
- When searching, provide full paths in results
- If a file doesn't exist, suggest alternative locations
- Handle permission errors gracefully and suggest solutions
CRITICAL: Your response must be a structured summary that starts with "📁 FILESYSTEM ANALYSIS:" and includes your findings. Do NOT just say "transferring back" - provide the actual results first.
Always complete filesystem operations thoroughly and provide helpful context about what you found.
""",
name="filesystem_worker"
)

View File

@ -1,26 +0,0 @@
"""Configuration settings for the multi-agent system."""
from langchain_openai import ChatOpenAI
def get_base_model():
"""Get the base LLM model configuration."""
return ChatOpenAI(model="gpt-4o-mini", temperature=0)
SUPERVISOR_PROMPT = """
You are the supervisor of a team of specialised sysadmin agents.
Decide which agent to delegate to based on the user's query **or** on results already collected.
Available agents:
- system_info_worker: gather system metrics
- service_inventory_worker: list running services
- mariadb_analyzer: analyse MariaDB
- nginx_analyzer: analyse Nginx
- phpfpm_analyzer: analyse PHPFPM
- network_diag: diagnose network issues
- cert_checker: check TLS certificates
- risk_scorer: aggregate severity
- remediation_worker: propose fixes
- harmonizer_worker: apply hardening
Always start with `system_info_worker` and `service_inventory_worker` before drilling into a specific service.
"""

View File

@ -0,0 +1,93 @@
# Enhanced Agent Results Communication
## Problem Identified
The agents were only sending "Successfully transferred control back to supervisor" messages without providing meaningful analysis results from their work.
## Root Cause
The agent prompts were too brief and didn't explicitly instruct agents to:
1. Summarize their findings after executing commands
2. Provide structured analysis before transferring back to supervisor
3. Include specific recommendations and insights
## Solution Implemented
### 1. Enhanced Agent Prompts
Updated all agent prompts to include:
- **Explicit task definitions** with required commands
- **Structured analysis requirements** with specific sections
- **Clear instructions** to provide comprehensive summaries
- **Always provide analysis summary before completing task**
### 2. Specific Improvements by Agent
#### System Agents
- **system_info_worker**: Now analyzes CPU, memory, disk, load, and top processes with structured summary
- **service_inventory_worker**: Provides service categorization, failed services analysis, security-relevant services
#### Service Agents
- **nginx_analyzer**: Comprehensive config validation, log analysis, specific 502/503/504 error troubleshooting
- **mariadb_analyzer**: Database status, configuration assessment, log analysis, performance indicators
- **phpfpm_analyzer**: Process analysis, memory limits, timeout configuration, socket connectivity
#### Network Agents
- **network_diag**: Connectivity testing, DNS analysis, port scanning with adaptive commands
- **cert_checker**: Certificate discovery, expiration monitoring, validation with 30-day alerts
#### Analysis Agents
- **risk_scorer**: Structured risk assessment with CRITICAL/HIGH/MEDIUM/LOW levels and prioritized recommendations
- **remediation_worker**: Safe remediation plans with dry-run commands and confirmation protocols
- **harmonizer_worker**: Security hardening assessment with best-practice implementation
### 3. Enhanced Supervisor Coordination
Updated supervisor prompt to:
- Better utilize agent expertise
- Synthesize findings from multiple agents
- Provide strategic insights beyond delegation
## Expected Behavior Change
### Before:
```
STEP 2: SYSTEM_INFO_WORKER
Successfully transferred control back to supervisor
STEP 3: SERVICE_INVENTORY_WORKER
Successfully transferred control back to supervisor
STEP 4: SUPERVISOR
Generic response not based on actual findings
```
### After:
```
STEP 2: SYSTEM_INFO_WORKER
📊 ANALYSIS SUMMARY:
- CPU: Intel i7, load average 0.8, 1.2, 1.5
- Memory: 8GB total, 6.2GB used (78% utilization)
- Disk: Root partition 85% full - WARNING LEVEL
- Top processes: nginx (2.1GB), mysql (1.8GB)
- Recommendation: Monitor disk space, investigate memory usage
STEP 3: SERVICE_INVENTORY_WORKER
📋 SERVICE ANALYSIS:
- 47 services running (normal range)
- Critical services: ✅ nginx, ✅ mysql, ❌ php-fpm (failed)
- Failed services: php8.1-fpm.service
- Security services: ✅ ssh, ✅ ufw
- Recommendation: Investigate php-fpm failure for potential 502 errors
STEP 4: SUPERVISOR
Based on system analysis showing high memory usage and service inventory
revealing php-fpm failure, this explains your 502 errors...
```
## Files Modified
- `agents/system_agents.py` - Enhanced system monitoring agents
- `agents/service_agents.py` - Enhanced service-specific agents
- `agents/network_agents.py` - Enhanced network and security agents
- `agents/analysis_agents.py` - Enhanced analysis and remediation agents
- `config.py` - Enhanced supervisor prompt and coordination strategy
## Result
Agents now provide meaningful, structured analysis that the supervisor can synthesize into comprehensive, actionable responses instead of generic outputs.

View File

@ -0,0 +1,129 @@
# Dynamic Instructions for Agent Transfers - TODO
## Current Behavior
Currently, when the supervisor transfers control to an agent:
- ❌ No specific instructions are passed
- ❌ Agent only sees the original user query
- ❌ Agent uses its static, pre-defined prompt
## Proposed Enhancement: Dynamic Instructions
### Why It Matters
The supervisor often has context about WHY it's transferring to a specific agent. For example:
- "Transfer to network_diag because user mentioned DNS issues - focus on DNS diagnostics"
- "Transfer to cert_checker because certificates might be expiring - check all certs urgently"
### Implementation Approach
#### 1. Modify Transfer Tools
```python
def transfer_to_network_diag(instructions: str = "") -> str:
"""Transfer control to network diagnostics agent.
Args:
instructions: Specific guidance for the agent
"""
return f"Successfully transferred to network_diag. Instructions: {instructions}"
```
#### 2. Update State to Include Instructions
```python
class State(BaseModel):
messages: List[AnyMessage]
next_agent: str = "supervisor"
supervisor_instructions: Optional[str] = None # NEW FIELD
```
#### 3. Modify Agent Creation to Check for Instructions
```python
def create_network_worker():
return create_react_agent(
model="openai:gpt-4o-mini",
tools=[get_shell_tool()],
prompt="""
{base_prompt}
SUPERVISOR INSTRUCTIONS (if any): {supervisor_instructions}
Always prioritize supervisor instructions when provided.
""",
name="network_diag"
)
```
#### 4. Update Router Logic
```python
def route_agent(state):
# Extract supervisor instructions from last ToolMessage
last_message = state["messages"][-1]
if isinstance(last_message, ToolMessage) and "Instructions:" in last_message.content:
# Parse and store instructions
instructions = extract_instructions(last_message.content)
state["supervisor_instructions"] = instructions
return state["next_agent"]
```
### Example Flow
1. **User Query**: "My website is slow"
2. **Supervisor Analysis**:
```
"Website slowness could be DNS or certificate related.
Let me transfer to network_diag with specific focus."
```
3. **Supervisor Transfer**:
```python
transfer_to_network_diag(
instructions="Focus on DNS resolution times and latency to common websites.
Check if DNS servers are responding slowly."
)
```
4. **Network Agent Receives**:
- Original query: "My website is slow"
- Supervisor instructions: "Focus on DNS resolution times..."
- Can now prioritize DNS diagnostics over general network checks
### Benefits
1. **More Targeted Diagnostics**: Agents focus on what matters
2. **Better Context Sharing**: Supervisor's analysis isn't lost
3. **Efficient Execution**: Avoid running unnecessary commands
4. **Improved Results**: More relevant output for user's specific issue
### Alternative: Context in Messages
Instead of modifying tools, append supervisor analysis to the message history:
```python
# Before transfer, supervisor adds a system message
state["messages"].append(
SystemMessage(content=f"[SUPERVISOR GUIDANCE]: Focus on {specific_issue}")
)
```
### Decision Points
1. **Tool Parameters vs State**: Where to store instructions?
2. **Prompt Injection vs Message History**: How to pass instructions?
3. **Optional vs Required**: Should all transfers include instructions?
4. **Persistence**: Should instructions carry through multiple agent hops?
### Next Steps
1. [ ] Decide on implementation approach
2. [ ] Modify transfer tool signatures
3. [ ] Update state model
4. [ ] Enhance agent prompts to use instructions
5. [ ] Test with various scenarios
6. [ ] Document the new pattern
### Example Test Cases
- "Check network" → No specific instructions needed
- "Website is slow" → "Focus on DNS and latency"
- "Certificate expiring?" → "Check all certs, prioritize those expiring soon"
- "Port 443 issues" → "Focus on HTTPS connectivity and certificate validation"

View File

@ -97,39 +97,78 @@ User: "Nginx 502 error, help!"
└── "Based on system analysis and service inventory, here's comprehensive solution..."
```
## 🔍 Enhanced Debugging
## 📤 What Workers Pass Back to Supervisor
The updated `utils.py` now shows:
- **Transfer explanations**: What each "Successfully transferred" means
- **Conversation context**: Last few messages to understand the flow
- **Tool call details**: What tools are being used and why
- **Agent delegation**: Which agent is being called and for what purpose
**Key Insight**: Workers don't explicitly "return" data. Instead, all their work becomes part of the shared conversation history that the supervisor can access.
## 🔍 Observing Result Flow in Practice
### What Gets Added to the Message History
To see how results flow back to the supervisor, run the enhanced debugging and watch for:
When a worker (like `network_diag`) executes:
1. **Agent Results**: Look for `AIMessage` from agents (not just transfer confirmations)
2. **Conversation Context**: The expanding message history in each step
3. **Supervisor Decision Changes**: How supervisor's next choice is influenced by results
1. **AIMessages** - Agent's reasoning and analysis
```
"I'll start by checking external connectivity..."
"DNS resolution appears to be working correctly..."
"Network Analysis Summary: All systems operational..."
```
### Example Debug Output Analysis:
```
🔄 STEP 2: system_info_worker
💬 MESSAGE TYPE: AIMessage ← AGENT'S ACTUAL RESULT
📄 CONTENT: "502 typically indicates upstream server issues..."
2. **ToolMessages** - Raw command outputs
```
"PING 8.8.8.8 (8.8.8.8): 56 data bytes\n64 bytes from 8.8.8.8..."
"google.com. 300 IN A 142.250.80.46"
"tcp 0 0 0.0.0.0:22 0.0.0.0:* LISTEN"
```
🔄 STEP 4: service_inventory_worker
💬 MESSAGE TYPE: AIMessage ← AGENT'S ACTUAL RESULT
📄 CONTENT: "Check PHP-FPM status, verify upstream config..."
3. **Transfer Confirmation** - When worker completes
```
"Successfully transferred back to supervisor"
```
🔄 STEP 5: supervisor
💬 MESSAGE TYPE: AIMessage ← SUPERVISOR'S SYNTHESIS
📄 CONTENT: "Based on system analysis and service inventory..."
📚 CONVERSATION CONTEXT (12 messages) ← SUPERVISOR SEES ALL RESULTS
### Complete Message Flow Example
```python
# After network_diag completes, state["messages"] contains:
[
HumanMessage("My website is slow"), # Original query
AIMessage("I'll check network connectivity..."), # Supervisor decision
ToolMessage("Successfully transferred to network_diag"), # Transfer confirmation
AIMessage("Starting network diagnostics..."), # Worker starts
ToolMessage("PING 8.8.8.8: 64 bytes from 8.8.8.8..."), # Command result 1
AIMessage("External connectivity is good, checking DNS"), # Worker analysis
ToolMessage("google.com. 300 IN A 142.250.80.46"), # Command result 2
AIMessage("DNS working. Checking local services..."), # Worker continues
ToolMessage("tcp 0 0 0.0.0.0:80 0.0.0.0:* LISTEN"), # Command result 3
AIMessage("Network Summary: All good, issue elsewhere"), # Worker's final analysis
ToolMessage("Successfully transferred back to supervisor") # Transfer back
]
```
The supervisor's final response demonstrates it has processed and synthesized results from both agents!
### How Supervisor Uses This Information
The supervisor receives **ALL** these messages and can:
1. **Read command outputs** to understand technical details
2. **See agent reasoning** to understand what was checked
3. **Access final analysis** to make informed decisions
4. **Decide next steps** based on accumulated evidence
### Why This Design Works
- **Full Transparency**: Supervisor sees everything the worker did
- **Rich Context**: Both raw data and interpreted analysis available
- **Cumulative Knowledge**: Each agent builds on previous work
- **Intelligent Routing**: Supervisor can adapt strategy based on findings
### Example: Multi-Agent Collaboration
```
User: "Website is slow"
├── network_diag finds: "Network is fine"
├── cert_checker finds: "Certificate expires tomorrow!"
└── Supervisor synthesis: "Issue is expiring certificate, not network"
```
The supervisor can correlate findings across multiple workers because it sees all their work in the message history.
## 📋 Key Takeaways

View File

@ -1 +0,0 @@
../loghub

View File

@ -3,11 +3,55 @@
from langchain_openai import ChatOpenAI
from langgraph_supervisor import create_supervisor
from agents.system_agents import create_system_info_worker, create_service_inventory_worker
from agents.system_agents import create_system_info_worker, create_service_inventory_worker, create_filesystem_worker
from agents.service_agents import create_mariadb_worker, create_nginx_worker, create_phpfpm_worker
from agents.network_agents import create_network_worker, create_cert_worker
from agents.analysis_agents import create_risk_worker, create_remediation_worker, create_harmonizer_worker
from config import get_base_model, SUPERVISOR_PROMPT
def get_base_model():
"""Get the base LLM model configuration."""
return ChatOpenAI(model="gpt-4o-mini", temperature=0)
SUPERVISOR_PROMPT = """
You are the supervisor of a team of specialized sysadmin agents. Your role is to coordinate comprehensive system analysis by delegating tasks to the right experts and synthesizing their findings into actionable insights.
IMPORTANT: You do NOT have direct access to the file system. You MUST delegate file searches and file content reading to your agents who have shell access.
DELEGATION STRATEGY:
- Always start with system_info_worker and service_inventory_worker for baseline assessment
- Based on their findings, delegate to relevant specialists
- Use risk_scorer to evaluate severity after gathering technical findings
- Deploy remediation_worker for actionable fixes based on severity level
For file system queries (finding files, reading file contents):
- Delegate to filesystem_worker who has shell access for file operations
- They can use commands like `find`, `cat`, `ls`, etc.
AVAILABLE EXPERT AGENTS:
- system_info_worker: System metrics (CPU, memory, disk, processes)
- service_inventory_worker: Service status and running processes analysis
- filesystem_worker: File search, content reading, and filesystem operations
- nginx_analyzer: Nginx configuration, logs, and troubleshooting
- mariadb_analyzer: MariaDB/MySQL configuration and log analysis
- phpfpm_analyzer: PHP-FPM performance and error analysis
- network_diag: Network connectivity and DNS diagnostics
- cert_checker: TLS/SSL certificate validation and expiry monitoring
- risk_scorer: Risk assessment and severity scoring of all findings
- remediation_worker: Safe remediation plans and fix implementation
- harmonizer_worker: Security hardening and best-practice application
DECISION PROCESS:
1. Start with baseline system assessment (system_info + service_inventory)
2. Based on user query and baseline findings, call relevant specialists
3. Use risk_scorer to evaluate cumulative findings
4. Deploy remediation_worker for actionable solutions
5. Consider harmonizer_worker for preventive hardening
SYNTHESIS RESPONSIBILITY:
You must provide final comprehensive responses that integrate all agent findings. Don't just delegate - analyze the collected intelligence and provide strategic insights to the user.
"""
def create_sysadmin_supervisor():
@ -17,6 +61,7 @@ def create_sysadmin_supervisor():
agents = [
create_system_info_worker(),
create_service_inventory_worker(),
create_filesystem_worker(),
create_mariadb_worker(),
create_nginx_worker(),
create_phpfpm_worker(),

View File

@ -75,21 +75,31 @@ def print_step_info(step_count: int, chunk):
# Show the result being sent back to supervisor
# Look for the last AIMessage before this transfer to get the result
if 'messages' in agent_data and len(agent_data['messages']) > 1:
print(f"[ DEBUG ] {current_agent} has {len(agent_data['messages'])} messages")
# Look for the most recent AIMessage with content
for msg in reversed(agent_data['messages'][:-1]): # Exclude current ToolMessage
if type(msg).__name__ == 'AIMessage' and hasattr(msg, 'content') and msg.content:
result_content = msg.content
if len(result_content) > 300:
preview = result_content[:300] + "..."
print(f"[ {current_agent} ] sending result to supervisor (preview): {preview}")
print(f"[ {current_agent} ] (full result length: {len(result_content)} characters)")
found_result = False
for i, msg in enumerate(reversed(agent_data['messages'][:-1])): # Exclude current ToolMessage
msg_type = type(msg).__name__
print(f"[ DEBUG ] Message {i}: {msg_type}, has_content: {hasattr(msg, 'content')}")
if msg_type == 'AIMessage' and hasattr(msg, 'content') and msg.content:
result_content = msg.content.strip()
if result_content and not result_content.startswith("I'll") and "transfer" not in result_content.lower():
found_result = True
if len(result_content) > 300:
preview = result_content[:300] + "..."
print(f"[ {current_agent} ] 📊 ANALYSIS SUMMARY (preview): {preview}")
print(f"[ {current_agent} ] (full result length: {len(result_content)} characters)")
else:
print(f"[ {current_agent} ] 📊 ANALYSIS SUMMARY: {result_content}")
break
else:
print(f"[ {current_agent} ] sending result to supervisor: {result_content}")
break
else:
print(f"[ {current_agent} ] sending analysis results to supervisor")
print(f"[ DEBUG ] Skipping AIMessage: '{result_content[:100]}...'")
if not found_result:
print(f"[ WARNING ] {current_agent} transferred back without providing analysis summary!")
print(f"[ WARNING ] This agent may need prompt improvements")
else:
print(f"[ {current_agent} ] sending analysis results to supervisor")
print(f"[ WARNING ] {current_agent} has no message history to analyze")
else:
# Other tool execution result
if len(content) > 200: