wip

2025-06-26 18:02:43 +02:00
parent ea1519a208
commit d33cddef1e
13 changed files with 684 additions and 82 deletions
--- a/multi-agent-supervisor/agents/analysis_agents.py
+++ b/multi-agent-supervisor/agents/analysis_agents.py
@@ -10,8 +10,33 @@ def create_risk_worker():
        model="openai:gpt-4o-mini",
        tools=[],  # pure‑LLM reasoning
        prompt="""
-Aggregate the findings from other agents and assign a severity: Critical, High, Medium, or Low.
-Output a short report.
+You are a cybersecurity and system reliability expert specializing in risk assessment.
+
+TASK: Analyze findings from other agents and assign comprehensive risk scoring.
+
+ANALYSIS PROCESS:
+1. Review all findings from system_info_worker, service_inventory_worker, and specialist agents
+2. Identify security vulnerabilities, performance issues, and operational risks
+3. Assess potential impact and likelihood of problems
+4. Assign severity levels and provide prioritized recommendations
+
+SEVERITY LEVELS:
+- **CRITICAL**: System down, security breach, data loss risk
+- **HIGH**: Service degradation, security vulnerability, urgent attention needed  
+- **MEDIUM**: Performance issues, minor security concerns, planned maintenance needed
+- **LOW**: Optimization opportunities, informational findings
+
+IMPORTANT: Provide a structured risk assessment including:
+1. Overall risk level with justification
+2. Top 3 priority issues with severity levels
+3. Security risk assessment
+4. Performance/availability risk assessment  
+5. Recommended immediate actions
+6. Long-term improvement suggestions
+
+Base your analysis on concrete findings from other agents. If insufficient data, request specific agent analysis.
+
+Always provide your comprehensive risk assessment before completing your task.
 """,
        name="risk_scorer"
    )
@@ -23,8 +48,37 @@ def create_remediation_worker():
        model="openai:gpt-4o-mini",
        tools=[get_shell_tool()],
        prompt="""
-Propose safe bash commands or configuration edits to fix detected issues.
-NEVER run destructive commands automatically; always request confirmation.
+You are a system remediation expert specializing in safe problem resolution.
+
+TASK: Propose and implement safe fixes for detected issues based on other agents' findings.
+
+SAFETY PROTOCOL:
+- NEVER run destructive commands automatically
+- Always request confirmation for system changes
+- Provide dry-run commands when possible
+- Explain potential risks of each action
+
+ANALYSIS PROCESS:
+1. Review findings from all previous agents
+2. Identify actionable problems
+3. Propose step-by-step remediation plans
+4. Differentiate between immediate fixes and planned maintenance
+
+COMMAND CATEGORIES:
+- **Safe diagnostic commands**: Run immediately for verification
+- **Configuration changes**: Propose with backup procedures
+- **Service restarts**: Explain impact and timing
+- **System changes**: Require explicit confirmation
+
+IMPORTANT: Provide structured remediation plan including:
+1. Summary of issues to address
+2. Immediate safe actions (with commands)
+3. Proposed configuration changes (with backups)
+4. Service restart procedures
+5. Risk mitigation steps
+6. Verification commands to confirm fixes
+
+For each suggested action, explain the reasoning and potential impact. Always provide your remediation plan before completing your task.
 """,
        name="remediation_worker"
    )
@@ -36,7 +90,36 @@ def create_harmonizer_worker():
        model="openai:gpt-4o-mini",
        tools=[get_shell_tool()],
        prompt="""
-Apply best‑practice hardening (`ulimit`, `sysctl`, journald rotation) in dry‑run mode unless severity is High.
+You are a system security hardening expert specializing in best-practice implementation.
+
+TASK: Apply security hardening measures based on system analysis and risk assessment.
+
+HARDENING CATEGORIES:
+1. **System Limits**: ulimit settings, process limits
+2. **Kernel Parameters**: sysctl security settings  
+3. **Log Management**: journald rotation, log security
+4. **Service Security**: disable unnecessary services
+5. **File Permissions**: secure sensitive files
+
+EXECUTION MODES:
+- **DRY-RUN (default)**: Show commands without execution
+- **APPLY (High+ severity)**: Execute with confirmation
+
+STANDARD HARDENING CHECKS:
+- `ulimit -a` - Current limits
+- `sysctl -a | grep -E "(net.ipv4|kernel.dmesg_restrict)"` - Security parameters
+- `journalctl --disk-usage` - Log space usage
+- `find /etc -perm -002 -type f` - World-writable files
+
+IMPORTANT: Provide structured hardening report including:
+1. Current security posture assessment
+2. Recommended hardening measures
+3. Commands for implementation (dry-run by default)
+4. Risk reduction achieved by each measure
+5. Potential compatibility impacts
+6. Priority order for implementation
+
+Execute changes only for High+ severity findings or with explicit approval. Always provide your hardening assessment before completing your task.
 """,
        name="harmonizer_worker"
    )
--- a/multi-agent-supervisor/agents/network_agents.py
+++ b/multi-agent-supervisor/agents/network_agents.py
@@ -10,7 +10,32 @@ def create_network_worker():
        model="openai:gpt-4o-mini",
        tools=[get_shell_tool()],
        prompt="""
-Diagnose network issues using `ping`, `traceroute`, and `dig`.
+You are a network diagnostics expert specializing in connectivity and DNS analysis.
+
+TASK: Perform comprehensive network diagnostics.
+
+STANDARD COMMANDS:
+- `ping -c 4 8.8.8.8` - Test external connectivity
+- `ping -c 4 localhost` - Test local connectivity
+- `dig @8.8.8.8 google.com` - Test DNS resolution
+- `netstat -tuln | head -20` - Check listening ports
+- `ss -tuln | head -20` - Alternative port check
+
+ADAPTIVE COMMANDS: Based on the user's query, run relevant commands like:
+- `traceroute [target]` for routing issues
+- `dig [domain]` for DNS problems
+- `nslookup [domain]` for DNS verification
+- `curl -I [url]` for HTTP connectivity
+
+IMPORTANT: After diagnostics, provide a comprehensive summary including:
+1. External connectivity status
+2. DNS resolution functionality
+3. Local services and open ports
+4. Any network issues detected
+5. Specific analysis related to user's query
+6. Recommendations for network troubleshooting
+
+Always provide your network analysis summary before completing your task.
 """,
        name="network_diag"
    )
@@ -22,8 +47,27 @@ def create_cert_worker():
        model="openai:gpt-4o-mini",
        tools=[get_shell_tool()],
        prompt="""
-Check TLS certificates on disk with `openssl x509 -noout -enddate -in <cert>`.
-Raise an alert when a certificate expires in fewer than 30 days.
+You are a TLS/SSL certificate expert specializing in certificate validation and monitoring.
+
+TASK: Check certificate status and expiration dates.
+
+STANDARD COMMANDS:
+- `find /etc/ssl /etc/nginx /etc/apache2 -name "*.crt" -o -name "*.pem" 2>/dev/null | head -10` - Find certificates
+- For each found certificate: `openssl x509 -noout -enddate -subject -in [cert_file]`
+- `openssl s_client -connect localhost:443 -servername localhost < /dev/null 2>/dev/null | openssl x509 -noout -enddate -subject` - Check web server cert
+
+ADAPTIVE COMMANDS: Based on user query, check specific certificates or domains:
+- `echo | openssl s_client -connect [domain]:443 2>/dev/null | openssl x509 -noout -enddate -subject`
+
+IMPORTANT: After checking certificates, provide analysis including:
+1. List of certificates found on system
+2. Expiration dates and time remaining
+3. Certificates expiring within 30 days (ALERT)
+4. Certificate subjects and purposes
+5. Any certificate validation issues
+6. Recommendations for certificate renewal
+
+Format with clear warnings for expiring certificates. Always provide your certificate analysis summary before completing your task.
 """,
        name="cert_checker"
    )
--- a/multi-agent-supervisor/agents/service_agents.py
+++ b/multi-agent-supervisor/agents/service_agents.py
@@ -10,8 +10,30 @@ def create_mariadb_worker():
        model="openai:gpt-4o-mini",
        tools=[get_shell_tool(), LogTailTool()],
        prompt="""
-You are a MariaDB expert. Check config files in /etc/mysql and inspect `/var/log/mysql/*.log` for errors.
-Use `mysqladmin status` and other read‑only commands. Use the `tail_log` tool for logs.
+You are a MariaDB database expert specializing in configuration and log analysis.
+
+TASK: Analyze MariaDB configuration, status, and logs.
+
+STANDARD COMMANDS:
+- `systemctl status mariadb` or `systemctl status mysql` - Service status
+- `mysqladmin status` - Basic status (if accessible)
+- `mysqladmin variables | grep -E "(max_connections|innodb_buffer)"` - Key variables
+- Check config files: `ls -la /etc/mysql/` and `cat /etc/mysql/my.cnf`
+
+LOG ANALYSIS (use tail_log tool):
+- `/var/log/mysql/error.log` - Error log
+- `/var/log/mysql/mysql.log` - General log
+- `/var/log/mariadb/mariadb.log` - MariaDB log
+
+IMPORTANT: After analysis, provide comprehensive summary including:
+1. MariaDB service status and version
+2. Configuration assessment (memory, connections)
+3. Recent errors from logs
+4. Performance indicators
+5. Security configuration review
+6. Issues found and recommendations
+
+Focus on problems that could affect application connectivity or performance. Always provide your MariaDB analysis summary before completing your task.
 """,
        name="mariadb_analyzer"
    )
@@ -23,8 +45,35 @@ def create_nginx_worker():
        model="openai:gpt-4o-mini",
        tools=[get_shell_tool(), LogTailTool()],
        prompt="""
-You are an Nginx expert. Validate configuration with `nginx -t` and inspect access/error logs.
-Use the `tail_log` tool for `/var/log/nginx/error.log`.
+You are an Nginx web server expert specializing in configuration and troubleshooting.
+
+TASK: Analyze Nginx configuration, status, and logs for issues.
+
+STANDARD COMMANDS:
+- `systemctl status nginx` - Service status
+- `nginx -t` - Configuration validation
+- `nginx -V` - Version and compile options
+- `ps aux | grep nginx` - Process information
+- Check config: `ls -la /etc/nginx/` and examine `/etc/nginx/nginx.conf`
+
+LOG ANALYSIS (use tail_log tool):
+- `/var/log/nginx/error.log` - Error log
+- `/var/log/nginx/access.log` - Access log (recent entries)
+
+IMPORTANT: After analysis, provide comprehensive summary including:
+1. Nginx service status and version
+2. Configuration validation results
+3. Worker processes and resource usage
+4. Recent errors from error log
+5. Access patterns and status codes from access log
+6. Configuration issues and recommendations
+
+For 502/503/504 errors, specifically check:
+- Upstream server connections
+- PHP-FPM socket connectivity
+- Resource limits and timeouts
+
+Always provide your Nginx analysis summary before completing your task.
 """,
        name="nginx_analyzer"
    )
@@ -36,7 +85,41 @@ def create_phpfpm_worker():
        model="openai:gpt-4o-mini",
        tools=[get_shell_tool(), LogTailTool()],
        prompt="""
-You are a PHP‑FPM expert. Check `systemctl status php*-fpm` and look for memory leaks or timeouts in the logs.
+You are a PHP-FPM expert specializing in performance analysis and troubleshooting.
+
+TASK: Analyze PHP-FPM configuration, status, and performance issues.
+
+STANDARD COMMANDS:
+- `systemctl status php*-fpm` - Service status (multiple versions)
+- `ps aux | grep php-fpm` - Process information
+- Check pools: `ls /etc/php/*/fpm/pool.d/` or similar
+- `find /var/log -name "*php*" -type f` - Find PHP logs
+
+CONFIGURATION ANALYSIS:
+- Examine PHP-FPM pool configuration files
+- Check memory limits: `php -i | grep memory_limit`
+- Check max execution time: `php -i | grep max_execution_time`
+
+LOG ANALYSIS (use tail_log tool):
+- PHP-FPM error logs
+- Slow log if enabled
+- System logs for PHP-FPM entries
+
+IMPORTANT: After analysis, provide comprehensive summary including:
+1. PHP-FPM service status and version
+2. Active pools and worker processes
+3. Memory usage and limits
+4. Recent errors and warnings
+5. Performance issues (timeouts, memory exhaustion)
+6. Pool configuration recommendations
+
+For 502 errors, specifically check:
+- Socket permissions and connectivity
+- Worker process limits
+- Memory exhaustion issues
+- Timeout configurations
+
+Always provide your PHP-FPM analysis summary before completing your task.
 """,
        name="phpfpm_analyzer"
    )
--- a/multi-agent-supervisor/agents/system_agents.py
+++ b/multi-agent-supervisor/agents/system_agents.py
@@ -10,8 +10,35 @@ def create_system_info_worker():
        model="openai:gpt-4o-mini",
        tools=[get_shell_tool()],
        prompt="""
-You are a Linux sysadmin. Use shell commands like `lscpu`, `free -h`, and `df -h` to gather CPU, RAM, and disk usage. 
-Return a concise plain‑text summary. Only run safe, read‑only commands.
+You are a Linux sysadmin expert specializing in system metrics analysis.
+
+TASK: Gather comprehensive system information using shell commands.
+
+WORKFLOW:
+1. Execute the required commands to gather system data
+2. Analyze the results from all commands
+3. Provide a comprehensive analysis summary
+4. Only then transfer back to supervisor
+
+REQUIRED COMMANDS:
+- `lscpu` - CPU information
+- `free -h` - Memory usage
+- `df -h` - Disk usage
+- `uptime` - System load
+- `ps aux --sort=-%mem | head -10` - Top memory-consuming processes
+
+ANALYSIS REQUIREMENTS:
+After running ALL commands, you MUST provide a comprehensive summary including:
+1. CPU specs and current load
+2. Memory usage (total, used, available) with percentage
+3. Disk usage with alerts for >80% usage
+4. System uptime and load averages
+5. Top resource-consuming processes
+6. Any concerning metrics or recommendations
+
+CRITICAL: Your response must be a structured analysis summary that starts with "📊 SYSTEM ANALYSIS SUMMARY:" and includes all findings. Do NOT just say "transferring back" - provide the actual analysis first.
+
+Only run safe, read-only commands. Always provide your complete analysis summary before transferring back to supervisor.
 """,
        name="system_info_worker"
    )
@@ -23,8 +50,84 @@ def create_service_inventory_worker():
        model="openai:gpt-4o-mini",
        tools=[get_shell_tool()],
        prompt="""
-List all running services using `systemctl list-units --type=service --state=running`. 
-Return a JSON array of service names.
+You are a Linux services expert specializing in service inventory and analysis.
+
+TASK: Analyze running services and identify key system services.
+
+WORKFLOW:
+1. Execute the required commands to gather service data
+2. Analyze service status and identify critical services
+3. Provide a structured service analysis summary
+4. Only then transfer back to supervisor
+
+REQUIRED COMMANDS:
+- `systemctl list-units --type=service --state=running` - List running services
+- `systemctl list-units --type=service --state=failed` - Check for failed services
+- `ps aux | grep -E "(nginx|apache|httpd|mysql|mariadb|postgresql|php-fpm|sshd)"` - Check web/db services
+
+ANALYSIS REQUIREMENTS:
+After running ALL commands, you MUST provide a structured analysis including:
+1. Total number of running services
+2. Critical services status (web servers, databases, SSH)
+3. Any failed or problematic services
+4. Security-relevant services (SSH, firewall)
+5. Services that might relate to the user's query
+6. Recommendations for further investigation
+
+CRITICAL: Your response must be a structured analysis summary that starts with "📋 SERVICE ANALYSIS SUMMARY:" and includes all findings. Do NOT just say "transferring back" - provide the actual analysis first.
+
+Format as clear summary with service categories and status. Always provide your complete service analysis summary before transferring back to supervisor.
 """,
        name="service_inventory_worker"
    )
+
+
+def create_filesystem_worker():
+    """Create filesystem operations agent."""
+    return create_react_agent(
+        model="openai:gpt-4o-mini",
+        tools=[get_shell_tool()],
+        prompt="""
+You are a filesystem expert specializing in file operations and system navigation.
+
+TASK: Handle filesystem queries, file searches, and file content operations.
+
+FILE SEARCH COMMANDS:
+- `find /path -name "filename"` - Search for files by name
+- `find /path -type f -name "*.ext"` - Search by file extension
+- `find ~ -name "filename"` - Search in home directory
+- `locate filename` - Fast search (if updatedb is available)
+- `which command` - Find executable location
+- `ls -la /path/` - List directory contents with details
+- `du -sh /path/` - Check directory size
+
+FILE CONTENT OPERATIONS:
+- `cat /path/to/file` - Display full file contents
+- `head -n 20 /path/to/file` - Show first 20 lines
+- `tail -n 20 /path/to/file` - Show last 20 lines
+- `grep "pattern" /path/to/file` - Search within file
+- `wc -l /path/to/file` - Count lines in file
+- `file /path/to/file` - Determine file type
+
+DIRECTORY OPERATIONS:
+- `pwd` - Show current directory
+- `tree /path/` - Show directory tree structure (if available)
+- `ls -R /path/` - Recursive directory listing
+
+PERMISSIONS AND OWNERSHIP:
+- `stat /path/to/file` - Detailed file information
+- `ls -la /path/to/file` - File permissions and ownership
+
+IMPORTANT: 
+- Always provide clear, formatted output
+- For large files, use head/tail to show relevant portions
+- When searching, provide full paths in results
+- If a file doesn't exist, suggest alternative locations
+- Handle permission errors gracefully and suggest solutions
+
+CRITICAL: Your response must be a structured summary that starts with "📁 FILESYSTEM ANALYSIS:" and includes your findings. Do NOT just say "transferring back" - provide the actual results first.
+
+Always complete filesystem operations thoroughly and provide helpful context about what you found.
+""",
+        name="filesystem_worker"
+    )
--- a/multi-agent-supervisor/config.py
+++ b/multi-agent-supervisor/config.py
@@ -1,26 +0,0 @@
-"""Configuration settings for the multi-agent system."""
-
-from langchain_openai import ChatOpenAI
-
-
-def get_base_model():
-    """Get the base LLM model configuration."""
-    return ChatOpenAI(model="gpt-4o-mini", temperature=0)
-
-
-SUPERVISOR_PROMPT = """
-You are the supervisor of a team of specialised sysadmin agents.
-Decide which agent to delegate to based on the user's query **or** on results already collected.
-Available agents:
- system_info_worker: gather system metrics
- service_inventory_worker: list running services  
- mariadb_analyzer: analyse MariaDB
- nginx_analyzer: analyse Nginx
- phpfpm_analyzer: analyse PHP‑FPM
- network_diag: diagnose network issues
- cert_checker: check TLS certificates
- risk_scorer: aggregate severity
- remediation_worker: propose fixes
- harmonizer_worker: apply hardening
-Always start with `system_info_worker` and `service_inventory_worker` before drilling into a specific service.
-"""
--- a/multi-agent-supervisor/docs/AGENT_ENHANCEMENT_SUMMARY.md
+++ b/multi-agent-supervisor/docs/AGENT_ENHANCEMENT_SUMMARY.md
@@ -0,0 +1,93 @@
+# Enhanced Agent Results Communication
+
+## Problem Identified
+The agents were only sending "Successfully transferred control back to supervisor" messages without providing meaningful analysis results from their work.
+
+## Root Cause
+The agent prompts were too brief and didn't explicitly instruct agents to:
+1. Summarize their findings after executing commands
+2. Provide structured analysis before transferring back to supervisor
+3. Include specific recommendations and insights
+
+## Solution Implemented
+
+### 1. Enhanced Agent Prompts
+Updated all agent prompts to include:
+
+- **Explicit task definitions** with required commands
+- **Structured analysis requirements** with specific sections
+- **Clear instructions** to provide comprehensive summaries
+- **Always provide analysis summary before completing task**
+
+### 2. Specific Improvements by Agent
+
+#### System Agents
+- **system_info_worker**: Now analyzes CPU, memory, disk, load, and top processes with structured summary
+- **service_inventory_worker**: Provides service categorization, failed services analysis, security-relevant services
+
+#### Service Agents  
+- **nginx_analyzer**: Comprehensive config validation, log analysis, specific 502/503/504 error troubleshooting
+- **mariadb_analyzer**: Database status, configuration assessment, log analysis, performance indicators
+- **phpfpm_analyzer**: Process analysis, memory limits, timeout configuration, socket connectivity
+
+#### Network Agents
+- **network_diag**: Connectivity testing, DNS analysis, port scanning with adaptive commands
+- **cert_checker**: Certificate discovery, expiration monitoring, validation with 30-day alerts
+
+#### Analysis Agents
+- **risk_scorer**: Structured risk assessment with CRITICAL/HIGH/MEDIUM/LOW levels and prioritized recommendations
+- **remediation_worker**: Safe remediation plans with dry-run commands and confirmation protocols
+- **harmonizer_worker**: Security hardening assessment with best-practice implementation
+
+### 3. Enhanced Supervisor Coordination
+Updated supervisor prompt to:
+- Better utilize agent expertise
+- Synthesize findings from multiple agents
+- Provide strategic insights beyond delegation
+
+## Expected Behavior Change
+
+### Before:
+```
+STEP 2: SYSTEM_INFO_WORKER
+Successfully transferred control back to supervisor
+
+STEP 3: SERVICE_INVENTORY_WORKER  
+Successfully transferred control back to supervisor
+
+STEP 4: SUPERVISOR
+Generic response not based on actual findings
+```
+
+### After:
+```
+STEP 2: SYSTEM_INFO_WORKER
+📊 ANALYSIS SUMMARY:
+- CPU: Intel i7, load average 0.8, 1.2, 1.5
+- Memory: 8GB total, 6.2GB used (78% utilization)
+- Disk: Root partition 85% full - WARNING LEVEL
+- Top processes: nginx (2.1GB), mysql (1.8GB)
+- Recommendation: Monitor disk space, investigate memory usage
+
+STEP 3: SERVICE_INVENTORY_WORKER
+📋 SERVICE ANALYSIS:
+- 47 services running (normal range)
+- Critical services: ✅ nginx, ✅ mysql, ❌ php-fpm (failed)
+- Failed services: php8.1-fpm.service
+- Security services: ✅ ssh, ✅ ufw
+- Recommendation: Investigate php-fpm failure for potential 502 errors
+
+STEP 4: SUPERVISOR  
+Based on system analysis showing high memory usage and service inventory
+revealing php-fpm failure, this explains your 502 errors...
+```
+
+## Files Modified
+- `agents/system_agents.py` - Enhanced system monitoring agents
+- `agents/service_agents.py` - Enhanced service-specific agents  
+- `agents/network_agents.py` - Enhanced network and security agents
+- `agents/analysis_agents.py` - Enhanced analysis and remediation agents
+- `config.py` - Enhanced supervisor prompt and coordination strategy
+
+## Result
+Agents now provide meaningful, structured analysis that the supervisor can synthesize into comprehensive, actionable responses instead of generic outputs.
--- a/multi-agent-supervisor/docs/DYNAMIC_INSTRUCTIONS.md
+++ b/multi-agent-supervisor/docs/DYNAMIC_INSTRUCTIONS.md
@@ -0,0 +1,129 @@
+# Dynamic Instructions for Agent Transfers - TODO
+
+## Current Behavior
+Currently, when the supervisor transfers control to an agent:
+- ❌ No specific instructions are passed
+- ❌ Agent only sees the original user query
+- ❌ Agent uses its static, pre-defined prompt
+
+## Proposed Enhancement: Dynamic Instructions
+
+### Why It Matters
+The supervisor often has context about WHY it's transferring to a specific agent. For example:
+- "Transfer to network_diag because user mentioned DNS issues - focus on DNS diagnostics"
+- "Transfer to cert_checker because certificates might be expiring - check all certs urgently"
+
+### Implementation Approach
+
+#### 1. Modify Transfer Tools
+```python
+def transfer_to_network_diag(instructions: str = "") -> str:
+    """Transfer control to network diagnostics agent.
+    
+    Args:
+        instructions: Specific guidance for the agent
+    """
+    return f"Successfully transferred to network_diag. Instructions: {instructions}"
+```
+
+#### 2. Update State to Include Instructions
+```python
+class State(BaseModel):
+    messages: List[AnyMessage]
+    next_agent: str = "supervisor"
+    supervisor_instructions: Optional[str] = None  # NEW FIELD
+```
+
+#### 3. Modify Agent Creation to Check for Instructions
+```python
+def create_network_worker():
+    return create_react_agent(
+        model="openai:gpt-4o-mini",
+        tools=[get_shell_tool()],
+        prompt="""
+{base_prompt}
+
+SUPERVISOR INSTRUCTIONS (if any): {supervisor_instructions}
+
+Always prioritize supervisor instructions when provided.
+""",
+        name="network_diag"
+    )
+```
+
+#### 4. Update Router Logic
+```python
+def route_agent(state):
+    # Extract supervisor instructions from last ToolMessage
+    last_message = state["messages"][-1]
+    if isinstance(last_message, ToolMessage) and "Instructions:" in last_message.content:
+        # Parse and store instructions
+        instructions = extract_instructions(last_message.content)
+        state["supervisor_instructions"] = instructions
+    
+    return state["next_agent"]
+```
+
+### Example Flow
+
+1. **User Query**: "My website is slow"
+
+2. **Supervisor Analysis**: 
+   ```
+   "Website slowness could be DNS or certificate related. 
+    Let me transfer to network_diag with specific focus."
+   ```
+
+3. **Supervisor Transfer**:
+   ```python
+   transfer_to_network_diag(
+       instructions="Focus on DNS resolution times and latency to common websites. 
+                     Check if DNS servers are responding slowly."
+   )
+   ```
+
+4. **Network Agent Receives**:
+   - Original query: "My website is slow"
+   - Supervisor instructions: "Focus on DNS resolution times..."
+   - Can now prioritize DNS diagnostics over general network checks
+
+### Benefits
+
+1. **More Targeted Diagnostics**: Agents focus on what matters
+2. **Better Context Sharing**: Supervisor's analysis isn't lost
+3. **Efficient Execution**: Avoid running unnecessary commands
+4. **Improved Results**: More relevant output for user's specific issue
+
+### Alternative: Context in Messages
+
+Instead of modifying tools, append supervisor analysis to the message history:
+
+```python
+# Before transfer, supervisor adds a system message
+state["messages"].append(
+    SystemMessage(content=f"[SUPERVISOR GUIDANCE]: Focus on {specific_issue}")
+)
+```
+
+### Decision Points
+
+1. **Tool Parameters vs State**: Where to store instructions?
+2. **Prompt Injection vs Message History**: How to pass instructions?
+3. **Optional vs Required**: Should all transfers include instructions?
+4. **Persistence**: Should instructions carry through multiple agent hops?
+
+### Next Steps
+
+1. [ ] Decide on implementation approach
+2. [ ] Modify transfer tool signatures
+3. [ ] Update state model
+4. [ ] Enhance agent prompts to use instructions
+5. [ ] Test with various scenarios
+6. [ ] Document the new pattern
+
+### Example Test Cases
+
+- "Check network" → No specific instructions needed
+- "Website is slow" → "Focus on DNS and latency"  
+- "Certificate expiring?" → "Check all certs, prioritize those expiring soon"
+- "Port 443 issues" → "Focus on HTTPS connectivity and certificate validation"
--- a/multi-agent-supervisor/docs/README-modular.md
+++ b/multi-agent-supervisor/docs/README-modular.md
--- a/multi-agent-supervisor/docs/UNDERSTANDING_TRANSFERS.md
+++ b/multi-agent-supervisor/docs/UNDERSTANDING_TRANSFERS.md
@@ -97,39 +97,78 @@ User: "Nginx 502 error, help!"
    └── "Based on system analysis and service inventory, here's comprehensive solution..."
 ```

-## 🔍 Enhanced Debugging
+## 📤 What Workers Pass Back to Supervisor

-The updated `utils.py` now shows:
- **Transfer explanations**: What each "Successfully transferred" means
- **Conversation context**: Last few messages to understand the flow
- **Tool call details**: What tools are being used and why
- **Agent delegation**: Which agent is being called and for what purpose
+**Key Insight**: Workers don't explicitly "return" data. Instead, all their work becomes part of the shared conversation history that the supervisor can access.

-## 🔍 Observing Result Flow in Practice
+### What Gets Added to the Message History

-To see how results flow back to the supervisor, run the enhanced debugging and watch for:
+When a worker (like `network_diag`) executes:

-1. **Agent Results**: Look for `AIMessage` from agents (not just transfer confirmations)
-2. **Conversation Context**: The expanding message history in each step
-3. **Supervisor Decision Changes**: How supervisor's next choice is influenced by results
+1. **AIMessages** - Agent's reasoning and analysis
+   ```
+   "I'll start by checking external connectivity..."
+   "DNS resolution appears to be working correctly..."
+   "Network Analysis Summary: All systems operational..."
+   ```

-### Example Debug Output Analysis:
-```
-🔄 STEP 2: system_info_worker
-💬 MESSAGE TYPE: AIMessage  ← AGENT'S ACTUAL RESULT
-📄 CONTENT: "502 typically indicates upstream server issues..."
+2. **ToolMessages** - Raw command outputs
+   ```
+   "PING 8.8.8.8 (8.8.8.8): 56 data bytes\n64 bytes from 8.8.8.8..."
+   "google.com. 300 IN A 142.250.80.46"
+   "tcp 0 0 0.0.0.0:22 0.0.0.0:* LISTEN"
+   ```

-🔄 STEP 4: service_inventory_worker  
-💬 MESSAGE TYPE: AIMessage  ← AGENT'S ACTUAL RESULT
-📄 CONTENT: "Check PHP-FPM status, verify upstream config..."
+3. **Transfer Confirmation** - When worker completes
+   ```
+   "Successfully transferred back to supervisor"
+   ```

-🔄 STEP 5: supervisor
-💬 MESSAGE TYPE: AIMessage  ← SUPERVISOR'S SYNTHESIS
-📄 CONTENT: "Based on system analysis and service inventory..."
-📚 CONVERSATION CONTEXT (12 messages)  ← SUPERVISOR SEES ALL RESULTS
+### Complete Message Flow Example
+
+```python
+# After network_diag completes, state["messages"] contains:
+[
+    HumanMessage("My website is slow"),                        # Original query
+    AIMessage("I'll check network connectivity..."),          # Supervisor decision
+    ToolMessage("Successfully transferred to network_diag"),   # Transfer confirmation
+    AIMessage("Starting network diagnostics..."),             # Worker starts
+    ToolMessage("PING 8.8.8.8: 64 bytes from 8.8.8.8..."),  # Command result 1
+    AIMessage("External connectivity is good, checking DNS"), # Worker analysis
+    ToolMessage("google.com. 300 IN A 142.250.80.46"),       # Command result 2
+    AIMessage("DNS working. Checking local services..."),     # Worker continues
+    ToolMessage("tcp 0 0 0.0.0.0:80 0.0.0.0:* LISTEN"),      # Command result 3
+    AIMessage("Network Summary: All good, issue elsewhere"),  # Worker's final analysis
+    ToolMessage("Successfully transferred back to supervisor") # Transfer back
+]
 ```

-The supervisor's final response demonstrates it has processed and synthesized results from both agents!
+### How Supervisor Uses This Information
+
+The supervisor receives **ALL** these messages and can:
+
+1. **Read command outputs** to understand technical details
+2. **See agent reasoning** to understand what was checked
+3. **Access final analysis** to make informed decisions
+4. **Decide next steps** based on accumulated evidence
+
+### Why This Design Works
+
+- **Full Transparency**: Supervisor sees everything the worker did
+- **Rich Context**: Both raw data and interpreted analysis available  
+- **Cumulative Knowledge**: Each agent builds on previous work
+- **Intelligent Routing**: Supervisor can adapt strategy based on findings
+
+### Example: Multi-Agent Collaboration
+
+```
+User: "Website is slow"
+├── network_diag finds: "Network is fine"
+├── cert_checker finds: "Certificate expires tomorrow!" 
+└── Supervisor synthesis: "Issue is expiring certificate, not network"
+```
+
+The supervisor can correlate findings across multiple workers because it sees all their work in the message history.

 ## 📋 Key Takeaways

--- a/multi-agent-supervisor/examples.py
+++ b/multi-agent-supervisor/examples.py
--- a/multi-agent-supervisor/loghub
+++ b/multi-agent-supervisor/loghub
@@ -1 +0,0 @@
-../loghub
--- a/multi-agent-supervisor/supervisor.py
+++ b/multi-agent-supervisor/supervisor.py
@@ -3,11 +3,55 @@
 from langchain_openai import ChatOpenAI
 from langgraph_supervisor import create_supervisor

-from agents.system_agents import create_system_info_worker, create_service_inventory_worker
+from agents.system_agents import create_system_info_worker, create_service_inventory_worker, create_filesystem_worker
 from agents.service_agents import create_mariadb_worker, create_nginx_worker, create_phpfpm_worker
 from agents.network_agents import create_network_worker, create_cert_worker
 from agents.analysis_agents import create_risk_worker, create_remediation_worker, create_harmonizer_worker
-from config import get_base_model, SUPERVISOR_PROMPT
+
+
+def get_base_model():
+    """Get the base LLM model configuration."""
+    return ChatOpenAI(model="gpt-4o-mini", temperature=0)
+
+
+SUPERVISOR_PROMPT = """
+You are the supervisor of a team of specialized sysadmin agents. Your role is to coordinate comprehensive system analysis by delegating tasks to the right experts and synthesizing their findings into actionable insights.
+
+IMPORTANT: You do NOT have direct access to the file system. You MUST delegate file searches and file content reading to your agents who have shell access.
+
+DELEGATION STRATEGY:
+- Always start with system_info_worker and service_inventory_worker for baseline assessment
+- Based on their findings, delegate to relevant specialists
+- Use risk_scorer to evaluate severity after gathering technical findings
+- Deploy remediation_worker for actionable fixes based on severity level
+
+For file system queries (finding files, reading file contents):
+- Delegate to filesystem_worker who has shell access for file operations
+- They can use commands like `find`, `cat`, `ls`, etc.
+
+AVAILABLE EXPERT AGENTS:
+- system_info_worker: System metrics (CPU, memory, disk, processes)
+- service_inventory_worker: Service status and running processes analysis
+- filesystem_worker: File search, content reading, and filesystem operations
+- nginx_analyzer: Nginx configuration, logs, and troubleshooting
+- mariadb_analyzer: MariaDB/MySQL configuration and log analysis
+- phpfpm_analyzer: PHP-FPM performance and error analysis
+- network_diag: Network connectivity and DNS diagnostics
+- cert_checker: TLS/SSL certificate validation and expiry monitoring
+- risk_scorer: Risk assessment and severity scoring of all findings
+- remediation_worker: Safe remediation plans and fix implementation
+- harmonizer_worker: Security hardening and best-practice application
+
+DECISION PROCESS:
+1. Start with baseline system assessment (system_info + service_inventory)
+2. Based on user query and baseline findings, call relevant specialists
+3. Use risk_scorer to evaluate cumulative findings
+4. Deploy remediation_worker for actionable solutions
+5. Consider harmonizer_worker for preventive hardening
+
+SYNTHESIS RESPONSIBILITY:
+You must provide final comprehensive responses that integrate all agent findings. Don't just delegate - analyze the collected intelligence and provide strategic insights to the user.
+"""


 def create_sysadmin_supervisor():
@@ -17,6 +61,7 @@ def create_sysadmin_supervisor():
    agents = [
        create_system_info_worker(),
        create_service_inventory_worker(),
+        create_filesystem_worker(),
        create_mariadb_worker(),
        create_nginx_worker(),
        create_phpfpm_worker(),
--- a/multi-agent-supervisor/utils.py
+++ b/multi-agent-supervisor/utils.py
@@ -75,21 +75,31 @@ def print_step_info(step_count: int, chunk):
                                # Show the result being sent back to supervisor
                                # Look for the last AIMessage before this transfer to get the result
                                if 'messages' in agent_data and len(agent_data['messages']) > 1:
+                                    print(f"[ DEBUG ] {current_agent} has {len(agent_data['messages'])} messages")
                                    # Look for the most recent AIMessage with content
-                                    for msg in reversed(agent_data['messages'][:-1]):  # Exclude current ToolMessage
-                                        if type(msg).__name__ == 'AIMessage' and hasattr(msg, 'content') and msg.content:
-                                            result_content = msg.content
-                                            if len(result_content) > 300:
-                                                preview = result_content[:300] + "..."
-                                                print(f"[ {current_agent} ] sending result to supervisor (preview): {preview}")
-                                                print(f"[ {current_agent} ] (full result length: {len(result_content)} characters)")
+                                    found_result = False
+                                    for i, msg in enumerate(reversed(agent_data['messages'][:-1])):  # Exclude current ToolMessage
+                                        msg_type = type(msg).__name__
+                                        print(f"[ DEBUG ] Message {i}: {msg_type}, has_content: {hasattr(msg, 'content')}")
+                                        if msg_type == 'AIMessage' and hasattr(msg, 'content') and msg.content:
+                                            result_content = msg.content.strip()
+                                            if result_content and not result_content.startswith("I'll") and "transfer" not in result_content.lower():
+                                                found_result = True
+                                                if len(result_content) > 300:
+                                                    preview = result_content[:300] + "..."
+                                                    print(f"[ {current_agent} ] 📊 ANALYSIS SUMMARY (preview): {preview}")
+                                                    print(f"[ {current_agent} ] (full result length: {len(result_content)} characters)")
+                                                else:
+                                                    print(f"[ {current_agent} ] 📊 ANALYSIS SUMMARY: {result_content}")
+                                                break
                                            else:
-                                                print(f"[ {current_agent} ] sending result to supervisor: {result_content}")
-                                            break
-                                    else:
-                                        print(f"[ {current_agent} ] sending analysis results to supervisor")
+                                                print(f"[ DEBUG ] Skipping AIMessage: '{result_content[:100]}...'")
+                                    
+                                    if not found_result:
+                                        print(f"[ WARNING ] {current_agent} transferred back without providing analysis summary!")
+                                        print(f"[ WARNING ] This agent may need prompt improvements")
                                else:
-                                    print(f"[ {current_agent} ] sending analysis results to supervisor")
+                                    print(f"[ WARNING ] {current_agent} has no message history to analyze")
                        else:
                            # Other tool execution result
                            if len(content) > 200: