simplify multi-agents approach

This commit is contained in:
Gaetan Hurel
2025-06-27 11:40:30 +02:00
parent 1a8d63c5f0
commit b26e50ed35
15 changed files with 365 additions and 862 deletions

View File

@@ -1,33 +1,11 @@
"""Agent definitions for the multi-agent sysadmin system."""
from .system_agents import (
create_system_info_worker,
create_service_inventory_worker,
)
from .service_agents import (
create_mariadb_worker,
create_nginx_worker,
create_phpfpm_worker,
)
from .network_agents import (
create_network_worker,
create_cert_worker,
)
from .analysis_agents import (
create_risk_worker,
create_remediation_worker,
create_harmonizer_worker,
)
from .os_detector import create_os_detector_worker
from .logs_analyzer import create_logs_analyzer_worker
from .performance_analyzer import create_performance_analyzer_worker
__all__ = [
"create_system_info_worker",
"create_service_inventory_worker",
"create_mariadb_worker",
"create_nginx_worker",
"create_phpfpm_worker",
"create_network_worker",
"create_cert_worker",
"create_risk_worker",
"create_remediation_worker",
"create_harmonizer_worker",
"create_os_detector_worker",
"create_logs_analyzer_worker",
"create_performance_analyzer_worker"
]

View File

@@ -1,125 +0,0 @@
"""Analysis and remediation agents."""
from langgraph.prebuilt import create_react_agent
from custom_tools import get_shell_tool
def create_risk_worker():
"""Create risk assessment agent."""
return create_react_agent(
model="openai:gpt-4o-mini",
tools=[], # pureLLM reasoning
prompt="""
You are a cybersecurity and system reliability expert specializing in risk assessment.
TASK: Analyze findings from other agents and assign comprehensive risk scoring.
ANALYSIS PROCESS:
1. Review all findings from system_info_worker, service_inventory_worker, and specialist agents
2. Identify security vulnerabilities, performance issues, and operational risks
3. Assess potential impact and likelihood of problems
4. Assign severity levels and provide prioritized recommendations
SEVERITY LEVELS:
- **CRITICAL**: System down, security breach, data loss risk
- **HIGH**: Service degradation, security vulnerability, urgent attention needed
- **MEDIUM**: Performance issues, minor security concerns, planned maintenance needed
- **LOW**: Optimization opportunities, informational findings
IMPORTANT: Provide a structured risk assessment including:
1. Overall risk level with justification
2. Top 3 priority issues with severity levels
3. Security risk assessment
4. Performance/availability risk assessment
5. Recommended immediate actions
6. Long-term improvement suggestions
Base your analysis on concrete findings from other agents. If insufficient data, request specific agent analysis.
Always provide your comprehensive risk assessment before completing your task.
""",
name="risk_scorer"
)
def create_remediation_worker():
"""Create remediation agent."""
return create_react_agent(
model="openai:gpt-4o-mini",
tools=[get_shell_tool()],
prompt="""
You are a system remediation expert specializing in safe problem resolution.
TASK: Propose and implement safe fixes for detected issues based on other agents' findings.
SAFETY PROTOCOL:
- NEVER run destructive commands automatically
- Always request confirmation for system changes
- Provide dry-run commands when possible
- Explain potential risks of each action
ANALYSIS PROCESS:
1. Review findings from all previous agents
2. Identify actionable problems
3. Propose step-by-step remediation plans
4. Differentiate between immediate fixes and planned maintenance
COMMAND CATEGORIES:
- **Safe diagnostic commands**: Run immediately for verification
- **Configuration changes**: Propose with backup procedures
- **Service restarts**: Explain impact and timing
- **System changes**: Require explicit confirmation
IMPORTANT: Provide structured remediation plan including:
1. Summary of issues to address
2. Immediate safe actions (with commands)
3. Proposed configuration changes (with backups)
4. Service restart procedures
5. Risk mitigation steps
6. Verification commands to confirm fixes
For each suggested action, explain the reasoning and potential impact. Always provide your remediation plan before completing your task.
""",
name="remediation_worker"
)
def create_harmonizer_worker():
"""Create system hardening agent."""
return create_react_agent(
model="openai:gpt-4o-mini",
tools=[get_shell_tool()],
prompt="""
You are a system security hardening expert specializing in best-practice implementation.
TASK: Apply security hardening measures based on system analysis and risk assessment.
HARDENING CATEGORIES:
1. **System Limits**: ulimit settings, process limits
2. **Kernel Parameters**: sysctl security settings
3. **Log Management**: journald rotation, log security
4. **Service Security**: disable unnecessary services
5. **File Permissions**: secure sensitive files
EXECUTION MODES:
- **DRY-RUN (default)**: Show commands without execution
- **APPLY (High+ severity)**: Execute with confirmation
STANDARD HARDENING CHECKS:
- `ulimit -a` - Current limits
- `sysctl -a | grep -E "(net.ipv4|kernel.dmesg_restrict)"` - Security parameters
- `journalctl --disk-usage` - Log space usage
- `find /etc -perm -002 -type f` - World-writable files
IMPORTANT: Provide structured hardening report including:
1. Current security posture assessment
2. Recommended hardening measures
3. Commands for implementation (dry-run by default)
4. Risk reduction achieved by each measure
5. Potential compatibility impacts
6. Priority order for implementation
Execute changes only for High+ severity findings or with explicit approval. Always provide your hardening assessment before completing your task.
""",
name="harmonizer_worker"
)

View File

@@ -0,0 +1,41 @@
"""Logs Analysis Agent for investigating and diagnosing issues through log files."""
from langchain_openai import ChatOpenAI
from langgraph.prebuilt import create_react_agent
from langchain_community.tools.shell.tool import ShellTool
from custom_tools import print_poem
def create_logs_analyzer_worker():
"""Create a logs analyzer agent that investigates system and application logs."""
tools = [ShellTool(), print_poem]
return create_react_agent(
model=ChatOpenAI(model="gpt-4o-mini", temperature=0),
tools=tools,
prompt="""You are an expert Logs Analysis Agent specialized in investigating and diagnosing issues through log files.
Your capabilities:
1. **Log Discovery**: Find relevant log files in standard locations (/var/log, journalctl, application-specific)
2. **Pattern Recognition**: Identify errors, warnings, anomalies, and trends in logs
3. **Timeline Analysis**: Correlate events across different log sources
4. **Root Cause Analysis**: Trace issues back to their origin through log evidence
Analysis techniques:
- Use `tail`, `grep`, `awk`, and `sed` for efficient log parsing
- Leverage `journalctl` for systemd-based systems
- Check application-specific logs (nginx, apache, mysql, etc.)
- Look for patterns: timestamps, error codes, stack traces
- Identify cascading failures and their sequence
Best practices:
- Start with recent logs (`tail -n 100` or `journalctl -n 100`)
- Use time-based filtering to focus on relevant periods
- Search for keywords: error, fail, critical, warning, denied
- Check multiple log sources for a complete picture
- Summarize findings clearly with timestamps and context
Remember: Complex debugging sessions can be stressful. Use the poem tool when you need a morale boost!""",
name="logs_analyzer"
)

View File

@@ -1,73 +0,0 @@
"""Network and security monitoring agents."""
from langgraph.prebuilt import create_react_agent
from custom_tools import get_shell_tool
def create_network_worker():
"""Create network diagnostics agent."""
return create_react_agent(
model="openai:gpt-4o-mini",
tools=[get_shell_tool()],
prompt="""
You are a network diagnostics expert specializing in connectivity and DNS analysis.
TASK: Perform comprehensive network diagnostics.
STANDARD COMMANDS:
- `ping -c 4 8.8.8.8` - Test external connectivity
- `ping -c 4 localhost` - Test local connectivity
- `dig @8.8.8.8 google.com` - Test DNS resolution
- `netstat -tuln | head -20` - Check listening ports
- `ss -tuln | head -20` - Alternative port check
ADAPTIVE COMMANDS: Based on the user's query, run relevant commands like:
- `traceroute [target]` for routing issues
- `dig [domain]` for DNS problems
- `nslookup [domain]` for DNS verification
- `curl -I [url]` for HTTP connectivity
IMPORTANT: After diagnostics, provide a comprehensive summary including:
1. External connectivity status
2. DNS resolution functionality
3. Local services and open ports
4. Any network issues detected
5. Specific analysis related to user's query
6. Recommendations for network troubleshooting
Always provide your network analysis summary before completing your task.
""",
name="network_diag"
)
def create_cert_worker():
"""Create certificate checking agent."""
return create_react_agent(
model="openai:gpt-4o-mini",
tools=[get_shell_tool()],
prompt="""
You are a TLS/SSL certificate expert specializing in certificate validation and monitoring.
TASK: Check certificate status and expiration dates.
STANDARD COMMANDS:
- `find /etc/ssl /etc/nginx /etc/apache2 -name "*.crt" -o -name "*.pem" 2>/dev/null | head -10` - Find certificates
- For each found certificate: `openssl x509 -noout -enddate -subject -in [cert_file]`
- `openssl s_client -connect localhost:443 -servername localhost < /dev/null 2>/dev/null | openssl x509 -noout -enddate -subject` - Check web server cert
ADAPTIVE COMMANDS: Based on user query, check specific certificates or domains:
- `echo | openssl s_client -connect [domain]:443 2>/dev/null | openssl x509 -noout -enddate -subject`
IMPORTANT: After checking certificates, provide analysis including:
1. List of certificates found on system
2. Expiration dates and time remaining
3. Certificates expiring within 30 days (ALERT)
4. Certificate subjects and purposes
5. Any certificate validation issues
6. Recommendations for certificate renewal
Format with clear warnings for expiring certificates. Always provide your certificate analysis summary before completing your task.
""",
name="cert_checker"
)

View File

@@ -0,0 +1,39 @@
"""OS Detection Agent for system identification and analysis."""
from langchain_openai import ChatOpenAI
from langgraph.prebuilt import create_react_agent
from langchain_community.tools.shell.tool import ShellTool
from custom_tools import print_poem
def create_os_detector_worker():
"""Create an OS detector agent that identifies system information and environment."""
tools = [ShellTool(), print_poem]
return create_react_agent(
model=ChatOpenAI(model="gpt-4o-mini", temperature=0),
tools=tools,
prompt="""You are an expert OS Detection Agent specialized in identifying and analyzing operating systems.
Your capabilities:
1. **System Identification**: Detect OS type, version, kernel, and architecture
2. **Environment Analysis**: Identify running services, installed packages, and system configuration
3. **Hardware Detection**: Gather CPU, memory, disk, and network interface information
4. **Security Assessment**: Check for security tools, firewall status, and SELinux/AppArmor status
Best practices:
- Start with basic commands like `uname -a`, `cat /etc/os-release`, `lsb_release -a`
- Use `systemctl` or `service` commands based on the init system
- Check for containerization (Docker, Kubernetes, LXC)
- Identify virtualization platforms if applicable
- Be thorough but efficient in your detection
Safety guidelines:
- Only run read-only commands for detection
- Never modify system configurations
- Avoid commands that could impact performance
Remember: You can also use the poem tool to boost morale when the debugging gets tough!""",
name="os_detector"
)

View File

@@ -0,0 +1,47 @@
"""Performance Analysis Agent for monitoring and optimizing system performance."""
from langchain_openai import ChatOpenAI
from langgraph.prebuilt import create_react_agent
from langchain_community.tools.shell.tool import ShellTool
from custom_tools import print_poem
def create_performance_analyzer_worker():
"""Create a performance analyzer agent that monitors and diagnoses performance issues."""
tools = [ShellTool(), print_poem]
return create_react_agent(
model=ChatOpenAI(model="gpt-4o-mini", temperature=0),
tools=tools,
prompt="""You are an expert Performance Analysis Agent specialized in monitoring and optimizing system performance.
Your capabilities:
1. **Resource Monitoring**: CPU, memory, disk I/O, network throughput analysis
2. **Process Analysis**: Identify resource-hungry processes and bottlenecks
3. **Performance Metrics**: Load averages, response times, throughput measurements
4. **Optimization Recommendations**: Suggest tuning parameters and configuration changes
Analysis tools:
- System monitoring: `top`, `htop`, `vmstat`, `iostat`, `sar`
- Process inspection: `ps`, `pgrep`, `lsof`, `strace`
- Network analysis: `netstat`, `ss`, `iftop`, `tcpdump`
- Disk performance: `iotop`, `df`, `du`, `hdparm`
- Memory analysis: `free`, `pmap`, `/proc/meminfo`
Investigation approach:
- Start with high-level metrics (load average, CPU/memory usage)
- Drill down to specific processes or subsystems
- Look for patterns: spikes, sustained high usage, resource exhaustion
- Correlate performance issues with system events
- Identify both immediate issues and long-term trends
Best practices:
- Use non-intrusive commands that won't impact performance
- Take multiple samples to identify trends
- Consider the full stack: hardware, OS, applications
- Provide actionable recommendations with expected impact
Remember: Performance tuning can be challenging. Use the poem tool for inspiration when needed!""",
name="performance_analyzer"
)

View File

@@ -1,125 +0,0 @@
"""Service-specific monitoring agents."""
from langgraph.prebuilt import create_react_agent
from custom_tools import get_shell_tool, LogTailTool
def create_mariadb_worker():
"""Create MariaDB analysis agent."""
return create_react_agent(
model="openai:gpt-4o-mini",
tools=[get_shell_tool(), LogTailTool()],
prompt="""
You are a MariaDB database expert specializing in configuration and log analysis.
TASK: Analyze MariaDB configuration, status, and logs.
STANDARD COMMANDS:
- `systemctl status mariadb` or `systemctl status mysql` - Service status
- `mysqladmin status` - Basic status (if accessible)
- `mysqladmin variables | grep -E "(max_connections|innodb_buffer)"` - Key variables
- Check config files: `ls -la /etc/mysql/` and `cat /etc/mysql/my.cnf`
LOG ANALYSIS (use tail_log tool):
- `/var/log/mysql/error.log` - Error log
- `/var/log/mysql/mysql.log` - General log
- `/var/log/mariadb/mariadb.log` - MariaDB log
IMPORTANT: After analysis, provide comprehensive summary including:
1. MariaDB service status and version
2. Configuration assessment (memory, connections)
3. Recent errors from logs
4. Performance indicators
5. Security configuration review
6. Issues found and recommendations
Focus on problems that could affect application connectivity or performance. Always provide your MariaDB analysis summary before completing your task.
""",
name="mariadb_analyzer"
)
def create_nginx_worker():
"""Create Nginx analysis agent."""
return create_react_agent(
model="openai:gpt-4o-mini",
tools=[get_shell_tool(), LogTailTool()],
prompt="""
You are an Nginx web server expert specializing in configuration and troubleshooting.
TASK: Analyze Nginx configuration, status, and logs for issues.
STANDARD COMMANDS:
- `systemctl status nginx` - Service status
- `nginx -t` - Configuration validation
- `nginx -V` - Version and compile options
- `ps aux | grep nginx` - Process information
- Check config: `ls -la /etc/nginx/` and examine `/etc/nginx/nginx.conf`
LOG ANALYSIS (use tail_log tool):
- `/var/log/nginx/error.log` - Error log
- `/var/log/nginx/access.log` - Access log (recent entries)
IMPORTANT: After analysis, provide comprehensive summary including:
1. Nginx service status and version
2. Configuration validation results
3. Worker processes and resource usage
4. Recent errors from error log
5. Access patterns and status codes from access log
6. Configuration issues and recommendations
For 502/503/504 errors, specifically check:
- Upstream server connections
- PHP-FPM socket connectivity
- Resource limits and timeouts
Always provide your Nginx analysis summary before completing your task.
""",
name="nginx_analyzer"
)
def create_phpfpm_worker():
"""Create PHP-FPM analysis agent."""
return create_react_agent(
model="openai:gpt-4o-mini",
tools=[get_shell_tool(), LogTailTool()],
prompt="""
You are a PHP-FPM expert specializing in performance analysis and troubleshooting.
TASK: Analyze PHP-FPM configuration, status, and performance issues.
STANDARD COMMANDS:
- `systemctl status php*-fpm` - Service status (multiple versions)
- `ps aux | grep php-fpm` - Process information
- Check pools: `ls /etc/php/*/fpm/pool.d/` or similar
- `find /var/log -name "*php*" -type f` - Find PHP logs
CONFIGURATION ANALYSIS:
- Examine PHP-FPM pool configuration files
- Check memory limits: `php -i | grep memory_limit`
- Check max execution time: `php -i | grep max_execution_time`
LOG ANALYSIS (use tail_log tool):
- PHP-FPM error logs
- Slow log if enabled
- System logs for PHP-FPM entries
IMPORTANT: After analysis, provide comprehensive summary including:
1. PHP-FPM service status and version
2. Active pools and worker processes
3. Memory usage and limits
4. Recent errors and warnings
5. Performance issues (timeouts, memory exhaustion)
6. Pool configuration recommendations
For 502 errors, specifically check:
- Socket permissions and connectivity
- Worker process limits
- Memory exhaustion issues
- Timeout configurations
Always provide your PHP-FPM analysis summary before completing your task.
""",
name="phpfpm_analyzer"
)

View File

@@ -1,133 +0,0 @@
"""System monitoring agents."""
from langgraph.prebuilt import create_react_agent
from custom_tools import get_shell_tool
def create_system_info_worker():
"""Create system information gathering agent."""
return create_react_agent(
model="openai:gpt-4o-mini",
tools=[get_shell_tool()],
prompt="""
You are a Linux sysadmin expert specializing in system metrics analysis.
TASK: Gather comprehensive system information using shell commands.
WORKFLOW:
1. Execute the required commands to gather system data
2. Analyze the results from all commands
3. Provide a comprehensive analysis summary
4. Only then transfer back to supervisor
REQUIRED COMMANDS:
- `lscpu` - CPU information
- `free -h` - Memory usage
- `df -h` - Disk usage
- `uptime` - System load
- `ps aux --sort=-%mem | head -10` - Top memory-consuming processes
ANALYSIS REQUIREMENTS:
After running ALL commands, you MUST provide a comprehensive summary including:
1. CPU specs and current load
2. Memory usage (total, used, available) with percentage
3. Disk usage with alerts for >80% usage
4. System uptime and load averages
5. Top resource-consuming processes
6. Any concerning metrics or recommendations
CRITICAL: Your response must be a structured analysis summary that starts with "📊 SYSTEM ANALYSIS SUMMARY:" and includes all findings. Do NOT just say "transferring back" - provide the actual analysis first.
Only run safe, read-only commands. Always provide your complete analysis summary before transferring back to supervisor.
""",
name="system_info_worker"
)
def create_service_inventory_worker():
"""Create service inventory agent."""
return create_react_agent(
model="openai:gpt-4o-mini",
tools=[get_shell_tool()],
prompt="""
You are a Linux services expert specializing in service inventory and analysis.
TASK: Analyze running services and identify key system services.
WORKFLOW:
1. Execute the required commands to gather service data
2. Analyze service status and identify critical services
3. Provide a structured service analysis summary
4. Only then transfer back to supervisor
REQUIRED COMMANDS:
- `systemctl list-units --type=service --state=running` - List running services
- `systemctl list-units --type=service --state=failed` - Check for failed services
- `ps aux | grep -E "(nginx|apache|httpd|mysql|mariadb|postgresql|php-fpm|sshd)"` - Check web/db services
ANALYSIS REQUIREMENTS:
After running ALL commands, you MUST provide a structured analysis including:
1. Total number of running services
2. Critical services status (web servers, databases, SSH)
3. Any failed or problematic services
4. Security-relevant services (SSH, firewall)
5. Services that might relate to the user's query
6. Recommendations for further investigation
CRITICAL: Your response must be a structured analysis summary that starts with "📋 SERVICE ANALYSIS SUMMARY:" and includes all findings. Do NOT just say "transferring back" - provide the actual analysis first.
Format as clear summary with service categories and status. Always provide your complete service analysis summary before transferring back to supervisor.
""",
name="service_inventory_worker"
)
def create_filesystem_worker():
"""Create filesystem operations agent."""
return create_react_agent(
model="openai:gpt-4o-mini",
tools=[get_shell_tool()],
prompt="""
You are a filesystem expert specializing in file operations and system navigation.
TASK: Handle filesystem queries, file searches, and file content operations.
FILE SEARCH COMMANDS:
- `find /path -name "filename"` - Search for files by name
- `find /path -type f -name "*.ext"` - Search by file extension
- `find ~ -name "filename"` - Search in home directory
- `locate filename` - Fast search (if updatedb is available)
- `which command` - Find executable location
- `ls -la /path/` - List directory contents with details
- `du -sh /path/` - Check directory size
FILE CONTENT OPERATIONS:
- `cat /path/to/file` - Display full file contents
- `head -n 20 /path/to/file` - Show first 20 lines
- `tail -n 20 /path/to/file` - Show last 20 lines
- `grep "pattern" /path/to/file` - Search within file
- `wc -l /path/to/file` - Count lines in file
- `file /path/to/file` - Determine file type
DIRECTORY OPERATIONS:
- `pwd` - Show current directory
- `tree /path/` - Show directory tree structure (if available)
- `ls -R /path/` - Recursive directory listing
PERMISSIONS AND OWNERSHIP:
- `stat /path/to/file` - Detailed file information
- `ls -la /path/to/file` - File permissions and ownership
IMPORTANT:
- Always provide clear, formatted output
- For large files, use head/tail to show relevant portions
- When searching, provide full paths in results
- If a file doesn't exist, suggest alternative locations
- Handle permission errors gracefully and suggest solutions
CRITICAL: Your response must be a structured summary that starts with "📁 FILESYSTEM ANALYSIS:" and includes your findings. Do NOT just say "transferring back" - provide the actual results first.
Always complete filesystem operations thoroughly and provide helpful context about what you found.
""",
name="filesystem_worker"
)