Back to Blog
Python Automation for DevOps: Essential Scripts Every SRE Should Know
•Ankit Bhardwaj
PythonAutomationDevOpsSRE
Python Automation for DevOps: Essential Scripts Every SRE Should Know
As a Site Reliability Engineer, automation is key to managing infrastructure efficiently. Python is my go-to language for automation tasks due to its simplicity and powerful libraries. Here are some essential automation scripts I use daily.
1. Automated Health Check Script
Monitor multiple services and get instant notifications when something goes wrong.
import requests
import json
from datetime import datetime
def check_service_health(url, expected_status=200):
try:
response = requests.get(url, timeout=10)
status = "✓ UP" if response.status_code == expected_status else "✗ DOWN"
return {
"url": url,
"status": status,
"status_code": response.status_code,
"response_time": response.elapsed.total_seconds(),
"timestamp": datetime.now().isoformat()
}
except Exception as e:
return {
"url": url,
"status": "✗ ERROR",
"error": str(e),
"timestamp": datetime.now().isoformat()
}
# Check multiple services
services = [
"https://api.example.com/health",
"https://app.example.com/status",
"https://db.example.com/ping"
]
for service in services:
result = check_service_health(service)
print(json.dumps(result, indent=2))
2. AWS Resource Inventory
Automatically generate an inventory of your AWS resources across regions.
import boto3
from collections import defaultdict
def get_aws_inventory(regions=['us-east-1', 'us-west-2']):
inventory = defaultdict(list)
for region in regions:
ec2 = boto3.client('ec2', region_name=region)
# Get EC2 instances
instances = ec2.describe_instances()
for reservation in instances['Reservations']:
for instance in reservation['Instances']:
inventory['ec2'].append({
'id': instance['InstanceId'],
'type': instance['InstanceType'],
'state': instance['State']['Name'],
'region': region
})
return inventory
# Usage
inventory = get_aws_inventory()
print(f"Total EC2 instances: {len(inventory['ec2'])}")
3. Log Analyzer for Error Detection
Parse logs and identify patterns that indicate problems.
import re
from collections import Counter
def analyze_logs(log_file):
error_patterns = {
'timeout': r'timeout|timed out',
'connection': r'connection refused|connection reset',
'memory': r'out of memory|memory exceeded',
'auth': r'authentication failed|unauthorized'
}
errors = Counter()
with open(log_file, 'r') as f:
for line in f:
for error_type, pattern in error_patterns.items():
if re.search(pattern, line, re.IGNORECASE):
errors[error_type] += 1
return errors
# Usage
errors = analyze_logs('/var/log/application.log')
print("Error Summary:")
for error_type, count in errors.most_common():
print(f" {error_type}: {count}")
4. Kubernetes Pod Restart Monitor
Track pod restarts and identify problematic deployments.
from kubernetes import client, config
def monitor_pod_restarts(namespace='default', threshold=5):
config.load_kube_config()
v1 = client.CoreV1Api()
pods = v1.list_namespaced_pod(namespace)
problematic_pods = []
for pod in pods.items:
for container_status in pod.status.container_statuses or []:
restart_count = container_status.restart_count
if restart_count >= threshold:
problematic_pods.append({
'pod': pod.metadata.name,
'container': container_status.name,
'restarts': restart_count,
'status': pod.status.phase
})
return problematic_pods
# Usage
pods = monitor_pod_restarts(threshold=5)
if pods:
print("Pods with high restart counts:")
for pod in pods:
print(f" {pod['pod']}: {pod['restarts']} restarts")
5. Automated Backup Verification
Verify that your backups are actually restorable.
import subprocess
import tempfile
import os
def verify_backup(backup_file, test_restore_dir=None):
if test_restore_dir is None:
test_restore_dir = tempfile.mkdtemp()
try:
# Attempt restore
result = subprocess.run(
['tar', '-xzf', backup_file, '-C', test_restore_dir],
capture_output=True,
text=True,
timeout=300
)
if result.returncode == 0:
# Check if files exist
files = os.listdir(test_restore_dir)
return {
'status': 'success',
'files_restored': len(files),
'size': os.path.getsize(backup_file)
}
else:
return {
'status': 'failed',
'error': result.stderr
}
except Exception as e:
return {
'status': 'error',
'error': str(e)
}
finally:
# Cleanup
subprocess.run(['rm', '-rf', test_restore_dir])
# Usage
result = verify_backup('/backups/daily-backup.tar.gz')
print(f"Backup verification: {result['status']}")
Best Practices for DevOps Automation
- Error Handling: Always include proper error handling and logging
- Timeouts: Set appropriate timeouts for network operations
- Idempotency: Make scripts safe to run multiple times
- Logging: Log all actions for audit trails
- Testing: Test scripts in non-production environments first
Taking It Further
These scripts are starting points. Enhance them by:
- Adding Slack/email notifications
- Integrating with monitoring systems
- Creating dashboards with the collected data
- Scheduling with cron or Kubernetes CronJobs
Need Python Automation Help?
I offer training workshops on Python automation and DevOps. Contact me to learn more about customized training for your team.
