Python Automation for DevOps: Essential Scripts Every SRE Should Know

As a Site Reliability Engineer, automation is key to managing infrastructure efficiently. Python is my go-to language for automation tasks due to its simplicity and powerful libraries. Here are some essential automation scripts I use daily.

1. Automated Health Check Script

Monitor multiple services and get instant notifications when something goes wrong.

import requests
import json
from datetime import datetime

def check_service_health(url, expected_status=200):
    try:
        response = requests.get(url, timeout=10)
        status = "✓ UP" if response.status_code == expected_status else "✗ DOWN"
        return {
            "url": url,
            "status": status,
            "status_code": response.status_code,
            "response_time": response.elapsed.total_seconds(),
            "timestamp": datetime.now().isoformat()
        }
    except Exception as e:
        return {
            "url": url,
            "status": "✗ ERROR",
            "error": str(e),
            "timestamp": datetime.now().isoformat()
        }

# Check multiple services
services = [
    "https://api.example.com/health",
    "https://app.example.com/status",
    "https://db.example.com/ping"
]

for service in services:
    result = check_service_health(service)
    print(json.dumps(result, indent=2))

2. AWS Resource Inventory

Automatically generate an inventory of your AWS resources across regions.

import boto3
from collections import defaultdict

def get_aws_inventory(regions=['us-east-1', 'us-west-2']):
    inventory = defaultdict(list)

    for region in regions:
        ec2 = boto3.client('ec2', region_name=region)

        # Get EC2 instances
        instances = ec2.describe_instances()
        for reservation in instances['Reservations']:
            for instance in reservation['Instances']:
                inventory['ec2'].append({
                    'id': instance['InstanceId'],
                    'type': instance['InstanceType'],
                    'state': instance['State']['Name'],
                    'region': region
                })

    return inventory

# Usage
inventory = get_aws_inventory()
print(f"Total EC2 instances: {len(inventory['ec2'])}")

3. Log Analyzer for Error Detection

Parse logs and identify patterns that indicate problems.

import re
from collections import Counter

def analyze_logs(log_file):
    error_patterns = {
        'timeout': r'timeout|timed out',
        'connection': r'connection refused|connection reset',
        'memory': r'out of memory|memory exceeded',
        'auth': r'authentication failed|unauthorized'
    }

    errors = Counter()

    with open(log_file, 'r') as f:
        for line in f:
            for error_type, pattern in error_patterns.items():
                if re.search(pattern, line, re.IGNORECASE):
                    errors[error_type] += 1

    return errors

# Usage
errors = analyze_logs('/var/log/application.log')
print("Error Summary:")
for error_type, count in errors.most_common():
    print(f"  {error_type}: {count}")

4. Kubernetes Pod Restart Monitor

Track pod restarts and identify problematic deployments.

from kubernetes import client, config

def monitor_pod_restarts(namespace='default', threshold=5):
    config.load_kube_config()
    v1 = client.CoreV1Api()

    pods = v1.list_namespaced_pod(namespace)
    problematic_pods = []

    for pod in pods.items:
        for container_status in pod.status.container_statuses or []:
            restart_count = container_status.restart_count
            if restart_count >= threshold:
                problematic_pods.append({
                    'pod': pod.metadata.name,
                    'container': container_status.name,
                    'restarts': restart_count,
                    'status': pod.status.phase
                })

    return problematic_pods

# Usage
pods = monitor_pod_restarts(threshold=5)
if pods:
    print("Pods with high restart counts:")
    for pod in pods:
        print(f"  {pod['pod']}: {pod['restarts']} restarts")

5. Automated Backup Verification

Verify that your backups are actually restorable.

import subprocess
import tempfile
import os

def verify_backup(backup_file, test_restore_dir=None):
    if test_restore_dir is None:
        test_restore_dir = tempfile.mkdtemp()

    try:
        # Attempt restore
        result = subprocess.run(
            ['tar', '-xzf', backup_file, '-C', test_restore_dir],
            capture_output=True,
            text=True,
            timeout=300
        )

        if result.returncode == 0:
            # Check if files exist
            files = os.listdir(test_restore_dir)
            return {
                'status': 'success',
                'files_restored': len(files),
                'size': os.path.getsize(backup_file)
            }
        else:
            return {
                'status': 'failed',
                'error': result.stderr
            }
    except Exception as e:
        return {
            'status': 'error',
            'error': str(e)
        }
    finally:
        # Cleanup
        subprocess.run(['rm', '-rf', test_restore_dir])

# Usage
result = verify_backup('/backups/daily-backup.tar.gz')
print(f"Backup verification: {result['status']}")

Best Practices for DevOps Automation

Error Handling: Always include proper error handling and logging
Timeouts: Set appropriate timeouts for network operations
Idempotency: Make scripts safe to run multiple times
Logging: Log all actions for audit trails
Testing: Test scripts in non-production environments first

Taking It Further

These scripts are starting points. Enhance them by:

Adding Slack/email notifications
Integrating with monitoring systems
Creating dashboards with the collected data
Scheduling with cron or Kubernetes CronJobs

Need Python Automation Help?

I offer training workshops on Python automation and DevOps. Contact me to learn more about customized training for your team.