Skip to main content
โšก Calmops

High Availability & Disaster Recovery: Multi-Region Strategies

High Availability & Disaster Recovery: Multi-Region Strategies

Building systems that withstand failures requires redundancy, automated failover, and tested recovery procedures.


Availability Levels

Uptime Tiers

99%      (52.6 min downtime/year)       - Single server, basic backups
99.9%    (8.8 min downtime/year)        - Multi-AZ, failover
99.99%   (52.6 sec downtime/year)       - Multi-region, active-active
99.999%  (5.26 sec downtime/year)       - Massive redundancy (5 nines)

Calculating Availability

class AvailabilityCalculator:
    """Calculate system availability from component reliability"""
    
    def series_availability(self, components: list[float]) -> float:
        """
        Series: All components must work
        A_total = A1 ร— A2 ร— A3
        """
        result = 1.0
        for availability in components:
            result *= availability
        return result
    
    def parallel_availability(self, components: list[float]) -> float:
        """
        Parallel: At least one must work
        A_total = 1 - (1-A1) ร— (1-A2)
        """
        unavailability = 1.0
        for availability in components:
            unavailability *= (1 - availability)
        return 1 - unavailability
    
    def calculate_downtime_minutes(self, uptime_percentage: float) -> float:
        """Minutes per year"""
        return (100 - uptime_percentage) / 100 * 525600  # minutes/year

# Example: API with database and cache
calc = AvailabilityCalculator()

# Series: All needed
api_server = 0.9999
database = 0.99999
cache = 0.99

series_avail = calc.series_availability([api_server, database, cache])
print(f"Series (all needed): {series_avail:.6f} (99.99%)")
print(f"Downtime: {calc.calculate_downtime_minutes(series_avail * 100):.1f} min/year")

# Parallel: Fallback possible
availability = calc.parallel_availability([0.95, 0.95])
print(f"Parallel (redundant): {availability:.4f} (99.75%)")

Multi-AZ Architecture

Synchronized Replication

# AWS RDS with Multi-AZ failover
apiVersion: v1
kind: Service
metadata:
  name: database
spec:
  ports:
  - port: 5432
    name: postgres
  clusterIP: None
  selector:
    app: postgres

---
apiVersion: apps/v1
kind: StatefulSet
metadata:
  name: postgres
spec:
  serviceName: postgres
  replicas: 2  # Primary + standby
  selector:
    matchLabels:
      app: postgres
  template:
    metadata:
      labels:
        app: postgres
    spec:
      containers:
      - name: postgres
        image: postgres:15
        ports:
        - containerPort: 5432
        env:
        - name: POSTGRES_REPLICATION_MODE
          value: "streaming"  # Synchronous replication
        volumeMounts:
        - name: data
          mountPath: /var/lib/postgresql
        livenessProbe:
          exec:
            command: ["pg_isready"]
          initialDelaySeconds: 30
          periodSeconds: 10

Automatic Failover

import asyncio
from pymongo import MongoClient
from pymongo.errors import ServerSelectionTimeoutError

class HighAvailabilityCluster:
    """MongoDB replica set with automatic failover"""
    
    def __init__(self):
        # Connection string with multiple nodes
        self.client = MongoClient(
            'mongodb://node1:27017,node2:27017,node3:27017',
            replicaSet='rs0',
            retryWrites=True,
            w='majority'  # Wait for majority replication
        )
        self.db = self.client['app_database']
    
    async def write_with_failover(self, collection: str, document: dict) -> bool:
        """Write with automatic failover to secondary"""
        
        max_retries = 3
        retry_count = 0
        
        while retry_count < max_retries:
            try:
                # Write with majority acknowledgment
                result = self.db[collection].insert_one(document)
                print(f"Written to {self.client.primary}: {result.inserted_id}")
                return True
                
            except ServerSelectionTimeoutError:
                retry_count += 1
                print(f"Primary unavailable, retrying... ({retry_count}/{max_retries})")
                await asyncio.sleep(2 ** retry_count)  # Exponential backoff
                
                # Trigger failover
                self.client.close()
                self.client = MongoClient(
                    'mongodb://node1:27017,node2:27017,node3:27017',
                    replicaSet='rs0'
                )
        
        return False

# Usage
cluster = HighAvailabilityCluster()
success = asyncio.run(cluster.write_with_failover(
    'transactions',
    {'amount': 100, 'status': 'pending'}
))

Multi-Region Architecture

Active-Active Setup

        โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
        โ”‚           Global Load Balancer              โ”‚
        โ”‚         (Route53, Cloudflare)               โ”‚
        โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
                         โ”‚
          โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
          โ”‚              โ”‚              โ”‚
       US-East        EU-West       Asia-Pacific
      โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”    โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”    โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
      โ”‚   API  โ”‚    โ”‚   API  โ”‚    โ”‚   API  โ”‚
      โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค    โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค    โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค
      โ”‚   DB   โ”‚โ—„โ”€โ”€โ–บโ”‚   DB   โ”‚โ—„โ”€โ”€โ–บโ”‚   DB   โ”‚
      โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜    โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜    โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
       Replication across regions

Multi-Region Deployment

import boto3
from typing import List

class MultiRegionHA:
    """Deploy and manage multi-region infrastructure"""
    
    def __init__(self, regions: List[str]):
        self.regions = regions
        self.clients = {region: boto3.client('ec2', region_name=region) 
                        for region in regions}
    
    def deploy_across_regions(self, ami_id: str):
        """Deploy instances to all regions"""
        
        deployments = {}
        
        for region in self.regions:
            ec2 = self.clients[region]
            
            # Launch instances
            response = ec2.run_instances(
                ImageId=ami_id,
                MinCount=2,
                MaxCount=2,
                InstanceType='t3.medium',
                Placement={'AvailabilityZone': f'{region}a'},
            )
            
            instance_ids = [i['InstanceId'] for i in response['Instances']]
            deployments[region] = instance_ids
        
        return deployments
    
    def setup_global_failover(self):
        """Configure Route53 health checks and failover"""
        
        route53 = boto3.client('route53')
        
        # Create health checks for each region
        health_checks = {}
        
        for region in self.regions:
            health_check = route53.create_health_check(
                HealthCheckConfig={
                    'Type': 'HTTPS',
                    'ResourcePath': '/health',
                    'FullyQualifiedDomainName': f'api.{region}.example.com',
                    'Port': 443,
                    'RequestInterval': 30,
                    'FailureThreshold': 3
                }
            )
            health_checks[region] = health_check['HealthCheck']['Id']
        
        # Create failover routing records
        for idx, region in enumerate(self.regions):
            route53.change_resource_record_sets(
                HostedZoneId='Z123456',
                ChangeBatch={
                    'Changes': [{
                        'Action': 'UPSERT',
                        'ResourceRecordSet': {
                            'Name': 'api.example.com',
                            'Type': 'A',
                            'TTL': 60,
                            'Failover': 'PRIMARY' if idx == 0 else 'SECONDARY',
                            'SetIdentifier': region,
                            'HealthCheckId': health_checks[region],
                            'AliasTarget': {
                                'HostedZoneId': 'Z456',
                                'DNSName': f'elb.{region}.amazonaws.com',
                                'EvaluateTargetHealth': True
                            }
                        }
                    }]
                }
            )

# Deploy to 3 regions with automatic failover
ha = MultiRegionHA(['us-east-1', 'eu-west-1', 'ap-southeast-1'])
ha.deploy_across_regions(ami_id='ami-0123456789')
ha.setup_global_failover()

RTO & RPO Strategies

Recovery Time Objective (RTO)

RTO = Time to restore service after failure

Strategies by RTO:
- 4 hours  : Daily snapshots + restore (low cost)
- 1 hour   : Hot standby + manual failover
- 15 min   : Automated failover + warm standby
- 1 min    : Active-active replication
- < 30 sec : Instant failover (5 nines)

Recovery Point Objective (RPO)

RPO = Data loss acceptable after failure

Strategies by RPO:
- 24 hours : Nightly backups (acceptable data loss)
- 1 hour   : Hourly snapshots
- 15 min   : Continuous replication (sync)
- < 1 sec  : Real-time sync + write-ahead logs

RTO/RPO Calculator

class DisasterRecoveryPlanning:
    """Plan RTO/RPO strategy based on business needs"""
    
    def __init__(self, business_impact_per_hour: float):
        self.impact_per_hour = business_impact_per_hour
    
    def analyze_strategy(self, rto_hours: float, rpo_hours: float):
        """Analyze cost vs business impact"""
        
        recovery_cost = {
            'daily_backup': 5000,           # Annual cost
            'hourly_snapshot': 15000,
            'continuous_replication': 50000,
            'active_active': 150000,
            'instant_failover': 250000
        }
        
        # Calculate failure cost
        annual_failure_hours = (365 * 24) / (24/rto_hours)  # Simplified
        annual_failure_cost = annual_failure_hours * self.impact_per_hour
        
        # Data loss cost
        avg_data_loss_records = (rpo_hours * 1000)  # 1k records/hour
        data_loss_cost = avg_data_loss_records * 50  # $50 per record
        
        return {
            'rto_hours': rto_hours,
            'rpo_hours': rpo_hours,
            'annual_failure_cost': annual_failure_cost,
            'data_loss_cost': data_loss_cost,
            'total_risk': annual_failure_cost + data_loss_cost
        }

# Example: E-commerce losing $50k per hour of downtime
dr = DisasterRecoveryPlanning(business_impact_per_hour=50000)

for strategy, cost in [
    ('Daily backups (24h RTO)', 24),
    ('Hourly snapshots (1h RTO)', 1),
    ('Replication (15min RTO)', 0.25),
]:
    result = dr.analyze_strategy(rto_hours=cost, rpo_hours=cost/2)
    print(f"{strategy}: ${result['total_risk']:,.0f} annual risk")

Backup Strategies

Incremental Backups

import hashlib
from datetime import datetime
from pathlib import Path

class IncrementalBackup:
    """Efficient incremental backup with change tracking"""
    
    def __init__(self, backup_dir: str):
        self.backup_dir = Path(backup_dir)
        self.manifest = {}  # Track file hashes
    
    def calculate_file_hash(self, filepath: str) -> str:
        """Calculate SHA256 of file"""
        sha256 = hashlib.sha256()
        with open(filepath, 'rb') as f:
            for chunk in iter(lambda: f.read(4096), b''):
                sha256.update(chunk)
        return sha256.hexdigest()
    
    def backup_incrementally(self, data_dir: str) -> dict:
        """Only backup changed files"""
        
        backup_meta = {
            'timestamp': datetime.utcnow().isoformat(),
            'files': {}
        }
        
        for filepath in Path(data_dir).rglob('*'):
            if filepath.is_file():
                file_hash = self.calculate_file_hash(str(filepath))
                
                # Check if file changed since last backup
                prev_hash = self.manifest.get(str(filepath))
                
                if prev_hash != file_hash:
                    # Only copy changed files
                    backup_path = self.backup_dir / filepath.name
                    backup_path.write_bytes(filepath.read_bytes())
                    
                    backup_meta['files'][str(filepath)] = {
                        'hash': file_hash,
                        'size': filepath.stat().st_size
                    }
                    
                    # Update manifest
                    self.manifest[str(filepath)] = file_hash
        
        return backup_meta
    
    def restore_from_backup(self, target_dir: str):
        """Restore from backup"""
        
        for backup_file in self.backup_dir.glob('*'):
            restore_path = Path(target_dir) / backup_file.name
            restore_path.write_bytes(backup_file.read_bytes())
            print(f"Restored {backup_file.name}")

# Usage
backup = IncrementalBackup('/backups')
meta = backup.backup_incrementally('/data')
print(f"Backed up {len(meta['files'])} changed files")

Cross-Region Backup

# AWS S3 with cross-region replication
apiVersion: storage.cnpg.io/v1
kind: Cluster
metadata:
  name: postgres-multi-region
spec:
  instances: 3
  
  # Primary backup
  backup:
    barmanObjectStore:
      destinationPath: s3://backups-us-east
      s3Credentials:
        accessKeyId:
          name: aws-creds
          key: access_key
        secretAccessKey:
          name: aws-creds
          key: secret_key
  
  # Secondary region backup
  externalClusters:
  - name: backup-eu
    barmanObjectStore:
      destinationPath: s3://backups-eu-west
      s3Credentials:
        accessKeyId:
          name: aws-creds
          key: access_key
        secretAccessKey:
          name: aws-creds
          key: secret_key

Testing Failover

Chaos Engineering

from chaos_monkey import ChaosMonkey
from datetime import datetime

class FailoverTesting:
    """Regularly test failover capabilities"""
    
    def __init__(self):
        self.chaos = ChaosMonkey()
    
    def test_instance_failure(self):
        """Kill random instance and verify failover"""
        
        print(f"[{datetime.now()}] Starting instance failure test")
        
        # Kill instance
        victim = self.chaos.kill_random_instance()
        print(f"Killed instance: {victim}")
        
        # Monitor failover
        start_time = datetime.now()
        while not self.is_healthy():
            elapsed = (datetime.now() - start_time).total_seconds()
            if elapsed > 300:  # 5 minute timeout
                raise TimeoutError("Failover took too long")
            time.sleep(5)
        
        failover_time = (datetime.now() - start_time).total_seconds()
        print(f"Service recovered in {failover_time:.1f} seconds (RTO: 5min)")
    
    def test_database_failure(self):
        """Fail primary database and verify promotion"""
        
        print("Starting database failover test")
        
        # Fail primary
        self.chaos.kill_db_primary()
        
        # Verify secondary promoted
        assert self.get_db_role() == 'primary'
        print("Secondary promoted to primary โœ“")
        
        # Verify no data loss
        assert self.verify_data_integrity()
        print("All data intact โœ“")
    
    def test_region_failure(self):
        """Simulate entire region failure"""
        
        print("Starting region failure test")
        
        # Disable entire region
        self.chaos.partition_network_segment('us-east-1')
        
        # Verify traffic routed to other regions
        start = datetime.now()
        while not self.traffic_routed_to('eu-west-1'):
            if (datetime.now() - start).seconds > 60:
                raise TimeoutError("DNS failover delayed")
            time.sleep(1)
        
        print("Traffic failed over to eu-west-1 โœ“")
    
    def is_healthy(self) -> bool:
        """Check system health"""
        # Verify API responds
        # Check database is writable
        # Validate data consistency
        pass

Glossary

  • RTO: Recovery Time Objective - time to restore after failure
  • RPO: Recovery Point Objective - acceptable data loss
  • Multi-AZ: Multiple Availability Zones in same region
  • Multi-region: Active deployment across geographic regions
  • Failover: Automatic switch to backup resource
  • MTBF: Mean Time Between Failures
  • MTTR: Mean Time To Repair

Resources

Comments