High Availability & Disaster Recovery: Multi-Region Strategies
Building systems that withstand failures requires redundancy, automated failover, and tested recovery procedures.
Availability Levels
Uptime Tiers
99% (52.6 min downtime/year) - Single server, basic backups
99.9% (8.8 min downtime/year) - Multi-AZ, failover
99.99% (52.6 sec downtime/year) - Multi-region, active-active
99.999% (5.26 sec downtime/year) - Massive redundancy (5 nines)
Calculating Availability
class AvailabilityCalculator:
"""Calculate system availability from component reliability"""
def series_availability(self, components: list[float]) -> float:
"""
Series: All components must work
A_total = A1 ร A2 ร A3
"""
result = 1.0
for availability in components:
result *= availability
return result
def parallel_availability(self, components: list[float]) -> float:
"""
Parallel: At least one must work
A_total = 1 - (1-A1) ร (1-A2)
"""
unavailability = 1.0
for availability in components:
unavailability *= (1 - availability)
return 1 - unavailability
def calculate_downtime_minutes(self, uptime_percentage: float) -> float:
"""Minutes per year"""
return (100 - uptime_percentage) / 100 * 525600 # minutes/year
# Example: API with database and cache
calc = AvailabilityCalculator()
# Series: All needed
api_server = 0.9999
database = 0.99999
cache = 0.99
series_avail = calc.series_availability([api_server, database, cache])
print(f"Series (all needed): {series_avail:.6f} (99.99%)")
print(f"Downtime: {calc.calculate_downtime_minutes(series_avail * 100):.1f} min/year")
# Parallel: Fallback possible
availability = calc.parallel_availability([0.95, 0.95])
print(f"Parallel (redundant): {availability:.4f} (99.75%)")
Multi-AZ Architecture
Synchronized Replication
# AWS RDS with Multi-AZ failover
apiVersion: v1
kind: Service
metadata:
name: database
spec:
ports:
- port: 5432
name: postgres
clusterIP: None
selector:
app: postgres
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: postgres
spec:
serviceName: postgres
replicas: 2 # Primary + standby
selector:
matchLabels:
app: postgres
template:
metadata:
labels:
app: postgres
spec:
containers:
- name: postgres
image: postgres:15
ports:
- containerPort: 5432
env:
- name: POSTGRES_REPLICATION_MODE
value: "streaming" # Synchronous replication
volumeMounts:
- name: data
mountPath: /var/lib/postgresql
livenessProbe:
exec:
command: ["pg_isready"]
initialDelaySeconds: 30
periodSeconds: 10
Automatic Failover
import asyncio
from pymongo import MongoClient
from pymongo.errors import ServerSelectionTimeoutError
class HighAvailabilityCluster:
"""MongoDB replica set with automatic failover"""
def __init__(self):
# Connection string with multiple nodes
self.client = MongoClient(
'mongodb://node1:27017,node2:27017,node3:27017',
replicaSet='rs0',
retryWrites=True,
w='majority' # Wait for majority replication
)
self.db = self.client['app_database']
async def write_with_failover(self, collection: str, document: dict) -> bool:
"""Write with automatic failover to secondary"""
max_retries = 3
retry_count = 0
while retry_count < max_retries:
try:
# Write with majority acknowledgment
result = self.db[collection].insert_one(document)
print(f"Written to {self.client.primary}: {result.inserted_id}")
return True
except ServerSelectionTimeoutError:
retry_count += 1
print(f"Primary unavailable, retrying... ({retry_count}/{max_retries})")
await asyncio.sleep(2 ** retry_count) # Exponential backoff
# Trigger failover
self.client.close()
self.client = MongoClient(
'mongodb://node1:27017,node2:27017,node3:27017',
replicaSet='rs0'
)
return False
# Usage
cluster = HighAvailabilityCluster()
success = asyncio.run(cluster.write_with_failover(
'transactions',
{'amount': 100, 'status': 'pending'}
))
Multi-Region Architecture
Active-Active Setup
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โ Global Load Balancer โ
โ (Route53, Cloudflare) โ
โโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โ
โโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโ
โ โ โ
US-East EU-West Asia-Pacific
โโโโโโโโโโ โโโโโโโโโโ โโโโโโโโโโ
โ API โ โ API โ โ API โ
โโโโโโโโโโค โโโโโโโโโโค โโโโโโโโโโค
โ DB โโโโโบโ DB โโโโโบโ DB โ
โโโโโโโโโโ โโโโโโโโโโ โโโโโโโโโโ
Replication across regions
Multi-Region Deployment
import boto3
from typing import List
class MultiRegionHA:
"""Deploy and manage multi-region infrastructure"""
def __init__(self, regions: List[str]):
self.regions = regions
self.clients = {region: boto3.client('ec2', region_name=region)
for region in regions}
def deploy_across_regions(self, ami_id: str):
"""Deploy instances to all regions"""
deployments = {}
for region in self.regions:
ec2 = self.clients[region]
# Launch instances
response = ec2.run_instances(
ImageId=ami_id,
MinCount=2,
MaxCount=2,
InstanceType='t3.medium',
Placement={'AvailabilityZone': f'{region}a'},
)
instance_ids = [i['InstanceId'] for i in response['Instances']]
deployments[region] = instance_ids
return deployments
def setup_global_failover(self):
"""Configure Route53 health checks and failover"""
route53 = boto3.client('route53')
# Create health checks for each region
health_checks = {}
for region in self.regions:
health_check = route53.create_health_check(
HealthCheckConfig={
'Type': 'HTTPS',
'ResourcePath': '/health',
'FullyQualifiedDomainName': f'api.{region}.example.com',
'Port': 443,
'RequestInterval': 30,
'FailureThreshold': 3
}
)
health_checks[region] = health_check['HealthCheck']['Id']
# Create failover routing records
for idx, region in enumerate(self.regions):
route53.change_resource_record_sets(
HostedZoneId='Z123456',
ChangeBatch={
'Changes': [{
'Action': 'UPSERT',
'ResourceRecordSet': {
'Name': 'api.example.com',
'Type': 'A',
'TTL': 60,
'Failover': 'PRIMARY' if idx == 0 else 'SECONDARY',
'SetIdentifier': region,
'HealthCheckId': health_checks[region],
'AliasTarget': {
'HostedZoneId': 'Z456',
'DNSName': f'elb.{region}.amazonaws.com',
'EvaluateTargetHealth': True
}
}
}]
}
)
# Deploy to 3 regions with automatic failover
ha = MultiRegionHA(['us-east-1', 'eu-west-1', 'ap-southeast-1'])
ha.deploy_across_regions(ami_id='ami-0123456789')
ha.setup_global_failover()
RTO & RPO Strategies
Recovery Time Objective (RTO)
RTO = Time to restore service after failure
Strategies by RTO:
- 4 hours : Daily snapshots + restore (low cost)
- 1 hour : Hot standby + manual failover
- 15 min : Automated failover + warm standby
- 1 min : Active-active replication
- < 30 sec : Instant failover (5 nines)
Recovery Point Objective (RPO)
RPO = Data loss acceptable after failure
Strategies by RPO:
- 24 hours : Nightly backups (acceptable data loss)
- 1 hour : Hourly snapshots
- 15 min : Continuous replication (sync)
- < 1 sec : Real-time sync + write-ahead logs
RTO/RPO Calculator
class DisasterRecoveryPlanning:
"""Plan RTO/RPO strategy based on business needs"""
def __init__(self, business_impact_per_hour: float):
self.impact_per_hour = business_impact_per_hour
def analyze_strategy(self, rto_hours: float, rpo_hours: float):
"""Analyze cost vs business impact"""
recovery_cost = {
'daily_backup': 5000, # Annual cost
'hourly_snapshot': 15000,
'continuous_replication': 50000,
'active_active': 150000,
'instant_failover': 250000
}
# Calculate failure cost
annual_failure_hours = (365 * 24) / (24/rto_hours) # Simplified
annual_failure_cost = annual_failure_hours * self.impact_per_hour
# Data loss cost
avg_data_loss_records = (rpo_hours * 1000) # 1k records/hour
data_loss_cost = avg_data_loss_records * 50 # $50 per record
return {
'rto_hours': rto_hours,
'rpo_hours': rpo_hours,
'annual_failure_cost': annual_failure_cost,
'data_loss_cost': data_loss_cost,
'total_risk': annual_failure_cost + data_loss_cost
}
# Example: E-commerce losing $50k per hour of downtime
dr = DisasterRecoveryPlanning(business_impact_per_hour=50000)
for strategy, cost in [
('Daily backups (24h RTO)', 24),
('Hourly snapshots (1h RTO)', 1),
('Replication (15min RTO)', 0.25),
]:
result = dr.analyze_strategy(rto_hours=cost, rpo_hours=cost/2)
print(f"{strategy}: ${result['total_risk']:,.0f} annual risk")
Backup Strategies
Incremental Backups
import hashlib
from datetime import datetime
from pathlib import Path
class IncrementalBackup:
"""Efficient incremental backup with change tracking"""
def __init__(self, backup_dir: str):
self.backup_dir = Path(backup_dir)
self.manifest = {} # Track file hashes
def calculate_file_hash(self, filepath: str) -> str:
"""Calculate SHA256 of file"""
sha256 = hashlib.sha256()
with open(filepath, 'rb') as f:
for chunk in iter(lambda: f.read(4096), b''):
sha256.update(chunk)
return sha256.hexdigest()
def backup_incrementally(self, data_dir: str) -> dict:
"""Only backup changed files"""
backup_meta = {
'timestamp': datetime.utcnow().isoformat(),
'files': {}
}
for filepath in Path(data_dir).rglob('*'):
if filepath.is_file():
file_hash = self.calculate_file_hash(str(filepath))
# Check if file changed since last backup
prev_hash = self.manifest.get(str(filepath))
if prev_hash != file_hash:
# Only copy changed files
backup_path = self.backup_dir / filepath.name
backup_path.write_bytes(filepath.read_bytes())
backup_meta['files'][str(filepath)] = {
'hash': file_hash,
'size': filepath.stat().st_size
}
# Update manifest
self.manifest[str(filepath)] = file_hash
return backup_meta
def restore_from_backup(self, target_dir: str):
"""Restore from backup"""
for backup_file in self.backup_dir.glob('*'):
restore_path = Path(target_dir) / backup_file.name
restore_path.write_bytes(backup_file.read_bytes())
print(f"Restored {backup_file.name}")
# Usage
backup = IncrementalBackup('/backups')
meta = backup.backup_incrementally('/data')
print(f"Backed up {len(meta['files'])} changed files")
Cross-Region Backup
# AWS S3 with cross-region replication
apiVersion: storage.cnpg.io/v1
kind: Cluster
metadata:
name: postgres-multi-region
spec:
instances: 3
# Primary backup
backup:
barmanObjectStore:
destinationPath: s3://backups-us-east
s3Credentials:
accessKeyId:
name: aws-creds
key: access_key
secretAccessKey:
name: aws-creds
key: secret_key
# Secondary region backup
externalClusters:
- name: backup-eu
barmanObjectStore:
destinationPath: s3://backups-eu-west
s3Credentials:
accessKeyId:
name: aws-creds
key: access_key
secretAccessKey:
name: aws-creds
key: secret_key
Testing Failover
Chaos Engineering
from chaos_monkey import ChaosMonkey
from datetime import datetime
class FailoverTesting:
"""Regularly test failover capabilities"""
def __init__(self):
self.chaos = ChaosMonkey()
def test_instance_failure(self):
"""Kill random instance and verify failover"""
print(f"[{datetime.now()}] Starting instance failure test")
# Kill instance
victim = self.chaos.kill_random_instance()
print(f"Killed instance: {victim}")
# Monitor failover
start_time = datetime.now()
while not self.is_healthy():
elapsed = (datetime.now() - start_time).total_seconds()
if elapsed > 300: # 5 minute timeout
raise TimeoutError("Failover took too long")
time.sleep(5)
failover_time = (datetime.now() - start_time).total_seconds()
print(f"Service recovered in {failover_time:.1f} seconds (RTO: 5min)")
def test_database_failure(self):
"""Fail primary database and verify promotion"""
print("Starting database failover test")
# Fail primary
self.chaos.kill_db_primary()
# Verify secondary promoted
assert self.get_db_role() == 'primary'
print("Secondary promoted to primary โ")
# Verify no data loss
assert self.verify_data_integrity()
print("All data intact โ")
def test_region_failure(self):
"""Simulate entire region failure"""
print("Starting region failure test")
# Disable entire region
self.chaos.partition_network_segment('us-east-1')
# Verify traffic routed to other regions
start = datetime.now()
while not self.traffic_routed_to('eu-west-1'):
if (datetime.now() - start).seconds > 60:
raise TimeoutError("DNS failover delayed")
time.sleep(1)
print("Traffic failed over to eu-west-1 โ")
def is_healthy(self) -> bool:
"""Check system health"""
# Verify API responds
# Check database is writable
# Validate data consistency
pass
Glossary
- RTO: Recovery Time Objective - time to restore after failure
- RPO: Recovery Point Objective - acceptable data loss
- Multi-AZ: Multiple Availability Zones in same region
- Multi-region: Active deployment across geographic regions
- Failover: Automatic switch to backup resource
- MTBF: Mean Time Between Failures
- MTTR: Mean Time To Repair
Resources
- AWS Well-Architected Framework: Reliability
- Google Cloud High Availability
- Azure Disaster Recovery
- Chaos Engineering Handbook
Comments