Introduction
SaaS applications require enterprise-grade disaster recovery. With customers relying on 24/7 availability, having robust backup and DR strategies is essential for customer trust and business continuity.
Key Statistics:
- 60% of companies without DR plans fail within 6 months of disaster
- Average DR cost: $100K-1M per incident
- Multi-region reduces outage risk by 85%
- Automated DR reduces recovery time by 90%
DR Strategy Framework
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โ DR Strategy Components โ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค
โ โ
โ RTO (Recovery Time Objective) โ
โ โโโ Critical: 15 minutes (financial, healthcare) โ
โ โโโ Standard: 1 hour (most SaaS) โ
โ โโโ Basic: 4 hours (non-mission critical) โ
โ โ
โ RPO (Recovery Point Objective) โ
โ โโโ Near-zero: Synchronous replication โ
โ โโโ 5 minutes: Async with frequent snapshots โ
โ โโโ 1 hour: Daily backups with async replication โ
โ โ
โ Architecture Options โ
โ โโโ Active-Active: Full redundancy, instant failover โ
โ โโโ Active-Passive: Standby ready to promote โ
โ โโโ Backup-Restore: Recovery from backups โ
โ โ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
Multi-Region Architecture
Active-Active Setup
# Terraform multi-region deployment
provider "aws" {
alias = "primary"
region = "us-east-1"
}
provider "aws" {
alias = "secondary"
region = "us-west-2"
}
# Primary database with cross-region read replica
resource "aws_db_instance" "primary" {
provider = aws.primary
identifier = "saas-primary"
engine = "postgres"
engine_version = "15.4"
instance_class = "db.r6g.xlarge"
multi_az = true
backup_retention_days = 30
backup_window = "03:00-04:00"
maintenance_window = "sun:04:00-sun:05:00"
}
# Cross-region read replica
resource "aws_db_instance" "replica" {
provider = aws.secondary
identifier = "saas-dr-replica"
source_db_instance_identifier = aws_db_instance.primary.identifier
instance_class = "db.r6g.xlarge"
engine = "postgres"
}
# Global Accelerator for DNS failover
resource "aws_globalaccelerator_accelerator" "main" {
name = "saas-accelerator"
enabled = true
}
resource "aws_globalaccelerator_listener" "tcp" {
accelerator_arn = aws_globalaccelerator_accelerator.main.id
protocol = "TCP"
port_range {
from_port = 443
to_port = 443
}
}
resource "aws_globalaccelerator_endpoint_group" "primary" {
listener_arn = aws_globalaccelerator_listener.tcp.id
endpoint_group_region = "us-east-1"
traffic_dial_percentage = 100
endpoint_configuration {
endpoint_id = aws_lb.primary.arn
weight = 100
}
}
Backup Strategies
Database Backups
#!/usr/bin/env python3
"""Automated backup system."""
import boto3
from datetime import datetime, timedelta
class DatabaseBackupManager:
"""Manage database backups across regions."""
def __init__(self):
self.rds = boto3.client('rds')
self.s3 = boto3.client('s3')
def create_snapshot(self, db_instance_id, prefix='daily'):
"""Create manual snapshot."""
snapshot_id = f"{prefix}-{db_instance_id}-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
response = self.rds.create_db_snapshot(
DBSnapshotIdentifier=snapshot_id,
DBInstanceIdentifier=db_instance_id,
Tags=[
{'Key': 'BackupType', 'Value': prefix},
{'Key': 'CreatedBy', 'Value': 'automated-backup'}
]
)
return response['DBSnapshot']['DBSnapshotIdentifier']
def copy_to_region(self, snapshot_id, target_region):
"""Copy snapshot to another region."""
response = self.rds.copy_db_snapshot(
SourceDBSnapshotIdentifier=snapshot_id,
TargetDBSnapshotIdentifier=f"{snapshot_id}-{target_region}",
SourceRegion='us-east-1',
KmsKeyId=f'arn:aws:kms:{target_region}:123456789012:key/backup-key'
)
return response['DBSnapshot']['DBSnapshotIdentifier']
def restore_from_snapshot(self, snapshot_id, instance_id):
"""Restore database from snapshot."""
response = self.rds.restore_db_instance_from_db_snapshot(
DBInstanceIdentifier=instance_id,
DBSnapshotIdentifier=snapshot_id,
InstanceClass='db.r6g.xlarge',
MultiAZ=False,
PubliclyAccessible=False
)
return response['DBInstance']['DBInstanceIdentifier']
def schedule_backups(self, schedule='cron(0 5 * * ? *)'):
"""Schedule automated backups."""
# Use EventBridge to trigger Lambda
events = boto3.client('events')
events.put_rule(
Name='daily-backup',
ScheduleExpression=schedule,
State='ENABLED'
)
events.put_targets(
Rule='daily-backup',
Targets=[
{
'Id': 'BackupLambda',
'Arn': 'arn:aws:lambda:us-east-1:123456789:function:DatabaseBackup'
}
]
)
Application Data Backups
#!/usr/bin/env python3
"""Application state backup."""
import boto3
import json
class ApplicationBackup:
"""Backup application state."""
def __init__(self, bucket_name):
self.s3 = boto3.client('s3')
self.bucket = bucket_name
def backup_dynamodb(self, table_name):
"""Export DynamoDB table to S3."""
export_client = boto3.client('dynamodbstreams')
response = export_client.export_table_to_point_in_time(
TableArn=f'arn:aws:dynamodb:us-east-1:123456789:table/{table_name}',
s3Bucket=self.bucket,
s3Prefix=f'backups/{table_name}/',
ExportFormat='DYNAMODB_JSON'
)
return response['ExportDescription']['ExportArn']
def backup_s3_bucket(self, source_bucket, prefix=''):
"""Copy S3 bucket to backup location."""
paginator = self.s3.get_paginator('list_objects_v2')
for page in paginator.paginate(Bucket=source_bucket, Prefix=prefix):
for obj in page.get('Contents', []):
copy_source = {'Bucket': source_bucket, 'Key': obj['Key']}
dest_key = f"backups/{datetime.now().strftime('%Y%m%d')}/{obj['Key']}"
self.s3.copy_object(
CopySource=copy_source,
Bucket=self.bucket,
Key=dest_key
)
def backup_redis(self, redis_client):
"""Backup Redis data."""
# Get all keys
keys = redis_client.keys('*')
backup = {}
for key in keys:
key_type = redis_client.type(key)
if key_type == 'string':
backup[key] = redis_client.get(key)
elif key_type == 'hash':
backup[key] = redis_client.hgetall(key)
elif key_type == 'list':
backup[key] = redis_client.lrange(key, 0, -1)
# Save to S3
self.s3.put_object(
Bucket=self.bucket,
Key=f"backups/redis/{datetime.now().strftime('%Y%m%d-%H%M%S')}.json",
Body=json.dumps(backup)
)
Failover Automation
#!/usr/bin/env python3
"""Automated failover system."""
import boto3
import time
from datetime import datetime
class FailoverManager:
"""Manage automatic failover."""
def __init__(self):
self.rds = boto3.client('rds')
self.route53 = boto3.client('route53')
self.sns = boto3.client('sns')
def trigger_failover(self, app_name, primary_region, secondary_region):
"""Execute failover to secondary region."""
print(f"[{datetime.now()}] Starting failover for {app_name}")
# 1. Notify team
self.sns.publish(
TopicArn='arn:aws:sns:us-east-1:123456789:alerts',
Subject=f'FAILOVER STARTED: {app_name}',
Message=f'Failover initiated at {datetime.now()}'
)
# 2. Promote read replica
print("Promoting database...")
self.rds.promote_read_replica(
DBInstanceIdentifier=f'{app_name}-replica-{secondary_region}',
BackupRetentionPeriod=7
)
# Wait for promotion
waiter = self.rds.get_waiter('db_instance_available')
waiter.wait(DBInstanceIdentifier=f'{app_name}-{secondary_region}')
# 3. Update DNS
print("Updating DNS...")
self.route53.change_resource_record_sets(
HostedZoneId='Z1234567890ABC',
ChangeBatch={
'Changes': [{
'Action': 'UPSERT',
'ResourceRecordSet': {
'Name': f'{app_name}.example.com',
'Type': 'CNAME',
'TTL': 60,
'ResourceRecords': [{'Value': f'{app_name}.{secondary_region}.example.com'}]
}
}]
}
)
# 4. Notify completion
self.sns.publish(
TopicArn='arn:aws:sns:us-east-1:123456789:alerts',
Subject=f'FAILOVER COMPLETED: {app_name}',
Message=f'Failover completed at {datetime.now()}'
)
print(f"[{datetime.now()}] Failover completed")
def verify_health(self, region):
"""Verify secondary region is healthy."""
# Check database
try:
self.rds.describe_db_instances(
DBInstanceIdentifier=f'app-{region}'
)
db_healthy = True
except:
db_healthy = False
return db_healthy
Testing DR
#!/bin/bash
# dr-test.sh - Test disaster recovery
set -e
echo "Starting DR test..."
# 1. Verify backups exist
echo "Checking backups..."
aws s3 ls s3://backup-bucket/daily/ || exit 1
# 2. Restore to test environment
echo "Restoring to test..."
aws rds restore-db-instance-from-db-snapshot \
--db-instance-identifier dr-test-restore \
--db-snapshot-identifier daily-production-2026-01-18 \
--db-instance-class db.t3.medium \
--region us-west-2
# 3. Deploy test application
echo "Deploying test app..."
kubectl config use-context test
kubectl apply -f k8s/test/
# 4. Run smoke tests
echo "Running smoke tests..."
curl -f https://dr-test.example.com/health || exit 1
curl -f https://dr-test.example.com/api/users || exit 1
# 5. Verify data integrity
echo "Verifying data..."
# Add verification steps
echo "DR test passed!"
# Cleanup
aws rds delete-db-instance --db-instance-identifier dr-test-restore --skip-final-snapshot
Comments