SaaS Backup & Disaster Recovery: Multi-Region Strategy

Introduction

SaaS applications require enterprise-grade disaster recovery. With customers relying on 24/7 availability, having robust backup and DR strategies is essential for customer trust and business continuity.

Key Statistics:

60% of companies without DR plans fail within 6 months of disaster
Average DR cost: $100K-1M per incident
Multi-region reduces outage risk by 85%
Automated DR reduces recovery time by 90%

DR Strategy Framework

┌─────────────────────────────────────────────────────────────────┐
│                    DR Strategy Components                            │
├─────────────────────────────────────────────────────────────────┤
│                                                                  │
│  RTO (Recovery Time Objective)                                      │
│  ├── Critical: 15 minutes (financial, healthcare)               │
│  ├── Standard: 1 hour (most SaaS)                                │
│  ├── Basic: 4 hours (non-mission critical)                      │
│                                                                  │
│  RPO (Recovery Point Objective)                                    │
│  ├── Near-zero: Synchronous replication                          │
│  ├── 5 minutes: Async with frequent snapshots                   │
│  ├── 1 hour: Daily backups with async replication              │
│                                                                  │
│  Architecture Options                                              │
│  ├── Active-Active: Full redundancy, instant failover            │
│  ├── Active-Passive: Standby ready to promote                   │
│  ├── Backup-Restore: Recovery from backups                       │
│                                                                  │
└─────────────────────────────────────────────────────────────────┘

Multi-Region Architecture

Active-Active Setup

# Terraform multi-region deployment
provider "aws" {
  alias  = "primary"
  region = "us-east-1"
}

provider "aws" {
  alias  = "secondary"
  region = "us-west-2"
}

# Primary database with cross-region read replica
resource "aws_db_instance" "primary" {
  provider               = aws.primary
  identifier            = "saas-primary"
  engine               = "postgres"
  engine_version       = "15.4"
  instance_class        = "db.r6g.xlarge"
  multi_az             = true
  backup_retention_days = 30
  backup_window         = "03:00-04:00"
  maintenance_window    = "sun:04:00-sun:05:00"
}

# Cross-region read replica
resource "aws_db_instance" "replica" {
  provider                   = aws.secondary
  identifier                = "saas-dr-replica"
  source_db_instance_identifier = aws_db_instance.primary.identifier
  instance_class            = "db.r6g.xlarge"
  engine                   = "postgres"
}

# Global Accelerator for DNS failover
resource "aws_globalaccelerator_accelerator" "main" {
  name            = "saas-accelerator"
  enabled         = true
}

resource "aws_globalaccelerator_listener" "tcp" {
  accelerator_arn = aws_globalaccelerator_accelerator.main.id
  protocol        = "TCP"
  
  port_range {
    from_port = 443
    to_port   = 443
  }
}

resource "aws_globalaccelerator_endpoint_group" "primary" {
  listener_arn                  = aws_globalaccelerator_listener.tcp.id
  endpoint_group_region         = "us-east-1"
  traffic_dial_percentage      = 100
  
  endpoint_configuration {
    endpoint_id = aws_lb.primary.arn
    weight      = 100
  }
}

Backup Strategies

Database Backups

#!/usr/bin/env python3
"""Automated backup system."""

import boto3
from datetime import datetime, timedelta

class DatabaseBackupManager:
    """Manage database backups across regions."""
    
    def __init__(self):
        self.rds = boto3.client('rds')
        self.s3 = boto3.client('s3')
    
    def create_snapshot(self, db_instance_id, prefix='daily'):
        """Create manual snapshot."""
        
        snapshot_id = f"{prefix}-{db_instance_id}-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
        
        response = self.rds.create_db_snapshot(
            DBSnapshotIdentifier=snapshot_id,
            DBInstanceIdentifier=db_instance_id,
            Tags=[
                {'Key': 'BackupType', 'Value': prefix},
                {'Key': 'CreatedBy', 'Value': 'automated-backup'}
            ]
        )
        
        return response['DBSnapshot']['DBSnapshotIdentifier']
    
    def copy_to_region(self, snapshot_id, target_region):
        """Copy snapshot to another region."""
        
        response = self.rds.copy_db_snapshot(
            SourceDBSnapshotIdentifier=snapshot_id,
            TargetDBSnapshotIdentifier=f"{snapshot_id}-{target_region}",
            SourceRegion='us-east-1',
            KmsKeyId=f'arn:aws:kms:{target_region}:123456789012:key/backup-key'
        )
        
        return response['DBSnapshot']['DBSnapshotIdentifier']
    
    def restore_from_snapshot(self, snapshot_id, instance_id):
        """Restore database from snapshot."""
        
        response = self.rds.restore_db_instance_from_db_snapshot(
            DBInstanceIdentifier=instance_id,
            DBSnapshotIdentifier=snapshot_id,
            InstanceClass='db.r6g.xlarge',
            MultiAZ=False,
            PubliclyAccessible=False
        )
        
        return response['DBInstance']['DBInstanceIdentifier']
    
    def schedule_backups(self, schedule='cron(0 5 * * ? *)'):
        """Schedule automated backups."""
        
        # Use EventBridge to trigger Lambda
        events = boto3.client('events')
        
        events.put_rule(
            Name='daily-backup',
            ScheduleExpression=schedule,
            State='ENABLED'
        )
        
        events.put_targets(
            Rule='daily-backup',
            Targets=[
                {
                    'Id': 'BackupLambda',
                    'Arn': 'arn:aws:lambda:us-east-1:123456789:function:DatabaseBackup'
                }
            ]
        )

Application Data Backups

#!/usr/bin/env python3
"""Application state backup."""

import boto3
import json

class ApplicationBackup:
    """Backup application state."""
    
    def __init__(self, bucket_name):
        self.s3 = boto3.client('s3')
        self.bucket = bucket_name
    
    def backup_dynamodb(self, table_name):
        """Export DynamoDB table to S3."""
        
        export_client = boto3.client('dynamodbstreams')
        
        response = export_client.export_table_to_point_in_time(
            TableArn=f'arn:aws:dynamodb:us-east-1:123456789:table/{table_name}',
            s3Bucket=self.bucket,
            s3Prefix=f'backups/{table_name}/',
            ExportFormat='DYNAMODB_JSON'
        )
        
        return response['ExportDescription']['ExportArn']
    
    def backup_s3_bucket(self, source_bucket, prefix=''):
        """Copy S3 bucket to backup location."""
        
        paginator = self.s3.get_paginator('list_objects_v2')
        
        for page in paginator.paginate(Bucket=source_bucket, Prefix=prefix):
            for obj in page.get('Contents', []):
                copy_source = {'Bucket': source_bucket, 'Key': obj['Key']}
                dest_key = f"backups/{datetime.now().strftime('%Y%m%d')}/{obj['Key']}"
                
                self.s3.copy_object(
                    CopySource=copy_source,
                    Bucket=self.bucket,
                    Key=dest_key
                )
    
    def backup_redis(self, redis_client):
        """Backup Redis data."""
        
        # Get all keys
        keys = redis_client.keys('*')
        
        backup = {}
        for key in keys:
            key_type = redis_client.type(key)
            
            if key_type == 'string':
                backup[key] = redis_client.get(key)
            elif key_type == 'hash':
                backup[key] = redis_client.hgetall(key)
            elif key_type == 'list':
                backup[key] = redis_client.lrange(key, 0, -1)
        
        # Save to S3
        self.s3.put_object(
            Bucket=self.bucket,
            Key=f"backups/redis/{datetime.now().strftime('%Y%m%d-%H%M%S')}.json",
            Body=json.dumps(backup)
        )

Failover Automation

#!/usr/bin/env python3
"""Automated failover system."""

import boto3
import time
from datetime import datetime

class FailoverManager:
    """Manage automatic failover."""
    
    def __init__(self):
        self.rds = boto3.client('rds')
        self.route53 = boto3.client('route53')
        self.sns = boto3.client('sns')
    
    def trigger_failover(self, app_name, primary_region, secondary_region):
        """Execute failover to secondary region."""
        
        print(f"[{datetime.now()}] Starting failover for {app_name}")
        
        # 1. Notify team
        self.sns.publish(
            TopicArn='arn:aws:sns:us-east-1:123456789:alerts',
            Subject=f'FAILOVER STARTED: {app_name}',
            Message=f'Failover initiated at {datetime.now()}'
        )
        
        # 2. Promote read replica
        print("Promoting database...")
        self.rds.promote_read_replica(
            DBInstanceIdentifier=f'{app_name}-replica-{secondary_region}',
            BackupRetentionPeriod=7
        )
        
        # Wait for promotion
        waiter = self.rds.get_waiter('db_instance_available')
        waiter.wait(DBInstanceIdentifier=f'{app_name}-{secondary_region}')
        
        # 3. Update DNS
        print("Updating DNS...")
        self.route53.change_resource_record_sets(
            HostedZoneId='Z1234567890ABC',
            ChangeBatch={
                'Changes': [{
                    'Action': 'UPSERT',
                    'ResourceRecordSet': {
                        'Name': f'{app_name}.example.com',
                        'Type': 'CNAME',
                        'TTL': 60,
                        'ResourceRecords': [{'Value': f'{app_name}.{secondary_region}.example.com'}]
                    }
                }]
            }
        )
        
        # 4. Notify completion
        self.sns.publish(
            TopicArn='arn:aws:sns:us-east-1:123456789:alerts',
            Subject=f'FAILOVER COMPLETED: {app_name}',
            Message=f'Failover completed at {datetime.now()}'
        )
        
        print(f"[{datetime.now()}] Failover completed")
    
    def verify_health(self, region):
        """Verify secondary region is healthy."""
        
        # Check database
        try:
            self.rds.describe_db_instances(
                DBInstanceIdentifier=f'app-{region}'
            )
            db_healthy = True
        except:
            db_healthy = False
        
        return db_healthy

Testing DR

#!/bin/bash
# dr-test.sh - Test disaster recovery

set -e

echo "Starting DR test..."

# 1. Verify backups exist
echo "Checking backups..."
aws s3 ls s3://backup-bucket/daily/ || exit 1

# 2. Restore to test environment
echo "Restoring to test..."
aws rds restore-db-instance-from-db-snapshot \
    --db-instance-identifier dr-test-restore \
    --db-snapshot-identifier daily-production-2026-01-18 \
    --db-instance-class db.t3.medium \
    --region us-west-2

# 3. Deploy test application
echo "Deploying test app..."
kubectl config use-context test
kubectl apply -f k8s/test/

# 4. Run smoke tests
echo "Running smoke tests..."
curl -f https://dr-test.example.com/health || exit 1
curl -f https://dr-test.example.com/api/users || exit 1

# 5. Verify data integrity
echo "Verifying data..."
# Add verification steps

echo "DR test passed!"

# Cleanup
aws rds delete-db-instance --db-instance-identifier dr-test-restore --skip-final-snapshot