Designing Resilient Systems: Architecture Patterns for High Availability
In today's interconnected world, system failures are not a matter of if, but when. After designing systems that handle millions of requests per second across multiple regions, I've learned that true resilience comes from embracing failure as a design constraint, not an afterthought.
The Resilience Mindset
Defining System Resilience
Resilience isn't just about staying up—it's about graceful degradation, quick recovery, and learning from failures. A resilient system exhibits:
- Fault Tolerance: Continues operating despite component failures
- Graceful Degradation: Reduces functionality rather than complete failure
- Quick Recovery: Minimizes downtime through automated healing
- Adaptive Capacity: Learns and improves from incidents
interface ResilienceMetrics {
availability: number; // 99.99%
meanTimeToRecovery: number; // minutes
errorBudget: number; // allowed downtime
blastRadius: number; // impact scope of failures
}
class ResilienceMonitor {
private metrics: ResilienceMetrics;
private alertManager: AlertManager;
constructor() {
this.metrics = this.initializeMetrics();
this.setupContinuousMonitoring();
}
async assessSystemHealth(): Promise<HealthStatus> {
const components = await this.checkAllComponents();
const overallHealth = this.calculateOverallHealth(components);
if (overallHealth.score < 0.95) {
await this.triggerDegradationMode();
}
return overallHealth;
}
private async triggerDegradationMode() {
// Implement graceful degradation strategies
await this.disableNonEssentialFeatures();
await this.increaseResourceLimits();
await this.activateBackupSystems();
}
}
Multi-Region Architecture Patterns
Active-Active Multi-Region Setup
## Terraform configuration for multi-region deployment
resource "aws_route53_health_check" "primary_region" {
fqdn = "api-us-east-1.example.com"
port = 443
type = "HTTPS"
resource_path = "/health"
failure_threshold = 3
request_interval = 30
tags = {
Name = "Primary Region Health Check"
}
}
resource "aws_route53_record" "api_failover" {
zone_id = aws_route53_zone.main.zone_id
name = "api.example.com"
type = "A"
set_identifier = "primary"
failover_routing_policy {
type = "PRIMARY"
}
health_check_id = aws_route53_health_check.primary_region.id
ttl = 60
records = [aws_eip.primary_lb.public_ip]
}
resource "aws_route53_record" "api_failover_secondary" {
zone_id = aws_route53_zone.main.zone_id
name = "api.example.com"
type = "A"
set_identifier = "secondary"
failover_routing_policy {
type = "SECONDARY"
}
ttl = 60
records = [aws_eip.secondary_lb.public_ip]
}
Cross-Region Data Replication
class CrossRegionReplicator {
private primaryRegion: string = 'us-east-1';
private secondaryRegions: string[] = ['us-west-2', 'eu-west-1'];
constructor(
private dynamoClient: DynamoDBClient,
private s3Client: S3Client,
private eventBridge: EventBridgeClient
) {}
async setupGlobalTables(tableName: string) {
const globalTableConfig = {
GlobalTableName: tableName,
ReplicationGroup: this.secondaryRegions.map(region => ({
RegionName: region,
GlobalSecondaryIndexes: this.getGSIConfig()
}))
};
await this.dynamoClient.createGlobalTable(globalTableConfig);
// Setup cross-region event replication
await this.setupEventReplication(tableName);
}
private async setupEventReplication(tableName: string) {
const rule = {
Name: `${tableName}-cross-region-replication`,
EventPattern: JSON.stringify({
source: ['aws.dynamodb'],
'detail-type': ['DynamoDB Stream Record'],
detail: {
eventSourceARN: [`arn:aws:dynamodb:${this.primaryRegion}:*:table/${tableName}/stream/*`]
}
}),
Targets: this.secondaryRegions.map(region => ({
Id: `target-${region}`,
Arn: `arn:aws:events:${region}:${process.env.AWS_ACCOUNT_ID}:event-bus/default`,
RoleArn: process.env.CROSS_REGION_ROLE_ARN
}))
};
await this.eventBridge.putRule(rule);
}
}
Chaos Engineering Implementation
Automated Failure Injection
import { ChaosMonkey } from './chaos-monkey';
class ChaosEngineeringPlatform {
private experiments: Map<string, ChaosExperiment> = new Map();
private safetyChecks: SafetyCheck[] = [];
constructor(private monitoring: MonitoringService) {
this.initializeSafetyChecks();
}
async runExperiment(experimentId: string): Promise<ExperimentResult> {
const experiment = this.experiments.get(experimentId);
if (!experiment) {
throw new Error(`Experiment ${experimentId} not found`);
}
// Pre-flight safety checks
const safetyResult = await this.runSafetyChecks();
if (!safetyResult.safe) {
throw new Error(`Safety checks failed: ${safetyResult.reasons.join(', ')}`);
}
const startTime = Date.now();
let rollbackExecuted = false;
try {
// Execute the chaos experiment
await experiment.execute();
// Monitor system behavior
const metrics = await this.monitorSystemDuringExperiment(
experiment.duration,
experiment.expectedImpact
);
// Automatic rollback if system degrades beyond acceptable limits
if (metrics.degradation > experiment.maxAcceptableDegradation) {
await this.rollbackExperiment(experiment);
rollbackExecuted = true;
}
return {
experimentId,
duration: Date.now() - startTime,
success: !rollbackExecuted,
metrics,
insights: this.generateInsights(metrics)
};
} catch (error) {
await this.rollbackExperiment(experiment);
throw error;
}
}
private async monitorSystemDuringExperiment(
duration: number,
expectedImpact: ExpectedImpact
): Promise<ExperimentMetrics> {
const metrics: ExperimentMetrics = {
errorRate: [],
latency: [],
throughput: [],
degradation: 0
};
const monitoringInterval = setInterval(async () => {
const currentMetrics = await this.monitoring.getCurrentMetrics();
metrics.errorRate.push(currentMetrics.errorRate);
metrics.latency.push(currentMetrics.p99Latency);
metrics.throughput.push(currentMetrics.requestsPerSecond);
// Calculate degradation score
metrics.degradation = this.calculateDegradation(
currentMetrics,
expectedImpact.baseline
);
}, 5000); // Monitor every 5 seconds
// Wait for experiment duration
await new Promise(resolve => setTimeout(resolve, duration));
clearInterval(monitoringInterval);
return metrics;
}
}
// Example chaos experiments
const networkLatencyExperiment: ChaosExperiment = {
id: 'network-latency-spike',
name: 'Network Latency Spike',
description: 'Inject 500ms latency to 10% of requests',
duration: 300000, // 5 minutes
maxAcceptableDegradation: 0.15, // 15% degradation
execute: async () => {
// Inject latency using service mesh or proxy
await injectNetworkLatency({
percentage: 10,
latency: '500ms',
target: 'user-service'
});
},
rollback: async () => {
await removeNetworkLatency('user-service');
}
};
Circuit Breaker and Bulkhead Patterns
Advanced Circuit Breaker with Metrics
class AdvancedCircuitBreaker {
private state: CircuitState = CircuitState.CLOSED;
private metrics: CircuitBreakerMetrics;
private slidingWindow: SlidingWindow;
constructor(
private config: CircuitBreakerConfig,
private metricsCollector: MetricsCollector
) {
this.metrics = new CircuitBreakerMetrics();
this.slidingWindow = new SlidingWindow(config.windowSize, config.windowDuration);
}
async execute<T>(operation: () => Promise<T>): Promise<T> {
const startTime = Date.now();
try {
this.checkCircuitState();
const result = await operation();
this.recordSuccess(Date.now() - startTime);
return result;
} catch (error) {
this.recordFailure(Date.now() - startTime, error);
throw error;
}
}
private checkCircuitState() {
const windowStats = this.slidingWindow.getStats();
switch (this.state) {
case CircuitState.CLOSED:
if (this.shouldOpenCircuit(windowStats)) {
this.openCircuit();
}
break;
case CircuitState.OPEN:
if (this.shouldAttemptReset()) {
this.state = CircuitState.HALF_OPEN;
} else {
throw new CircuitBreakerOpenError('Circuit breaker is OPEN');
}
break;
case CircuitState.HALF_OPEN:
// Allow limited requests through
if (windowStats.requestCount >= this.config.halfOpenMaxRequests) {
throw new CircuitBreakerOpenError('Half-open request limit exceeded');
}
break;
}
}
private shouldOpenCircuit(stats: WindowStats): boolean {
if (stats.requestCount < this.config.minimumRequests) {
return false;
}
const errorRate = stats.errorCount / stats.requestCount;
const slowCallRate = stats.slowCallCount / stats.requestCount;
return errorRate >= this.config.errorThreshold ||
slowCallRate >= this.config.slowCallThreshold;
}
}
Bulkhead Pattern Implementation
class ResourceBulkhead {
private pools: Map<string, ResourcePool> = new Map();
constructor(private config: BulkheadConfig) {
this.initializePools();
}
private initializePools() {
// Critical operations get larger pool
this.pools.set('critical', new ResourcePool({
maxSize: this.config.totalResources * 0.6,
minSize: this.config.totalResources * 0.4,
priority: Priority.HIGH
}));
// Important operations get medium pool
this.pools.set('important', new ResourcePool({
maxSize: this.config.totalResources * 0.3,
minSize: this.config.totalResources * 0.2,
priority: Priority.MEDIUM
}));
// Best-effort operations get remaining resources
this.pools.set('best-effort', new ResourcePool({
maxSize: this.config.totalResources * 0.1,
minSize: 1,
priority: Priority.LOW
}));
}
async executeWithBulkhead<T>(
category: string,
operation: () => Promise<T>
): Promise<T> {
const pool = this.pools.get(category);
if (!pool) {
throw new Error(`Unknown bulkhead category: ${category}`);
}
const resource = await pool.acquire();
try {
return await operation();
} finally {
pool.release(resource);
}
}
}
// Usage example
const bulkhead = new ResourceBulkhead({
totalResources: 100
});
// Critical user operations
await bulkhead.executeWithBulkhead('critical', async () => {
return await processPayment(paymentData);
});
// Analytics operations (best-effort)
await bulkhead.executeWithBulkhead('best-effort', async () => {
return await updateAnalytics(eventData);
});
Observability and Incident Response
Comprehensive Monitoring Stack
class ObservabilityPlatform {
private metrics: MetricsCollector;
private traces: TracingService;
private logs: LoggingService;
private alerts: AlertManager;
constructor() {
this.setupGoldenSignals();
this.setupSLIMonitoring();
this.setupIncidentResponse();
}
private setupGoldenSignals() {
// The four golden signals of monitoring
this.metrics.registerGauge('latency_p99', {
description: '99th percentile latency',
labels: ['service', 'endpoint']
});
this.metrics.registerCounter('request_total', {
description: 'Total requests',
labels: ['service', 'endpoint', 'status']
});
this.metrics.registerGauge('error_rate', {
description: 'Error rate percentage',
labels: ['service', 'endpoint']
});
this.metrics.registerGauge('saturation', {
description: 'Resource utilization',
labels: ['service', 'resource_type']
});
}
async setupSLIMonitoring() {
const slis = [
{
name: 'api_availability',
query: 'sum(rate(http_requests_total{status!~"5.."}[5m])) / sum(rate(http_requests_total[5m]))',
threshold: 0.999, // 99.9% availability
window: '30d'
},
{
name: 'api_latency',
query: 'histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))',
threshold: 0.5, // 500ms
window: '30d'
}
];
for (const sli of slis) {
await this.alerts.createSLIAlert(sli);
}
}
}
Automated Incident Response
class IncidentResponseAutomation {
private runbooks: Map<string, Runbook> = new Map();
private escalationPolicies: EscalationPolicy[] = [];
constructor(
private pagerDuty: PagerDutyService,
private slack: SlackService,
private kubernetes: KubernetesClient
) {
this.loadRunbooks();
}
async handleIncident(alert: Alert): Promise<IncidentResponse> {
const incident = await this.createIncident(alert);
// Execute automated remediation
const runbook = this.runbooks.get(alert.type);
if (runbook && runbook.autoRemediation) {
try {
await this.executeRunbook(runbook, incident);
if (await this.verifyResolution(incident)) {
await this.resolveIncident(incident);
return { status: 'auto-resolved', incident };
}
} catch (error) {
console.error('Auto-remediation failed:', error);
}
}
// Escalate to human responders
await this.escalateIncident(incident);
return { status: 'escalated', incident };
}
private async executeRunbook(runbook: Runbook, incident: Incident) {
for (const step of runbook.steps) {
switch (step.type) {
case 'scale_deployment':
await this.kubernetes.scaleDeployment(
step.deployment,
step.replicas
);
break;
case 'restart_pods':
await this.kubernetes.restartPods(step.selector);
break;
case 'toggle_feature_flag':
await this.toggleFeatureFlag(step.flag, step.enabled);
break;
case 'drain_traffic':
await this.drainTrafficFromRegion(step.region);
break;
}
// Wait for step to take effect
await new Promise(resolve => setTimeout(resolve, step.waitTime || 30000));
}
}
}
Performance Under Load
Adaptive Load Balancing
class AdaptiveLoadBalancer {
private backends: Backend[] = [];
private healthChecker: HealthChecker;
private metricsCollector: MetricsCollector;
constructor() {
this.healthChecker = new HealthChecker();
this.metricsCollector = new MetricsCollector();
this.startAdaptiveAlgorithm();
}
async selectBackend(request: Request): Promise<Backend> {
const healthyBackends = this.backends.filter(b => b.isHealthy);
if (healthyBackends.length === 0) {
throw new Error('No healthy backends available');
}
// Use weighted round-robin with dynamic weights
const weights = await this.calculateDynamicWeights(healthyBackends);
return this.weightedSelection(healthyBackends, weights);
}
private async calculateDynamicWeights(backends: Backend[]): Promise<number[]> {
const metrics = await Promise.all(
backends.map(b => this.metricsCollector.getBackendMetrics(b.id))
);
return metrics.map(metric => {
// Weight based on inverse of response time and error rate
const responseTimeFactor = 1 / (metric.avgResponseTime + 1);
const errorRateFactor = 1 - metric.errorRate;
const cpuFactor = 1 - metric.cpuUtilization;
return responseTimeFactor * errorRateFactor * cpuFactor;
});
}
private startAdaptiveAlgorithm() {
setInterval(async () => {
await this.adjustBackendWeights();
await this.scaleBasedOnDemand();
}, 30000); // Adjust every 30 seconds
}
}
Disaster Recovery Strategies
Automated Backup and Recovery
class DisasterRecoveryOrchestrator {
private backupScheduler: BackupScheduler;
private recoveryProcedures: Map<string, RecoveryProcedure> = new Map();
constructor() {
this.setupBackupSchedules();
this.setupRecoveryProcedures();
}
async executeDisasterRecovery(scenario: DisasterScenario): Promise<RecoveryResult> {
const procedure = this.recoveryProcedures.get(scenario.type);
if (!procedure) {
throw new Error(`No recovery procedure for scenario: ${scenario.type}`);
}
const recoveryPlan = await this.generateRecoveryPlan(scenario, procedure);
// Execute recovery in phases
for (const phase of recoveryPlan.phases) {
await this.executeRecoveryPhase(phase);
// Validate phase completion
const validation = await this.validatePhase(phase);
if (!validation.success) {
throw new Error(`Recovery phase failed: ${validation.error}`);
}
}
return {
scenario: scenario.type,
duration: recoveryPlan.estimatedDuration,
success: true,
dataLoss: recoveryPlan.estimatedDataLoss
};
}
private async generateRecoveryPlan(
scenario: DisasterScenario,
procedure: RecoveryProcedure
): Promise<RecoveryPlan> {
const plan: RecoveryPlan = {
phases: [],
estimatedDuration: 0,
estimatedDataLoss: 0
};
// Phase 1: Assess damage and stop traffic
plan.phases.push({
name: 'assessment',
steps: [
'stop_incoming_traffic',
'assess_data_integrity',
'identify_affected_services'
],
estimatedDuration: 300 // 5 minutes
});
// Phase 2: Restore from backups
plan.phases.push({
name: 'restore',
steps: [
'restore_database_from_backup',
'restore_file_systems',
'validate_data_integrity'
],
estimatedDuration: 1800 // 30 minutes
});
// Phase 3: Restart services
plan.phases.push({
name: 'restart',
steps: [
'start_core_services',
'run_health_checks',
'gradually_restore_traffic'
],
estimatedDuration: 600 // 10 minutes
});
return plan;
}
}
Conclusion
Building resilient systems is an ongoing journey that requires embracing failure as a design constraint. The patterns and practices outlined here have been proven in production environments handling millions of users and petabytes of data.
Key takeaways for 2025:
- Design for Failure: Assume components will fail and design accordingly
- Embrace Chaos Engineering: Proactively test your assumptions about system behavior
- Implement Comprehensive Observability: You can't fix what you can't see
- Automate Recovery: Human response time is too slow for modern systems
- Practice Disaster Recovery: Regular drills ensure procedures work when needed
Remember: resilience is not a destination but a continuous practice of improvement and adaptation.
These patterns have been refined through managing systems with 99.99% uptime requirements across financial services, healthcare, and e-commerce platforms. Each technique has been battle-tested through real production incidents.