Modern DevOps: Infrastructure as Code with Terraform and Pulumi
Infrastructure as Code (IaC) has evolved from a nice-to-have to an absolute necessity in modern software development. After managing infrastructure for systems serving millions of users, I've learned that the choice between Terraform and Pulumi isn't just about syntax—it's about team capabilities, organizational needs, and long-term maintainability.
The Evolution of Infrastructure Management
From ClickOps to GitOps
The journey from manual infrastructure management to fully automated GitOps workflows represents one of the most significant improvements in operational efficiency I've witnessed.
// Modern IaC workflow with Pulumi
import * as aws from "@pulumi/aws";
import * as awsx from "@pulumi/awsx";
import * as kubernetes from "@pulumi/kubernetes";
class ModernInfrastructure {
private cluster: aws.eks.Cluster;
private vpc: awsx.ec2.Vpc;
private nodeGroup: aws.eks.NodeGroup;
constructor(private config: InfrastructureConfig) {
this.createNetworking();
this.createCompute();
this.setupMonitoring();
}
private createNetworking() {
this.vpc = new awsx.ec2.Vpc("main-vpc", {
cidrBlock: "10.0.0.0/16",
numberOfAvailabilityZones: 3,
enableDnsHostnames: true,
enableDnsSupport: true,
subnets: [
{
type: "public",
cidrMask: 24,
tags: { "kubernetes.io/role/elb": "1" }
},
{
type: "private",
cidrMask: 24,
tags: { "kubernetes.io/role/internal-elb": "1" }
}
]
});
}
private createCompute() {
// EKS Cluster with managed node groups
this.cluster = new aws.eks.Cluster("main-cluster", {
version: "1.28",
vpcConfig: {
subnetIds: this.vpc.privateSubnetIds,
endpointConfigPrivateAccess: true,
endpointConfigPublicAccess: true,
publicAccessCidrs: ["0.0.0.0/0"]
},
enabledClusterLogTypes: [
"api", "audit", "authenticator", "controllerManager", "scheduler"
]
});
// Managed node group with spot instances
this.nodeGroup = new aws.eks.NodeGroup("main-nodes", {
clusterName: this.cluster.name,
nodeRoleArn: this.createNodeRole().arn,
subnetIds: this.vpc.privateSubnetIds,
capacityType: "SPOT",
instanceTypes: ["t3.medium", "t3.large"],
scalingConfig: {
desiredSize: 3,
maxSize: 10,
minSize: 1
},
updateConfig: {
maxUnavailablePercentage: 25
}
});
}
}
Terraform vs Pulumi: A Practical Comparison
Terraform: The Declarative Approach
## Terraform configuration for multi-environment setup
terraform {
required_version = ">= 1.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 5.0"
}
}
backend "s3" {
bucket = "terraform-state-bucket"
key = "infrastructure/terraform.tfstate"
region = "us-east-1"
encrypt = true
dynamodb_table = "terraform-locks"
}
}
## Reusable module for EKS cluster
module "eks_cluster" {
source = "./modules/eks"
cluster_name = var.cluster_name
cluster_version = var.cluster_version
vpc_id = module.vpc.vpc_id
subnet_ids = module.vpc.private_subnets
node_groups = {
main = {
instance_types = ["t3.medium", "t3.large"]
capacity_type = "SPOT"
min_size = 1
max_size = 10
desired_size = 3
k8s_labels = {
Environment = var.environment
NodeGroup = "main"
}
tags = {
"kubernetes.io/cluster/${var.cluster_name}" = "owned"
}
}
}
# IRSA roles for service accounts
irsa_roles = [
{
name = "aws-load-balancer-controller"
namespace = "kube-system"
policy_arns = [
"arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess"
]
},
{
name = "external-dns"
namespace = "kube-system"
policy_arns = [
aws_iam_policy.external_dns.arn
]
}
]
}
## Custom IAM policy for external-dns
resource "aws_iam_policy" "external_dns" {
name_prefix = "external-dns-"
policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Effect = "Allow"
Action = [
"route53:ChangeResourceRecordSets",
"route53:ListHostedZones",
"route53:ListResourceRecordSets"
]
Resource = "*"
}
]
})
}
Pulumi: The Programmatic Approach
// Pulumi configuration with advanced patterns
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
import * as kubernetes from "@pulumi/kubernetes";
interface ClusterConfig {
name: string;
version: string;
nodeGroups: NodeGroupConfig[];
addons: AddonConfig[];
}
class EKSClusterBuilder {
private cluster: aws.eks.Cluster;
private provider: kubernetes.Provider;
constructor(private config: ClusterConfig) {}
async build(): Promise<EKSCluster> {
// Create cluster with advanced configuration
this.cluster = new aws.eks.Cluster(this.config.name, {
version: this.config.version,
vpcConfig: await this.getVpcConfig(),
encryptionConfig: [{
provider: {
keyArn: await this.createKMSKey()
},
resources: ["secrets"]
}],
enabledClusterLogTypes: [
"api", "audit", "authenticator", "controllerManager", "scheduler"
]
});
// Create Kubernetes provider
this.provider = new kubernetes.Provider("k8s-provider", {
kubeconfig: this.cluster.kubeconfigJson
});
// Install essential addons
await this.installAddons();
// Create node groups
await this.createNodeGroups();
return new EKSCluster(this.cluster, this.provider);
}
private async installAddons(): Promise<void> {
// AWS Load Balancer Controller
const albController = new kubernetes.helm.v3.Chart("aws-load-balancer-controller", {
chart: "aws-load-balancer-controller",
repository: "https://aws.github.io/eks-charts",
namespace: "kube-system",
values: {
clusterName: this.cluster.name,
serviceAccount: {
create: true,
annotations: {
"eks.amazonaws.com/role-arn": await this.createIRSARole("aws-load-balancer-controller")
}
}
}
}, { provider: this.provider });
// External DNS
const externalDns = new kubernetes.helm.v3.Chart("external-dns", {
chart: "external-dns",
repository: "https://kubernetes-sigs.github.io/external-dns/",
namespace: "kube-system",
values: {
serviceAccount: {
annotations: {
"eks.amazonaws.com/role-arn": await this.createIRSARole("external-dns")
}
},
domainFilters: ["example.com"],
policy: "sync"
}
}, { provider: this.provider });
}
private async createIRSARole(serviceName: string): Promise<string> {
const role = new aws.iam.Role(`${serviceName}-role`, {
assumeRolePolicy: pulumi.all([
this.cluster.identity,
this.cluster.name
]).apply(([identity, clusterName]) =>
JSON.stringify({
Version: "2012-10-17",
Statement: [{
Effect: "Allow",
Principal: {
Federated: identity.oidc[0].issuer
},
Action: "sts:AssumeRoleWithWebIdentity",
Condition: {
StringEquals: {
[`${identity.oidc[0].issuer}:sub`]: `system:serviceaccount:kube-system:${serviceName}`,
[`${identity.oidc[0].issuer}:aud`]: "sts.amazonaws.com"
}
}
}]
})
)
});
return role.arn;
}
}
Advanced GitOps Workflows
Multi-Environment Pipeline
## GitHub Actions workflow for infrastructure deployment
name: Infrastructure Deployment
on:
push:
branches: [main, develop]
paths: ['infrastructure/**']
pull_request:
paths: ['infrastructure/**']
env:
AWS_REGION: us-east-1
TERRAFORM_VERSION: 1.6.0
PULUMI_VERSION: 3.90.0
jobs:
plan:
name: Plan Infrastructure Changes
runs-on: ubuntu-latest
strategy:
matrix:
environment: [dev, staging, prod]
tool: [terraform, pulumi]
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
aws-region: ${{ env.AWS_REGION }}
- name: Setup Terraform
if: matrix.tool == 'terraform'
uses: hashicorp/setup-terraform@v3
with:
terraform_version: ${{ env.TERRAFORM_VERSION }}
- name: Setup Pulumi
if: matrix.tool == 'pulumi'
uses: pulumi/actions@v4
with:
pulumi-version: ${{ env.PULUMI_VERSION }}
- name: Terraform Plan
if: matrix.tool == 'terraform'
working-directory: infrastructure/terraform
run: |
terraform init -backend-config="key=environments/${{ matrix.environment }}/terraform.tfstate"
terraform plan -var-file="environments/${{ matrix.environment }}.tfvars" -out=tfplan
- name: Pulumi Preview
if: matrix.tool == 'pulumi'
working-directory: infrastructure/pulumi
run: |
pulumi stack select ${{ matrix.environment }}
pulumi preview --diff
- name: Upload Plan Artifacts
uses: actions/upload-artifact@v3
with:
name: ${{ matrix.tool }}-plan-${{ matrix.environment }}
path: |
infrastructure/${{ matrix.tool }}/tfplan
infrastructure/${{ matrix.tool }}/pulumi-preview.json
security-scan:
name: Security Scanning
runs-on: ubuntu-latest
needs: plan
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Run Checkov
uses: bridgecrewio/checkov-action@master
with:
directory: infrastructure/
framework: terraform,kubernetes
output_format: sarif
output_file_path: checkov-results.sarif
- name: Run tfsec
uses: aquasecurity/tfsec-action@v1.0.3
with:
working_directory: infrastructure/terraform
- name: Upload SARIF file
uses: github/codeql-action/upload-sarif@v2
with:
sarif_file: checkov-results.sarif
deploy:
name: Deploy Infrastructure
runs-on: ubuntu-latest
needs: [plan, security-scan]
if: github.ref == 'refs/heads/main'
environment: production
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Deploy to Production
run: |
# Implement blue-green deployment strategy
./scripts/blue-green-deploy.sh
Infrastructure Testing Strategies
Terratest for Infrastructure Validation
package test
import (
"testing"
"time"
"github.com/gruntwork-io/terratest/modules/aws"
"github.com/gruntwork-io/terratest/modules/terraform"
"github.com/stretchr/testify/assert"
)
func TestEKSCluster(t *testing.T) {
t.Parallel()
// Configure Terraform options
terraformOptions := &terraform.Options{
TerraformDir: "../infrastructure/terraform",
VarFiles: []string{"test.tfvars"},
Vars: map[string]interface{}{
"cluster_name": "test-cluster-" + randomString(8),
"environment": "test",
},
BackendConfig: map[string]interface{}{
"bucket": "terraform-test-state",
"key": "test/terraform.tfstate",
"region": "us-east-1",
},
}
// Clean up resources after test
defer terraform.Destroy(t, terraformOptions)
// Deploy infrastructure
terraform.InitAndApply(t, terraformOptions)
// Validate cluster creation
clusterName := terraform.Output(t, terraformOptions, "cluster_name")
awsRegion := terraform.Output(t, terraformOptions, "aws_region")
// Test cluster is accessible
cluster := aws.GetEksCluster(t, awsRegion, clusterName)
assert.Equal(t, "ACTIVE", *cluster.Status)
// Test node groups are healthy
nodeGroups := aws.GetEksNodeGroups(t, awsRegion, clusterName)
assert.True(t, len(nodeGroups) > 0)
for _, nodeGroup := range nodeGroups {
assert.Equal(t, "ACTIVE", *nodeGroup.Status)
}
// Test Kubernetes API accessibility
testKubernetesConnectivity(t, terraformOptions)
}
func testKubernetesConnectivity(t *testing.T, terraformOptions *terraform.Options) {
kubeconfigPath := terraform.Output(t, terraformOptions, "kubeconfig_path")
// Test basic kubectl commands
kubectl := k8s.NewKubectlOptions("", kubeconfigPath, "default")
// Wait for nodes to be ready
k8s.WaitUntilAllNodesReady(t, kubectl, 10, 30*time.Second)
// Deploy test application
k8s.KubectlApply(t, kubectl, "test-manifests/")
defer k8s.KubectlDelete(t, kubectl, "test-manifests/")
// Wait for deployment to be ready
k8s.WaitUntilDeploymentAvailable(t, kubectl, "test-app", 10, 30*time.Second)
}
Cost Optimization Strategies
Automated Cost Management
// Pulumi automation for cost optimization
import * as aws from "@pulumi/aws";
import * as pulumi from "@pulumi/pulumi";
class CostOptimizer {
private config: pulumi.Config;
constructor() {
this.config = new pulumi.Config();
}
createSpotNodeGroup(clusterName: pulumi.Output<string>): aws.eks.NodeGroup {
return new aws.eks.NodeGroup("spot-nodes", {
clusterName: clusterName,
capacityType: "SPOT",
instanceTypes: ["t3.medium", "t3.large", "t3.xlarge"],
// Mixed instance policy for better availability
launchTemplate: {
version: "$Latest"
},
scalingConfig: {
desiredSize: 3,
maxSize: 20,
minSize: 1
},
// Taints for spot instances
taints: [{
key: "spot-instance",
value: "true",
effect: "NO_SCHEDULE"
}],
tags: {
"k8s.io/cluster-autoscaler/enabled": "true",
"k8s.io/cluster-autoscaler/node-template/taint/spot-instance": "true:NoSchedule"
}
});
}
setupAutoscaling(clusterName: string): void {
// Cluster Autoscaler configuration
const clusterAutoscaler = new kubernetes.apps.v1.Deployment("cluster-autoscaler", {
metadata: {
name: "cluster-autoscaler",
namespace: "kube-system"
},
spec: {
selector: {
matchLabels: {
app: "cluster-autoscaler"
}
},
template: {
metadata: {
labels: {
app: "cluster-autoscaler"
}
},
spec: {
containers: [{
name: "cluster-autoscaler",
image: "k8s.gcr.io/autoscaling/cluster-autoscaler:v1.21.0",
command: [
"./cluster-autoscaler",
`--v=4`,
`--stderrthreshold=info`,
`--cloud-provider=aws`,
`--skip-nodes-with-local-storage=false`,
`--expander=least-waste`,
`--node-group-auto-discovery=asg:tag=k8s.io/cluster-autoscaler/enabled,k8s.io/cluster-autoscaler/${clusterName}`,
`--balance-similar-node-groups`,
`--skip-nodes-with-system-pods=false`
],
resources: {
limits: {
cpu: "100m",
memory: "300Mi"
},
requests: {
cpu: "100m",
memory: "300Mi"
}
}
}]
}
}
}
});
}
// Automated resource cleanup
createCleanupLambda(): aws.lambda.Function {
return new aws.lambda.Function("resource-cleanup", {
runtime: aws.lambda.Runtime.Python3d9,
code: new pulumi.asset.AssetArchive({
".": new pulumi.asset.FileArchive("./lambda/cleanup")
}),
handler: "cleanup.handler",
environment: {
variables: {
ENVIRONMENT: this.config.require("environment")
}
},
// Schedule to run daily
timeout: 300
});
}
}
Monitoring and Observability
Comprehensive Infrastructure Monitoring
## Prometheus configuration for infrastructure monitoring
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: monitoring
data:
prometheus.yml: |
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- "/etc/prometheus/rules/*.yml"
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
scrape_configs:
# Kubernetes API server
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
# Node metrics
- job_name: 'kubernetes-nodes'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics
# AWS Load Balancer Controller metrics
- job_name: 'aws-load-balancer-controller'
kubernetes_sd_configs:
- role: endpoints
namespaces:
names:
- kube-system
relabel_configs:
- source_labels: [__meta_kubernetes_service_name]
action: keep
regex: aws-load-balancer-webhook-service
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: infrastructure-alerts
namespace: monitoring
spec:
groups:
- name: infrastructure.rules
rules:
- alert: HighNodeCPUUsage
expr: (100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage on node {{ $labels.instance }}"
description: "Node {{ $labels.instance }} has CPU usage above 80% for more than 5 minutes."
- alert: HighNodeMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage on node {{ $labels.instance }}"
description: "Node {{ $labels.instance }} has memory usage above 85% for more than 5 minutes."
- alert: KubernetesNodeNotReady
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
for: 10m
labels:
severity: critical
annotations:
summary: "Kubernetes node not ready"
description: "Node {{ $labels.node }} has been not ready for more than 10 minutes."
Security and Compliance
Infrastructure Security Scanning
## Python script for automated security compliance checking
import boto3
import json
from typing import Dict, List, Any
class InfrastructureSecurityScanner:
def __init__(self):
self.ec2 = boto3.client('ec2')
self.iam = boto3.client('iam')
self.eks = boto3.client('eks')
self.s3 = boto3.client('s3')
def scan_security_groups(self) -> List[Dict[str, Any]]:
"""Scan for overly permissive security groups"""
violations = []
response = self.ec2.describe_security_groups()
for sg in response['SecurityGroups']:
for rule in sg.get('IpPermissions', []):
# Check for 0.0.0.0/0 access
for ip_range in rule.get('IpRanges', []):
if ip_range.get('CidrIp') == '0.0.0.0/0':
violations.append({
'type': 'overly_permissive_sg',
'resource': sg['GroupId'],
'description': f"Security group {sg['GroupId']} allows access from 0.0.0.0/0",
'severity': 'HIGH' if rule.get('FromPort') in [22, 3389] else 'MEDIUM'
})
return violations
def scan_iam_policies(self) -> List[Dict[str, Any]]:
"""Scan for overly permissive IAM policies"""
violations = []
# Check for policies with * actions
paginator = self.iam.get_paginator('list_policies')
for page in paginator.paginate(Scope='Local'):
for policy in page['Policies']:
policy_version = self.iam.get_policy_version(
PolicyArn=policy['Arn'],
VersionId=policy['DefaultVersionId']
)
document = policy_version['PolicyVersion']['Document']
for statement in document.get('Statement', []):
if isinstance(statement.get('Action'), str):
actions = [statement['Action']]
else:
actions = statement.get('Action', [])
if '*' in actions and statement.get('Effect') == 'Allow':
violations.append({
'type': 'overly_permissive_iam',
'resource': policy['Arn'],
'description': f"IAM policy {policy['PolicyName']} allows all actions (*)",
'severity': 'HIGH'
})
return violations
def scan_eks_clusters(self) -> List[Dict[str, Any]]:
"""Scan EKS clusters for security best practices"""
violations = []
clusters = self.eks.list_clusters()
for cluster_name in clusters['clusters']:
cluster = self.eks.describe_cluster(name=cluster_name)['cluster']
# Check if cluster endpoint is public
vpc_config = cluster.get('resourcesVpcConfig', {})
if vpc_config.get('endpointConfigPublicAccess'):
public_cidrs = vpc_config.get('publicAccessCidrs', [])
if '0.0.0.0/0' in public_cidrs:
violations.append({
'type': 'public_eks_endpoint',
'resource': cluster_name,
'description': f"EKS cluster {cluster_name} has public endpoint accessible from anywhere",
'severity': 'MEDIUM'
})
# Check if logging is enabled
logging = cluster.get('logging', {})
enabled_logs = [log['type'] for log in logging.get('clusterLogging', []) if log.get('enabled')]
required_logs = ['api', 'audit', 'authenticator']
missing_logs = set(required_logs) - set(enabled_logs)
if missing_logs:
violations.append({
'type': 'missing_eks_logs',
'resource': cluster_name,
'description': f"EKS cluster {cluster_name} missing required log types: {', '.join(missing_logs)}",
'severity': 'MEDIUM'
})
return violations
def generate_compliance_report(self) -> Dict[str, Any]:
"""Generate comprehensive compliance report"""
all_violations = []
all_violations.extend(self.scan_security_groups())
all_violations.extend(self.scan_iam_policies())
all_violations.extend(self.scan_eks_clusters())
# Categorize by severity
high_severity = [v for v in all_violations if v['severity'] == 'HIGH']
medium_severity = [v for v in all_violations if v['severity'] == 'MEDIUM']
low_severity = [v for v in all_violations if v['severity'] == 'LOW']
return {
'total_violations': len(all_violations),
'high_severity_count': len(high_severity),
'medium_severity_count': len(medium_severity),
'low_severity_count': len(low_severity),
'violations': all_violations,
'compliance_score': max(0, 100 - (len(high_severity) * 10 + len(medium_severity) * 5 + len(low_severity) * 1))
}
if __name__ == "__main__":
scanner = InfrastructureSecurityScanner()
report = scanner.generate_compliance_report()
print(json.dumps(report, indent=2))
# Fail CI/CD if high severity violations found
if report['high_severity_count'] > 0:
exit(1)
Future of Infrastructure as Code
Emerging Trends for 2025
- AI-Powered Infrastructure Optimization
- Policy as Code Integration
- Serverless Infrastructure Patterns
- Multi-Cloud Abstraction Layers
- GitOps for Everything
Conclusion
The choice between Terraform and Pulumi ultimately depends on your team's expertise and organizational needs. Terraform excels in declarative simplicity and ecosystem maturity, while Pulumi offers programmatic flexibility and familiar development patterns.
Key takeaways for modern IaC:
- Embrace GitOps workflows for better collaboration and auditability
- Implement comprehensive testing at all levels
- Prioritize security scanning in your CI/CD pipeline
- Monitor infrastructure costs proactively
- Plan for disaster recovery from day one
The future of infrastructure management is code-driven, automated, and observable. By adopting these practices, you'll build more reliable, secure, and cost-effective systems.
These patterns and practices have been refined through managing infrastructure for applications serving millions of users across fintech, healthcare, and e-commerce platforms. Each approach has been battle-tested in production environments.