All Articles

Infrastructure as Code for AI Agent Deployments: Terraform Patterns and Best Practices

Managing AI agent infrastructure manually becomes unsustainable as your deployment complexity grows. Infrastructure as Code (IaC) provides reproducible, version-controlled, and auditable infrastructure management that scales with your AI operations. This article explores Terraform patterns specifically designed for AI agent workloads across AWS and GCP.

The AI Infrastructure Challenge

AI agents require specialized infrastructure considerations that traditional web applications don’t face:

  • GPU-enabled compute resources with specific driver requirements
  • High-memory instances for model loading and inference
  • Auto-scaling policies adapted to AI workload characteristics
  • Storage solutions optimized for large model files
  • Network configurations for model serving and data pipelines

Modular Terraform Architecture for AI Workloads

Structure your Terraform code with reusable modules that encapsulate AI-specific infrastructure patterns:

# environments/production/main.tf
terraform {
  required_version = ">= 1.0"
  required_providers {
    aws = {
      source  = "hashicorp/aws"
      version = "~> 5.0"
    }
    google = {
      source  = "hashicorp/google"
      version = "~> 4.0"
    }
  }
}

# Multi-cloud AI infrastructure
module "aws_ai_infrastructure" {
  source = "../../modules/aws-ai-cluster"
  
  cluster_name = "production-ai-agents"
  environment = "production"
  
  # AI-specific configuration
  enable_gpu_nodes = true
  gpu_instance_types = ["g4dn.xlarge", "g4dn.2xlarge"]
  cpu_instance_types = ["m5.large", "m5.xlarge", "c5.xlarge"]
  
  # Cost optimization
  use_spot_instances = true
  spot_percentage = 70
  
  # Storage for models
  efs_storage_size = 1000 # GB
  
  # Networking
  vpc_cidr = "10.0.0.0/16"
  availability_zones = ["us-west-2a", "us-west-2b", "us-west-2c"]
  
  tags = local.common_tags
}

module "gcp_ai_infrastructure" {
  source = "../../modules/gcp-ai-cluster"
  
  project_id = var.gcp_project_id
  cluster_name = "production-ai-agents"
  region = "us-central1"
  
  # GKE configuration for AI workloads
  enable_gpu_nodes = true
  gpu_type = "nvidia-tesla-t4"
  gpu_count = 1
  
  # Node pool configuration
  machine_types = {
    cpu_pool = "n1-standard-4"
    gpu_pool = "n1-standard-4"
    memory_pool = "n1-highmem-4"
  }
  
  # Auto-scaling
  min_node_count = 1
  max_node_count = 10
  
  # Storage
  disk_size_gb = 100
  disk_type = "pd-ssd"
  
  labels = local.common_labels
}

AWS EKS Module for AI Agents

# modules/aws-ai-cluster/main.tf
resource "aws_eks_cluster" "ai_cluster" {
  name     = var.cluster_name
  role_arn = aws_iam_role.cluster_role.arn
  version  = var.kubernetes_version

  vpc_config {
    subnet_ids              = aws_subnet.private[*].id
    endpoint_private_access = true
    endpoint_public_access  = var.enable_public_access
    security_group_ids      = [aws_security_group.cluster.id]
  }

  enabled_cluster_log_types = ["api", "audit", "authenticator", "controllerManager", "scheduler"]
  
  depends_on = [
    aws_iam_role_policy_attachment.cluster_policy,
    aws_iam_role_policy_attachment.vpc_cni_policy,
  ]
}

# CPU-optimized node group
resource "aws_eks_node_group" "cpu_nodes" {
  cluster_name    = aws_eks_cluster.ai_cluster.name
  node_group_name = "${var.cluster_name}-cpu-nodes"
  node_role_arn   = aws_iam_role.node_role.arn
  subnet_ids      = aws_subnet.private[*].id

  instance_types = var.cpu_instance_types
  capacity_type  = var.use_spot_instances ? "SPOT" : "ON_DEMAND"
  
  scaling_config {
    desired_size = var.cpu_desired_capacity
    max_size     = var.cpu_max_capacity
    min_size     = var.cpu_min_capacity
  }

  launch_template {
    id      = aws_launch_template.cpu_nodes.id
    version = aws_launch_template.cpu_nodes.latest_version
  }

  labels = {
    NodeType = "cpu"
    Workload = "ai-agents"
  }

  taint {
    key    = "ai-agents/cpu"
    value  = "true"
    effect = "NO_SCHEDULE"
  }
}

# GPU-enabled node group
resource "aws_eks_node_group" "gpu_nodes" {
  count = var.enable_gpu_nodes ? 1 : 0
  
  cluster_name    = aws_eks_cluster.ai_cluster.name
  node_group_name = "${var.cluster_name}-gpu-nodes"
  node_role_arn   = aws_iam_role.node_role.arn
  subnet_ids      = aws_subnet.private[*].id

  instance_types = var.gpu_instance_types
  capacity_type  = "ON_DEMAND"  # GPU instances rarely available as spot
  
  scaling_config {
    desired_size = var.gpu_desired_capacity
    max_size     = var.gpu_max_capacity
    min_size     = var.gpu_min_capacity
  }

  launch_template {
    id      = aws_launch_template.gpu_nodes[0].id
    version = aws_launch_template.gpu_nodes[0].latest_version
  }

  labels = {
    NodeType = "gpu"
    Workload = "ai-agents"
    "nvidia.com/gpu" = "true"
  }

  taint {
    key    = "ai-agents/gpu"
    value  = "true"
    effect = "NO_SCHEDULE"
  }
}

# EFS for shared model storage
resource "aws_efs_file_system" "model_storage" {
  creation_token = "${var.cluster_name}-models"
  
  performance_mode = "generalPurpose"
  throughput_mode  = "provisioned"
  provisioned_throughput_in_mibps = var.efs_throughput

  lifecycle_policy {
    transition_to_ia = "AFTER_30_DAYS"
  }

  tags = merge(var.tags, {
    Name = "${var.cluster_name}-model-storage"
    Type = "AI-Models"
  })
}

GCP GKE Module for AI Workloads

# modules/gcp-ai-cluster/main.tf
resource "google_container_cluster" "ai_cluster" {
  name     = var.cluster_name
  location = var.region
  project  = var.project_id

  # Remove default node pool
  remove_default_node_pool = true
  initial_node_count       = 1

  # Networking
  network    = google_compute_network.ai_vpc.name
  subnetwork = google_compute_subnetwork.ai_subnet.name

  # Enable features for AI workloads
  addons_config {
    horizontal_pod_autoscaling {
      disabled = false
    }
    http_load_balancing {
      disabled = false
    }
    network_policy_config {
      disabled = false
    }
  }

  # Enable node auto-provisioning for dynamic scaling
  cluster_autoscaling {
    enabled = true
    resource_limits {
      resource_type = "cpu"
      minimum       = 1
      maximum       = 100
    }
    resource_limits {
      resource_type = "memory"
      minimum       = 1
      maximum       = 400
    }
  }

  # Monitoring and logging
  monitoring_config {
    enable_components = ["SYSTEM_COMPONENTS", "WORKLOADS"]
  }

  logging_config {
    enable_components = ["SYSTEM_COMPONENTS", "WORKLOADS"]
  }
}

# CPU node pool for general AI workloads
resource "google_container_node_pool" "cpu_pool" {
  name       = "${var.cluster_name}-cpu-pool"
  location   = var.region
  cluster    = google_container_cluster.ai_cluster.name
  project    = var.project_id

  # Auto-scaling configuration
  autoscaling {
    min_node_count = var.min_node_count
    max_node_count = var.max_node_count
  }

  management {
    auto_repair  = true
    auto_upgrade = true
  }

  node_config {
    preemptible  = var.use_preemptible_nodes
    machine_type = var.machine_types.cpu_pool
    disk_size_gb = var.disk_size_gb
    disk_type    = var.disk_type

    # Optimize for AI workloads
    metadata = {
      disable-legacy-endpoints = "true"
    }

    labels = merge(var.labels, {
      node-type = "cpu"
      workload  = "ai-agents"
    })

    taint {
      key    = "ai-agents/cpu"
      value  = "true"
      effect = "NO_SCHEDULE"
    }

    oauth_scopes = [
      "https://www.googleapis.com/auth/cloud-platform",
      "https://www.googleapis.com/auth/devstorage.read_only",
      "https://www.googleapis.com/auth/logging.write",
      "https://www.googleapis.com/auth/monitoring",
    ]
  }
}

# GPU node pool for inference workloads
resource "google_container_node_pool" "gpu_pool" {
  count = var.enable_gpu_nodes ? 1 : 0
  
  name       = "${var.cluster_name}-gpu-pool"
  location   = var.region
  cluster    = google_container_cluster.ai_cluster.name
  project    = var.project_id

  autoscaling {
    min_node_count = 0
    max_node_count = var.gpu_max_nodes
  }

  management {
    auto_repair  = true
    auto_upgrade = false  # GPU drivers require careful upgrade management
  }

  node_config {
    preemptible  = false  # GPU nodes should be stable
    machine_type = var.machine_types.gpu_pool
    disk_size_gb = var.disk_size_gb
    disk_type    = "pd-ssd"

    # GPU configuration
    guest_accelerator {
      type  = var.gpu_type
      count = var.gpu_count
    }

    labels = merge(var.labels, {
      node-type = "gpu"
      workload  = "ai-agents"
      gpu-type  = var.gpu_type
    })

    taint {
      key    = "ai-agents/gpu"
      value  = "true"
      effect = "NO_SCHEDULE"
    }

    oauth_scopes = [
      "https://www.googleapis.com/auth/cloud-platform",
    ]
  }
}

Environment-Specific Configuration

Use Terraform workspaces and variable files for environment separation:

# environments/dev/terraform.tfvars
cluster_name = "dev-ai-agents"
kubernetes_version = "1.28"

# Cost optimization for development
use_spot_instances = true
spot_percentage = 90

# Reduced capacity
cpu_min_capacity = 1
cpu_max_capacity = 3
cpu_desired_capacity = 1

# Disable GPU for cost savings
enable_gpu_nodes = false

# Smaller storage
efs_storage_size = 100

tags = {
  Environment = "development"
  CostCenter  = "ai-research"
  Project     = "ai-agents"
}
# environments/production/terraform.tfvars
cluster_name = "prod-ai-agents"
kubernetes_version = "1.28"

# Production reliability over cost
use_spot_instances = true
spot_percentage = 50  # More conservative

# Production scaling
cpu_min_capacity = 3
cpu_max_capacity = 20
cpu_desired_capacity = 5

# Enable GPU for inference
enable_gpu_nodes = true
gpu_min_capacity = 1
gpu_max_capacity = 5

# Production storage
efs_storage_size = 1000

tags = {
  Environment = "production"
  CostCenter  = "ai-platform"
  Project     = "ai-agents"
  Backup      = "required"
}

GitOps Workflow with Terraform

Implement automated infrastructure deployment with proper safeguards:

# .github/workflows/terraform-deploy.yml
name: Deploy AI Infrastructure

on:
  push:
    branches: [main]
    paths: ['environments/**', 'modules/**']
  pull_request:
    paths: ['environments/**', 'modules/**']

jobs:
  terraform:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        environment: [dev, staging, production]
    
    steps:
      - uses: actions/checkout@v4
      
      - name: Setup Terraform
        uses: hashicorp/setup-terraform@v3
        with:
          terraform_version: 1.6.0
          
      - name: Configure AWS Credentials
        uses: aws-actions/configure-aws-credentials@v4
        with:
          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          aws-region: us-west-2
          
      - name: Configure GCP Credentials
        uses: google-github-actions/auth@v2
        with:
          credentials_json: ${{ secrets.GCP_SA_KEY }}
          
      - name: Terraform Init
        working-directory: ./environments/${{ matrix.environment }}
        run: terraform init
        
      - name: Terraform Validate
        working-directory: ./environments/${{ matrix.environment }}
        run: terraform validate
        
      - name: Terraform Plan
        working-directory: ./environments/${{ matrix.environment }}
        run: terraform plan -out=tfplan
        
      - name: Terraform Apply
        if: github.ref == 'refs/heads/main' && matrix.environment != 'production'
        working-directory: ./environments/${{ matrix.environment }}
        run: terraform apply tfplan
        
      - name: Production Approval Required
        if: github.ref == 'refs/heads/main' && matrix.environment == 'production'
        uses: actions/github-script@v7
        with:
          script: |
            github.rest.issues.createComment({
              issue_number: context.issue.number,
              owner: context.repo.owner,
              repo: context.repo.repo,
              body: '🚀 Production deployment ready. Manual approval required for Terraform apply.'
            });

Cost Management and Governance

Implement cost controls through Terraform policies and monitoring:

# modules/cost-controls/main.tf
resource "aws_budgets_budget" "ai_infrastructure" {
  name         = "ai-infrastructure-budget"
  budget_type  = "COST"
  limit_amount = var.monthly_budget
  limit_unit   = "USD"
  time_unit    = "MONTHLY"

  cost_filters {
    tag {
      key = "Project"
      values = ["ai-agents"]
    }
  }

  notification {
    comparison_operator   = "GREATER_THAN"
    threshold            = 80
    threshold_type       = "PERCENTAGE"
    notification_type    = "ACTUAL"
    subscriber_email_addresses = var.alert_emails
  }

  notification {
    comparison_operator   = "GREATER_THAN"
    threshold            = 100
    threshold_type       = "PERCENTAGE"
    notification_type    = "FORECASTED"
    subscriber_email_addresses = var.alert_emails
  }
}

# Auto-shutdown for development environments
resource "aws_lambda_function" "cost_optimizer" {
  filename         = "cost_optimizer.zip"
  function_name    = "ai-infrastructure-cost-optimizer"
  role            = aws_iam_role.lambda_role.arn
  handler         = "lambda_function.lambda_handler"
  runtime         = "python3.9"
  timeout         = 300

  environment {
    variables = {
      CLUSTER_NAME = var.cluster_name
      ENVIRONMENT = var.environment
    }
  }
}

# Schedule to shutdown dev clusters after hours
resource "aws_cloudwatch_event_rule" "shutdown_schedule" {
  count = var.environment == "dev" ? 1 : 0
  
  name                = "ai-cluster-shutdown"
  description         = "Shutdown AI clusters after hours for cost savings"
  schedule_expression = "cron(0 22 * * MON-FRI *)"  # 10 PM weekdays
}

Security and Compliance Configuration

# Security module for AI infrastructure
resource "aws_kms_key" "ai_encryption" {
  description             = "KMS key for AI infrastructure encryption"
  deletion_window_in_days = var.environment == "production" ? 30 : 7

  policy = jsonencode({
    Version = "2012-10-17"
    Statement = [
      {
        Sid    = "Enable IAM User Permissions"
        Effect = "Allow"
        Principal = {
          AWS = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:root"
        }
        Action   = "kms:*"
        Resource = "*"
      }
    ]
  })

  tags = merge(var.tags, {
    Name = "${var.cluster_name}-encryption-key"
  })
}

# Network security
resource "aws_security_group" "cluster" {
  name_prefix = "${var.cluster_name}-cluster-"
  vpc_id      = aws_vpc.ai_vpc.id
  description = "Security group for AI cluster"

  # Allow HTTPS for API server
  ingress {
    from_port   = 443
    to_port     = 443
    protocol    = "tcp"
    cidr_blocks = var.allowed_cidrs
  }

  # Allow all outbound traffic
  egress {
    from_port   = 0
    to_port     = 0
    protocol    = "-1"
    cidr_blocks = ["0.0.0.0/0"]
  }

  tags = merge(var.tags, {
    Name = "${var.cluster_name}-cluster-sg"
  })
}

State Management and Team Collaboration

Configure remote state with proper locking and versioning:

# backend.tf
terraform {
  backend "s3" {
    bucket         = "ai-infrastructure-terraform-state"
    key            = "environments/production/terraform.tfstate"
    region         = "us-west-2"
    encrypt        = true
    dynamodb_table = "ai-terraform-locks"
    
    # Versioning for state file recovery
    versioning = true
  }
}

# State bucket configuration
resource "aws_s3_bucket" "terraform_state" {
  bucket = "ai-infrastructure-terraform-state"
}

resource "aws_s3_bucket_versioning" "terraform_state" {
  bucket = aws_s3_bucket.terraform_state.id
  versioning_configuration {
    status = "Enabled"
  }
}

resource "aws_s3_bucket_server_side_encryption_configuration" "terraform_state" {
  bucket = aws_s3_bucket.terraform_state.id

  rule {
    apply_server_side_encryption_by_default {
      sse_algorithm = "AES256"
    }
  }
}

# DynamoDB for state locking
resource "aws_dynamodb_table" "terraform_locks" {
  name           = "ai-terraform-locks"
  billing_mode   = "PAY_PER_REQUEST"
  hash_key       = "LockID"

  attribute {
    name = "LockID"
    type = "S"
  }
}

Testing Infrastructure Code

# tests/terraform_test.go
package test

import (
    "testing"
    "github.com/gruntwork-io/terratest/modules/terraform"
    "github.com/stretchr/testify/assert"
)

func TestAIInfrastructure(t *testing.T) {
    terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{
        TerraformDir: "../environments/test",
        Vars: map[string]interface{}{
            "cluster_name": "test-ai-cluster",
            "environment": "test",
        },
    })

    defer terraform.Destroy(t, terraformOptions)

    terraform.InitAndApply(t, terraformOptions)

    // Verify cluster was created
    clusterName := terraform.Output(t, terraformOptions, "cluster_name")
    assert.Equal(t, "test-ai-cluster", clusterName)

    # Verify node groups were created
    nodeGroups := terraform.OutputList(t, terraformOptions, "node_groups")
    assert.Contains(t, nodeGroups, "test-ai-cluster-cpu-nodes")
}

Best Practices Checklist

✅ Modular Design: Reusable modules for different AI workload patterns
✅ Environment Separation: Clear separation between dev/staging/production
✅ State Management: Remote state with locking and versioning
✅ Cost Controls: Budgets, alerts, and automated cost optimization
✅ Security: Encryption, network policies, and IAM best practices
✅ Testing: Automated testing of infrastructure changes
✅ Documentation: Clear variable descriptions and usage examples

Next Steps

Infrastructure as Code provides the foundation for reliable, scalable AI agent deployments. The next article explores production monitoring and observability—how to gain visibility into your newly automated infrastructure and the AI agents running on it.

Remember: infrastructure is not the goal—it’s the reliable foundation that enables your AI agents to deliver value consistently and cost-effectively.