Infrastructure as Code for AI Agent Deployments: Terraform Patterns and Best Practices
Managing AI agent infrastructure manually becomes unsustainable as your deployment complexity grows. Infrastructure as Code (IaC) provides reproducible, version-controlled, and auditable infrastructure management that scales with your AI operations. This article explores Terraform patterns specifically designed for AI agent workloads across AWS and GCP.
The AI Infrastructure Challenge
AI agents require specialized infrastructure considerations that traditional web applications don’t face:
- GPU-enabled compute resources with specific driver requirements
- High-memory instances for model loading and inference
- Auto-scaling policies adapted to AI workload characteristics
- Storage solutions optimized for large model files
- Network configurations for model serving and data pipelines
Modular Terraform Architecture for AI Workloads
Structure your Terraform code with reusable modules that encapsulate AI-specific infrastructure patterns:
# environments/production/main.tf
terraform {
required_version = ">= 1.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 5.0"
}
google = {
source = "hashicorp/google"
version = "~> 4.0"
}
}
}
# Multi-cloud AI infrastructure
module "aws_ai_infrastructure" {
source = "../../modules/aws-ai-cluster"
cluster_name = "production-ai-agents"
environment = "production"
# AI-specific configuration
enable_gpu_nodes = true
gpu_instance_types = ["g4dn.xlarge", "g4dn.2xlarge"]
cpu_instance_types = ["m5.large", "m5.xlarge", "c5.xlarge"]
# Cost optimization
use_spot_instances = true
spot_percentage = 70
# Storage for models
efs_storage_size = 1000 # GB
# Networking
vpc_cidr = "10.0.0.0/16"
availability_zones = ["us-west-2a", "us-west-2b", "us-west-2c"]
tags = local.common_tags
}
module "gcp_ai_infrastructure" {
source = "../../modules/gcp-ai-cluster"
project_id = var.gcp_project_id
cluster_name = "production-ai-agents"
region = "us-central1"
# GKE configuration for AI workloads
enable_gpu_nodes = true
gpu_type = "nvidia-tesla-t4"
gpu_count = 1
# Node pool configuration
machine_types = {
cpu_pool = "n1-standard-4"
gpu_pool = "n1-standard-4"
memory_pool = "n1-highmem-4"
}
# Auto-scaling
min_node_count = 1
max_node_count = 10
# Storage
disk_size_gb = 100
disk_type = "pd-ssd"
labels = local.common_labels
}AWS EKS Module for AI Agents
# modules/aws-ai-cluster/main.tf
resource "aws_eks_cluster" "ai_cluster" {
name = var.cluster_name
role_arn = aws_iam_role.cluster_role.arn
version = var.kubernetes_version
vpc_config {
subnet_ids = aws_subnet.private[*].id
endpoint_private_access = true
endpoint_public_access = var.enable_public_access
security_group_ids = [aws_security_group.cluster.id]
}
enabled_cluster_log_types = ["api", "audit", "authenticator", "controllerManager", "scheduler"]
depends_on = [
aws_iam_role_policy_attachment.cluster_policy,
aws_iam_role_policy_attachment.vpc_cni_policy,
]
}
# CPU-optimized node group
resource "aws_eks_node_group" "cpu_nodes" {
cluster_name = aws_eks_cluster.ai_cluster.name
node_group_name = "${var.cluster_name}-cpu-nodes"
node_role_arn = aws_iam_role.node_role.arn
subnet_ids = aws_subnet.private[*].id
instance_types = var.cpu_instance_types
capacity_type = var.use_spot_instances ? "SPOT" : "ON_DEMAND"
scaling_config {
desired_size = var.cpu_desired_capacity
max_size = var.cpu_max_capacity
min_size = var.cpu_min_capacity
}
launch_template {
id = aws_launch_template.cpu_nodes.id
version = aws_launch_template.cpu_nodes.latest_version
}
labels = {
NodeType = "cpu"
Workload = "ai-agents"
}
taint {
key = "ai-agents/cpu"
value = "true"
effect = "NO_SCHEDULE"
}
}
# GPU-enabled node group
resource "aws_eks_node_group" "gpu_nodes" {
count = var.enable_gpu_nodes ? 1 : 0
cluster_name = aws_eks_cluster.ai_cluster.name
node_group_name = "${var.cluster_name}-gpu-nodes"
node_role_arn = aws_iam_role.node_role.arn
subnet_ids = aws_subnet.private[*].id
instance_types = var.gpu_instance_types
capacity_type = "ON_DEMAND" # GPU instances rarely available as spot
scaling_config {
desired_size = var.gpu_desired_capacity
max_size = var.gpu_max_capacity
min_size = var.gpu_min_capacity
}
launch_template {
id = aws_launch_template.gpu_nodes[0].id
version = aws_launch_template.gpu_nodes[0].latest_version
}
labels = {
NodeType = "gpu"
Workload = "ai-agents"
"nvidia.com/gpu" = "true"
}
taint {
key = "ai-agents/gpu"
value = "true"
effect = "NO_SCHEDULE"
}
}
# EFS for shared model storage
resource "aws_efs_file_system" "model_storage" {
creation_token = "${var.cluster_name}-models"
performance_mode = "generalPurpose"
throughput_mode = "provisioned"
provisioned_throughput_in_mibps = var.efs_throughput
lifecycle_policy {
transition_to_ia = "AFTER_30_DAYS"
}
tags = merge(var.tags, {
Name = "${var.cluster_name}-model-storage"
Type = "AI-Models"
})
}GCP GKE Module for AI Workloads
# modules/gcp-ai-cluster/main.tf
resource "google_container_cluster" "ai_cluster" {
name = var.cluster_name
location = var.region
project = var.project_id
# Remove default node pool
remove_default_node_pool = true
initial_node_count = 1
# Networking
network = google_compute_network.ai_vpc.name
subnetwork = google_compute_subnetwork.ai_subnet.name
# Enable features for AI workloads
addons_config {
horizontal_pod_autoscaling {
disabled = false
}
http_load_balancing {
disabled = false
}
network_policy_config {
disabled = false
}
}
# Enable node auto-provisioning for dynamic scaling
cluster_autoscaling {
enabled = true
resource_limits {
resource_type = "cpu"
minimum = 1
maximum = 100
}
resource_limits {
resource_type = "memory"
minimum = 1
maximum = 400
}
}
# Monitoring and logging
monitoring_config {
enable_components = ["SYSTEM_COMPONENTS", "WORKLOADS"]
}
logging_config {
enable_components = ["SYSTEM_COMPONENTS", "WORKLOADS"]
}
}
# CPU node pool for general AI workloads
resource "google_container_node_pool" "cpu_pool" {
name = "${var.cluster_name}-cpu-pool"
location = var.region
cluster = google_container_cluster.ai_cluster.name
project = var.project_id
# Auto-scaling configuration
autoscaling {
min_node_count = var.min_node_count
max_node_count = var.max_node_count
}
management {
auto_repair = true
auto_upgrade = true
}
node_config {
preemptible = var.use_preemptible_nodes
machine_type = var.machine_types.cpu_pool
disk_size_gb = var.disk_size_gb
disk_type = var.disk_type
# Optimize for AI workloads
metadata = {
disable-legacy-endpoints = "true"
}
labels = merge(var.labels, {
node-type = "cpu"
workload = "ai-agents"
})
taint {
key = "ai-agents/cpu"
value = "true"
effect = "NO_SCHEDULE"
}
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform",
"https://www.googleapis.com/auth/devstorage.read_only",
"https://www.googleapis.com/auth/logging.write",
"https://www.googleapis.com/auth/monitoring",
]
}
}
# GPU node pool for inference workloads
resource "google_container_node_pool" "gpu_pool" {
count = var.enable_gpu_nodes ? 1 : 0
name = "${var.cluster_name}-gpu-pool"
location = var.region
cluster = google_container_cluster.ai_cluster.name
project = var.project_id
autoscaling {
min_node_count = 0
max_node_count = var.gpu_max_nodes
}
management {
auto_repair = true
auto_upgrade = false # GPU drivers require careful upgrade management
}
node_config {
preemptible = false # GPU nodes should be stable
machine_type = var.machine_types.gpu_pool
disk_size_gb = var.disk_size_gb
disk_type = "pd-ssd"
# GPU configuration
guest_accelerator {
type = var.gpu_type
count = var.gpu_count
}
labels = merge(var.labels, {
node-type = "gpu"
workload = "ai-agents"
gpu-type = var.gpu_type
})
taint {
key = "ai-agents/gpu"
value = "true"
effect = "NO_SCHEDULE"
}
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform",
]
}
}Environment-Specific Configuration
Use Terraform workspaces and variable files for environment separation:
# environments/dev/terraform.tfvars
cluster_name = "dev-ai-agents"
kubernetes_version = "1.28"
# Cost optimization for development
use_spot_instances = true
spot_percentage = 90
# Reduced capacity
cpu_min_capacity = 1
cpu_max_capacity = 3
cpu_desired_capacity = 1
# Disable GPU for cost savings
enable_gpu_nodes = false
# Smaller storage
efs_storage_size = 100
tags = {
Environment = "development"
CostCenter = "ai-research"
Project = "ai-agents"
}# environments/production/terraform.tfvars
cluster_name = "prod-ai-agents"
kubernetes_version = "1.28"
# Production reliability over cost
use_spot_instances = true
spot_percentage = 50 # More conservative
# Production scaling
cpu_min_capacity = 3
cpu_max_capacity = 20
cpu_desired_capacity = 5
# Enable GPU for inference
enable_gpu_nodes = true
gpu_min_capacity = 1
gpu_max_capacity = 5
# Production storage
efs_storage_size = 1000
tags = {
Environment = "production"
CostCenter = "ai-platform"
Project = "ai-agents"
Backup = "required"
}GitOps Workflow with Terraform
Implement automated infrastructure deployment with proper safeguards:
# .github/workflows/terraform-deploy.yml
name: Deploy AI Infrastructure
on:
push:
branches: [main]
paths: ['environments/**', 'modules/**']
pull_request:
paths: ['environments/**', 'modules/**']
jobs:
terraform:
runs-on: ubuntu-latest
strategy:
matrix:
environment: [dev, staging, production]
steps:
- uses: actions/checkout@v4
- name: Setup Terraform
uses: hashicorp/setup-terraform@v3
with:
terraform_version: 1.6.0
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: us-west-2
- name: Configure GCP Credentials
uses: google-github-actions/auth@v2
with:
credentials_json: ${{ secrets.GCP_SA_KEY }}
- name: Terraform Init
working-directory: ./environments/${{ matrix.environment }}
run: terraform init
- name: Terraform Validate
working-directory: ./environments/${{ matrix.environment }}
run: terraform validate
- name: Terraform Plan
working-directory: ./environments/${{ matrix.environment }}
run: terraform plan -out=tfplan
- name: Terraform Apply
if: github.ref == 'refs/heads/main' && matrix.environment != 'production'
working-directory: ./environments/${{ matrix.environment }}
run: terraform apply tfplan
- name: Production Approval Required
if: github.ref == 'refs/heads/main' && matrix.environment == 'production'
uses: actions/github-script@v7
with:
script: |
github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: '🚀 Production deployment ready. Manual approval required for Terraform apply.'
});Cost Management and Governance
Implement cost controls through Terraform policies and monitoring:
# modules/cost-controls/main.tf
resource "aws_budgets_budget" "ai_infrastructure" {
name = "ai-infrastructure-budget"
budget_type = "COST"
limit_amount = var.monthly_budget
limit_unit = "USD"
time_unit = "MONTHLY"
cost_filters {
tag {
key = "Project"
values = ["ai-agents"]
}
}
notification {
comparison_operator = "GREATER_THAN"
threshold = 80
threshold_type = "PERCENTAGE"
notification_type = "ACTUAL"
subscriber_email_addresses = var.alert_emails
}
notification {
comparison_operator = "GREATER_THAN"
threshold = 100
threshold_type = "PERCENTAGE"
notification_type = "FORECASTED"
subscriber_email_addresses = var.alert_emails
}
}
# Auto-shutdown for development environments
resource "aws_lambda_function" "cost_optimizer" {
filename = "cost_optimizer.zip"
function_name = "ai-infrastructure-cost-optimizer"
role = aws_iam_role.lambda_role.arn
handler = "lambda_function.lambda_handler"
runtime = "python3.9"
timeout = 300
environment {
variables = {
CLUSTER_NAME = var.cluster_name
ENVIRONMENT = var.environment
}
}
}
# Schedule to shutdown dev clusters after hours
resource "aws_cloudwatch_event_rule" "shutdown_schedule" {
count = var.environment == "dev" ? 1 : 0
name = "ai-cluster-shutdown"
description = "Shutdown AI clusters after hours for cost savings"
schedule_expression = "cron(0 22 * * MON-FRI *)" # 10 PM weekdays
}Security and Compliance Configuration
# Security module for AI infrastructure
resource "aws_kms_key" "ai_encryption" {
description = "KMS key for AI infrastructure encryption"
deletion_window_in_days = var.environment == "production" ? 30 : 7
policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Sid = "Enable IAM User Permissions"
Effect = "Allow"
Principal = {
AWS = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:root"
}
Action = "kms:*"
Resource = "*"
}
]
})
tags = merge(var.tags, {
Name = "${var.cluster_name}-encryption-key"
})
}
# Network security
resource "aws_security_group" "cluster" {
name_prefix = "${var.cluster_name}-cluster-"
vpc_id = aws_vpc.ai_vpc.id
description = "Security group for AI cluster"
# Allow HTTPS for API server
ingress {
from_port = 443
to_port = 443
protocol = "tcp"
cidr_blocks = var.allowed_cidrs
}
# Allow all outbound traffic
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
tags = merge(var.tags, {
Name = "${var.cluster_name}-cluster-sg"
})
}State Management and Team Collaboration
Configure remote state with proper locking and versioning:
# backend.tf
terraform {
backend "s3" {
bucket = "ai-infrastructure-terraform-state"
key = "environments/production/terraform.tfstate"
region = "us-west-2"
encrypt = true
dynamodb_table = "ai-terraform-locks"
# Versioning for state file recovery
versioning = true
}
}
# State bucket configuration
resource "aws_s3_bucket" "terraform_state" {
bucket = "ai-infrastructure-terraform-state"
}
resource "aws_s3_bucket_versioning" "terraform_state" {
bucket = aws_s3_bucket.terraform_state.id
versioning_configuration {
status = "Enabled"
}
}
resource "aws_s3_bucket_server_side_encryption_configuration" "terraform_state" {
bucket = aws_s3_bucket.terraform_state.id
rule {
apply_server_side_encryption_by_default {
sse_algorithm = "AES256"
}
}
}
# DynamoDB for state locking
resource "aws_dynamodb_table" "terraform_locks" {
name = "ai-terraform-locks"
billing_mode = "PAY_PER_REQUEST"
hash_key = "LockID"
attribute {
name = "LockID"
type = "S"
}
}Testing Infrastructure Code
# tests/terraform_test.go
package test
import (
"testing"
"github.com/gruntwork-io/terratest/modules/terraform"
"github.com/stretchr/testify/assert"
)
func TestAIInfrastructure(t *testing.T) {
terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{
TerraformDir: "../environments/test",
Vars: map[string]interface{}{
"cluster_name": "test-ai-cluster",
"environment": "test",
},
})
defer terraform.Destroy(t, terraformOptions)
terraform.InitAndApply(t, terraformOptions)
// Verify cluster was created
clusterName := terraform.Output(t, terraformOptions, "cluster_name")
assert.Equal(t, "test-ai-cluster", clusterName)
# Verify node groups were created
nodeGroups := terraform.OutputList(t, terraformOptions, "node_groups")
assert.Contains(t, nodeGroups, "test-ai-cluster-cpu-nodes")
}Best Practices Checklist
✅ Modular Design: Reusable modules for different AI workload patterns
✅ Environment Separation: Clear separation between dev/staging/production
✅ State Management: Remote state with locking and versioning
✅ Cost Controls: Budgets, alerts, and automated cost optimization
✅ Security: Encryption, network policies, and IAM best practices
✅ Testing: Automated testing of infrastructure changes
✅ Documentation: Clear variable descriptions and usage examples
Next Steps
Infrastructure as Code provides the foundation for reliable, scalable AI agent deployments. The next article explores production monitoring and observability—how to gain visibility into your newly automated infrastructure and the AI agents running on it.
Remember: infrastructure is not the goal—it’s the reliable foundation that enables your AI agents to deliver value consistently and cost-effectively.