Loading content...
Platform engineering has emerged as one of the most transformative movements in software development. By creating self-service internal developer platforms (IDPs), organizations are dramatically reducing the time from idea to production while improving reliability and developer satisfaction.
Traditional DevOps Challenges:
Platform Engineering Solution:
An IDP is a curated set of tools, services, and workflows that enable developers to:
Key Characteristics:
Infrastructure as Code (IaC) Layer:
// platform/infrastructure/templates/web-service.ts
import * as pulumi from '@pulumi/pulumi'
import * as azure from '@pulumi/azure-native'
export interface WebServiceConfig {
name: string
environment: 'dev' | 'staging' | 'production'
replicas?: number
cpu?: string
memory?: string
environmentVariables?: Record<string, string>
}
export class WebService extends pulumi.ComponentResource {
public readonly url: pulumi.Output<string>
public readonly containerApp: azure.app.ContainerApp
constructor(name: string, config: WebServiceConfig, opts?: pulumi.ComponentResourceOptions) {
super('platform:web-service', name, {}, opts)
// Resource group (shared or dedicated)
const resourceGroup = new azure.resources.ResourceGroup(`${config.name}-rg`, {
resourceGroupName: `${config.name}-${config.environment}`,
location: 'eastus',
tags: {
environment: config.environment,
managedBy: 'platform',
service: config.name
}
}, { parent: this })
// Container Apps Environment (shared across services)
const managedEnvironment = azure.app.getManagedEnvironment({
resourceGroupName: `platform-${config.environment}`,
environmentName: `platform-env-${config.environment}`
})
// Log Analytics Workspace
const workspace = new azure.operationalinsights.Workspace(`${config.name}-logs`, {
resourceGroupName: resourceGroup.name,
workspaceName: `${config.name}-logs`,
location: resourceGroup.location,
sku: { name: 'PerGB2018' },
retentionInDays: 30
}, { parent: this })
// Container App
this.containerApp = new azure.app.ContainerApp(`${config.name}-app`, {
containerAppName: config.name,
resourceGroupName: resourceGroup.name,
managedEnvironmentId: managedEnvironment.then(env => env.id),
configuration: {
ingress: {
external: true,
targetPort: 3000,
traffic: [{
weight: 100,
latestRevision: true
}]
},
registries: [{
server: 'myregistry.azurecr.io',
identity: 'system'
}]
},
template: {
containers: [{
name: config.name,
image: `myregistry.azurecr.io/${config.name}:latest`,
resources: {
cpu: parseFloat(config.cpu || '0.5'),
memory: config.memory || '1Gi'
},
env: Object.entries(config.environmentVariables || {}).map(([name, value]) => ({
name,
value
}))
}],
scale: {
minReplicas: config.environment === 'production' ? 2 : 1,
maxReplicas: config.replicas || 10,
rules: [{
name: 'http-scaling',
http: {
metadata: {
concurrentRequests: '100'
}
}
}]
}
},
identity: {
type: 'SystemAssigned'
}
}, { parent: this })
// Assign outputs
this.url = this.containerApp.configuration.apply(c =>
c?.ingress?.fqdn ? `https://${c.ingress.fqdn}` : ''
)
this.registerOutputs({
url: this.url
})
}
}
// Usage in service repository
const webService = new WebService('my-api', {
name: 'my-api',
environment: 'production',
replicas: 5,
cpu: '1.0',
memory: '2Gi',
environmentVariables: {
NODE_ENV: 'production',
LOG_LEVEL: 'info'
}
})
export const serviceUrl = webService.url
Platform CLI Tool:
// platform-cli/src/commands/create.ts
import { Command } from 'commander'
import inquirer from 'inquirer'
import { generateFromTemplate } from '../templates'
import { initializeGitRepo } from '../git'
import { setupCI } from '../ci'
export const createCommand = new Command('create')
.description('Create a new service from template')
.action(async () => {
const answers = await inquirer.prompt([
{
type: 'list',
name: 'type',
message: 'What type of service?',
choices: [
'Web API (Node.js)',
'Web API (Python)',
'Frontend (Next.js)',
'Background Worker',
'Scheduled Job'
]
},
{
type: 'input',
name: 'name',
message: 'Service name:',
validate: (input) => /^[a-z][a-z0-9-]*$/.test(input) || 'Invalid name format'
},
{
type: 'input',
name: 'description',
message: 'Service description:'
},
{
type: 'list',
name: 'database',
message: 'Need a database?',
choices: ['None', 'PostgreSQL', 'MongoDB', 'Redis']
},
{
type: 'confirm',
name: 'monitoring',
message: 'Enable monitoring and alerts?',
default: true
}
])
console.log('\n📦 Creating service...')
// Generate from template
await generateFromTemplate({
type: answers.type,
name: answers.name,
description: answers.description,
database: answers.database,
monitoring: answers.monitoring
})
// Initialize Git repository
await initializeGitRepo(answers.name)
// Setup CI/CD pipeline
await setupCI(answers.name, answers.type)
// Create infrastructure
await createInfrastructure(answers)
console.log('\n✅ Service created successfully!')
console.log(`\n📁 cd ${answers.name}`)
console.log('🚀 platform deploy --env dev')
})
async function createInfrastructure(config: any) {
const infraCode = `
import { WebService } from '@company/platform'
const service = new WebService('${config.name}', {
name: '${config.name}',
environment: 'dev',
${config.database !== 'None' ? `database: '${config.database.toLowerCase()}',` : ''}
${config.monitoring ? 'monitoring: { enabled: true },' : ''}
})
export const url = service.url
`
await writeFile(`${config.name}/infrastructure/index.ts`, infraCode)
}
Automated Deployment Pipeline:
# .platform/deploy.yml
name: Deploy Service
on:
push:
branches: [main]
workflow_dispatch:
inputs:
environment:
description: 'Environment'
required: true
type: choice
options:
- dev
- staging
- production
jobs:
deploy:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Platform Auth
uses: company/platform-auth@v1
with:
token: ${{ secrets.PLATFORM_TOKEN }}
- name: Build
uses: company/platform-build@v1
with:
cache: true
- name: Run Tests
uses: company/platform-test@v1
with:
coverage: true
- name: Security Scan
uses: company/platform-security@v1
- name: Deploy
uses: company/platform-deploy@v1
with:
environment: ${{ github.event.inputs.environment || 'dev' }}
auto-rollback: true
- name: Health Check
uses: company/platform-healthcheck@v1
with:
timeout: 300
- name: Notify
if: always()
uses: company/platform-notify@v1
with:
channels: ['slack', 'email']
Infrastructure Provisioning Portal:
// platform-portal/src/pages/provision.tsx
import { useState } from 'react'
import { useMutation } from '@tanstack/react-query'
export default function ProvisionResource() {
const [config, setConfig] = useState({
resourceType: 'database',
name: '',
size: 'small',
backup: true,
highAvailability: false
})
const provision = useMutation({
mutationFn: async (config) => {
const response = await fetch('/api/platform/provision', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(config)
})
return response.json()
},
onSuccess: (data) => {
// Show connection details
console.log('Resource provisioned:', data)
}
})
return (
<div className="max-w-2xl mx-auto p-8">
<h1 className="text-3xl font-bold mb-8">Provision Database</h1>
<form onSubmit={(e) => {
e.preventDefault()
provision.mutate(config)
}}>
<div className="space-y-6">
<div>
<label className="block text-sm font-medium mb-2">
Database Type
</label>
<select
value={config.resourceType}
onChange={(e) => setConfig({...config, resourceType: e.target.value})}
className="w-full p-2 border rounded"
>
<option value="postgresql">PostgreSQL</option>
<option value="mongodb">MongoDB</option>
<option value="redis">Redis</option>
</select>
</div>
<div>
<label className="block text-sm font-medium mb-2">
Database Name
</label>
<input
type="text"
value={config.name}
onChange={(e) => setConfig({...config, name: e.target.value})}
className="w-full p-2 border rounded"
placeholder="my-database"
/>
</div>
<div>
<label className="block text-sm font-medium mb-2">
Size
</label>
<div className="grid grid-cols-3 gap-4">
{['small', 'medium', 'large'].map(size => (
<button
key={size}
type="button"
onClick={() => setConfig({...config, size})}
className={`p-4 border rounded ${
config.size === size ? 'bg-blue-500 text-white' : ''
}`}
>
<div className="font-medium capitalize">{size}</div>
<div className="text-sm">
{size === 'small' && '2 vCPU, 4GB'}
{size === 'medium' && '4 vCPU, 16GB'}
{size === 'large' && '8 vCPU, 32GB'}
</div>
</button>
))}
</div>
</div>
<div className="space-y-2">
<label className="flex items-center">
<input
type="checkbox"
checked={config.backup}
onChange={(e) => setConfig({...config, backup: e.target.checked})}
className="mr-2"
/>
<span>Automated backups (recommended)</span>
</label>
<label className="flex items-center">
<input
type="checkbox"
checked={config.highAvailability}
onChange={(e) => setConfig({...config, highAvailability: e.target.checked})}
className="mr-2"
/>
<span>High availability (production)</span>
</label>
</div>
<div className="bg-gray-50 p-4 rounded">
<h3 className="font-medium mb-2">Estimated Cost</h3>
<div className="text-2xl font-bold">
${calculateCost(config)}/month
</div>
</div>
<button
type="submit"
disabled={provision.isPending}
className="w-full bg-blue-500 text-white py-3 rounded font-medium hover:bg-blue-600 disabled:opacity-50"
>
{provision.isPending ? 'Provisioning...' : 'Provision Database'}
</button>
</div>
</form>
{provision.isSuccess && (
<div className="mt-8 p-4 bg-green-50 border border-green-200 rounded">
<h3 className="font-medium text-green-900 mb-2">
✅ Database Provisioned!
</h3>
<div className="space-y-2 text-sm">
<div>
<strong>Connection String:</strong>
<code className="block mt-1 p-2 bg-white rounded">
{provision.data.connectionString}
</code>
</div>
<div>
<strong>Admin Portal:</strong>
<a href={provision.data.adminUrl} className="text-blue-600">
{provision.data.adminUrl}
</a>
</div>
</div>
</div>
)}
</div>
)
}
function calculateCost(config: any): number {
const baseCosts = {
small: 50,
medium: 150,
large: 400
}
let cost = baseCosts[config.size as keyof typeof baseCosts]
if (config.backup) cost += 20
if (config.highAvailability) cost *= 2
return cost
}
Service Template Structure:
service-template/
|-- .platform/
| |-- deploy.yml # Deployment pipeline
| |-- monitoring.yml # Monitoring configuration
| +-- alerts.yml # Alert rules
|-- src/
| |-- index.ts # Application entry
| |-- health.ts # Health check endpoint
| +-- metrics.ts # Custom metrics
|-- infrastructure/
| |-- index.ts # Pulumi infrastructure
| +-- database.ts # Database setup
|-- tests/
| |-- unit/
| +-- integration/
|-- Dockerfile # Optimized container image
|-- docker-compose.yml # Local development
|-- package.json
+-- README.md # Generated documentation
Template Generation:
// platform/templates/web-api-nodejs.ts
export function generateWebApiTemplate(config: ServiceConfig) {
return {
'src/index.ts': `
import express from 'express'
import { registerHealthChecks } from './health'
import { registerMetrics } from './metrics'
import { logger } from '@company/platform-logger'
${config.database ? `import { db } from './database'` : ''}
const app = express()
const port = process.env.PORT || 3000
// Middleware
app.use(express.json())
app.use(logger.middleware())
// Health checks
registerHealthChecks(app)
// Metrics
registerMetrics(app)
// Routes
app.get('/', (req, res) => {
res.json({
service: '${config.name}',
version: process.env.VERSION || 'dev',
environment: process.env.NODE_ENV
})
})
// Start server
${config.database ? `
db.connect().then(() => {
app.listen(port, () => {
console.log(\`Server running on port \${port}\`)
})
}).catch(err => {
console.error('Failed to connect to database:', err)
process.exit(1)
})
` : `
app.listen(port, () => {
console.log(\`Server running on port \${port}\`)
})
`}
`,
'src/health.ts': generateHealthCheck(config),
'src/metrics.ts': generateMetrics(config),
'Dockerfile': generateDockerfile(config),
'infrastructure/index.ts': generateInfrastructure(config),
'README.md': generateReadme(config)
}
}
Built-in Monitoring Stack:
// platform/monitoring/dashboard.ts
import * as azure from '@pulumi/azure-native'
export function createServiceDashboard(serviceName: string) {
return new azure.portal.Dashboard(`${serviceName}-dashboard`, {
dashboardName: `${serviceName}-dashboard`,
lenses: [
{
order: 0,
parts: [
// Request Rate
{
position: { x: 0, y: 0, colSpan: 6, rowSpan: 4 },
metadata: {
type: 'Extension/HubsExtension/PartType/MonitorChartPart',
settings: {
content: {
chartType: 'Line',
metrics: [{
resourceId: serviceResourceId,
name: 'Requests',
aggregationType: 'Count'
}]
}
}
}
},
// Response Time (P50, P95, P99)
{
position: { x: 6, y: 0, colSpan: 6, rowSpan: 4 },
metadata: {
type: 'Extension/HubsExtension/PartType/MonitorChartPart',
settings: {
content: {
chartType: 'Line',
metrics: [
{
resourceId: serviceResourceId,
name: 'ResponseTime',
aggregationType: 'Percentile',
percentile: 50
},
{
resourceId: serviceResourceId,
name: 'ResponseTime',
aggregationType: 'Percentile',
percentile: 95
},
{
resourceId: serviceResourceId,
name: 'ResponseTime',
aggregationType: 'Percentile',
percentile: 99
}
]
}
}
}
},
// Error Rate
{
position: { x: 0, y: 4, colSpan: 6, rowSpan: 4 },
metadata: {
type: 'Extension/HubsExtension/PartType/MonitorChartPart',
settings: {
content: {
chartType: 'Line',
metrics: [{
resourceId: serviceResourceId,
name: 'Errors',
aggregationType: 'Count'
}]
}
}
}
},
// CPU & Memory
{
position: { x: 6, y: 4, colSpan: 6, rowSpan: 4 },
metadata: {
type: 'Extension/HubsExtension/PartType/MonitorChartPart',
settings: {
content: {
chartType: 'Line',
metrics: [
{
resourceId: serviceResourceId,
name: 'CpuPercentage',
aggregationType: 'Average'
},
{
resourceId: serviceResourceId,
name: 'MemoryPercentage',
aggregationType: 'Average'
}
]
}
}
}
}
]
}
]
})
}
Automated Alerting:
// platform/monitoring/alerts.ts
export function createServiceAlerts(serviceName: string, config: AlertConfig) {
// High error rate
new azure.insights.MetricAlert(`${serviceName}-error-rate`, {
ruleName: `${serviceName}-high-error-rate`,
scopes: [serviceResourceId],
criteria: {
allOf: [{
metricName: 'Errors',
operator: 'GreaterThan',
threshold: config.errorThreshold || 10,
timeAggregation: 'Count',
dimensions: []
}]
},
windowSize: 'PT5M',
evaluationFrequency: 'PT1M',
severity: 2,
actions: [{
actionGroupId: platformActionGroup.id
}]
})
// Slow response time
new azure.insights.MetricAlert(`${serviceName}-slow-response`, {
ruleName: `${serviceName}-slow-response`,
scopes: [serviceResourceId],
criteria: {
allOf: [{
metricName: 'ResponseTime',
operator: 'GreaterThan',
threshold: config.responseTimeThreshold || 1000,
timeAggregation: 'Average',
dimensions: []
}]
},
windowSize: 'PT5M',
evaluationFrequency: 'PT1M',
severity: 3,
actions: [{
actionGroupId: platformActionGroup.id
}]
})
// High CPU usage
new azure.insights.MetricAlert(`${serviceName}-high-cpu`, {
ruleName: `${serviceName}-high-cpu`,
scopes: [serviceResourceId],
criteria: {
allOf: [{
metricName: 'CpuPercentage',
operator: 'GreaterThan',
threshold: 80,
timeAggregation: 'Average',
dimensions: []
}]
},
windowSize: 'PT15M',
evaluationFrequency: 'PT5M',
severity: 3,
actions: [{
actionGroupId: platformActionGroup.id
}]
})
}
Developer Productivity:
Time to First Deploy:
Before Platform: 2-3 weeks
After Platform: 2-3 hours
Improvement: 95% reduction
Deployment Frequency:
Before: 2-3 times/week
After: 15-20 times/day
Improvement: 35x increase
Mean Time to Recovery (MTTR):
Before: 4-6 hours
After: 15-30 minutes
Improvement: 90% reduction
Service Creation Time:
Before: 1-2 days
After: 15 minutes
Improvement: 97% reduction
Platform Adoption:
// platform/metrics/adoption.ts
export interface PlatformMetrics {
activeServices: number
deploymentsPerDay: number
averageDeployTime: number
developerSatisfaction: number
incidentRate: number
platformUptime: number
}
export async function calculateAdoption(): Promise<PlatformMetrics> {
const services = await getActiveServices()
const deployments = await getDeployments(last7Days())
return {
activeServices: services.length,
deploymentsPerDay: deployments.length / 7,
averageDeployTime: calculateAverage(
deployments.map(d => d.duration)
),
developerSatisfaction: await getSurveyScore(),
incidentRate: await getIncidentRate(),
platformUptime: await getPlatformUptime()
}
}
Cognitive Load Reduction:
Self-Service Success Rate:
Tasks Completed Without Help:
Infrastructure provisioning: 95%
Deployment: 98%
Monitoring setup: 92%
Secret management: 97%
Database creation: 93%
// platform/deployment/progressive-rollout.ts
export async function progressiveRollout(config: RolloutConfig) {
const stages = [
{ name: 'canary', traffic: 5, duration: 300 }, // 5% for 5 min
{ name: 'early', traffic: 25, duration: 600 }, // 25% for 10 min
{ name: 'majority', traffic: 75, duration: 900 }, // 75% for 15 min
{ name: 'full', traffic: 100, duration: 0 } // 100%
]
for (const stage of stages) {
console.log(`Rolling out to ${stage.name}: ${stage.traffic}% traffic`)
// Update traffic split
await updateTrafficSplit(config.service, {
newVersion: stage.traffic,
oldVersion: 100 - stage.traffic
})
// Monitor health
const health = await monitorHealth(config.service, stage.duration)
if (!health.healthy) {
console.log('Health check failed, rolling back...')
await rollback(config.service)
throw new Error(`Rollout failed at ${stage.name} stage`)
}
// Check error rate
const errorRate = await getErrorRate(config.service, stage.duration)
if (errorRate > config.maxErrorRate) {
console.log('Error rate exceeded threshold, rolling back...')
await rollback(config.service)
throw new Error(`High error rate at ${stage.name} stage: ${errorRate}%`)
}
console.log(`✅ ${stage.name} stage successful`)
}
console.log('🎉 Rollout completed successfully!')
}
// platform/cost/optimizer.ts
export async function analyzeCosts() {
const services = await getAllServices()
const recommendations: CostRecommendation[] = []
for (const service of services) {
const usage = await getUsageMetrics(service.id)
// Check for over-provisioned resources
if (usage.cpu.average < 30 && usage.cpu.max < 50) {
recommendations.push({
service: service.name,
type: 'downsize',
resource: 'CPU',
current: service.resources.cpu,
recommended: calculateOptimalCpu(usage),
savings: calculateSavings(service, 'cpu')
})
}
// Check for unused resources
if (usage.requests.last7Days === 0) {
recommendations.push({
service: service.name,
type: 'shutdown',
reason: 'No traffic in last 7 days',
savings: service.monthlyCost
})
}
// Check for optimization opportunities
if (!service.autoscaling && usage.variability > 0.5) {
recommendations.push({
service: service.name,
type: 'enable-autoscaling',
reason: 'High traffic variability detected',
savings: service.monthlyCost * 0.3
})
}
}
return recommendations
}
// platform/security/scanner.ts
export async function securityScan(service: string) {
const results = {
vulnerabilities: [],
secrets: [],
compliance: [],
recommendations: []
}
// Scan container image
const imageVulns = await scanContainerImage(service)
results.vulnerabilities.push(...imageVulns)
// Check for exposed secrets
const secrets = await scanForSecrets(service)
results.secrets.push(...secrets)
// Compliance checks
const compliance = await checkCompliance(service, [
'encryption-at-rest',
'encryption-in-transit',
'authentication-required',
'audit-logging-enabled',
'backup-configured'
])
results.compliance.push(...compliance)
// Generate recommendations
if (imageVulns.some(v => v.severity === 'critical')) {
results.recommendations.push({
type: 'critical',
message: 'Critical vulnerabilities detected. Update base image immediately.'
})
}
return results
}
Challenge: 1000+ microservices, fragmented tooling
Solution:
Results:
Challenge: Enable developers to own entire lifecycle
Solution:
Results:
Challenge: Rapid growth straining infrastructure team
Solution:
Results:
Week 1-2: Discovery
- Interview development teams
- Document pain points
- Identify common patterns
- Define platform requirements
Week 3-4: Core Infrastructure
- Set up IaC foundation
- Create first service template
- Build basic CI/CD pipeline
- Deploy pilot service
Week 5-8: Observability
- Implement monitoring stack
- Create dashboards
- Set up alerting
- Add logging aggregation
Week 9-12: Self-Service
- Build platform CLI
- Create documentation
- Onboard pilot teams
- Gather feedback
- Add more service templates
- Implement progressive rollouts
- Build developer portal
- Add cost tracking
- Expand to more teams
- Advanced automation
- AI-powered recommendations
- Cost optimization
- Security hardening
- Platform as Product mindset
Platform engineering represents a fundamental shift in how we build and operate software. By creating self-service internal developer platforms, organizations can dramatically improve developer productivity, reduce operational toil, and accelerate time to market.
The key is treating your platform as a product: understand your users (developers), iterate based on feedback, provide excellent documentation, and continuously improve the experience. When done right, platform engineering doesn't just make developers faster - it makes them happier.
At Cortara Labs, we help organizations design and build internal developer platforms tailored to their specific needs and culture. Whether you're starting from scratch or modernizing existing infrastructure, we can guide you on your platform engineering journey.
Discover how a well-designed internal developer platform can revolutionize your team's productivity. Contact us for a platform engineering assessment, or explore our services to see how we can help.
Follow @cortaralabs for insights on platform engineering, DevOps, and developer experience.