Loading content...
Modern applications are complex distributed systems with hundreds of services, thousands of dependencies, and millions of users. Traditional monitoring isn't enough - we need comprehensive observability and site reliability engineering (SRE) practices to ensure our systems are reliable, performant, and maintainable.
Traditional Monitoring:
Modern Observability:
1. Metrics: Quantitative measurements over time 2. Logs: Discrete events with context 3. Traces: Request flows through distributed systems
Plus Modern Additions:
Prometheus for Metrics:
// application/metrics.ts
import { Registry, Counter, Histogram, Gauge } from 'prom-client'
export const register = new Registry()
// Request counter
export const httpRequestsTotal = new Counter({
name: 'http_requests_total',
help: 'Total number of HTTP requests',
labelNames: ['method', 'path', 'status'],
registers: [register]
})
// Response time histogram
export const httpRequestDuration = new Histogram({
name: 'http_request_duration_seconds',
help: 'HTTP request duration in seconds',
labelNames: ['method', 'path', 'status'],
buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10],
registers: [register]
})
// Active connections gauge
export const activeConnections = new Gauge({
name: 'active_connections',
help: 'Number of active connections',
registers: [register]
})
// Database query performance
export const dbQueryDuration = new Histogram({
name: 'db_query_duration_seconds',
help: 'Database query duration in seconds',
labelNames: ['operation', 'table', 'status'],
buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1],
registers: [register]
})
// Business metrics
export const ordersCreated = new Counter({
name: 'orders_created_total',
help: 'Total number of orders created',
labelNames: ['status', 'payment_method'],
registers: [register]
})
export const revenueGenerated = new Counter({
name: 'revenue_generated_dollars',
help: 'Total revenue generated in dollars',
labelNames: ['product_type'],
registers: [register]
})
Instrumentation Middleware:
// application/middleware/metrics.ts
import { Request, Response, NextFunction } from 'express'
import { httpRequestsTotal, httpRequestDuration } from '../metrics'
export function metricsMiddleware(req: Request, res: Response, next: NextFunction) {
const start = Date.now()
res.on('finish', () => {
const duration = (Date.now() - start) / 1000
const labels = {
method: req.method,
path: req.route?.path || req.path,
status: res.statusCode.toString()
}
httpRequestsTotal.inc(labels)
httpRequestDuration.observe(labels, duration)
})
next()
}
// Expose metrics endpoint
app.get('/metrics', async (req, res) => {
res.set('Content-Type', register.contentType)
res.end(await register.metrics())
})
Custom Business Metrics:
// application/services/order-service.ts
import { ordersCreated, revenueGenerated } from '../metrics'
export class OrderService {
async createOrder(orderData: OrderData) {
const order = await this.db.orders.create(orderData)
// Record business metrics
ordersCreated.inc({
status: order.status,
payment_method: order.paymentMethod
})
revenueGenerated.inc(
{ product_type: order.productType },
order.total
)
return order
}
}
OpenTelemetry Implementation:
// application/tracing.ts
import { NodeSDK } from '@opentelemetry/sdk-node'
import { getNodeAutoInstrumentations } from '@opentelemetry/auto-instrumentations-node'
import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http'
import { Resource } from '@opentelemetry/resources'
import { SemanticResourceAttributes } from '@opentelemetry/semantic-conventions'
const sdk = new NodeSDK({
resource: new Resource({
[SemanticResourceAttributes.SERVICE_NAME]: 'api-service',
[SemanticResourceAttributes.SERVICE_VERSION]: process.env.VERSION || 'dev',
[SemanticResourceAttributes.DEPLOYMENT_ENVIRONMENT]: process.env.NODE_ENV
}),
traceExporter: new OTLPTraceExporter({
url: 'http://jaeger:4318/v1/traces'
}),
instrumentations: [
getNodeAutoInstrumentations({
'@opentelemetry/instrumentation-http': {
enabled: true
},
'@opentelemetry/instrumentation-express': {
enabled: true
},
'@opentelemetry/instrumentation-pg': {
enabled: true
},
'@opentelemetry/instrumentation-redis': {
enabled: true
}
})
]
})
sdk.start()
// Graceful shutdown
process.on('SIGTERM', () => {
sdk.shutdown()
.then(() => console.log('Tracing terminated'))
.catch((error) => console.log('Error terminating tracing', error))
.finally(() => process.exit(0))
})
Custom Spans for Business Logic:
// application/services/payment-service.ts
import { trace, context, SpanStatusCode } from '@opentelemetry/api'
export class PaymentService {
private tracer = trace.getTracer('payment-service')
async processPayment(paymentData: PaymentData) {
// Create custom span
return await this.tracer.startActiveSpan('process-payment', async (span) => {
try {
// Add attributes
span.setAttribute('payment.amount', paymentData.amount)
span.setAttribute('payment.method', paymentData.method)
span.setAttribute('payment.currency', paymentData.currency)
// Validate payment
await this.tracer.startActiveSpan('validate-payment', async (validateSpan) => {
await this.validatePayment(paymentData)
validateSpan.end()
})
// Process with payment gateway
const result = await this.tracer.startActiveSpan('gateway-request', async (gatewaySpan) => {
gatewaySpan.setAttribute('gateway.provider', 'stripe')
const result = await this.stripe.charges.create({
amount: paymentData.amount,
currency: paymentData.currency,
source: paymentData.token
})
gatewaySpan.setAttribute('gateway.transaction_id', result.id)
gatewaySpan.end()
return result
})
// Record success
span.setStatus({ code: SpanStatusCode.OK })
span.setAttribute('payment.transaction_id', result.id)
return result
} catch (error) {
// Record error
span.setStatus({
code: SpanStatusCode.ERROR,
message: error.message
})
span.recordException(error)
throw error
} finally {
span.end()
}
})
}
}
Structured Logger Setup:
// application/logger.ts
import winston from 'winston'
import { trace, context } from '@opentelemetry/api'
const logger = winston.createLogger({
level: process.env.LOG_LEVEL || 'info',
format: winston.format.combine(
winston.format.timestamp(),
winston.format.errors({ stack: true }),
winston.format.json()
),
defaultMeta: {
service: 'api-service',
environment: process.env.NODE_ENV,
version: process.env.VERSION
},
transports: [
new winston.transports.Console(),
new winston.transports.File({
filename: 'logs/error.log',
level: 'error'
}),
new winston.transports.File({
filename: 'logs/combined.log'
})
]
})
// Add trace context to logs
const addTraceContext = winston.format((info) => {
const span = trace.getSpan(context.active())
if (span) {
const spanContext = span.spanContext()
info.trace_id = spanContext.traceId
info.span_id = spanContext.spanId
}
return info
})
logger.format = winston.format.combine(
addTraceContext(),
logger.format
)
export default logger
Contextual Logging:
// application/middleware/request-logger.ts
import logger from '../logger'
import { v4 as uuidv4 } from 'uuid'
export function requestLogger(req: Request, res: Response, next: NextFunction) {
const requestId = req.header('X-Request-ID') || uuidv4()
const start = Date.now()
// Add request ID to request object
req.requestId = requestId
res.setHeader('X-Request-ID', requestId)
// Log request
logger.info('Incoming request', {
request_id: requestId,
method: req.method,
path: req.path,
query: req.query,
ip: req.ip,
user_agent: req.get('user-agent')
})
res.on('finish', () => {
const duration = Date.now() - start
logger.info('Request completed', {
request_id: requestId,
method: req.method,
path: req.path,
status: res.statusCode,
duration_ms: duration
})
})
next()
}
Application Logging Best Practices:
// application/services/user-service.ts
import logger from '../logger'
export class UserService {
async createUser(userData: UserData) {
logger.info('Creating new user', {
operation: 'create_user',
email: userData.email,
source: userData.registrationSource
})
try {
// Check if user exists
const existing = await this.db.users.findByEmail(userData.email)
if (existing) {
logger.warn('User creation failed: email already exists', {
operation: 'create_user',
email: userData.email,
reason: 'duplicate_email'
})
throw new ConflictError('Email already registered')
}
// Create user
const user = await this.db.users.create(userData)
logger.info('User created successfully', {
operation: 'create_user',
user_id: user.id,
email: user.email
})
return user
} catch (error) {
logger.error('User creation failed', {
operation: 'create_user',
email: userData.email,
error: error.message,
stack: error.stack
})
throw error
}
}
}
Client-Side Performance Tracking:
// frontend/monitoring/rum.ts
import { trace, context } from '@opentelemetry/api'
import { WebTracerProvider } from '@opentelemetry/sdk-trace-web'
import { getWebAutoInstrumentations } from '@opentelemetry/auto-instrumentations-web'
import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http'
const provider = new WebTracerProvider()
const exporter = new OTLPTraceExporter({
url: 'https://api.myapp.com/v1/traces'
})
provider.addSpanProcessor(
new BatchSpanProcessor(exporter)
)
provider.register({
instrumentations: [
getWebAutoInstrumentations({
'@opentelemetry/instrumentation-document-load': {},
'@opentelemetry/instrumentation-user-interaction': {},
'@opentelemetry/instrumentation-fetch': {},
'@opentelemetry/instrumentation-xml-http-request': {}
})
]
})
// Track Core Web Vitals
import { getCLS, getFID, getFCP, getLCP, getTTFB } from 'web-vitals'
function sendToAnalytics(metric: Metric) {
fetch('/api/analytics/web-vitals', {
method: 'POST',
body: JSON.stringify({
name: metric.name,
value: metric.value,
id: metric.id,
delta: metric.delta,
rating: metric.rating
}),
keepalive: true
})
}
getCLS(sendToAnalytics)
getFID(sendToAnalytics)
getFCP(sendToAnalytics)
getLCP(sendToAnalytics)
getTTFB(sendToAnalytics)
// Track custom user interactions
export function trackEvent(name: string, properties: Record<string, any>) {
const tracer = trace.getTracer('user-interaction')
tracer.startActiveSpan(name, (span) => {
Object.entries(properties).forEach(([key, value]) => {
span.setAttribute(key, value)
})
span.end()
})
}
// Usage
trackEvent('button_click', {
button_id: 'checkout',
page: '/cart',
user_id: currentUser.id
})
Defining SLOs:
// sre/slos.ts
export interface SLO {
name: string
description: string
target: number // percentage (e.g., 99.9)
window: string // time window (e.g., '30d')
sli: ServiceLevelIndicator
}
export interface ServiceLevelIndicator {
type: 'availability' | 'latency' | 'error_rate'
query: string
threshold?: number
}
export const slos: SLO[] = [
{
name: 'API Availability',
description: '99.9% of API requests return 2xx or 3xx status',
target: 99.9,
window: '30d',
sli: {
type: 'availability',
query: `
sum(rate(http_requests_total{status=~"2..|3.."}[5m]))
/
sum(rate(http_requests_total[5m]))
`
}
},
{
name: 'API Latency (P95)',
description: '95% of requests complete within 500ms',
target: 95,
window: '30d',
sli: {
type: 'latency',
query: `
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
)
`,
threshold: 0.5 // 500ms
}
},
{
name: 'Database Query Performance',
description: '99% of queries complete within 100ms',
target: 99,
window: '30d',
sli: {
type: 'latency',
query: `
histogram_quantile(0.99,
sum(rate(db_query_duration_seconds_bucket[5m])) by (le)
)
`,
threshold: 0.1 // 100ms
}
},
{
name: 'Error Budget',
description: 'Less than 0.1% error rate',
target: 99.9,
window: '30d',
sli: {
type: 'error_rate',
query: `
sum(rate(http_requests_total{status=~"5.."}[5m]))
/
sum(rate(http_requests_total[5m]))
`
}
}
]
Error Budget Calculation:
// sre/error-budget.ts
export function calculateErrorBudget(slo: SLO, actualSLI: number): ErrorBudget {
const allowedFailureRate = (100 - slo.target) / 100
const actualFailureRate = (100 - actualSLI) / 100
const errorBudgetConsumed = (actualFailureRate / allowedFailureRate) * 100
const errorBudgetRemaining = 100 - errorBudgetConsumed
return {
sloName: slo.name,
target: slo.target,
actual: actualSLI,
allowedFailureRate,
actualFailureRate,
errorBudgetConsumed,
errorBudgetRemaining,
status: errorBudgetRemaining > 0 ? 'healthy' : 'exhausted',
recommendation: getRecommendation(errorBudgetRemaining)
}
}
function getRecommendation(remaining: number): string {
if (remaining > 50) {
return 'Healthy - Continue development'
} else if (remaining > 20) {
return 'Warning - Monitor closely'
} else if (remaining > 0) {
return 'Critical - Pause feature work, focus on reliability'
} else {
return 'Budget exhausted - Emergency reliability focus'
}
}
// Example usage
const availability = calculateErrorBudget(slos[0], 99.85)
console.log(availability)
// {
// sloName: 'API Availability',
// target: 99.9,
// actual: 99.85,
// allowedFailureRate: 0.001,
// actualFailureRate: 0.0015,
// errorBudgetConsumed: 150,
// errorBudgetRemaining: -50,
// status: 'exhausted',
// recommendation: 'Budget exhausted - Emergency reliability focus'
// }
Alert Pyramid:
// sre/alerts.ts
export enum AlertSeverity {
Critical = 'critical', // Page immediately
High = 'high', // Alert within 1 hour
Medium = 'medium', // Alert within 1 day
Low = 'low' // No alert, log only
}
export interface Alert {
name: string
severity: AlertSeverity
condition: string
duration: string
annotations: {
summary: string
description: string
runbook: string
}
}
export const alerts: Alert[] = [
// Critical: Service down
{
name: 'ServiceDown',
severity: AlertSeverity.Critical,
condition: `
up{job="api-service"} == 0
`,
duration: '1m',
annotations: {
summary: 'Service {{ $labels.instance }} is down',
description: 'Service has been down for more than 1 minute',
runbook: 'https://runbooks.myapp.com/service-down'
}
},
// Critical: High error rate
{
name: 'HighErrorRate',
severity: AlertSeverity.Critical,
condition: `
(
sum(rate(http_requests_total{status=~"5.."}[5m]))
/
sum(rate(http_requests_total[5m]))
) > 0.05
`,
duration: '5m',
annotations: {
summary: 'High error rate detected',
description: 'Error rate is {{ $value | humanizePercentage }} (threshold: 5%)',
runbook: 'https://runbooks.myapp.com/high-error-rate'
}
},
// High: Slow response time
{
name: 'SlowResponseTime',
severity: AlertSeverity.High,
condition: `
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
) > 1.0
`,
duration: '10m',
annotations: {
summary: 'Slow response time detected',
description: 'P95 latency is {{ $value }}s (threshold: 1s)',
runbook: 'https://runbooks.myapp.com/slow-response'
}
},
// High: Database connection pool exhausted
{
name: 'DatabasePoolExhausted',
severity: AlertSeverity.High,
condition: `
(
db_connections_used
/
db_connections_max
) > 0.9
`,
duration: '5m',
annotations: {
summary: 'Database connection pool nearly exhausted',
description: 'Using {{ $value | humanizePercentage }} of available connections',
runbook: 'https://runbooks.myapp.com/db-pool-exhausted'
}
},
// Medium: High memory usage
{
name: 'HighMemoryUsage',
severity: AlertSeverity.Medium,
condition: `
(
node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes
) / node_memory_MemTotal_bytes > 0.85
`,
duration: '15m',
annotations: {
summary: 'High memory usage',
description: 'Memory usage is {{ $value | humanizePercentage }}',
runbook: 'https://runbooks.myapp.com/high-memory'
}
},
// Medium: Error budget at risk
{
name: 'ErrorBudgetAtRisk',
severity: AlertSeverity.Medium,
condition: `
(
1 - (sum(rate(http_requests_total{status=~"2..|3.."}[30d]))
/ sum(rate(http_requests_total[30d])))
) / 0.001 > 0.8
`,
duration: '1h',
annotations: {
summary: 'Error budget consumption at 80%',
description: 'Error budget will be exhausted soon if trend continues',
runbook: 'https://runbooks.myapp.com/error-budget-risk'
}
}
]
Alert Routing:
# alertmanager.yml
route:
receiver: 'default'
group_by: ['alertname', 'cluster']
group_wait: 10s
group_interval: 5m
repeat_interval: 4h
routes:
# Critical alerts: Page on-call engineer
- match:
severity: critical
receiver: 'pagerduty'
continue: true
# High priority: Slack + Email
- match:
severity: high
receiver: 'slack-high'
continue: true
# Medium priority: Slack only
- match:
severity: medium
receiver: 'slack-medium'
receivers:
- name: 'pagerduty'
pagerduty_configs:
- service_key: '<key>'
description: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
- name: 'slack-high'
slack_configs:
- api_url: '<webhook>'
channel: '#alerts-high'
title: '{{ .GroupLabels.alertname }}'
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
- name: 'slack-medium'
slack_configs:
- api_url: '<webhook>'
channel: '#alerts-medium'
title: '{{ .GroupLabels.alertname }}'
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
Incident Management Workflow:
// sre/incident-management.ts
export enum IncidentSeverity {
SEV1 = 'sev1', // Critical outage
SEV2 = 'sev2', // Major degradation
SEV3 = 'sev3', // Minor issue
}
export interface Incident {
id: string
severity: IncidentSeverity
title: string
description: string
status: 'investigating' | 'identified' | 'monitoring' | 'resolved'
startTime: Date
endTime?: Date
affectedServices: string[]
impactedUsers?: number
commander: string
responders: string[]
timeline: IncidentEvent[]
rootCause?: string
actionItems: ActionItem[]
}
export interface IncidentEvent {
timestamp: Date
type: 'detected' | 'escalated' | 'update' | 'resolved'
description: string
author: string
}
export interface ActionItem {
id: string
description: string
assignee: string
priority: 'high' | 'medium' | 'low'
status: 'open' | 'in-progress' | 'completed'
dueDate?: Date
}
// Automated incident creation from alerts
export async function createIncidentFromAlert(alert: Alert) {
const incident: Incident = {
id: generateId(),
severity: mapAlertSeverityToIncidentSeverity(alert.severity),
title: alert.annotations.summary,
description: alert.annotations.description,
status: 'investigating',
startTime: new Date(),
affectedServices: extractServicesFromAlert(alert),
commander: await getOnCallEngineer(),
responders: [],
timeline: [{
timestamp: new Date(),
type: 'detected',
description: 'Incident automatically created from alert',
author: 'system'
}],
actionItems: []
}
// Create incident ticket
await createJiraTicket(incident)
// Notify team
await notifySlack(incident)
// Page on-call if critical
if (incident.severity === IncidentSeverity.SEV1) {
await pageOnCall(incident)
}
return incident
}
Post-Incident Review Template:
# Post-Incident Review: [Incident Title]
**Incident ID**: INC-2026-001
**Date**: 2026-02-07
**Duration**: 45 minutes
**Severity**: SEV2
**Status**: Resolved
## Summary
Brief description of what happened.
## Impact
- **Users Affected**: ~5,000 users
- **Services Affected**: Payment API, Order Processing
- **Revenue Impact**: $10,000 estimated
- **Availability**: 99.85% (below 99.9% SLO)
## Timeline
- **14:23 UTC**: Alert triggered - High error rate on payment API
- **14:25 UTC**: On-call engineer acknowledged
- **14:30 UTC**: Root cause identified - Database connection pool exhausted
- **14:35 UTC**: Mitigation deployed - Increased connection pool size
- **14:45 UTC**: Service recovered - Error rate back to normal
- **15:00 UTC**: Monitoring confirmed - Incident resolved
## Root Cause
Database connection pool was sized for normal load (50 connections) but traffic spike from marketing campaign caused connection exhaustion.
## Detection
- Automated alert triggered after 5 minutes of elevated error rate
- Time to detection: 3 minutes (acceptable)
- Alert fired correctly
## Response
**What Went Well**:
- Quick identification of root cause
- Effective communication in incident channel
- Clear runbooks helped responders
**What Went Wrong**:
- No capacity planning for traffic spikes
- Monitoring didn't catch connection pool filling up
- Manual intervention required (no auto-scaling)
## Action Items
1. **[P0]** Implement auto-scaling for database connection pool - @john - Due: 2026-02-14
2. **[P1]** Add monitoring for connection pool utilization - @sarah - Due: 2026-02-10
3. **[P1]** Create capacity planning process for marketing campaigns - @mike - Due: 2026-02-21
4. **[P2]** Document connection pool sizing guidelines - @lisa - Due: 2026-02-28
5. **[P2]** Run load test with 2x expected traffic - @team - Due: 2026-03-07
## Lessons Learned
- Need better coordination between marketing and engineering
- Capacity planning should include traffic spike scenarios
- Auto-scaling critical for resilience
**Questions for Discussion**:
1. Should we implement circuit breakers?
2. Do we need a dedicated load testing environment?
3. Should marketing campaigns require engineering review?
// chaos/experiments.ts
import { ChaosExperiment } from './framework'
export const experiments = [
// Latency injection
new ChaosExperiment({
name: 'database-latency',
description: 'Inject 500ms latency to database queries',
target: { service: 'api', component: 'database' },
fault: {
type: 'latency',
duration: 500,
percentage: 10 // Affect 10% of requests
},
duration: '5m',
successCriteria: {
slo: 'API Latency (P95)',
threshold: 1000 // Should stay below 1s
}
}),
// Instance failure
new ChaosExperiment({
name: 'instance-failure',
description: 'Terminate one instance',
target: { service: 'api', instances: 1 },
fault: {
type: 'terminate'
},
duration: '10m',
successCriteria: {
slo: 'API Availability',
threshold: 99.9 // Should maintain availability
}
}),
// Network partition
new ChaosExperiment({
name: 'network-partition',
description: 'Block network traffic between services',
target: {
from: 'api-service',
to: 'cache-service'
},
fault: {
type: 'network-block'
},
duration: '3m',
successCriteria: {
slo: 'API Availability',
threshold: 99.9 // Should degrade gracefully
}
})
]
// monitoring/synthetic.ts
import playwright from 'playwright'
export async function runSyntheticTest() {
const browser = await playwright.chromium.launch()
const context = await browser.newContext()
const page = await context.newPage()
const start = Date.now()
try {
// Navigate to homepage
await page.goto('https://myapp.com')
await page.waitForLoadState('networkidle')
// Check login flow
await page.click('#login-button')
await page.fill('#email', 'test@example.com')
await page.fill('#password', 'test123')
await page.click('#submit')
// Verify dashboard loads
await page.waitForSelector('#dashboard')
// Check critical API
const response = await page.request.get('https://api.myapp.com/health')
if (!response.ok()) {
throw new Error('Health check failed')
}
const duration = Date.now() - start
// Report success
await reportMetric({
name: 'synthetic_test_duration',
value: duration,
status: 'success'
})
} catch (error) {
// Report failure
await reportMetric({
name: 'synthetic_test_duration',
value: Date.now() - start,
status: 'failure',
error: error.message
})
// Take screenshot for debugging
await page.screenshot({ path: 'failure.png' })
throw error
} finally {
await browser.close()
}
}
// Run every 5 minutes
setInterval(runSyntheticTest, 5 * 60 * 1000)
Mean Time to Detect (MTTD):
Before: 25 minutes
After: 3 minutes
Improvement: 88% reduction
Mean Time to Resolve (MTTR):
Before: 4 hours
After: 45 minutes
Improvement: 81% reduction
Incident Frequency:
Before: 12 per month
After: 3 per month
Improvement: 75% reduction
SLO Achievement:
Availability: 99.95% (target: 99.9%)
Latency P95: 380ms (target: 500ms)
Error Rate: 0.03% (target: 0.1%)
Observability and SRE practices are essential for operating reliable systems at scale. By implementing comprehensive monitoring, distributed tracing, structured logging, and SLO-driven development, organizations can build systems that are both highly reliable and easy to debug when issues occur.
The key is treating observability as a first-class concern from day one, not as an afterthought. Instrument your code, define clear SLOs, automate alerting, and continuously improve based on incidents and metrics.
At Cortara Labs, we help organizations build observable, reliable systems with proper SRE practices. From instrumentation to incident response, we ensure your systems can handle production demands.
Learn how comprehensive observability and SRE practices can transform your operations. Contact us for an observability assessment, or explore our services to see how we can help.
Follow @cortaralabs for insights on observability, SRE, and building reliable systems.