Skip to content

Observability

Observability in TinySystems covers logging, metrics, and tracing. This guide helps you understand and monitor your modules in production.

Overview

┌─────────────────────────────────────────────────────────────────────────────┐
│                         OBSERVABILITY STACK                                  │
└─────────────────────────────────────────────────────────────────────────────┘

  Module Pods                  Collectors                    Backends
      │                           │                             │
      │  ┌─────────────────────┐  │                             │
      │  │ Logs (stdout/stderr)│──┼──▶ Fluentd/Fluent Bit ─────▶ Loki/ES
      │  └─────────────────────┘  │                             │
      │                           │                             │
      │  ┌─────────────────────┐  │                             │
      │  │ Metrics (Prometheus)│──┼──▶ Prometheus Scrape ──────▶ Prometheus
      │  └─────────────────────┘  │                             │
      │                           │                             │
      │  ┌─────────────────────┐  │                             │
      │  │ Traces (OTLP)       │──┼──▶ OTEL Collector ─────────▶ Tempo/Jaeger
      │  └─────────────────────┘  │                             │
      │                           │                             │

Logging

Structured Logging

Use structured logging for better searchability:

go
import "sigs.k8s.io/controller-runtime/pkg/log"

func (c *Component) Handle(ctx context.Context, output module.Handler, port string, msg any) error {
    logger := log.FromContext(ctx)

    logger.Info("processing message",
        "port", port,
        "type", fmt.Sprintf("%T", msg),
    )

    // On error
    if err != nil {
        logger.Error(err, "processing failed",
            "port", port,
            "input", msg,
        )
        return err
    }

    return nil
}

Log Levels

LevelUse CaseExample
ErrorFailures requiring attentionConnection failed, data corruption
InfoNormal operationsMessage processed, state changed
DebugDevelopment detailsFull payload, timing
V(1)VerboseEvery function entry/exit

Log Context

Add context to all logs:

go
func (c *Component) Handle(ctx context.Context, output module.Handler, port string, msg any) error {
    logger := log.FromContext(ctx).WithValues(
        "component", c.GetInfo().Name,
        "node", c.nodeName,
    )

    logger.Info("handling message", "port", port)
    // All subsequent logs include component and node
}

Metrics

Prometheus Metrics

Define metrics for your component:

go
import (
    "github.com/prometheus/client_golang/prometheus"
    "github.com/prometheus/client_golang/prometheus/promauto"
)

var (
    messagesProcessed = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "tinysystems_messages_processed_total",
            Help: "Total number of messages processed",
        },
        []string{"component", "port", "status"},
    )

    processingDuration = promauto.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "tinysystems_processing_duration_seconds",
            Help:    "Message processing duration",
            Buckets: prometheus.DefBuckets,
        },
        []string{"component", "port"},
    )

    activeConnections = promauto.NewGaugeVec(
        prometheus.GaugeOpts{
            Name: "tinysystems_active_connections",
            Help: "Number of active connections",
        },
        []string{"component"},
    )
)

Using Metrics

go
func (c *Component) Handle(ctx context.Context, output module.Handler, port string, msg any) error {
    start := time.Now()
    defer func() {
        duration := time.Since(start).Seconds()
        processingDuration.WithLabelValues(c.name, port).Observe(duration)
    }()

    err := c.process(ctx, output, msg)

    status := "success"
    if err != nil {
        status = "error"
    }
    messagesProcessed.WithLabelValues(c.name, port, status).Inc()

    return err
}

Standard Metrics

Recommended metrics for all components:

MetricTypeLabelsDescription
_messages_totalCountercomponent, port, statusMessages processed
_duration_secondsHistogramcomponent, portProcessing time
_errors_totalCountercomponent, error_typeErrors by type
_active_operationsGaugecomponentIn-flight operations

Tracing

OpenTelemetry Integration

go
import (
    "go.opentelemetry.io/otel"
    "go.opentelemetry.io/otel/trace"
)

var tracer = otel.Tracer("my-module")

func (c *Component) Handle(ctx context.Context, output module.Handler, port string, msg any) error {
    ctx, span := tracer.Start(ctx, "Handle",
        trace.WithAttributes(
            attribute.String("component", c.name),
            attribute.String("port", port),
        ),
    )
    defer span.End()

    // Processing...
    if err != nil {
        span.RecordError(err)
        span.SetStatus(codes.Error, err.Error())
        return err
    }

    span.SetStatus(codes.Ok, "")
    return nil
}

Trace Context Propagation

Traces propagate across modules:

go
// Automatic via gRPC metadata
// Manual for HTTP:
func (c *HTTPClient) doRequest(ctx context.Context, req *http.Request) (*http.Response, error) {
    // Inject trace context into headers
    otel.GetTextMapPropagator().Inject(ctx, propagation.HeaderCarrier(req.Header))

    return c.client.Do(req)
}

Health Endpoints

Liveness Probe

go
http.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) {
    w.WriteHeader(http.StatusOK)
    w.Write([]byte("ok"))
})

Readiness Probe

go
func (c *Component) readinessHandler(w http.ResponseWriter, r *http.Request) {
    if !c.isReady() {
        w.WriteHeader(http.StatusServiceUnavailable)
        w.Write([]byte("not ready"))
        return
    }
    w.WriteHeader(http.StatusOK)
    w.Write([]byte("ready"))
}

func (c *Component) isReady() bool {
    return c.settings.Initialized && c.connections.AllHealthy()
}

Kubernetes Configuration

Pod Annotations

yaml
metadata:
  annotations:
    prometheus.io/scrape: "true"
    prometheus.io/port: "8080"
    prometheus.io/path: "/metrics"

ServiceMonitor

yaml
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  name: my-module
spec:
  selector:
    matchLabels:
      app: my-module
  endpoints:
    - port: metrics
      interval: 15s

Dashboards

Grafana Dashboard

json
{
  "title": "Module Overview",
  "panels": [
    {
      "title": "Messages/sec",
      "type": "graph",
      "targets": [
        {
          "expr": "rate(tinysystems_messages_processed_total[5m])",
          "legendFormat": "{{component}} - {{port}}"
        }
      ]
    },
    {
      "title": "Processing Latency p99",
      "type": "graph",
      "targets": [
        {
          "expr": "histogram_quantile(0.99, rate(tinysystems_processing_duration_seconds_bucket[5m]))",
          "legendFormat": "{{component}}"
        }
      ]
    }
  ]
}

Alerting

Prometheus Alerts

yaml
groups:
  - name: tinysystems
    rules:
      - alert: HighErrorRate
        expr: |
          rate(tinysystems_messages_processed_total{status="error"}[5m])
          / rate(tinysystems_messages_processed_total[5m]) > 0.05
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High error rate in {{ $labels.component }}"

      - alert: HighLatency
        expr: |
          histogram_quantile(0.99, rate(tinysystems_processing_duration_seconds_bucket[5m])) > 5
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "High latency in {{ $labels.component }}"

Best Practices

1. Use Labels Wisely

go
// Good: Low cardinality
messagesProcessed.WithLabelValues(componentName, port, status)

// Bad: High cardinality (unbounded values)
messagesProcessed.WithLabelValues(componentName, userID, requestID)

2. Log at Appropriate Levels

go
// Development
log.V(1).Info("detailed debug info", "data", fullPayload)

// Production
log.Info("message processed", "count", 1)
log.Error(err, "processing failed")

3. Include Trace IDs in Logs

go
logger.Info("processing",
    "traceID", trace.SpanFromContext(ctx).SpanContext().TraceID().String(),
)

4. Monitor Business Metrics

go
var (
    ordersCreated = promauto.NewCounter(prometheus.CounterOpts{
        Name: "orders_created_total",
    })
    orderValue = promauto.NewHistogram(prometheus.HistogramOpts{
        Name:    "order_value_dollars",
        Buckets: []float64{10, 50, 100, 500, 1000},
    })
)

Next Steps

Build flow-based applications on Kubernetes