diff options
27 files changed, 1556 insertions, 1 deletions
diff --git a/f3s/loki/alloy-values.yaml b/f3s/loki/alloy-values.yaml index 09da220..f53fd12 100644 --- a/f3s/loki/alloy-values.yaml +++ b/f3s/loki/alloy-values.yaml @@ -1,4 +1,17 @@ alloy: + service: + ports: + otlp-grpc: + enabled: true + port: 4317 + targetPort: 4317 + protocol: TCP + otlp-http: + enabled: true + port: 4318 + targetPort: 4318 + protocol: TCP + configMap: content: | discovery.kubernetes "pods" { @@ -39,3 +52,57 @@ alloy: url = "http://loki.monitoring.svc.cluster.local:3100/loki/api/v1/push" } } + + // ======================================== + // TRACES COLLECTION + // ======================================== + + // OTLP receiver for traces via gRPC and HTTP + // Accepts traces from applications instrumented with OpenTelemetry + otelcol.receiver.otlp "default" { + // Accept OTLP over gRPC on port 4317 (standard OTLP port) + grpc { + endpoint = "0.0.0.0:4317" + } + + // Accept OTLP over HTTP on port 4318 (standard OTLP HTTP port) + http { + endpoint = "0.0.0.0:4318" + } + + output { + traces = [otelcol.processor.batch.default.input] + } + } + + // Batch processor for efficient trace forwarding to Tempo + // Reduces network calls by batching spans before sending + otelcol.processor.batch "default" { + // Send batch every 5 seconds + timeout = "5s" + + // Or when 100 spans have accumulated + send_batch_size = 100 + + // Maximum batch size as safety limit + send_batch_max_size = 200 + + output { + traces = [otelcol.exporter.otlp.tempo.input] + } + } + + // OTLP exporter to send traces to Tempo + otelcol.exporter.otlp "tempo" { + client { + endpoint = "tempo.monitoring.svc.cluster.local:4317" + + // Tempo doesn't use TLS for internal cluster communication + tls { + insecure = true + } + + // Enable compression for efficiency + compression = "gzip" + } + } diff --git a/f3s/prometheus/persistence-values.yaml b/f3s/prometheus/persistence-values.yaml index b5e13e4..7e115a9 100644 --- a/f3s/prometheus/persistence-values.yaml +++ b/f3s/prometheus/persistence-values.yaml @@ -55,4 +55,32 @@ grafana: podSecurityContext: fsGroup: 911 runAsUser: 911 - runAsGroup: 911
\ No newline at end of file + runAsGroup: 911 + + additionalDataSources: + - name: Tempo + type: tempo + uid: tempo + url: http://tempo.monitoring.svc.cluster.local:3200 + access: proxy + isDefault: false + editable: true + jsonData: + httpMethod: GET + tracesToLogsV2: + datasourceUid: 'loki' + spanStartTimeShift: '-1h' + spanEndTimeShift: '1h' + filterByTraceID: false + filterBySpanID: false + tags: ['cluster', 'namespace', 'pod', 'app'] + tracesToMetrics: + datasourceUid: 'prometheus' + serviceMap: + datasourceUid: 'prometheus' + nodeGraph: + enabled: true + search: + hide: false + lokiSearch: + datasourceUid: 'loki'
\ No newline at end of file diff --git a/f3s/tempo/Justfile b/f3s/tempo/Justfile new file mode 100644 index 0000000..361f5c6 --- /dev/null +++ b/f3s/tempo/Justfile @@ -0,0 +1,33 @@ +# Grafana Tempo deployment automation +# Following the pattern from Loki Justfile + +install: + helm repo add grafana https://grafana.github.io/helm-charts || true + helm repo update + kubectl apply -f persistent-volumes.yaml + helm install tempo grafana/tempo --namespace monitoring -f values.yaml + kubectl apply -f datasource-configmap.yaml + +uninstall: + kubectl delete -f datasource-configmap.yaml || true + helm uninstall tempo --namespace monitoring || true + kubectl delete -f persistent-volumes.yaml || true + +upgrade: + helm upgrade tempo grafana/tempo --namespace monitoring -f values.yaml + kubectl apply -f datasource-configmap.yaml + +status: + kubectl get pods -n monitoring -l app.kubernetes.io/name=tempo + kubectl get svc -n monitoring -l app.kubernetes.io/name=tempo + kubectl get pvc -n monitoring tempo-data-pvc + +logs: + kubectl logs -n monitoring -l app.kubernetes.io/name=tempo --tail=100 -f + +check: + @echo "Checking Tempo readiness..." + kubectl exec -n monitoring $(kubectl get pod -n monitoring -l app.kubernetes.io/name=tempo -o jsonpath='{.items[0].metadata.name}') -- wget -qO- http://localhost:3200/ready + @echo "" + @echo "Checking OTLP ports..." + kubectl exec -n monitoring $(kubectl get pod -n monitoring -l app.kubernetes.io/name=tempo -o jsonpath='{.items[0].metadata.name}') -- netstat -ln | grep -E ':(4317|4318|3200)' diff --git a/f3s/tempo/README.md b/f3s/tempo/README.md new file mode 100644 index 0000000..ee59311 --- /dev/null +++ b/f3s/tempo/README.md @@ -0,0 +1,182 @@ +# Grafana Tempo - Distributed Tracing + +Grafana Tempo deployment for the f3s Kubernetes cluster in monolithic mode. + +## Overview + +- **Deployment Mode**: Monolithic (all components in one process) +- **Storage Backend**: Filesystem (local storage on hostPath) +- **Storage Size**: 10Gi +- **Retention**: 7 days (168h) +- **Namespace**: `monitoring` + +## Components + +- **Tempo**: Distributed tracing backend +- **OTLP Receivers**: Accepts traces via gRPC (4317) and HTTP (4318) +- **Query Frontend**: Query interface on port 3200 +- **Grafana Datasource**: Auto-discovered via ConfigMap label + +## Architecture + +``` +Applications → Alloy (OTLP collector) → Tempo → Grafana +``` + +## Installation + +```bash +just install +``` + +This will: +1. Add Grafana Helm repo and update +2. Create PersistentVolume and PersistentVolumeClaim +3. Install Tempo via Helm +4. Create Grafana datasource ConfigMap + +## Configuration + +### values.yaml + +- Monolithic mode configuration +- OTLP receivers on ports 4317 (gRPC) and 4318 (HTTP) +- Local filesystem storage at `/var/tempo/traces` +- Resource limits: 2Gi memory, 1 CPU + +### persistent-volumes.yaml + +- PV: `tempo-data-pv` at `/data/nfs/k3svolumes/tempo/data` +- PVC: `tempo-data-pvc` (10Gi, ReadWriteOnce) + +### datasource-configmap.yaml + +- Auto-discovered by Grafana sidecar +- Enables traces-to-logs correlation with Loki +- Enables traces-to-metrics correlation with Prometheus +- Enables service graph visualization + +## Grafana Integration + +The datasource is automatically discovered by Grafana through the ConfigMap with label `grafana_datasource: "1"`. + +To access traces in Grafana: +1. Navigate to Explore +2. Select "Tempo" datasource +3. Use Search or TraceQL queries + +### Example TraceQL Queries + +``` +# Find all traces from demo app +{ resource.service.namespace = "tracing-demo" } + +# Find slow requests (>200ms) +{ duration > 200ms } + +# Find errors +{ status = error } + +# Find traces from specific service +{ resource.service.name = "frontend" } +``` + +## Verification + +Check that Tempo is running: +```bash +just status +``` + +Check Tempo readiness and OTLP ports: +```bash +just check +``` + +View logs: +```bash +just logs +``` + +## Sending Traces + +Applications should send traces to Alloy's OTLP receivers: +- gRPC: `alloy.monitoring.svc.cluster.local:4317` +- HTTP: `alloy.monitoring.svc.cluster.local:4318` + +Alloy forwards traces to Tempo at `tempo.monitoring.svc.cluster.local:4317`. + +## Maintenance + +### Upgrade + +```bash +just upgrade +``` + +### Uninstall + +```bash +just uninstall +``` + +### Check Storage Usage + +```bash +kubectl exec -n monitoring $(kubectl get pod -n monitoring -l app.kubernetes.io/name=tempo -o jsonpath='{.items[0].metadata.name}') -- df -h /var/tempo +``` + +## Troubleshooting + +### Tempo pod not starting + +Check events: +```bash +kubectl describe pod -n monitoring -l app.kubernetes.io/name=tempo +``` + +Check PVC binding: +```bash +kubectl get pvc -n monitoring tempo-data-pvc +``` + +### No traces appearing + +1. Verify Alloy is forwarding traces: +```bash +kubectl logs -n monitoring -l app.kubernetes.io/name=alloy | grep -i tempo +``` + +2. Check Tempo logs: +```bash +just logs +``` + +3. Verify OTLP receivers are listening: +```bash +just check +``` + +### Grafana datasource not appearing + +1. Check ConfigMap exists: +```bash +kubectl get cm -n monitoring tempo-grafana-datasource --show-labels +``` + +2. Check Grafana sidecar logs: +```bash +kubectl logs -n monitoring $(kubectl get pod -n monitoring -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}') -c grafana-sc-datasources +``` + +3. Restart Grafana pod if needed: +```bash +kubectl delete pod -n monitoring -l app.kubernetes.io/name=grafana +``` + +## References + +- [Grafana Tempo Documentation](https://grafana.com/docs/tempo/latest/) +- [Tempo Helm Chart](https://github.com/grafana/helm-charts/tree/main/charts/tempo) +- [OpenTelemetry Protocol (OTLP)](https://opentelemetry.io/docs/specs/otlp/) +- [TraceQL Query Language](https://grafana.com/docs/tempo/latest/traceql/) diff --git a/f3s/tempo/datasource-configmap.yaml b/f3s/tempo/datasource-configmap.yaml new file mode 100644 index 0000000..00fb972 --- /dev/null +++ b/f3s/tempo/datasource-configmap.yaml @@ -0,0 +1,47 @@ +# Grafana Datasource ConfigMap for Tempo +# Auto-discovered by Grafana sidecar via label grafana_datasource: "1" +# Enables traces-to-logs and traces-to-metrics correlation + +apiVersion: v1 +kind: ConfigMap +metadata: + name: tempo-grafana-datasource + namespace: monitoring + labels: + grafana_datasource: "1" # Must be string "1" for auto-discovery +data: + tempo-datasource.yaml: |- + apiVersion: 1 + datasources: + - name: "Tempo" + type: tempo + uid: tempo + url: http://tempo.monitoring.svc.cluster.local:3200 + access: proxy + isDefault: false + editable: true + jsonData: + httpMethod: GET + # Enable traces-to-logs correlation with Loki + tracesToLogsV2: + datasourceUid: 'loki' + spanStartTimeShift: '-1h' + spanEndTimeShift: '1h' + filterByTraceID: false + filterBySpanID: false + tags: ['cluster', 'namespace', 'pod', 'app'] + # Enable traces-to-metrics correlation with Prometheus + tracesToMetrics: + datasourceUid: 'prometheus' + # Enable service graph visualization + serviceMap: + datasourceUid: 'prometheus' + # Enable node graph for visualization + nodeGraph: + enabled: true + # Enable search + search: + hide: false + # Enable Loki search integration + lokiSearch: + datasourceUid: 'loki' diff --git a/f3s/tempo/persistent-volumes.yaml b/f3s/tempo/persistent-volumes.yaml new file mode 100644 index 0000000..fc4c378 --- /dev/null +++ b/f3s/tempo/persistent-volumes.yaml @@ -0,0 +1,31 @@ +# Persistent Volume and Claim for Grafana Tempo trace storage +# Following the pattern from Loki deployment +# Storage: 10Gi at /data/nfs/k3svolumes/tempo/data + +apiVersion: v1 +kind: PersistentVolume +metadata: + name: tempo-data-pv +spec: + capacity: + storage: 10Gi + volumeMode: Filesystem + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + hostPath: + path: /data/nfs/k3svolumes/tempo/data + type: DirectoryOrCreate +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: tempo-data-pvc + namespace: monitoring +spec: + storageClassName: "" # Empty for manual binding to PV + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi diff --git a/f3s/tempo/values.yaml b/f3s/tempo/values.yaml new file mode 100644 index 0000000..d118b63 --- /dev/null +++ b/f3s/tempo/values.yaml @@ -0,0 +1,76 @@ +# Grafana Tempo - Monolithic Mode Configuration +# Following the pattern from Loki SingleBinary deployment + +tempo: + # Retention policy for traces (7 days) + retention: 168h + + # Storage configuration - Local filesystem backend + # This is required for monolithic mode + storage: + trace: + backend: local + local: + path: /var/tempo/traces + wal: + path: /var/tempo/wal + + # Distributor configuration with OTLP receivers + # Bind to 0.0.0.0 to avoid Tempo 2.7+ localhost-only binding issue + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + + # Query frontend configuration + # Enabled by default in monolithic mode + +# Persistence configuration using hostPath PV +# Matches the pattern from Loki deployment +persistence: + enabled: true + size: 10Gi + storageClassName: "" # Empty string for manual PV binding + accessModes: + - ReadWriteOnce + +# Service configuration +# Expose OTLP ports and query endpoint +service: + type: ClusterIP + +# Resource limits to prevent runaway resource usage +# Adjusted for monolithic deployment +resources: + limits: + cpu: 1000m + memory: 2Gi + requests: + cpu: 500m + memory: 1Gi + +# Security context following best practices +# Using non-root user +securityContext: + fsGroup: 10001 + runAsUser: 10001 + runAsGroup: 10001 + runAsNonRoot: true + +# Disable components not needed in monolithic mode +gateway: + enabled: false + +# Monitoring integration with Prometheus +# Enables ServiceMonitor for automatic scraping +serviceMonitor: + enabled: true + labels: + release: prometheus + +# Test pod disabled to reduce overhead +test: + enabled: false diff --git a/f3s/tracing-demo/Justfile b/f3s/tracing-demo/Justfile new file mode 100644 index 0000000..d1bc474 --- /dev/null +++ b/f3s/tracing-demo/Justfile @@ -0,0 +1,93 @@ +# Tracing Demo Application deployment automation +# Three-tier Python application demonstrating distributed tracing + +NAMESPACE := "services" +RELEASE_NAME := "tracing-demo" +CHART_PATH := "./helm-chart" + +# Build all Docker images (use docker-image-Justfile for build/push to registry) +build: + just -f docker-image-Justfile build + +# Push images to private registry +push: + just -f docker-image-Justfile push + +# Build and push images +build-push: build push + +# Install Helm chart +install: + helm install {{RELEASE_NAME}} {{CHART_PATH}} --namespace {{NAMESPACE}} --create-namespace + +# Upgrade Helm chart +upgrade: + helm upgrade {{RELEASE_NAME}} {{CHART_PATH}} --namespace {{NAMESPACE}} + +# Delete Helm release +delete: + helm uninstall {{RELEASE_NAME}} --namespace {{NAMESPACE}} + +# Rebuild images, import, and upgrade deployment +rebuild: build import upgrade + +# Check deployment status +status: + kubectl get pods -n {{NAMESPACE}} | grep tracing-demo + kubectl get svc -n {{NAMESPACE}} | grep -E '(frontend|middleware|backend)-service' + kubectl get ingress -n {{NAMESPACE}} tracing-demo-ingress + +# View logs from all services +logs: + @echo "=== Frontend logs ===" + kubectl logs -n {{NAMESPACE}} -l app=tracing-demo-frontend --tail=20 + @echo "" + @echo "=== Middleware logs ===" + kubectl logs -n {{NAMESPACE}} -l app=tracing-demo-middleware --tail=20 + @echo "" + @echo "=== Backend logs ===" + kubectl logs -n {{NAMESPACE}} -l app=tracing-demo-backend --tail=20 + +# Follow logs from frontend +logs-frontend: + kubectl logs -n {{NAMESPACE}} -l app=tracing-demo-frontend -f + +# Follow logs from middleware +logs-middleware: + kubectl logs -n {{NAMESPACE}} -l app=tracing-demo-middleware -f + +# Follow logs from backend +logs-backend: + kubectl logs -n {{NAMESPACE}} -l app=tracing-demo-backend -f + +# Test the application +test: + @echo "Testing frontend health endpoint..." + curl http://tracing-demo.f3s.buetow.org/ + @echo "" + @echo "Testing API process endpoint..." + curl http://tracing-demo.f3s.buetow.org/api/process + +# Load test - generate multiple traces +load-test: + @echo "Generating 50 requests with 0.5s delay..." + @for i in {1..50}; do \ + curl -s http://tracing-demo.f3s.buetow.org/api/process >/dev/null && echo "Request $$i complete"; \ + sleep 0.5; \ + done + @echo "Load test complete!" + +# Port forward to services for local testing +port-forward-frontend: + kubectl port-forward -n {{NAMESPACE}} svc/frontend-service 5000:5000 + +port-forward-middleware: + kubectl port-forward -n {{NAMESPACE}} svc/middleware-service 5001:5001 + +port-forward-backend: + kubectl port-forward -n {{NAMESPACE}} svc/backend-service 5002:5002 + +# Check if traces are being generated +check-traces: + @echo "Check Grafana Tempo for traces with:" + @echo " { resource.service.namespace = \"tracing-demo\" }" diff --git a/f3s/tracing-demo/README.md b/f3s/tracing-demo/README.md new file mode 100644 index 0000000..5934c00 --- /dev/null +++ b/f3s/tracing-demo/README.md @@ -0,0 +1,250 @@ +# Tracing Demo Application + +Three-tier Python Flask application demonstrating distributed tracing with OpenTelemetry and Grafana Tempo. + +## Overview + +This demo application shows how distributed tracing works across multiple microservices: + +- **Frontend**: Receives HTTP requests, forwards to middleware +- **Middleware**: Transforms data, calls backend +- **Backend**: Returns data (simulates database queries) + +Each service is instrumented with OpenTelemetry and sends traces to Grafana Tempo via Alloy. + +## Architecture + +``` +User → Frontend (Flask:5000) → Middleware (Flask:5001) → Backend (Flask:5002) + ↓ ↓ ↓ + Alloy (OTLP:4317) → Tempo → Grafana +``` + +## Components + +### Frontend Service +- Port: 5000 +- Endpoints: + - `GET /` - Service info and health + - `GET /health` - Kubernetes health probe + - `GET|POST /api/process` - Main processing endpoint +- Calls: Middleware service + +### Middleware Service +- Port: 5001 +- Endpoints: + - `GET /` - Service info and health + - `GET /health` - Kubernetes health probe + - `POST /api/transform` - Data transformation endpoint +- Calls: Backend service + +### Backend Service +- Port: 5002 +- Endpoints: + - `GET /` - Service info and health + - `GET /health` - Kubernetes health probe + - `GET /api/data` - Data retrieval endpoint (simulates DB query) +- Calls: None (leaf service) + +## OpenTelemetry Instrumentation + +All services use: +- **Auto-instrumentation**: Flask and Requests libraries automatically create spans +- **Manual spans**: Custom spans for business logic with attributes +- **OTLP export**: Traces sent to Alloy via gRPC on port 4317 +- **Resource attributes**: Service name, namespace, version identify each service + +## Build and Deploy + +### Prerequisites + +1. Tempo must be deployed and running in `monitoring` namespace +2. Alloy must be configured with OTLP receivers +3. Docker installed for building images +4. Access to k3s cluster (SSH to r0) + +### Quick Start + +```bash +# Build Docker images +just build + +# Import images to k3s +just import + +# Deploy with Helm +just install + +# Check status +just status +``` + +### Rebuild and Update + +```bash +# Rebuild images, import, and upgrade deployment +just rebuild +``` + +## Testing + +### Basic Test + +```bash +# Test health endpoint +curl http://tracing-demo.f3s.buetow.org/ + +# Test API endpoint (generates a trace) +curl http://tracing-demo.f3s.buetow.org/api/process +``` + +### Load Test + +Generate 50 requests to create multiple traces: + +```bash +just load-test +``` + +### View Logs + +```bash +# View logs from all services +just logs + +# Follow frontend logs +just logs-frontend + +# Follow middleware logs +just logs-middleware + +# Follow backend logs +just logs-backend +``` + +## Viewing Traces in Grafana + +1. Navigate to Grafana: https://grafana.f3s.buetow.org +2. Go to Explore → Select "Tempo" datasource +3. Use TraceQL queries: + +``` +# All traces from demo app +{ resource.service.namespace = "tracing-demo" } + +# Slow requests (>200ms) +{ duration > 200ms } + +# Traces from specific service +{ resource.service.name = "frontend" } + +# Errors +{ status = error } +``` + +4. View Service Graph to see connections between services + +## Trace Features Demonstrated + +### Distributed Context Propagation +Traces automatically span all three services, showing: +- Frontend span (root) +- Middleware span (child of frontend) +- Backend span (child of middleware) + +### Custom Attributes +Each service adds custom attributes: +- `service.name` - Service identifier +- `service.namespace` - Application namespace +- Custom business logic attributes + +### Trace Correlation +- **Traces-to-Logs**: Click on a span to see related logs in Loki +- **Traces-to-Metrics**: View Prometheus metrics for services in the trace +- **Service Graph**: Visualize service dependencies + +## Development + +### Local Testing with Port Forwarding + +```bash +# Forward frontend +just port-forward-frontend +curl http://localhost:5000/ + +# Forward middleware +just port-forward-middleware +curl http://localhost:5001/ + +# Forward backend +just port-forward-backend +curl http://localhost:5002/ +``` + +### Modifying the Application + +1. Edit Python code in `docker/*/app.py` +2. Rebuild: `just build` +3. Import: `just import` +4. Upgrade: `just upgrade` + +Or use the combined command: `just rebuild` + +## Troubleshooting + +### No traces appearing in Grafana + +1. Check pods are running: +```bash +kubectl get pods -n services | grep tracing-demo +``` + +2. Check Alloy is receiving traces: +```bash +kubectl logs -n monitoring -l app.kubernetes.io/name=alloy | grep -i otlp +``` + +3. Check Tempo is storing traces: +```bash +kubectl logs -n monitoring -l app.kubernetes.io/name=tempo | grep -i trace +``` + +4. Verify OTLP endpoint is accessible: +```bash +kubectl exec -n services $(kubectl get pod -n services -l app=tracing-demo-frontend -o jsonpath='{.items[0].metadata.name}') -- wget -qO- http://alloy.monitoring.svc.cluster.local:4317 +``` + +### Pods not starting + +Check events and logs: +```bash +kubectl describe pod -n services -l app=tracing-demo-frontend +kubectl logs -n services -l app=tracing-demo-frontend +``` + +### Images not found + +Verify images are imported to k3s: +```bash +ssh r0 'k3s crictl images | grep tracing-demo' +``` + +If missing, run: +```bash +just import +``` + +## Cleanup + +Remove the demo application: + +```bash +just delete +``` + +## References + +- [OpenTelemetry Python Documentation](https://opentelemetry.io/docs/languages/python/) +- [Flask Instrumentation](https://opentelemetry-python-contrib.readthedocs.io/en/latest/instrumentation/flask/flask.html) +- [Grafana Tempo Documentation](https://grafana.com/docs/tempo/latest/) +- [TraceQL Query Language](https://grafana.com/docs/tempo/latest/traceql/) diff --git a/f3s/tracing-demo/docker-image-Justfile b/f3s/tracing-demo/docker-image-Justfile new file mode 100644 index 0000000..7b263b1 --- /dev/null +++ b/f3s/tracing-demo/docker-image-Justfile @@ -0,0 +1,38 @@ +# Docker image build and push automation for tracing demo +# Similar to anki-sync-server pattern + +REGISTRY := "r0.lan.buetow.org:30001" + +# Build all images +build: + docker build -t tracing-demo-frontend:latest ./docker/frontend + docker build -t tracing-demo-middleware:latest ./docker/middleware + docker build -t tracing-demo-backend:latest ./docker/backend + +# Tag and push all images to private registry +push: + docker tag tracing-demo-frontend:latest {{REGISTRY}}/tracing-demo-frontend:latest + docker tag tracing-demo-middleware:latest {{REGISTRY}}/tracing-demo-middleware:latest + docker tag tracing-demo-backend:latest {{REGISTRY}}/tracing-demo-backend:latest + docker push {{REGISTRY}}/tracing-demo-frontend:latest + docker push {{REGISTRY}}/tracing-demo-middleware:latest + docker push {{REGISTRY}}/tracing-demo-backend:latest + +# Build and push in one command +all: build push + +# Build and push specific service +frontend: + docker build -t tracing-demo-frontend:latest ./docker/frontend + docker tag tracing-demo-frontend:latest {{REGISTRY}}/tracing-demo-frontend:latest + docker push {{REGISTRY}}/tracing-demo-frontend:latest + +middleware: + docker build -t tracing-demo-middleware:latest ./docker/middleware + docker tag tracing-demo-middleware:latest {{REGISTRY}}/tracing-demo-middleware:latest + docker push {{REGISTRY}}/tracing-demo-middleware:latest + +backend: + docker build -t tracing-demo-backend:latest ./docker/backend + docker tag tracing-demo-backend:latest {{REGISTRY}}/tracing-demo-backend:latest + docker push {{REGISTRY}}/tracing-demo-backend:latest diff --git a/f3s/tracing-demo/docker/backend/Dockerfile b/f3s/tracing-demo/docker/backend/Dockerfile new file mode 100644 index 0000000..5018e8f --- /dev/null +++ b/f3s/tracing-demo/docker/backend/Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Copy and install dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code +COPY app.py . + +# Expose port for Flask application +EXPOSE 5002 + +# Run the application +CMD ["python", "app.py"] diff --git a/f3s/tracing-demo/docker/backend/app.py b/f3s/tracing-demo/docker/backend/app.py new file mode 100644 index 0000000..2c9e88a --- /dev/null +++ b/f3s/tracing-demo/docker/backend/app.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +""" +Tracing Demo - Backend Service +Final service in the chain that returns data. +Simulates database queries and demonstrates end-to-end tracing. +""" +from flask import Flask, jsonify +import os +import logging +import time +from datetime import datetime + +# OpenTelemetry imports for distributed tracing +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.instrumentation.flask import FlaskInstrumentor +from opentelemetry.sdk.resources import Resource + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# Initialize OpenTelemetry tracing with resource attributes +# These attributes identify this service in traces +resource = Resource(attributes={ + "service.name": "backend", + "service.namespace": "tracing-demo", + "service.version": "1.0.0", + "deployment.environment": "production" +}) + +provider = TracerProvider(resource=resource) + +# Configure OTLP exporter to send traces to Alloy +otlp_exporter = OTLPSpanExporter( + endpoint=os.getenv('OTEL_EXPORTER_OTLP_ENDPOINT', + 'http://alloy.monitoring.svc.cluster.local:4317'), + insecure=True +) + +# Batch spans for efficient export +processor = BatchSpanProcessor(otlp_exporter) +provider.add_span_processor(processor) +trace.set_tracer_provider(provider) + +# Get tracer for manual instrumentation +tracer = trace.get_tracer(__name__) + +# Create Flask application +app = Flask(__name__) + +# Auto-instrument Flask +FlaskInstrumentor().instrument_app(app) + +@app.route('/') +def index(): + """ + Health check and service information endpoint. + Returns service metadata. + """ + return jsonify({ + "service": "backend", + "version": "1.0.0", + "message": "Tracing demo backend service" + }) + +@app.route('/health') +def health(): + """ + Kubernetes health check endpoint. + Used by readiness and liveness probes. + """ + return jsonify({"status": "healthy"}), 200 + +@app.route('/api/data', methods=['GET']) +def get_data(): + """ + Return data endpoint that simulates a database query. + Creates custom spans to track query execution. + This is the final service in the trace chain. + """ + # Create a custom span for the database query simulation + with tracer.start_as_current_span("backend-get-data") as span: + # Add custom attributes to the span + span.set_attribute("backend.handler", "get_data") + + # Simulate database query delay + query_time = 0.1 + time.sleep(query_time) + + # Record query duration in span + span.set_attribute("backend.query.duration_ms", query_time * 1000) + span.set_attribute("backend.query.type", "simulated_database_query") + + # Prepare response data + data = { + "service": "backend", + "data": { + "id": 12345, + "value": "Sample data from backend service", + "timestamp": datetime.utcnow().isoformat(), + "query_time_ms": query_time * 1000 + } + } + + logger.info(f"Returning data: {data['data']['id']}") + + return jsonify(data), 200 + +if __name__ == '__main__': + logger.info("Starting backend service on port 5002") + logger.info(f"OTLP endpoint: {os.getenv('OTEL_EXPORTER_OTLP_ENDPOINT', 'default')}") + app.run(host='0.0.0.0', port=5002, debug=False) diff --git a/f3s/tracing-demo/docker/backend/requirements.txt b/f3s/tracing-demo/docker/backend/requirements.txt new file mode 100644 index 0000000..6022d6c --- /dev/null +++ b/f3s/tracing-demo/docker/backend/requirements.txt @@ -0,0 +1,4 @@ +flask==3.0.0 +opentelemetry-distro==0.49b0 +opentelemetry-exporter-otlp==1.28.0 +opentelemetry-instrumentation-flask==0.49b0 diff --git a/f3s/tracing-demo/docker/frontend/Dockerfile b/f3s/tracing-demo/docker/frontend/Dockerfile new file mode 100644 index 0000000..dd28e97 --- /dev/null +++ b/f3s/tracing-demo/docker/frontend/Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Copy and install dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code +COPY app.py . + +# Expose port for Flask application +EXPOSE 5000 + +# Run the application +CMD ["python", "app.py"] diff --git a/f3s/tracing-demo/docker/frontend/app.py b/f3s/tracing-demo/docker/frontend/app.py new file mode 100644 index 0000000..65ab3f3 --- /dev/null +++ b/f3s/tracing-demo/docker/frontend/app.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python3 +""" +Tracing Demo - Frontend Service +Receives user requests and forwards to middleware service. +Demonstrates OpenTelemetry auto-instrumentation with Flask. +""" +from flask import Flask, jsonify, request +import requests +import os +import logging + +# OpenTelemetry imports for distributed tracing +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.instrumentation.flask import FlaskInstrumentor +from opentelemetry.instrumentation.requests import RequestsInstrumentor +from opentelemetry.sdk.resources import Resource + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# Initialize OpenTelemetry tracing with resource attributes +# These attributes identify this service in traces +resource = Resource(attributes={ + "service.name": "frontend", + "service.namespace": "tracing-demo", + "service.version": "1.0.0", + "deployment.environment": "production" +}) + +provider = TracerProvider(resource=resource) + +# Configure OTLP exporter to send traces to Alloy +otlp_exporter = OTLPSpanExporter( + endpoint=os.getenv('OTEL_EXPORTER_OTLP_ENDPOINT', + 'http://alloy.monitoring.svc.cluster.local:4317'), + insecure=True +) + +# Batch spans for efficient export +processor = BatchSpanProcessor(otlp_exporter) +provider.add_span_processor(processor) +trace.set_tracer_provider(provider) + +# Get tracer for manual instrumentation if needed +tracer = trace.get_tracer(__name__) + +# Create Flask application +app = Flask(__name__) + +# Auto-instrument Flask to create spans for HTTP requests +FlaskInstrumentor().instrument_app(app) + +# Auto-instrument requests library to propagate trace context +RequestsInstrumentor().instrument() + +# Configuration for downstream services +MIDDLEWARE_URL = os.getenv('MIDDLEWARE_URL', + 'http://middleware-service.services.svc.cluster.local:5001') + +@app.route('/') +def index(): + """ + Health check and service information endpoint. + Returns service metadata. + """ + return jsonify({ + "service": "frontend", + "version": "1.0.0", + "message": "Tracing demo frontend service", + "trace_enabled": True, + "middleware_url": MIDDLEWARE_URL + }) + +@app.route('/health') +def health(): + """ + Kubernetes health check endpoint. + Used by readiness and liveness probes. + """ + return jsonify({"status": "healthy"}), 200 + +@app.route('/api/process', methods=['GET', 'POST']) +def process(): + """ + Main processing endpoint that demonstrates distributed tracing. + Forwards request to middleware service and returns combined response. + Creates a custom span to track the processing logic. + """ + # Create a custom span for the processing logic + with tracer.start_as_current_span("frontend-process") as span: + # Add custom attributes to the span for better observability + span.set_attribute("frontend.handler", "process") + + # Get request data (supports both GET and POST) + if request.method == 'POST': + data = request.get_json() or {} + else: + data = {"source": "GET request"} + + span.set_attribute("frontend.request.method", request.method) + + try: + # Call middleware service + # The requests library auto-instrumentation will create a span + # and propagate the trace context via W3C Trace Context headers + logger.info(f"Calling middleware at {MIDDLEWARE_URL}/api/transform") + + response = requests.post( + f'{MIDDLEWARE_URL}/api/transform', + json=data, + timeout=10 + ) + + response.raise_for_status() + middleware_data = response.json() + + # Record successful call in span + span.set_attribute("frontend.middleware.status", response.status_code) + + return jsonify({ + "service": "frontend", + "status": "success", + "request_data": data, + "middleware_response": middleware_data + }), 200 + + except requests.exceptions.RequestException as e: + # Log error and record in span + logger.error(f"Error calling middleware: {e}") + span.set_attribute("frontend.error", str(e)) + + # Set span status to error + span.set_status(trace.Status(trace.StatusCode.ERROR, str(e))) + + return jsonify({ + "service": "frontend", + "status": "error", + "error": str(e) + }), 500 + +if __name__ == '__main__': + logger.info("Starting frontend service on port 5000") + logger.info(f"Middleware URL: {MIDDLEWARE_URL}") + logger.info(f"OTLP endpoint: {os.getenv('OTEL_EXPORTER_OTLP_ENDPOINT', 'default')}") + app.run(host='0.0.0.0', port=5000, debug=False) diff --git a/f3s/tracing-demo/docker/frontend/requirements.txt b/f3s/tracing-demo/docker/frontend/requirements.txt new file mode 100644 index 0000000..cb10687 --- /dev/null +++ b/f3s/tracing-demo/docker/frontend/requirements.txt @@ -0,0 +1,6 @@ +flask==3.0.0 +requests==2.31.0 +opentelemetry-distro==0.49b0 +opentelemetry-exporter-otlp==1.28.0 +opentelemetry-instrumentation-flask==0.49b0 +opentelemetry-instrumentation-requests==0.49b0 diff --git a/f3s/tracing-demo/docker/middleware/Dockerfile b/f3s/tracing-demo/docker/middleware/Dockerfile new file mode 100644 index 0000000..60272f7 --- /dev/null +++ b/f3s/tracing-demo/docker/middleware/Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Copy and install dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code +COPY app.py . + +# Expose port for Flask application +EXPOSE 5001 + +# Run the application +CMD ["python", "app.py"] diff --git a/f3s/tracing-demo/docker/middleware/app.py b/f3s/tracing-demo/docker/middleware/app.py new file mode 100644 index 0000000..9c0ad30 --- /dev/null +++ b/f3s/tracing-demo/docker/middleware/app.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 +""" +Tracing Demo - Middleware Service +Transforms data and calls backend service. +Demonstrates trace context propagation in a multi-tier architecture. +""" +from flask import Flask, jsonify, request +import requests +import os +import logging +import time + +# OpenTelemetry imports for distributed tracing +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.instrumentation.flask import FlaskInstrumentor +from opentelemetry.instrumentation.requests import RequestsInstrumentor +from opentelemetry.sdk.resources import Resource + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# Initialize OpenTelemetry tracing with resource attributes +# These attributes identify this service in traces +resource = Resource(attributes={ + "service.name": "middleware", + "service.namespace": "tracing-demo", + "service.version": "1.0.0", + "deployment.environment": "production" +}) + +provider = TracerProvider(resource=resource) + +# Configure OTLP exporter to send traces to Alloy +otlp_exporter = OTLPSpanExporter( + endpoint=os.getenv('OTEL_EXPORTER_OTLP_ENDPOINT', + 'http://alloy.monitoring.svc.cluster.local:4317'), + insecure=True +) + +# Batch spans for efficient export +processor = BatchSpanProcessor(otlp_exporter) +provider.add_span_processor(processor) +trace.set_tracer_provider(provider) + +# Get tracer for manual instrumentation +tracer = trace.get_tracer(__name__) + +# Create Flask application +app = Flask(__name__) + +# Auto-instrument Flask and requests library +FlaskInstrumentor().instrument_app(app) +RequestsInstrumentor().instrument() + +# Configuration for downstream services +BACKEND_URL = os.getenv('BACKEND_URL', + 'http://backend-service.services.svc.cluster.local:5002') + +@app.route('/') +def index(): + """ + Health check and service information endpoint. + Returns service metadata. + """ + return jsonify({ + "service": "middleware", + "version": "1.0.0", + "message": "Tracing demo middleware service", + "backend_url": BACKEND_URL + }) + +@app.route('/health') +def health(): + """ + Kubernetes health check endpoint. + Used by readiness and liveness probes. + """ + return jsonify({"status": "healthy"}), 200 + +@app.route('/api/transform', methods=['POST']) +def transform(): + """ + Transform data and fetch additional data from backend. + Demonstrates trace context propagation through multiple services. + Creates custom spans to track transformation logic. + """ + # Create a custom span for the transformation logic + with tracer.start_as_current_span("middleware-transform") as span: + # Add custom attributes to the span + span.set_attribute("middleware.handler", "transform") + + # Get request data from frontend + data = request.get_json() or {} + span.set_attribute("middleware.input.keys", str(list(data.keys()))) + + # Simulate some data transformation processing + time.sleep(0.05) + + try: + # Call backend service to fetch additional data + # The trace context is automatically propagated via HTTP headers + logger.info(f"Calling backend at {BACKEND_URL}/api/data") + + response = requests.get( + f'{BACKEND_URL}/api/data', + timeout=10 + ) + + response.raise_for_status() + backend_data = response.json() + + # Record successful call in span + span.set_attribute("middleware.backend.status", response.status_code) + + # Transform and combine the data + transformed = { + "middleware_processed": True, + "original_data": data, + "backend_data": backend_data, + "transformation_time_ms": 50 + } + + return jsonify(transformed), 200 + + except requests.exceptions.RequestException as e: + # Log error and record in span + logger.error(f"Error calling backend: {e}") + span.set_attribute("middleware.error", str(e)) + + # Set span status to error + span.set_status(trace.Status(trace.StatusCode.ERROR, str(e))) + + return jsonify({ + "service": "middleware", + "status": "error", + "error": str(e) + }), 500 + +if __name__ == '__main__': + logger.info("Starting middleware service on port 5001") + logger.info(f"Backend URL: {BACKEND_URL}") + logger.info(f"OTLP endpoint: {os.getenv('OTEL_EXPORTER_OTLP_ENDPOINT', 'default')}") + app.run(host='0.0.0.0', port=5001, debug=False) diff --git a/f3s/tracing-demo/docker/middleware/requirements.txt b/f3s/tracing-demo/docker/middleware/requirements.txt new file mode 100644 index 0000000..cb10687 --- /dev/null +++ b/f3s/tracing-demo/docker/middleware/requirements.txt @@ -0,0 +1,6 @@ +flask==3.0.0 +requests==2.31.0 +opentelemetry-distro==0.49b0 +opentelemetry-exporter-otlp==1.28.0 +opentelemetry-instrumentation-flask==0.49b0 +opentelemetry-instrumentation-requests==0.49b0 diff --git a/f3s/tracing-demo/helm-chart/Chart.yaml b/f3s/tracing-demo/helm-chart/Chart.yaml new file mode 100644 index 0000000..c884ea0 --- /dev/null +++ b/f3s/tracing-demo/helm-chart/Chart.yaml @@ -0,0 +1,5 @@ +apiVersion: v2 +name: tracing-demo +description: A Helm chart for deploying distributed tracing demo application (Frontend, Middleware, Backend) +version: 0.1.0 +appVersion: "1.0.0" diff --git a/f3s/tracing-demo/helm-chart/templates/backend-deployment.yaml b/f3s/tracing-demo/helm-chart/templates/backend-deployment.yaml new file mode 100644 index 0000000..0a1f831 --- /dev/null +++ b/f3s/tracing-demo/helm-chart/templates/backend-deployment.yaml @@ -0,0 +1,51 @@ +# Backend Service Deployment +# Returns data (simulates database queries) +apiVersion: apps/v1 +kind: Deployment +metadata: + name: tracing-demo-backend + namespace: services + labels: + app: tracing-demo-backend + component: backend +spec: + replicas: 1 + selector: + matchLabels: + app: tracing-demo-backend + template: + metadata: + labels: + app: tracing-demo-backend + component: backend + spec: + containers: + - name: backend + image: registry.lan.buetow.org:30001/tracing-demo-backend:latest + imagePullPolicy: IfNotPresent + ports: + - containerPort: 5002 + name: http + protocol: TCP + env: + - name: OTEL_EXPORTER_OTLP_ENDPOINT + value: "http://alloy.monitoring.svc.cluster.local:4317" + resources: + limits: + cpu: 200m + memory: 256Mi + requests: + cpu: 100m + memory: 128Mi + livenessProbe: + httpGet: + path: /health + port: 5002 + initialDelaySeconds: 10 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 5002 + initialDelaySeconds: 5 + periodSeconds: 5 diff --git a/f3s/tracing-demo/helm-chart/templates/backend-service.yaml b/f3s/tracing-demo/helm-chart/templates/backend-service.yaml new file mode 100644 index 0000000..a7f6e61 --- /dev/null +++ b/f3s/tracing-demo/helm-chart/templates/backend-service.yaml @@ -0,0 +1,17 @@ +# Backend Service +# Exposes the backend deployment within the cluster +apiVersion: v1 +kind: Service +metadata: + name: backend-service + namespace: services + labels: + app: tracing-demo-backend +spec: + ports: + - name: http + port: 5002 + protocol: TCP + targetPort: 5002 + selector: + app: tracing-demo-backend diff --git a/f3s/tracing-demo/helm-chart/templates/frontend-deployment.yaml b/f3s/tracing-demo/helm-chart/templates/frontend-deployment.yaml new file mode 100644 index 0000000..f607b01 --- /dev/null +++ b/f3s/tracing-demo/helm-chart/templates/frontend-deployment.yaml @@ -0,0 +1,53 @@ +# Frontend Service Deployment +# Receives HTTP requests and forwards to middleware +apiVersion: apps/v1 +kind: Deployment +metadata: + name: tracing-demo-frontend + namespace: services + labels: + app: tracing-demo-frontend + component: frontend +spec: + replicas: 1 + selector: + matchLabels: + app: tracing-demo-frontend + template: + metadata: + labels: + app: tracing-demo-frontend + component: frontend + spec: + containers: + - name: frontend + image: registry.lan.buetow.org:30001/tracing-demo-frontend:latest + imagePullPolicy: IfNotPresent + ports: + - containerPort: 5000 + name: http + protocol: TCP + env: + - name: MIDDLEWARE_URL + value: "http://middleware-service.services.svc.cluster.local:5001" + - name: OTEL_EXPORTER_OTLP_ENDPOINT + value: "http://alloy.monitoring.svc.cluster.local:4317" + resources: + limits: + cpu: 200m + memory: 256Mi + requests: + cpu: 100m + memory: 128Mi + livenessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 10 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 5 + periodSeconds: 5 diff --git a/f3s/tracing-demo/helm-chart/templates/frontend-service.yaml b/f3s/tracing-demo/helm-chart/templates/frontend-service.yaml new file mode 100644 index 0000000..d45dd2a --- /dev/null +++ b/f3s/tracing-demo/helm-chart/templates/frontend-service.yaml @@ -0,0 +1,17 @@ +# Frontend Service +# Exposes the frontend deployment within the cluster +apiVersion: v1 +kind: Service +metadata: + name: frontend-service + namespace: services + labels: + app: tracing-demo-frontend +spec: + ports: + - name: http + port: 5000 + protocol: TCP + targetPort: 5000 + selector: + app: tracing-demo-frontend diff --git a/f3s/tracing-demo/helm-chart/templates/ingress.yaml b/f3s/tracing-demo/helm-chart/templates/ingress.yaml new file mode 100644 index 0000000..f080761 --- /dev/null +++ b/f3s/tracing-demo/helm-chart/templates/ingress.yaml @@ -0,0 +1,22 @@ +# Ingress for Frontend Service +# Exposes the tracing demo application at tracing-demo.f3s.buetow.org +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: tracing-demo-ingress + namespace: services + annotations: + spec.ingressClassName: traefik + traefik.ingress.kubernetes.io/router.entrypoints: web +spec: + rules: + - host: tracing-demo.f3s.buetow.org + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: frontend-service + port: + number: 5000 diff --git a/f3s/tracing-demo/helm-chart/templates/middleware-deployment.yaml b/f3s/tracing-demo/helm-chart/templates/middleware-deployment.yaml new file mode 100644 index 0000000..cae0c59 --- /dev/null +++ b/f3s/tracing-demo/helm-chart/templates/middleware-deployment.yaml @@ -0,0 +1,53 @@ +# Middleware Service Deployment +# Transforms data and calls backend +apiVersion: apps/v1 +kind: Deployment +metadata: + name: tracing-demo-middleware + namespace: services + labels: + app: tracing-demo-middleware + component: middleware +spec: + replicas: 1 + selector: + matchLabels: + app: tracing-demo-middleware + template: + metadata: + labels: + app: tracing-demo-middleware + component: middleware + spec: + containers: + - name: middleware + image: registry.lan.buetow.org:30001/tracing-demo-middleware:latest + imagePullPolicy: IfNotPresent + ports: + - containerPort: 5001 + name: http + protocol: TCP + env: + - name: BACKEND_URL + value: "http://backend-service.services.svc.cluster.local:5002" + - name: OTEL_EXPORTER_OTLP_ENDPOINT + value: "http://alloy.monitoring.svc.cluster.local:4317" + resources: + limits: + cpu: 200m + memory: 256Mi + requests: + cpu: 100m + memory: 128Mi + livenessProbe: + httpGet: + path: /health + port: 5001 + initialDelaySeconds: 10 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 5001 + initialDelaySeconds: 5 + periodSeconds: 5 diff --git a/f3s/tracing-demo/helm-chart/templates/middleware-service.yaml b/f3s/tracing-demo/helm-chart/templates/middleware-service.yaml new file mode 100644 index 0000000..08c325b --- /dev/null +++ b/f3s/tracing-demo/helm-chart/templates/middleware-service.yaml @@ -0,0 +1,17 @@ +# Middleware Service +# Exposes the middleware deployment within the cluster +apiVersion: v1 +kind: Service +metadata: + name: middleware-service + namespace: services + labels: + app: tracing-demo-middleware +spec: + ports: + - name: http + port: 5001 + protocol: TCP + targetPort: 5001 + selector: + app: tracing-demo-middleware |
