Kubernetes Expert Examples
Externalized from the agent definition per the few-shot-examples rule (#1587).
Kubernetes Expert — Worked Examples
Externalized from the agent definition per the few-shot-examples rule (#1587).
Process Sample Blocks
These are the full illustrative sample blocks for the agent's process steps. Each corresponds to a capability summarized inline in the agent definition's `## Your Process` section.
1. Cluster State Audit
# Check node health and resource pressure
kubectl get nodes -o wide
kubectl describe nodes | grep -A5 "Conditions:"
# Identify resource-constrained pods
kubectl top pods --all-namespaces --sort-by=cpu | head -20
kubectl top pods --all-namespaces --sort-by=memory | head -20
# Find pods without resource requests/limits
kubectl get pods --all-namespaces -o json \
| jq '.items[] | select(.spec.containers[].resources.requests == null)
| {name: .metadata.name, ns: .metadata.namespace}'
# Check for pods in non-Running states
kubectl get pods --all-namespaces --field-selector=status.phase!=Running
# Inspect cluster events for warnings
kubectl get events --all-namespaces --sort-by='.lastTimestamp' \
| grep -i "warning\|failed\|error" | tail -30
# Audit RBAC — who can do what
kubectl auth can-i --list --as=system:serviceaccount:default:my-sa
2. Production Deployment Manifests
# Deployment with all production fields set
apiVersion: apps/v1
kind: Deployment
metadata:
name: order-service
namespace: production
labels:
app: order-service
version: "2026.2.0"
app.kubernetes.io/part-of: commerce-platform
spec:
replicas: 3
revisionHistoryLimit: 5
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
maxUnavailable: 0 # Zero-downtime: always have full capacity
selector:
matchLabels:
app: order-service
template:
metadata:
labels:
app: order-service
version: "2026.2.0"
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
prometheus.io/path: "/actuator/prometheus"
spec:
serviceAccountName: order-service
securityContext:
runAsNonRoot: true
runAsUser: 1000
fsGroup: 2000
seccompProfile:
type: RuntimeDefault
topologySpreadConstraints:
- maxSkew: 1
topologyKey: topology.kubernetes.io/zone
whenUnsatisfiable: DoNotSchedule
labelSelector:
matchLabels:
app: order-service
containers:
- name: order-service
image: registry.example.com/order-service:2026.2.0
imagePullPolicy: IfNotPresent
ports:
- name: http
containerPort: 8080
protocol: TCP
env:
- name: SPRING_PROFILES_ACTIVE
value: "prod"
- name: DB_URL
valueFrom:
secretKeyRef:
name: order-service-secrets
key: database-url
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
resources:
requests:
cpu: "250m"
memory: "512Mi"
limits:
cpu: "1000m"
memory: "1Gi"
livenessProbe:
httpGet:
path: /actuator/health/liveness
port: http
initialDelaySeconds: 30
periodSeconds: 10
failureThreshold: 3
timeoutSeconds: 5
readinessProbe:
httpGet:
path: /actuator/health/readiness
port: http
initialDelaySeconds: 20
periodSeconds: 5
failureThreshold: 3
timeoutSeconds: 3
startupProbe:
httpGet:
path: /actuator/health
port: http
failureThreshold: 30 # Allow 5 minutes for slow JVM startup
periodSeconds: 10
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop: ["ALL"]
volumeMounts:
- name: tmp
mountPath: /tmp
volumes:
- name: tmp
emptyDir: {}
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
topologyKey: kubernetes.io/hostname
labelSelector:
matchLabels:
app: order-service
3. Service, Ingress, and HPA
# Service
apiVersion: v1
kind: Service
metadata:
name: order-service
namespace: production
labels:
app: order-service
spec:
selector:
app: order-service
ports:
- name: http
port: 80
targetPort: http
protocol: TCP
type: ClusterIP
---
# Ingress with TLS and rate limiting annotations
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: order-service
namespace: production
annotations:
nginx.ingress.kubernetes.io/rate-limit: "100"
nginx.ingress.kubernetes.io/rate-limit-burst-multiplier: "3"
nginx.ingress.kubernetes.io/ssl-redirect: "true"
nginx.ingress.kubernetes.io/proxy-body-size: "2m"
cert-manager.io/cluster-issuer: letsencrypt-prod
spec:
ingressClassName: nginx
tls:
- hosts:
- api.example.com
secretName: api-example-com-tls
rules:
- host: api.example.com
http:
paths:
- path: /api/v1/orders
pathType: Prefix
backend:
service:
name: order-service
port:
name: http
---
# HorizontalPodAutoscaler — scale on CPU and custom metrics
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: order-service
namespace: production
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: order-service
minReplicas: 3
maxReplicas: 20
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 60
- type: Pods
pods:
metric:
name: http_requests_per_second
target:
type: AverageValue
averageValue: "500"
behavior:
scaleDown:
stabilizationWindowSeconds: 300 # 5-minute cooldown before scale-down
policies:
- type: Pods
value: 2
periodSeconds: 60
scaleUp:
stabilizationWindowSeconds: 0
policies:
- type: Pods
value: 4
periodSeconds: 30
---
# PodDisruptionBudget — maintain minimum availability during disruptions
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
name: order-service
namespace: production
spec:
minAvailable: 2
selector:
matchLabels:
app: order-service
4. Helm Chart Structure
charts/order-service/
├── Chart.yaml
├── values.yaml
├── values-staging.yaml
├── values-prod.yaml
└── templates/
├── _helpers.tpl
├── deployment.yaml
├── service.yaml
├── ingress.yaml
├── hpa.yaml
├── pdb.yaml
├── serviceaccount.yaml
├── configmap.yaml
└── NOTES.txt
# Chart.yaml
apiVersion: v2
name: order-service
description: Order service for commerce platform
type: application
version: 1.0.0
appVersion: "2026.2.0"
dependencies:
- name: common
version: ">=1.0.0"
repository: https://charts.example.com
---
# values.yaml — defaults (development)
replicaCount: 1
image:
repository: registry.example.com/order-service
pullPolicy: IfNotPresent
tag: "" # Overridden by CI with image digest or tag
serviceAccount:
create: true
name: ""
annotations: {}
service:
type: ClusterIP
port: 80
ingress:
enabled: false
className: nginx
annotations: {}
hosts: []
tls: []
resources:
requests:
cpu: 250m
memory: 512Mi
limits:
cpu: 1000m
memory: 1Gi
autoscaling:
enabled: false
minReplicas: 1
maxReplicas: 10
targetCPUUtilizationPercentage: 60
podDisruptionBudget:
enabled: false
minAvailable: 1
env:
SPRING_PROFILES_ACTIVE: dev
secrets:
databaseUrl: "" # Injected via CI or External Secrets Operator
# values-prod.yaml — production overrides
replicaCount: 3
ingress:
enabled: true
annotations:
cert-manager.io/cluster-issuer: letsencrypt-prod
hosts:
- host: api.example.com
paths:
- path: /api/v1/orders
pathType: Prefix
tls:
- secretName: api-tls
hosts:
- api.example.com
autoscaling:
enabled: true
minReplicas: 3
maxReplicas: 20
podDisruptionBudget:
enabled: true
minAvailable: 2
env:
SPRING_PROFILES_ACTIVE: prod
# Deploy with Helm
helm upgrade --install order-service ./charts/order-service \
--namespace production \
--create-namespace \
--values ./charts/order-service/values-prod.yaml \
--set image.tag="${IMAGE_TAG}" \
--set secrets.databaseUrl="${DB_URL}" \
--wait \
--timeout 5m
5. Kustomize Overlay Pattern
k8s/
├── base/
│ ├── kustomization.yaml
│ ├── deployment.yaml
│ ├── service.yaml
│ └── serviceaccount.yaml
└── overlays/
├── staging/
│ ├── kustomization.yaml
│ └── patches/
│ └── deployment-replicas.yaml
└── production/
├── kustomization.yaml
└── patches/
├── deployment-resources.yaml
└── hpa.yaml
# base/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- deployment.yaml
- service.yaml
- serviceaccount.yaml
commonLabels:
app.kubernetes.io/managed-by: kustomize
---
# overlays/production/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: production
resources:
- ../../base
- hpa.yaml
- pdb.yaml
patches:
- path: patches/deployment-resources.yaml
target:
kind: Deployment
name: order-service
images:
- name: registry.example.com/order-service
newTag: "2026.2.0"
configMapGenerator:
- name: order-service-config
literals:
- SPRING_PROFILES_ACTIVE=prod
- LOG_LEVEL=WARN
# overlays/production/patches/deployment-resources.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: order-service
spec:
replicas: 3
template:
spec:
containers:
- name: order-service
resources:
requests:
cpu: "500m"
memory: "768Mi"
limits:
cpu: "2000m"
memory: "1536Mi"
6. RBAC and Network Policies
# ServiceAccount with minimal permissions
apiVersion: v1
kind: ServiceAccount
metadata:
name: order-service
namespace: production
annotations:
eks.amazonaws.com/role-arn: arn:aws:iam::123456789:role/order-service-role # AWS IRSA
---
# Role — least-privilege namespace access
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: order-service
namespace: production
rules:
- apiGroups: [""]
resources: ["configmaps"]
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources: ["secrets"]
resourceNames: ["order-service-secrets"] # Only specific secret
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: order-service
namespace: production
subjects:
- kind: ServiceAccount
name: order-service
namespace: production
roleRef:
kind: Role
name: order-service
apiGroup: rbac.authorization.k8s.io
---
# NetworkPolicy — default deny, explicit allow
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: order-service-netpol
namespace: production
spec:
podSelector:
matchLabels:
app: order-service
policyTypes:
- Ingress
- Egress
ingress:
- from:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: ingress-nginx
- podSelector:
matchLabels:
app: api-gateway
ports:
- port: 8080
egress:
- to:
- podSelector:
matchLabels:
app: postgres
ports:
- port: 5432
- to:
- podSelector:
matchLabels:
app: redis
ports:
- port: 6379
- to: # Allow DNS resolution
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: kube-system
ports:
- port: 53
protocol: UDP
7. GitOps with ArgoCD
# ArgoCD Application — declarative delivery
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: order-service-prod
namespace: argocd
finalizers:
- resources-finalizer.argocd.argoproj.io
spec:
project: commerce-platform
source:
repoURL: https://git.example.com/infra/k8s-manifests
targetRevision: main
path: overlays/production
destination:
server: https://kubernetes.default.svc
namespace: production
syncPolicy:
automated:
prune: true # Remove resources deleted from Git
selfHeal: true # Revert manual cluster changes
syncOptions:
- CreateNamespace=true
- PrunePropagationPolicy=foreground
- RespectIgnoreDifferences=true
retry:
limit: 3
backoff:
duration: 5s
factor: 2
maxDuration: 3m
ignoreDifferences:
- group: apps
kind: Deployment
jsonPointers:
- /spec/replicas # HPA manages replicas — ignore Git drift
---
# ArgoCD AppProject — RBAC boundaries
apiVersion: argoproj.io/v1alpha1
kind: AppProject
metadata:
name: commerce-platform
namespace: argocd
spec:
description: Commerce platform production applications
sourceRepos:
- https://git.example.com/infra/k8s-manifests
destinations:
- namespace: production
server: https://kubernetes.default.svc
clusterResourceWhitelist:
- group: ""
kind: Namespace
namespaceResourceBlacklist:
- group: ""
kind: ResourceQuota # Protect quota settings
roles:
- name: deployer
policies:
- p, proj:commerce-platform:deployer, applications, sync, commerce-platform/*, allow
8. Observability Stack
# PrometheusRule — alerts for order-service SLIs
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: order-service-alerts
namespace: production
labels:
prometheus: kube-prometheus
role: alert-rules
spec:
groups:
- name: order-service.slo
interval: 30s
rules:
# SLI: error rate
- record: job:order_service_errors:rate5m
expr: |
rate(http_server_requests_seconds_count{
service="order-service", status=~"5.."
}[5m])
# Alert: error rate > 1% over 5 minutes
- alert: OrderServiceHighErrorRate
expr: |
(
rate(http_server_requests_seconds_count{
service="order-service", status=~"5.."
}[5m])
/
rate(http_server_requests_seconds_count{
service="order-service"
}[5m])
) > 0.01
for: 5m
labels:
severity: warning
team: commerce
annotations:
summary: "Order service error rate above 1%"
description: "Error rate is {{ $value | humanizePercentage }} over the last 5 minutes"
runbook_url: "https://runbooks.example.com/order-service/high-error-rate"
# Alert: P99 latency > 500ms
- alert: OrderServiceHighLatency
expr: |
histogram_quantile(0.99,
rate(http_server_requests_seconds_bucket{
service="order-service"
}[5m])
) > 0.5
for: 10m
labels:
severity: warning
annotations:
summary: "Order service P99 latency above 500ms"
# ServiceMonitor — Prometheus scrape config
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: order-service
namespace: production
labels:
release: kube-prometheus-stack
spec:
selector:
matchLabels:
app: order-service
endpoints:
- port: http
path: /actuator/prometheus
interval: 15s
scrapeTimeout: 10s
9. Custom Resource Definitions and Operators
// CRD type using controller-runtime
// +kubebuilder:object:root=true
// +kubebuilder:subresource:status
// +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.ready"
// +kubebuilder:printcolumn:name="Replicas",type="integer",JSONPath=".status.readyReplicas"
type OrderProcessor struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`
Spec OrderProcessorSpec `json:"spec,omitempty"`
Status OrderProcessorStatus `json:"status,omitempty"`
}
type OrderProcessorSpec struct {
Replicas int32 `json:"replicas"`
Queue string `json:"queue"`
MaxRetries int `json:"maxRetries,omitempty"`
}
// Reconciler
type OrderProcessorReconciler struct {
client.Client
Scheme *runtime.Scheme
Log logr.Logger
}
func (r *OrderProcessorReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
log := r.Log.WithValues("orderprocessor", req.NamespacedName)
var op commercev1.OrderProcessor
if err := r.Get(ctx, req.NamespacedName, &op); err != nil {
return ctrl.Result{}, client.IgnoreNotFound(err)
}
// Ensure Deployment matches desired state
deployment := r.deploymentForOrderProcessor(&op)
if err := ctrl.SetControllerReference(&op, deployment, r.Scheme); err != nil {
return ctrl.Result{}, err
}
found := &appsv1.Deployment{}
err := r.Get(ctx, types.NamespacedName{Name: deployment.Name, Namespace: deployment.Namespace}, found)
if errors.IsNotFound(err) {
log.Info("Creating Deployment", "name", deployment.Name)
return ctrl.Result{}, r.Create(ctx, deployment)
}
if err != nil {
return ctrl.Result{}, err
}
// Reconcile spec if drifted
if *found.Spec.Replicas != op.Spec.Replicas {
found.Spec.Replicas = &op.Spec.Replicas
return ctrl.Result{}, r.Update(ctx, found)
}
return ctrl.Result{RequeueAfter: 30 * time.Second}, nil
}
Few-Shot Examples
Example 1: Diagnosing CrashLoopBackOff
Input: "order-service pod keeps restarting with CrashLoopBackOff"
# Step 1: Get pod details
kubectl describe pod order-service-7d9f8c-x2k9p -n production
# Look for:
# - Last State exit code (1=app error, 137=OOMKilled, 143=SIGTERM)
# - Events section for scheduling or image pull errors
# - Liveness probe failures
# Step 2: View current and previous logs
kubectl logs order-service-7d9f8c-x2k9p -n production
kubectl logs order-service-7d9f8c-x2k9p -n production --previous
# Step 3: If OOMKilled (exit 137) — increase memory limit
# kubectl top pod shows actual usage near limit → bump limit 50%
kubectl top pod order-service-7d9f8c-x2k9p -n production
# Step 4: If startup probe failing — check initialDelaySeconds
# Application needs 45s to start but probe fires at 30s
Fix for OOMKilled:
resources:
requests:
memory: "512Mi"
limits:
memory: "1Gi" # Increased from 512Mi based on actual usage
Fix for slow startup:
startupProbe:
httpGet:
path: /actuator/health
port: http
failureThreshold: 30 # 30 * 10s = 5 minutes max startup time
periodSeconds: 10
livenessProbe:
# Only activates after startupProbe succeeds
initialDelaySeconds: 0
periodSeconds: 10
Example 2: Zero-Downtime Deployment with PodDisruptionBudget
Input: "Cluster node maintenance causes order service to go down during pod evictions"
# Problem: no PDB means all pods can be evicted simultaneously
# Solution 1: PodDisruptionBudget
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
name: order-service-pdb
namespace: production
spec:
minAvailable: 2 # At least 2 pods always available during disruptions
selector:
matchLabels:
app: order-service
# Solution 2: Topology spread across nodes and zones
spec:
topologySpreadConstraints:
- maxSkew: 1
topologyKey: kubernetes.io/hostname
whenUnsatisfiable: DoNotSchedule
labelSelector:
matchLabels:
app: order-service
- maxSkew: 1
topologyKey: topology.kubernetes.io/zone
whenUnsatisfiable: DoNotSchedule
labelSelector:
matchLabels:
app: order-service
# Solution 3: Graceful shutdown — handle SIGTERM before pod is removed
spec:
terminationGracePeriodSeconds: 60 # Allow in-flight requests to complete
containers:
- name: order-service
lifecycle:
preStop:
exec:
command: ["sh", "-c", "sleep 5"] # Let load balancer drain first
Example 3: Canary Deployment with Argo Rollouts
Input: "We want to roll out the new checkout service version to 10% of traffic first"
apiVersion: argoproj.io/v1alpha1
kind: Rollout
metadata:
name: checkout-service
namespace: production
spec:
replicas: 10
selector:
matchLabels:
app: checkout-service
template:
metadata:
labels:
app: checkout-service
spec:
containers:
- name: checkout-service
image: registry.example.com/checkout-service:2026.2.0
resources:
requests:
cpu: 250m
memory: 512Mi
strategy:
canary:
canaryService: checkout-service-canary
stableService: checkout-service-stable
trafficRouting:
nginx:
stableIngress: checkout-ingress
steps:
- setWeight: 10 # 10% traffic to canary
- pause: {duration: 5m} # Observe error rate and latency
- setWeight: 30
- pause: {duration: 5m}
- setWeight: 60
- pause: {duration: 5m}
- setWeight: 100
analysis:
templates:
- templateName: success-rate
startingStep: 1
args:
- name: service-name
value: checkout-service-canary
---
apiVersion: argoproj.io/v1alpha1
kind: AnalysisTemplate
metadata:
name: success-rate
spec:
args:
- name: service-name
metrics:
- name: success-rate
interval: 1m
successCondition: result[0] >= 0.99 # Abort if error rate > 1%
failureLimit: 2
provider:
prometheus:
address: http://prometheus:9090
query: |
sum(rate(http_requests_total{service="{{args.service-name}}", status!~"5.."}[5m]))
/
sum(rate(http_requests_total{service="{{args.service-name}}"}[5m]))
# Monitor rollout progress
kubectl argo rollouts get rollout checkout-service -n production --watch
# Manually promote or abort
kubectl argo rollouts promote checkout-service -n production
kubectl argo rollouts abort checkout-service -n production