From 13acfccd776760e98684957112b0f9e176a07c2e Mon Sep 17 00:00:00 2001 From: Jet Li Date: Wed, 3 Sep 2025 16:59:04 -0700 Subject: [PATCH] (feat): Basic docs on Freeleaps Infra --- docs/Azure_K8s_Node_Addition_Runbook.md | 283 +++++ docs/Current_Ingress_Analysis.md | 409 +++++++ docs/Custom_Resources_And_Operators_Guide.md | 573 ++++++++++ docs/Ingress_Setup_And_Redirects_Guide.md | 558 +++++++++ docs/Kubernetes_Bootstrap_Guide.md | 433 +++++++ docs/Kubernetes_Core_Concepts_Guide.md | 840 ++++++++++++++ ...netes_Fundamentals_For_Junior_Engineers.md | 832 ++++++++++++++ docs/PVC_Deep_Dive_Guide.md | 608 ++++++++++ docs/README.md | 406 +++++++ docs/RabbitMQ_Management_Analysis.md | 1015 +++++++++++++++++ docs/Reconciler_Architecture_Deep_Dive.md | 440 +++++++ docs/Reconciler_Framework_Analysis.md | 521 +++++++++ docs/add_k8s_node.sh | 397 +++++++ docs/bootstrap-k8s-cluster.sh | 394 +++++++ docs/examples/basic-pod.yaml | 141 +++ .../complete-application-example.yaml | 468 ++++++++ docs/examples/configmap-secret-example.yaml | 100 ++ docs/examples/deployment-example.yaml | 158 +++ docs/examples/ingress-example.yaml | 265 +++++ docs/examples/job-cronjob-example.yaml | 162 +++ docs/examples/kubectl-quick-reference.md | 381 +++++++ docs/examples/namespace-with-pvc.yaml | 44 + .../examples/resource-management-example.yaml | 150 +++ docs/examples/service-example.yaml | 54 + docs/node_config.env.template | 190 +++ 25 files changed, 9822 insertions(+) create mode 100644 docs/Azure_K8s_Node_Addition_Runbook.md create mode 100644 docs/Current_Ingress_Analysis.md create mode 100644 docs/Custom_Resources_And_Operators_Guide.md create mode 100644 docs/Ingress_Setup_And_Redirects_Guide.md create mode 100644 docs/Kubernetes_Bootstrap_Guide.md create mode 100644 docs/Kubernetes_Core_Concepts_Guide.md create mode 100644 docs/Kubernetes_Fundamentals_For_Junior_Engineers.md create mode 100644 docs/PVC_Deep_Dive_Guide.md create mode 100644 docs/README.md create mode 100644 docs/RabbitMQ_Management_Analysis.md create mode 100644 docs/Reconciler_Architecture_Deep_Dive.md create mode 100644 docs/Reconciler_Framework_Analysis.md create mode 100755 docs/add_k8s_node.sh create mode 100755 docs/bootstrap-k8s-cluster.sh create mode 100644 docs/examples/basic-pod.yaml create mode 100644 docs/examples/complete-application-example.yaml create mode 100644 docs/examples/configmap-secret-example.yaml create mode 100644 docs/examples/deployment-example.yaml create mode 100644 docs/examples/ingress-example.yaml create mode 100644 docs/examples/job-cronjob-example.yaml create mode 100644 docs/examples/kubectl-quick-reference.md create mode 100644 docs/examples/namespace-with-pvc.yaml create mode 100644 docs/examples/resource-management-example.yaml create mode 100644 docs/examples/service-example.yaml create mode 100644 docs/node_config.env.template diff --git a/docs/Azure_K8s_Node_Addition_Runbook.md b/docs/Azure_K8s_Node_Addition_Runbook.md new file mode 100644 index 00000000..413ed582 --- /dev/null +++ b/docs/Azure_K8s_Node_Addition_Runbook.md @@ -0,0 +1,283 @@ +# Azure Kubernetes Node Addition Runbook + +## Overview +This runbook provides step-by-step instructions for adding new Azure Virtual Machines to an existing Kubernetes cluster installed via Kubespray. + +## Prerequisites +- Access to Azure CLI with appropriate permissions +- SSH access to the new VM +- Access to the existing Kubernetes cluster +- Kubespray installation directory + +## Pre-Installation Checklist + +### 1. Verify New VM Details +```bash +# Get VM details from Azure +az vm show --resource-group --name --query "{name:name,ip:publicIps,privateIp:privateIps}" -o table +``` + +### 2. Verify SSH Access +```bash +# Test SSH connection to the new VM +ssh wwwadmin@mathmast.com@ +# You will be prompted for password +``` + +### 3. Verify Network Connectivity +```bash +# From the new VM, test connectivity to existing cluster +ping +``` + +## Step-by-Step Process + +### Step 1: Update Ansible Inventory + +1. **Navigate to Kubespray directory** +```bash +cd freeleaps-ops/3rd/kubespray +``` + +2. **Edit the inventory file** +```bash +vim ../cluster/ansible/manifests/inventory.ini +``` + +3. **Add the new node to the appropriate group** + +For a worker node: +```ini +[kube_node] +# Existing nodes... +prod-usw2-k8s-freeleaps-worker-nodes-06 ansible_host= ansible_user=wwwadmin@mathmast.com host_name=prod-usw2-k8s-freeleaps-worker-nodes-06 +``` + +For a master node: +```ini +[kube_control_plane] +# Existing nodes... +prod-usw2-k8s-freeleaps-master-03 ansible_host= ansible_user=wwwadmin@mathmast.com etcd_member_name=freeleaps-etcd-03 host_name=prod-usw2-k8s-freeleaps-master-03 +``` + +### Step 2: Verify Inventory Configuration + +1. **Check inventory syntax** +```bash +ansible-inventory -i ../cluster/ansible/manifests/inventory.ini --list +``` + +2. **Test connectivity to new node** +```bash +ansible -i ../cluster/ansible/manifests/inventory.ini kube_node -m ping -kK +``` + +### Step 3: Run Kubespray Scale Playbook + +1. **Execute the scale playbook** +```bash +cd ../cluster/ansible/manifests +ansible-playbook -i inventory.ini ../../3rd/kubespray/scale.yml -kK -b +``` + +**Note**: +- `-k` prompts for SSH password +- `-K` prompts for sudo password +- `-b` enables privilege escalation + +### Step 4: Verify Node Addition + +1. **Check node status** +```bash +kubectl get nodes +``` + +2. **Verify node is ready** +```bash +kubectl describe node +``` + +3. **Check node labels** +```bash +kubectl get nodes --show-labels +``` + +### Step 5: Post-Installation Verification + +1. **Test pod scheduling** +```bash +# Create a test pod to verify scheduling +kubectl run test-pod --image=nginx --restart=Never +kubectl get pod test-pod -o wide +``` + +2. **Check node resources** +```bash +kubectl top nodes +``` + +3. **Verify node components** +```bash +kubectl get pods -n kube-system -o wide | grep +``` + +## Troubleshooting + +### Common Issues + +#### 1. SSH Connection Failed +```bash +# Verify VM is running +az vm show --resource-group --name --query "powerState" + +# Check network security groups +az network nsg rule list --resource-group --nsg-name +``` + +#### 2. Ansible Connection Failed +```bash +# Test with verbose output +ansible -i ../cluster/ansible/manifests/inventory.ini kube_node -m ping -kK -vvv +``` + +#### 3. Node Not Ready +```bash +# Check node conditions +kubectl describe node + +# Check kubelet logs +kubectl logs -n kube-system kubelet- +``` + +#### 4. Pod Scheduling Issues +```bash +# Check node taints +kubectl get nodes -o custom-columns=NAME:.metadata.name,TAINTS:.spec.taints + +# Check node capacity +kubectl describe node | grep -A 10 "Capacity" +``` + +### Recovery Procedures + +#### If Scale Playbook Fails +1. **Clean up the failed node** +```bash +kubectl delete node +``` + +2. **Reset the VM** +```bash +# Reset VM to clean state +az vm restart --resource-group --name +``` + +3. **Retry the scale playbook** +```bash +ansible-playbook -i inventory.ini ../../3rd/kubespray/scale.yml -kK -b +``` + +#### If Node is Stuck in NotReady State +1. **Check kubelet service** +```bash +ssh wwwadmin@mathmast.com@ +sudo systemctl status kubelet +``` + +2. **Restart kubelet** +```bash +ssh wwwadmin@mathmast.com@ +sudo systemctl restart kubelet +``` + +## Security Considerations + +### 1. Network Security +- Ensure the new VM is in the correct subnet +- Verify network security group rules allow cluster communication +- Check firewall rules if applicable + +### 2. Access Control +- Use SSH key-based authentication when possible +- Limit sudo access to necessary commands +- Monitor node access logs + +### 3. Compliance +- Ensure the new node meets security requirements +- Verify all required security patches are applied +- Check compliance with organizational policies + +## Monitoring and Maintenance + +### 1. Node Health Monitoring +```bash +# Set up monitoring for the new node +kubectl get nodes -o wide +kubectl top nodes +``` + +### 2. Resource Monitoring +```bash +# Monitor resource usage +kubectl describe node | grep -A 5 "Allocated resources" +``` + +### 3. Log Monitoring +```bash +# Monitor kubelet logs +kubectl logs -n kube-system kubelet- --tail=100 -f +``` + +## Rollback Procedures + +### If Node Addition Causes Issues + +1. **Cordon the node** +```bash +kubectl cordon +``` + +2. **Drain the node** +```bash +kubectl drain --ignore-daemonsets --delete-emptydir-data +``` + +3. **Remove the node** +```bash +kubectl delete node +``` + +4. **Update inventory** +```bash +# Remove the node from inventory.ini +vim ../cluster/ansible/manifests/inventory.ini +``` + +## Documentation + +### Required Information +- VM name and IP address +- Resource group and subscription +- Node role (worker/master) +- Date and time of addition +- Person performing the addition + +### Post-Addition Checklist +- [ ] Node appears in `kubectl get nodes` +- [ ] Node status is Ready +- [ ] Pods can be scheduled on the node +- [ ] All node components are running +- [ ] Monitoring is configured +- [ ] Documentation is updated + +## Emergency Contacts + +- **Infrastructure Team**: [Contact Information] +- **Kubernetes Administrators**: [Contact Information] +- **Azure Support**: [Contact Information] + +--- + +**Last Updated**: [Date] +**Version**: 1.0 +**Author**: [Name] diff --git a/docs/Current_Ingress_Analysis.md b/docs/Current_Ingress_Analysis.md new file mode 100644 index 00000000..ed83b95c --- /dev/null +++ b/docs/Current_Ingress_Analysis.md @@ -0,0 +1,409 @@ +# Current Ingress Setup Analysis + +## 🎯 **Overview** + +This document analyzes your current Kubernetes ingress setup based on the codebase examination. It explains how your ingress infrastructure works, what components are involved, and how they interact. + +--- + +## πŸ“Š **Your Current Ingress Architecture** + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ INTERNET β”‚ +β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ Browser β”‚ β”‚ Mobile β”‚ β”‚ API β”‚ β”‚ Other β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ App β”‚ β”‚ Client β”‚ β”‚ Clients β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β–Ό β–Ό β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ AZURE LOAD BALANCER β”‚ β”‚ +β”‚ β”‚ IP: 4.155.160.32 (prod-usw2-k8s-freeleaps-lb-fe-ip) β”‚ β”‚ +β”‚ β”‚ Port: 80/443 β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ β”‚ +β”‚ β–Ό β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ NGINX INGRESS CONTROLLER β”‚ β”‚ +β”‚ β”‚ Namespace: freeleaps-controls-system β”‚ β”‚ +β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ +β”‚ β”‚ β”‚ Pod: ingress-nginx-controller-abc123 β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ Image: ingress-nginx/controller:v1.12.0 β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ IP: 10.0.1.100 Port: 80/443 β”‚ β”‚ β”‚ +β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ β”‚ +β”‚ β–Ό β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ INGRESS RULES β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ argo.mathmast.com β†’ argo-cd-server:80 β”‚ β”‚ +β”‚ β”‚ gitea.freeleaps.mathmast.com β†’ gitea-http:3000 β”‚ β”‚ +β”‚ β”‚ magicleaps.mathmast.com β†’ magicleaps-frontend-service:80 β”‚ β”‚ +β”‚ β”‚ alpha.magicleaps.mathmast.com β†’ magicleaps-frontend-service:80 β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ β”‚ +β”‚ β–Ό β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ KUBERNETES SERVICES β”‚ β”‚ +β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ +β”‚ β”‚ β”‚argo-cd-svc β”‚ β”‚gitea-http β”‚ β”‚magic-front β”‚ β”‚magic-api β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ClusterIP β”‚ β”‚ClusterIP β”‚ β”‚ClusterIP β”‚ β”‚ClusterIP β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚10.0.1.10 β”‚ β”‚10.0.1.11 β”‚ β”‚10.0.1.12 β”‚ β”‚10.0.1.13 β”‚ β”‚ β”‚ +β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ β”‚ +β”‚ β–Ό β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ APPLICATION PODS β”‚ β”‚ +β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ +β”‚ β”‚ β”‚argo-cd-pod β”‚ β”‚gitea-pod β”‚ β”‚magic-front β”‚ β”‚magic-api β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚10.0.1.101 β”‚ β”‚10.0.1.102 β”‚ β”‚10.0.1.103 β”‚ β”‚10.0.1.104 β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚argo-cd:v2.8 β”‚ β”‚gitea:1.20 β”‚ β”‚nginx:latest β”‚ β”‚api:v1.2 β”‚ β”‚ β”‚ +β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +--- + +## πŸ”§ **Components Analysis** + +### **1. Nginx Ingress Controller** + +**Location**: `freeleaps-ops/cluster/manifests/freeleaps-controls-system/ingress-nginx/values.yaml` + +**Key Configuration**: +```yaml +# Controller Configuration +controller: + name: controller + image: + image: ingress-nginx/controller + tag: "v1.12.0" # ← Specific version for stability + runAsNonRoot: true # ← Security: don't run as root + runAsUser: 101 # ← Security: run as nginx user + allowPrivilegeEscalation: false # ← Security: prevent privilege escalation + + # Ingress Class Configuration + ingressClassResource: + name: nginx # ← Ingress class name + enabled: true # ← Create the IngressClass resource + default: false # ← Not the default (allows multiple controllers) + controllerValue: k8s.io/ingress-nginx # ← Controller identifier + + # Service Configuration + service: + type: LoadBalancer # ← Azure Load Balancer for external access + ports: + http: 80 # ← HTTP port + https: 443 # ← HTTPS port +``` + +**What this means**: +- You have a production-grade nginx-ingress-controller +- It's configured with security best practices +- It uses Azure Load Balancer for external access +- It's not the default ingress class (allows flexibility) + +### **2. Cert-Manager Integration** + +**Location**: `freeleaps-ops/cluster/manifests/freeleaps-controls-system/godaddy-webhook/cluster-issuer.yaml` + +**Key Configuration**: +```yaml +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: mathmast-dot-com +spec: + acme: + email: acme@mathmast.com + server: https://acme-v02.api.letsencrypt.org/directory + solvers: + - dns01: + webhook: + config: + apiKeySecretRef: + name: mathmast-godaddy-api-key + groupName: acme.mathmast.com + solverName: godaddy + selector: + dnsZones: + - mathmast.com +``` + +**What this means**: +- You're using Let's Encrypt for SSL certificates +- DNS01 challenge for domain validation (more reliable than HTTP01) +- GoDaddy DNS API integration for automatic DNS record creation +- Certificates are automatically renewed + +### **3. Custom Ingress Manager** + +**Location**: `freeleaps-devops-reconciler/reconciler/controllers/ingress_resources/ingress_manager.py` + +**Key Features**: +```python +# Automatic Ingress Creation +annotations = { + "nginx.ingress.kubernetes.io/ssl-redirect": "true", + "nginx.ingress.kubernetes.io/force-ssl-redirect": "true", + "cert-manager.io/cluster-issuer": "letsencrypt-prod", + "nginx.ingress.kubernetes.io/proxy-body-size": "0", + "nginx.ingress.kubernetes.io/proxy-read-timeout": "600", + "nginx.ingress.kubernetes.io/proxy-send-timeout": "600" +} +``` + +**What this means**: +- You have a custom controller that automatically creates ingresses +- It enforces SSL redirect (HTTP β†’ HTTPS) +- It integrates with cert-manager for automatic certificates +- It sets performance optimizations (timeouts, body size) + +--- + +## πŸ”„ **Request Flow Analysis** + +### **1. External Request Flow** +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Browser β”‚ β”‚ Azure Load β”‚ β”‚ Nginx β”‚ β”‚ Application β”‚ +β”‚ β”‚ β”‚ Balancer β”‚ β”‚ Ingress β”‚ β”‚ Service β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ β”‚ β”‚ β”‚ + β”‚ HTTPS Request β”‚ β”‚ β”‚ + │───────────────▢│ β”‚ β”‚ + β”‚ β”‚ Forward to β”‚ β”‚ + β”‚ β”‚ nginx β”‚ β”‚ + β”‚ │───────────────▢│ β”‚ + β”‚ β”‚ β”‚ Route based β”‚ + β”‚ β”‚ β”‚ on host/path β”‚ + β”‚ β”‚ │───────────────▢│ + β”‚ β”‚ β”‚ β”‚ Return response + β”‚ β”‚ │◀───────────────│ + β”‚ │◀───────────────│ β”‚ + │◀───────────────│ β”‚ β”‚ +``` + +### **2. SSL Certificate Flow** +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Ingress β”‚ β”‚ cert-manager β”‚ β”‚ Let's β”‚ β”‚ GoDaddy β”‚ +β”‚ Controller β”‚ β”‚ β”‚ β”‚ Encrypt β”‚ β”‚ DNS API β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ β”‚ β”‚ β”‚ + β”‚ Check cert β”‚ β”‚ β”‚ + │───────────────▢│ β”‚ β”‚ + β”‚ β”‚ Request cert β”‚ β”‚ + β”‚ │───────────────▢│ β”‚ + β”‚ β”‚ β”‚ DNS Challenge β”‚ + β”‚ β”‚ │───────────────▢│ + β”‚ β”‚ β”‚ β”‚ Create TXT record + β”‚ β”‚ β”‚ │◀───────────────│ + β”‚ β”‚ β”‚ Cert Ready β”‚ + β”‚ β”‚ │◀───────────────│ + β”‚ β”‚ Cert Ready β”‚ β”‚ + β”‚ │◀───────────────│ β”‚ + β”‚ Cert Ready β”‚ β”‚ β”‚ + │◀───────────────│ β”‚ β”‚ +``` + +--- + +## πŸ› οΈ **Current Applications** + +Based on your codebase, you have these applications exposed via ingress: + +### **1. ArgoCD (GitOps)** +- **Domain**: `argo.mathmast.com` +- **Service**: `argo-cd-server` +- **Purpose**: GitOps deployment tool +- **Access**: Web UI for managing deployments +- **Namespace**: `freeleaps-devops-system` + +### **2. Gitea (Git Repository)** +- **Domain**: `gitea.freeleaps.mathmast.com` +- **Service**: `gitea-http` +- **Purpose**: Git repository hosting +- **Access**: Web UI for code management +- **Namespace**: `freeleaps-prod` +- **Port**: 3000 + +### **3. Magicleaps (Main Application)** +- **Production Domain**: `magicleaps.mathmast.com` +- **Alpha Domain**: `alpha.magicleaps.mathmast.com` +- **Service**: `magicleaps-frontend-service` +- **Purpose**: Main business application +- **Namespace**: `magicleaps` +- **Port**: 80 + +--- + +## πŸ”’ **Security Features** + +### **1. SSL/TLS Enforcement** +```yaml +# All traffic is forced to HTTPS +nginx.ingress.kubernetes.io/ssl-redirect: "true" +nginx.ingress.kubernetes.io/force-ssl-redirect: "true" +``` + +### **2. Automatic Certificate Management** +- Let's Encrypt certificates +- DNS01 challenge validation +- Automatic renewal +- GoDaddy DNS integration + +### **3. Performance Optimizations** +```yaml +# Handle large requests +nginx.ingress.kubernetes.io/proxy-body-size: "0" + +# Long-running requests +nginx.ingress.kubernetes.io/proxy-read-timeout: "600" +nginx.ingress.kubernetes.io/proxy-send-timeout: "600" +``` + +--- + +## πŸ“Š **Monitoring and Debugging** + +### **1. Check Ingress Status** +```bash +# Check all ingresses +kubectl get ingress --all-namespaces + +# Check specific ingress +kubectl describe ingress -n + +# Check ingress controller +kubectl get pods -n freeleaps-controls-system -l app.kubernetes.io/name=ingress-nginx +``` + +### **2. Check SSL Certificates** +```bash +# Check certificates +kubectl get certificates --all-namespaces + +# Check certificate status +kubectl describe certificate -n + +# Check cert-manager +kubectl get pods -n cert-manager +``` + +### **3. Check DNS Resolution** +```bash +# Test DNS resolution +nslookup argo.mathmast.com +nslookup gitea.freeleaps.mathmast.com +nslookup magicleaps.mathmast.com +nslookup alpha.magicleaps.mathmast.com +``` + +### **4. Check Azure Load Balancer** +```bash +# Your actual load balancer IP +curl -I http://4.155.160.32 + +# Check if load balancer is responding +telnet 4.155.160.32 80 +telnet 4.155.160.32 443 +``` + +--- + +## πŸš€ **How Your Setup Compares to Examples** + +### **Your Current Setup vs Example** + +| Feature | Your Setup | Example Setup | Notes | +|---------|------------|---------------|-------| +| **Ingress Controller** | nginx-ingress v1.12.0 | nginx-ingress | Same | +| **SSL Provider** | Let's Encrypt + GoDaddy | Let's Encrypt | You have DNS integration | +| **Certificate Validation** | DNS01 challenge | HTTP01 challenge | More reliable | +| **Automatic Creation** | Custom controller | Manual | You have automation | +| **Performance** | Optimized timeouts | Basic | You have better config | +| **Security** | SSL redirect enforced | SSL redirect | Same | + +### **Advantages of Your Setup** + +1. **Automation**: Custom controller automatically creates ingresses +2. **DNS Integration**: GoDaddy API for automatic DNS record creation +3. **Reliability**: DNS01 challenge is more reliable than HTTP01 +4. **Performance**: Optimized timeouts and body size limits +5. **Security**: Enforced SSL redirects + +--- + +## πŸ”§ **Troubleshooting Your Setup** + +### **1. Certificate Issues** +```bash +# Check certificate status +kubectl get certificates --all-namespaces + +# Check cert-manager logs +kubectl logs -n cert-manager deployment/cert-manager + +# Check DNS records +dig TXT _acme-challenge.mathmast.com +``` + +### **2. Ingress Issues** +```bash +# Check ingress controller +kubectl get pods -n freeleaps-controls-system -l app.kubernetes.io/name=ingress-nginx + +# Check ingress controller logs +kubectl logs -n freeleaps-controls-system deployment/ingress-nginx-controller + +# Check ingress status +kubectl describe ingress -n +``` + +### **3. DNS Issues** +```bash +# Test DNS resolution +nslookup + +# Check GoDaddy API key +kubectl get secret mathmast-godaddy-api-key -n cert-manager -o yaml +``` + +### **4. Load Balancer Issues** +```bash +# Check if your load balancer is accessible +curl -I http://4.155.160.32 + +# Check Azure load balancer health +az network lb show --name prod-usw2-k8s-freeleaps-lb --resource-group +``` + +--- + +## πŸ“š **Learn More** + +### **Your Specific Components** +- [nginx-ingress](https://kubernetes.github.io/ingress-nginx/) - Your ingress controller +- [cert-manager](https://cert-manager.io/docs/) - Your certificate manager +- [GoDaddy DNS01](https://cert-manager.io/docs/configuration/acme/dns01/godaddy/) - Your DNS provider +- [Let's Encrypt](https://letsencrypt.org/docs/) - Your certificate authority + +### **Related Documentation** +- [Kubernetes Ingress](https://kubernetes.io/docs/concepts/services-networking/ingress/) +- [SSL/TLS in Kubernetes](https://kubernetes.io/docs/concepts/services-networking/ingress/#tls) +- [DNS01 Challenge](https://cert-manager.io/docs/configuration/acme/dns01/) + +--- + +**Last Updated**: September 3, 2025 +**Version**: 1.0 +**Maintainer**: Infrastructure Team diff --git a/docs/Custom_Resources_And_Operators_Guide.md b/docs/Custom_Resources_And_Operators_Guide.md new file mode 100644 index 00000000..3b181fdc --- /dev/null +++ b/docs/Custom_Resources_And_Operators_Guide.md @@ -0,0 +1,573 @@ +# Custom Resources & Operators Guide + +## 🎯 **Overview** + +This guide explains **Custom Resources (CRs)**, **Custom Resource Definitions (CRDs)**, **Kubernetes Operators**, and how your `freeleaps-devops-reconciler` works as an operator to manage your DevOps infrastructure. + +--- + +## πŸ“Š **What Are Custom Resources?** + +### **πŸ”„ CR vs CRD Relationship** + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ CRD vs CR RELATIONSHIP β”‚ +β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ CUSTOM RESOURCE DEFINITION (CRD) β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ apiVersion: apiextensions.k8s.io/v1 β”‚ β”‚ β”‚ +β”‚ β”‚ kind: CustomResourceDefinition β”‚ β”‚ β”‚ +β”‚ β”‚ metadata: β”‚ β”‚ β”‚ +β”‚ β”‚ name: devopsprojects.freeleaps.com β”‚ β”‚ β”‚ +β”‚ β”‚ spec: β”‚ β”‚ β”‚ +β”‚ β”‚ group: freeleaps.com β”‚ β”‚ β”‚ +β”‚ β”‚ names: β”‚ β”‚ β”‚ +β”‚ β”‚ kind: DevOpsProject β”‚ β”‚ β”‚ +β”‚ β”‚ plural: devopsprojects β”‚ β”‚ β”‚ +β”‚ β”‚ scope: Namespaced β”‚ β”‚ β”‚ +β”‚ β”‚ versions: β”‚ β”‚ β”‚ +β”‚ β”‚ - name: v1alpha1 β”‚ β”‚ β”‚ +β”‚ β”‚ schema: β”‚ β”‚ β”‚ +β”‚ β”‚ # Schema definition... β”‚ β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β–Ό β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ +β”‚ β”‚ CUSTOM RESOURCE (CR) β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ apiVersion: freeleaps.com/v1alpha1 β”‚ β”‚ β”‚ +β”‚ β”‚ kind: DevOpsProject β”‚ β”‚ β”‚ +β”‚ β”‚ metadata: β”‚ β”‚ β”‚ +β”‚ β”‚ name: my-project β”‚ β”‚ β”‚ +β”‚ β”‚ namespace: freeleaps-devops-system β”‚ β”‚ β”‚ +β”‚ β”‚ spec: β”‚ β”‚ β”‚ +β”‚ β”‚ projectName: "My Awesome Project" β”‚ β”‚ β”‚ +β”‚ β”‚ projectId: "my-awesome-project" β”‚ β”‚ β”‚ +β”‚ β”‚ git: β”‚ β”‚ β”‚ +β”‚ β”‚ url: "https://github.com/myorg/myproject" β”‚ β”‚ β”‚ +β”‚ β”‚ branch: "main" β”‚ β”‚ β”‚ +β”‚ β”‚ registry: β”‚ β”‚ β”‚ +β”‚ β”‚ url: "https://harbor.example.com" β”‚ β”‚ β”‚ +β”‚ β”‚ project: "myproject" β”‚ β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +### **🎯 Why Custom Resources?** + +```yaml +# Instead of managing multiple resources manually: +# - Namespace +# - ServiceAccount +# - Role/RoleBinding +# - ConfigMap +# - Secret +# - Deployment +# - Service +# - Ingress + +# You can create ONE custom resource: +apiVersion: freeleaps.com/v1alpha1 +kind: DevOpsProject +metadata: + name: my-project +spec: + projectName: "My Project" + projectId: "my-project" + git: + url: "https://github.com/myorg/myproject" + branch: "main" + registry: + url: "https://harbor.example.com" + project: "myproject" +``` + +--- + +## 🏭 **Your DevOps Reconciler Architecture** + +### **πŸ“Š Reconciler vs DevOps Repo Relationship** + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ RECONCILER + DEVOPS ARCHITECTURE β”‚ +β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ FRELEAPS.COM PLATFORM β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ +β”‚ β”‚ β”‚ User β”‚ β”‚ Project β”‚ β”‚ Git β”‚ β”‚ Registry β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ Creates β”‚ β”‚ Manager β”‚ β”‚ Webhook β”‚ β”‚ Manager β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ Project β”‚ β”‚ Creates β”‚ β”‚ Triggers β”‚ β”‚ Creates β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ DevOps β”‚ β”‚ Event β”‚ β”‚ Repo β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ Project β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β–Ό β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ +β”‚ β”‚ RABBITMQ MESSAGE QUEUE β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ +β”‚ β”‚ β”‚DevOpsInit β”‚ β”‚DevOpsReconcileβ”‚ β”‚DevOpsDeploy β”‚ β”‚DevOpsDelete β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚Event β”‚ β”‚Event β”‚ β”‚Event β”‚ β”‚Event β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β–Ό β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ +β”‚ β”‚ FRELEAPS-DEVOPS-RECONCILER (OPERATOR) β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ CONTROLLERS β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β” β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚DevOpsProjectβ”‚ β”‚ArgoSettings β”‚ β”‚Jenkins β”‚ β”‚... β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚Controller β”‚ β”‚Controller β”‚ β”‚Settings β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚Controller β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β–Ό β”‚ β”‚ β”‚ +β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ CUSTOM RESOURCES β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β” β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚DevOpsProjectβ”‚ β”‚ArgoSettings β”‚ β”‚Jenkins β”‚ β”‚... β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚CR β”‚ β”‚CR β”‚ β”‚Settings β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚CR β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β–Ό β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ +β”‚ β”‚ KUBERNETES RESOURCES β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ +β”‚ β”‚ β”‚ArgoCD β”‚ β”‚Jenkins β”‚ β”‚Harbor β”‚ β”‚Namespaces β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚Applications β”‚ β”‚Pipelines β”‚ β”‚Repositories β”‚ β”‚Services β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚Projects β”‚ β”‚Jobs β”‚ β”‚Credentials β”‚ β”‚Deployments β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚Ingresses β”‚ β”‚ β”‚ +β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +--- + +## πŸ”§ **Your Custom Resources** + +### **1. DevOpsProject CRD** + +```yaml +# 🏭 ACTUAL CRD FROM YOUR CODEBASE +# freeleaps-devops-reconciler/deploy/crds.yaml +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: devopsprojects.freeleaps.com +spec: + group: freeleaps.com + scope: Namespaced + names: + kind: DevOpsProject + plural: devopsprojects + shortNames: [dop, dops] + versions: + - name: v1alpha1 + served: true + storage: true + schema: + openAPIV3Schema: + type: object + required: ['spec'] + properties: + spec: + type: object + required: + - projectName + - projectId + - git + - registry + - environments + properties: + projectName: + type: string + description: "Human readable project name" + projectId: + type: string + description: "Unique project identifier" + pattern: "^[a-z0-9]([a-z0-9-]*[a-z0-9])?$" + git: + type: object + required: [url, branch] + properties: + url: + type: string + description: "Git repository URL" + branch: + type: string + description: "Default git branch" + default: "main" + registry: + type: object + required: [url, project] + properties: + url: + type: string + description: "Container registry URL" + project: + type: string + description: "Registry project name" +``` + +### **2. DevOpsProject CR Example** + +```yaml +# 🏭 ACTUAL CR EXAMPLE +apiVersion: freeleaps.com/v1alpha1 +kind: DevOpsProject +metadata: + name: magicleaps-frontend + namespace: freeleaps-devops-system + labels: + app.kubernetes.io/name: magicleaps-frontend + app.kubernetes.io/instance: magicleaps-frontend +spec: + projectName: "Magicleaps Frontend" + projectId: "magicleaps-frontend" + git: + url: "https://github.com/freeleaps/magicleaps-frontend" + branch: "main" + credentialsRef: + name: "github-credentials" + namespace: "freeleaps-devops-system" + registry: + url: "https://harbor.freeleaps.mathmast.com" + project: "magicleaps" + credentialsRef: + name: "harbor-credentials" + namespace: "freeleaps-devops-system" + environments: + - name: "production" + domain: "magicleaps.mathmast.com" + replicas: 3 + - name: "alpha" + domain: "alpha.magicleaps.mathmast.com" + replicas: 1 +``` + +### **3. Other Custom Resources** + +```yaml +# 🏭 YOUR COMPLETE CRD SET +# From freeleaps-devops-reconciler/docs/design/one-click-deployment.md + +# DevOpsProject - Main project configuration +# ArgoSettings - ArgoCD settings for the DevOpsProject +# JenkinsSettings - Jenkins settings and generated pipelines +# ContainerRegistry - Container registry information +# ContainerImage - Every image manufactured by Jenkins pipeline +# DeploymentRecord - Track deployment records +# GitCredential - Git repository credentials +# IngressResource - Ingress configuration +``` + +--- + +## πŸ€– **How Your Operator Works** + +### **πŸ”„ Reconciliation Loop** + +```python +# 🏭 ACTUAL CODE FROM YOUR RECONCILER +# freeleaps-devops-reconciler/reconciler/controllers/devops_projects/controller.py + +@kopf.on.create(group=consts.GROUP, version=consts.VERSION, kind=consts.DEVOPS_PROJECT_KIND) +def on_devops_proj_created(name: str, namespace: Optional[str], body: Body, logger: Logger, **kwargs): + logger.info(f"Newly created DevOpsProject resource and named {name} in namespace {namespace}, start to reconciling...") + + devops_proj = DevOpsProject(body) + + try: + devops_proj.parse_spec() + devops_proj.get_spec().validate(logger) + except SpecError as e: + devops_proj.update_status({ + 'devopsProject': { + 'status': DevOpsProjectDiagStatus.INVALID.value, + 'synced': False, + 'ready': False, + 'lastProbeTime': isotime(), + } + }) + devops_proj.error(action='CreateDevOpsProject', + reason='InvalidSpecArgument', msg=str(e)) + raise kopf.TemporaryError(f"Error found in DevOpsProject spec: {e}") + + # Create resource manager and handle the project + resource_manager = DevOpsProjectResourceManager(namespace, logger) + # ... implementation details +``` + +### **πŸ“Š Event Flow** + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ EVENT FLOW β”‚ +β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ User β”‚ β”‚ RabbitMQ β”‚ β”‚ Operator β”‚ β”‚ Kubernetes β”‚ β”‚ +β”‚ β”‚ Action β”‚ β”‚ Message β”‚ β”‚ Controllerβ”‚ β”‚ Resources β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ 1. Create β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ Project β”‚ β”‚ β”‚ β”‚ +β”‚ │───────────────▢│ β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ 2. DevOpsInit β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ Event β”‚ β”‚ β”‚ +β”‚ β”‚ │───────────────▢│ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ 3. Create CR β”‚ β”‚ +β”‚ β”‚ β”‚ │───────────────▢│ β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ 4. CR Created β”‚ +β”‚ β”‚ β”‚ β”‚ │◀───────────────│ +β”‚ β”‚ β”‚ β”‚ 5. Reconcile β”‚ β”‚ +β”‚ β”‚ β”‚ │◀───────────────│ β”‚ +β”‚ β”‚ β”‚ β”‚ 6. Create β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ Resources β”‚ β”‚ +β”‚ β”‚ β”‚ │───────────────▢│ β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ 7. Resources β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ Ready β”‚ +β”‚ β”‚ β”‚ β”‚ │◀───────────────│ +β”‚ β”‚ β”‚ β”‚ 8. Update β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ Status β”‚ β”‚ +β”‚ β”‚ β”‚ │◀───────────────│ β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +--- + +## 🎯 **Understanding the Relationship** + +### **πŸ“Š Reconciler vs DevOps Repo** + +| Component | Purpose | Location | Responsibility | +|-----------|---------|----------|----------------| +| **freeleaps-devops-reconciler** | **Kubernetes Operator** | `freeleaps-ops/freeleaps-devops-reconciler/` | Watches CRs, creates K8s resources | +| **freeleaps.com Platform** | **Business Logic** | `freeleaps-service-hub/` | User interface, project management | +| **RabbitMQ** | **Message Queue** | Infrastructure | Event communication | +| **ArgoCD** | **GitOps** | `freeleaps-ops/cluster/manifests/` | Application deployment | +| **Jenkins** | **CI/CD** | Infrastructure | Pipeline execution | + +### **πŸ”„ How They Work Together** + +```yaml +# 1. User creates project on freeleaps.com +# 2. Platform sends DevOpsInit event to RabbitMQ +# 3. Reconciler receives event and creates DevOpsProject CR +# 4. Reconciler watches CR and creates: +# - ArgoCD Application +# - Jenkins Pipeline +# - Harbor Repository +# - Namespace and RBAC +# 5. ArgoCD deploys the application +# 6. Jenkins runs the pipeline +``` + +--- + +## πŸ”§ **Practical Examples** + +### **1. Creating a DevOpsProject** + +```bash +# Create a DevOpsProject CR +kubectl apply -f - < -n + +# Check CR YAML +kubectl get devopsproject -n -o yaml +``` + +### **3. Check Created Resources** + +```bash +# Check what resources were created +kubectl get all -n + +# Check ArgoCD applications +kubectl get applications -n freeleaps-devops-system + +# Check Jenkins pipelines +kubectl get jenkinssettings --all-namespaces +``` + +--- + +## πŸ“š **Next Steps** + +### **1. Learn More About Operators** +- [Kubernetes Operators](https://kubernetes.io/docs/concepts/extend-kubernetes/operator/) +- [Kopf Framework](https://kopf.readthedocs.io/) (what your reconciler uses) +- [Operator SDK](https://sdk.operatorframework.io/) + +### **2. Understand Your Architecture** +- Study your `freeleaps-devops-reconciler` code +- Understand the event flow from RabbitMQ +- Learn how CRs trigger resource creation + +### **3. Extend Your Operator** +- Add new custom resources +- Implement new controllers +- Add validation and error handling + +--- + +**Last Updated**: September 3, 2025 +**Version**: 1.0 +**Maintainer**: Infrastructure Team diff --git a/docs/Ingress_Setup_And_Redirects_Guide.md b/docs/Ingress_Setup_And_Redirects_Guide.md new file mode 100644 index 00000000..dfdb8a8b --- /dev/null +++ b/docs/Ingress_Setup_And_Redirects_Guide.md @@ -0,0 +1,558 @@ +# Ingress Setup & Redirects Guide + +## 🎯 **Overview** + +This guide covers advanced ingress configuration, redirects, and routing patterns. Building on your existing `Current_Ingress_Analysis.md`, this focuses on practical setup and common patterns. + +--- + +## πŸ“Š **Ingress Setup Process** + +### **Step 1: Install Ingress Controller** + +```bash +# Install nginx-ingress controller +kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/controller-v1.12.0/deploy/static/provider/cloud/deploy.yaml + +# Verify installation +kubectl get pods -n ingress-nginx +kubectl get service ingress-nginx-controller -n ingress-nginx +``` + +### **Step 2: Configure DNS** + +```bash +# Your actual Azure Load Balancer IP +# IP: 4.155.160.32 (prod-usw2-k8s-freeleaps-lb-fe-ip) + +# Add DNS records: +# argo.mathmast.com β†’ 4.155.160.32 +# gitea.freeleaps.mathmast.com β†’ 4.155.160.32 +# magicleaps.mathmast.com β†’ 4.155.160.32 +# alpha.magicleaps.mathmast.com β†’ 4.155.160.32 +``` + +### **Step 3: Setup Cert-Manager** + +```bash +# Install cert-manager +kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.13.0/cert-manager.yaml + +# Verify installation +kubectl get pods -n cert-manager +``` + +--- + +## πŸ”„ **Redirect Patterns** + +### **1. HTTP to HTTPS Redirect** + +```yaml +# βœ… BEST PRACTICE: Force HTTPS +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: secure-app-ingress + annotations: + nginx.ingress.kubernetes.io/ssl-redirect: "true" + nginx.ingress.kubernetes.io/force-ssl-redirect: "true" + cert-manager.io/cluster-issuer: "letsencrypt-prod" +spec: + tls: + - hosts: + - myapp.example.com + secretName: myapp-tls + rules: + - host: myapp.example.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: myapp-service + port: + number: 80 +``` + +### **2. Path-Based Redirects** + +```yaml +# Redirect /old to /new +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: redirect-ingress + annotations: + nginx.ingress.kubernetes.io/rewrite-target: /new + nginx.ingress.kubernetes.io/configuration-snippet: | + location /old { + return 301 /new; + } +spec: + rules: + - host: myapp.example.com + http: + paths: + - path: /old + pathType: Prefix + backend: + service: + name: myapp-service + port: + number: 80 +``` + +### **3. Domain Redirects** + +```yaml +# Redirect old domain to new domain +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: domain-redirect + annotations: + nginx.ingress.kubernetes.io/configuration-snippet: | + if ($host = "old.example.com") { + return 301 https://new.example.com$request_uri; + } +spec: + rules: + - host: old.example.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: myapp-service + port: + number: 80 +``` + +### **4. Subdomain Redirects** + +```yaml +# Redirect www to non-www +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: www-redirect + annotations: + nginx.ingress.kubernetes.io/configuration-snippet: | + if ($host = "www.example.com") { + return 301 https://example.com$request_uri; + } +spec: + rules: + - host: www.example.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: myapp-service + port: + number: 80 +``` + +--- + +## 🎯 **Advanced Routing Patterns** + +### **1. API Versioning** + +```yaml +# Route different API versions +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: api-versioning +spec: + rules: + - host: api.example.com + http: + paths: + - path: /v1 + pathType: Prefix + backend: + service: + name: api-v1-service + port: + number: 8080 + - path: /v2 + pathType: Prefix + backend: + service: + name: api-v2-service + port: + number: 8080 + - path: / + pathType: Prefix + backend: + service: + name: api-latest-service + port: + number: 8080 +``` + +### **2. Blue-Green Deployment** + +```yaml +# Route traffic between blue and green deployments +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: blue-green-routing + annotations: + nginx.ingress.kubernetes.io/configuration-snippet: | + # Route 90% to blue, 10% to green + set $upstream ""; + if ($request_uri ~ "^/blue") { + set $upstream "blue-service:8080"; + } + if ($request_uri ~ "^/green") { + set $upstream "green-service:8080"; + } + if ($upstream = "") { + # Default routing logic + set $random $remote_addr; + if ($random ~ "^[0-9]$") { + set $upstream "blue-service:8080"; + } + if ($random ~ "^[a-f]$") { + set $upstream "green-service:8080"; + } + } +spec: + rules: + - host: myapp.example.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: blue-service + port: + number: 8080 +``` + +### **3. Geographic Routing** + +```yaml +# Route based on user location +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: geo-routing + annotations: + nginx.ingress.kubernetes.io/configuration-snippet: | + # Route US users to US cluster, EU users to EU cluster + set $upstream ""; + if ($http_x_forwarded_for ~ "^.*\.(us|ca)") { + set $upstream "us-service:8080"; + } + if ($http_x_forwarded_for ~ "^.*\.(eu|uk|de|fr)") { + set $upstream "eu-service:8080"; + } + if ($upstream = "") { + set $upstream "default-service:8080"; + } +spec: + rules: + - host: myapp.example.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: default-service + port: + number: 8080 +``` + +--- + +## πŸ”§ **Performance Optimizations** + +### **1. Rate Limiting** + +```yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: rate-limited-ingress + annotations: + nginx.ingress.kubernetes.io/rate-limit: "100" + nginx.ingress.kubernetes.io/rate-limit-window: "1m" + nginx.ingress.kubernetes.io/rate-limit-burst: "200" +spec: + rules: + - host: api.example.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: api-service + port: + number: 8080 +``` + +### **2. Caching Headers** + +```yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: cached-ingress + annotations: + nginx.ingress.kubernetes.io/configuration-snippet: | + location ~* \.(js|css|png|jpg|jpeg|gif|ico|svg)$ { + expires 1y; + add_header Cache-Control "public, immutable"; + } +spec: + rules: + - host: myapp.example.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: myapp-service + port: + number: 80 +``` + +### **3. Gzip Compression** + +```yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: compressed-ingress + annotations: + nginx.ingress.kubernetes.io/enable-cors: "true" + nginx.ingress.kubernetes.io/cors-allow-methods: "GET, POST, PUT, DELETE, OPTIONS" + nginx.ingress.kubernetes.io/cors-allow-origin: "*" + nginx.ingress.kubernetes.io/configuration-snippet: | + gzip on; + gzip_types text/plain text/css application/json application/javascript text/xml application/xml application/xml+rss text/javascript; +spec: + rules: + - host: myapp.example.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: myapp-service + port: + number: 80 +``` + +--- + +## πŸ›‘οΈ **Security Patterns** + +### **1. IP Whitelisting** + +```yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: whitelisted-ingress + annotations: + nginx.ingress.kubernetes.io/whitelist-source-range: "10.0.0.0/8,192.168.0.0/16" +spec: + rules: + - host: internal.example.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: internal-service + port: + number: 8080 +``` + +### **2. Basic Auth** + +```yaml +# Create secret for basic auth +apiVersion: v1 +kind: Secret +metadata: + name: basic-auth-secret +type: Opaque +data: + auth: +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: auth-ingress + annotations: + nginx.ingress.kubernetes.io/auth-type: basic + nginx.ingress.kubernetes.io/auth-secret: basic-auth-secret +spec: + rules: + - host: protected.example.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: protected-service + port: + number: 8080 +``` + +### **3. CORS Configuration** + +```yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: cors-ingress + annotations: + nginx.ingress.kubernetes.io/enable-cors: "true" + nginx.ingress.kubernetes.io/cors-allow-methods: "GET, POST, PUT, DELETE, OPTIONS" + nginx.ingress.kubernetes.io/cors-allow-origin: "https://myapp.example.com" + nginx.ingress.kubernetes.io/cors-allow-credentials: "true" +spec: + rules: + - host: api.example.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: api-service + port: + number: 8080 +``` + +--- + +## πŸ” **Monitoring & Debugging** + +### **1. Ingress Status Check** + +```bash +# Check ingress status +kubectl get ingress --all-namespaces +kubectl describe ingress -n + +# Check ingress controller logs +kubectl logs -n ingress-nginx deployment/ingress-nginx-controller + +# Check certificate status +kubectl get certificates --all-namespaces +kubectl describe certificate -n +``` + +### **2. Test Redirects** + +```bash +# Test HTTP to HTTPS redirect +curl -I http://myapp.example.com +# Should return: 301 Moved Permanently + +# Test domain redirect +curl -I http://old.example.com +# Should return: 301 Moved Permanently + +# Test path redirect +curl -I http://myapp.example.com/old +# Should return: 301 Moved Permanently +``` + +### **3. Performance Testing** + +```bash +# Test response times +curl -w "@curl-format.txt" -o /dev/null -s "https://myapp.example.com" + +# Load testing +ab -n 1000 -c 10 https://myapp.example.com/ + +# SSL certificate check +openssl s_client -connect myapp.example.com:443 -servername myapp.example.com +``` + +--- + +## πŸ“š **Best Practices** + +### **1. Always Use HTTPS** +```yaml +# βœ… DO: Force HTTPS redirect +nginx.ingress.kubernetes.io/force-ssl-redirect: "true" + +# ❌ DON'T: Allow HTTP traffic +# nginx.ingress.kubernetes.io/ssl-redirect: "false" +``` + +### **2. Use Specific Path Types** +```yaml +# βœ… DO: Use specific path types +pathType: Prefix # For /api/v1/* +pathType: Exact # For exact matches +pathType: ImplementationSpecific # For complex patterns + +# ❌ DON'T: Use default path types without understanding +``` + +### **3. Implement Health Checks** +```yaml +# βœ… DO: Add health check endpoints +nginx.ingress.kubernetes.io/health-check-path: "/health" +nginx.ingress.kubernetes.io/health-check-interval: "30s" +``` + +### **4. Monitor Resource Usage** +```bash +# Monitor ingress controller resources +kubectl top pods -n ingress-nginx + +# Monitor ingress metrics +kubectl get ingress --all-namespaces -o wide +``` + +--- + +## 🎯 **Your Current Setup Analysis** + +Based on your `Current_Ingress_Analysis.md`, you have: + +### **βœ… What's Working Well:** +- **Nginx Ingress Controller**: Production-grade setup +- **Cert-Manager**: Automatic SSL certificates +- **DNS01 Challenge**: Reliable certificate validation +- **Custom Controller**: Automated ingress creation +- **Performance Optimizations**: Timeouts and body size limits + +### **πŸ”§ Potential Improvements:** +1. **Rate Limiting**: Add rate limiting for API endpoints +2. **Caching**: Implement caching for static assets +3. **Monitoring**: Add ingress metrics and alerts +4. **Backup Ingress**: Consider secondary ingress controller + +--- + +**Last Updated**: September 3, 2025 +**Version**: 1.0 +**Maintainer**: Infrastructure Team diff --git a/docs/Kubernetes_Bootstrap_Guide.md b/docs/Kubernetes_Bootstrap_Guide.md new file mode 100644 index 00000000..3df2dd56 --- /dev/null +++ b/docs/Kubernetes_Bootstrap_Guide.md @@ -0,0 +1,433 @@ +# Kubernetes Bootstrap Guide + +## 🎯 **Overview** + +This guide explains how to bootstrap a complete Kubernetes cluster from scratch using Azure VMs and the `freeleaps-ops` repository. **Kubernetes does NOT create automatically** - you need to manually bootstrap the entire infrastructure. + +## πŸ“‹ **Prerequisites** + +### **1. Azure Infrastructure** +- βœ… Azure VMs (already provisioned) +- βœ… Network connectivity between VMs +- βœ… Azure AD tenant configured +- βœ… Resource group: `k8s` + +### **2. Local Environment** +- βœ… `freeleaps-ops` repository cloned +- βœ… Ansible installed (`pip install ansible`) +- βœ… Azure CLI installed and configured +- βœ… SSH access to VMs + +### **3. VM Requirements** +- **Master Nodes**: 2+ VMs for control plane +- **Worker Nodes**: 2+ VMs for workloads +- **Network**: All VMs in same subnet +- **OS**: Ubuntu 20.04+ recommended + +--- + +## πŸš€ **Step-by-Step Bootstrap Process** + +### **Step 1: Verify Azure VMs** + +```bash +# Check VM status +az vm list --resource-group k8s --query "[].{name:name,powerState:powerState,privateIP:privateIps}" -o table + +# Ensure all VMs are running +az vm start --resource-group k8s --name +``` + +### **Step 2: Configure Inventory** + +Edit the Ansible inventory file: + +```bash +cd freeleaps-ops +vim cluster/ansible/manifests/inventory.ini +``` + +**Example inventory structure:** +```ini +[all:vars] +ansible_user=wwwadmin@mathmast.com +ansible_ssh_common_args='-o StrictHostKeyChecking=no' + +[kube_control_plane] +prod-usw2-k8s-freeleaps-master-01 ansible_host=10.10.0.4 etcd_member_name=freeleaps-etcd-01 host_name=prod-usw2-k8s-freeleaps-master-01 +prod-usw2-k8s-freeleaps-master-02 ansible_host=10.10.0.5 etcd_member_name=freeleaps-etcd-02 host_name=prod-usw2-k8s-freeleaps-master-02 + +[kube_node] +prod-usw2-k8s-freeleaps-worker-nodes-01 ansible_host=10.10.0.6 host_name=prod-usw2-k8s-freeleaps-worker-nodes-01 +prod-usw2-k8s-freeleaps-worker-nodes-02 ansible_host=10.10.0.7 host_name=prod-usw2-k8s-freeleaps-worker-nodes-02 + +[etcd] +prod-usw2-k8s-freeleaps-master-01 +prod-usw2-k8s-freeleaps-master-02 + +[k8s_cluster:children] +kube_control_plane +kube_node +``` + +### **Step 3: Test Connectivity** + +```bash +cd cluster/ansible/manifests +ansible -i inventory.ini all -m ping -kK +``` + +### **Step 4: Bootstrap Kubernetes Cluster** + +```bash +cd ../../3rd/kubespray +ansible-playbook -i ../../cluster/ansible/manifests/inventory.ini ./cluster.yml -kK -b +``` + +**What this does:** +- Installs Docker/containerd on all nodes +- Downloads Kubernetes binaries (v1.31.4) +- Generates certificates and keys +- Bootstraps etcd cluster +- Starts Kubernetes control plane +- Joins worker nodes +- Configures Calico networking +- Sets up OIDC authentication + +### **Step 5: Get Kubeconfig** + +```bash +# Get kubeconfig from master node +ssh wwwadmin@mathmast.com@10.10.0.4 "sudo cat /etc/kubernetes/admin.conf" > ~/.kube/config + +# Test cluster access +kubectl get nodes +kubectl get pods -n kube-system +``` + +### **Step 6: Deploy Infrastructure** + +```bash +cd ../../cluster/manifests + +# Deploy in order +kubectl apply -f freeleaps-controls-system/ +kubectl apply -f freeleaps-devops-system/ +kubectl apply -f freeleaps-monitoring-system/ +kubectl apply -f freeleaps-logging-system/ +kubectl apply -f freeleaps-data-platform/ +``` + +### **Step 7: Setup Authentication** + +```bash +cd ../../cluster/bin +./freeleaps-cluster-authenticator auth +``` + +--- + +## πŸ€– **Automated Bootstrap Script** + +Use the provided bootstrap script for automated deployment: + +```bash +cd freeleaps-ops/docs +./bootstrap-k8s-cluster.sh +``` + +**Script Features:** +- βœ… Prerequisites verification +- βœ… Azure VM status check +- βœ… Connectivity testing +- βœ… Automated cluster bootstrap +- βœ… Infrastructure deployment +- βœ… Authentication setup +- βœ… Status verification + +**Usage Options:** +```bash +# Full bootstrap +./bootstrap-k8s-cluster.sh + +# Only verify prerequisites +./bootstrap-k8s-cluster.sh --verify + +# Only bootstrap cluster (skip infrastructure) +./bootstrap-k8s-cluster.sh --bootstrap +``` + +--- + +## πŸ”§ **Manual Bootstrap Commands** + +If you prefer manual control, here are the detailed commands: + +### **1. Install Prerequisites** + +```bash +# Install Ansible +pip install ansible + +# Install Azure CLI +curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash + +# Install kubectl +curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" +sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl +``` + +### **2. Configure Azure** + +```bash +# Login to Azure +az login + +# Set subscription +az account set --subscription +``` + +### **3. Bootstrap Cluster** + +```bash +# Navigate to kubespray +cd freeleaps-ops/3rd/kubespray + +# Run cluster installation +ansible-playbook -i ../../cluster/ansible/manifests/inventory.ini ./cluster.yml -kK -b +``` + +### **4. Verify Installation** + +```bash +# Get kubeconfig +ssh wwwadmin@mathmast.com@ "sudo cat /etc/kubernetes/admin.conf" > ~/.kube/config + +# Test cluster +kubectl get nodes +kubectl get pods -n kube-system +``` + +--- + +## πŸ” **Verification Steps** + +### **1. Cluster Health** + +```bash +# Check nodes +kubectl get nodes -o wide + +# Check system pods +kubectl get pods -n kube-system + +# Check cluster info +kubectl cluster-info +``` + +### **2. Network Verification** + +```bash +# Check Calico pods +kubectl get pods -n kube-system | grep calico + +# Check network policies +kubectl get networkpolicies --all-namespaces +``` + +### **3. Authentication Test** + +```bash +# Test OIDC authentication +kubectl auth whoami + +# Check permissions +kubectl auth can-i --list +``` + +--- + +## 🚨 **Troubleshooting** + +### **Common Issues** + +#### **1. Ansible Connection Failed** +```bash +# Check VM status +az vm show --resource-group k8s --name --query "powerState" + +# Test SSH manually +ssh wwwadmin@mathmast.com@ + +# Check network security groups +az network nsg rule list --resource-group k8s --nsg-name +``` + +#### **2. Cluster Bootstrap Failed** +```bash +# Check Ansible logs +ansible-playbook -i inventory.ini cluster.yml -kK -b -vvv + +# Check VM resources +kubectl describe node + +# Check system pods +kubectl get pods -n kube-system +kubectl describe pod -n kube-system +``` + +#### **3. Infrastructure Deployment Failed** +```bash +# Check CRDs +kubectl get crd + +# Check operator pods +kubectl get pods --all-namespaces | grep operator + +# Check events +kubectl get events --all-namespaces --sort-by='.lastTimestamp' +``` + +### **Recovery Procedures** + +#### **If Bootstrap Fails** +1. **Clean up failed installation** +```bash +# Reset VMs to clean state +az vm restart --resource-group k8s --name +``` + +2. **Retry bootstrap** +```bash +cd freeleaps-ops/3rd/kubespray +ansible-playbook -i ../../cluster/ansible/manifests/inventory.ini ./cluster.yml -kK -b +``` + +#### **If Infrastructure Deployment Fails** +1. **Check prerequisites** +```bash +kubectl get nodes +kubectl get pods -n kube-system +``` + +2. **Redeploy components** +```bash +kubectl delete -f / +kubectl apply -f / +``` + +--- + +## πŸ“Š **Post-Bootstrap Verification** + +### **1. Core Components** + +```bash +# ArgoCD +kubectl get pods -n freeleaps-devops-system | grep argocd + +# Cert-manager +kubectl get pods -n freeleaps-controls-system | grep cert-manager + +# Prometheus/Grafana +kubectl get pods -n freeleaps-monitoring-system | grep prometheus +kubectl get pods -n freeleaps-monitoring-system | grep grafana + +# Logging +kubectl get pods -n freeleaps-logging-system | grep loki +``` + +### **2. Access Points** + +```bash +# ArgoCD UI +kubectl port-forward svc/argocd-server -n freeleaps-devops-system 8080:80 + +# Grafana UI +kubectl port-forward svc/kube-prometheus-stack-grafana -n freeleaps-monitoring-system 3000:80 + +# Kubernetes Dashboard +kubectl port-forward svc/kubernetes-dashboard-kong-proxy -n freeleaps-infra-system 8443:443 +``` + +### **3. Authentication Setup** + +```bash +# Setup user authentication +cd freeleaps-ops/cluster/bin +./freeleaps-cluster-authenticator auth + +# Test authentication +kubectl auth whoami +kubectl get nodes +``` + +--- + +## πŸ”’ **Security Considerations** + +### **1. Network Security** +- Ensure VMs are in private subnets +- Configure network security groups properly +- Use VPN or bastion host for access + +### **2. Access Control** +- Use Azure AD OIDC for authentication +- Implement RBAC for authorization +- Regular access reviews + +### **3. Monitoring** +- Enable audit logging +- Monitor cluster health +- Set up alerts + +--- + +## πŸ“š **Next Steps** + +### **1. Application Deployment** +- Deploy applications via ArgoCD +- Configure CI/CD pipelines +- Set up monitoring and alerting + +### **2. Maintenance** +- Regular security updates +- Backup etcd data +- Monitor resource usage + +### **3. Scaling** +- Add more worker nodes +- Configure auto-scaling +- Optimize resource allocation + +--- + +## πŸ†˜ **Support** + +### **Emergency Contacts** +- **Infrastructure Team**: [Contact Information] +- **Azure Support**: [Contact Information] +- **Kubernetes Community**: [Contact Information] + +### **Useful Commands** +```bash +# Cluster status +kubectl get nodes +kubectl get pods --all-namespaces + +# Logs +kubectl logs -n kube-system + +# Events +kubectl get events --all-namespaces --sort-by='.lastTimestamp' + +# Resource usage +kubectl top nodes +kubectl top pods --all-namespaces +``` + +--- + +**Last Updated**: September 3, 2025 +**Version**: 1.0 +**Maintainer**: Infrastructure Team diff --git a/docs/Kubernetes_Core_Concepts_Guide.md b/docs/Kubernetes_Core_Concepts_Guide.md new file mode 100644 index 00000000..8fcca1a3 --- /dev/null +++ b/docs/Kubernetes_Core_Concepts_Guide.md @@ -0,0 +1,840 @@ +# Kubernetes Core Concepts Guide + +## 🎯 **Overview** + +This guide explains the fundamental Kubernetes concepts: **Pods**, **Namespaces**, and **Persistent Volume Claims (PVCs)**. These are the building blocks of your applications in Kubernetes. + +--- + +## 🏭 **Your Codebase Usage Patterns** + +Before diving into the concepts, here's what your codebase actually uses: + +### **πŸ“‹ Deployment Methods Used in Your Codebase** +| Method | Used In Your Codebase | Example Location | +|--------|----------------------|------------------| +| **Helm Charts** | βœ… **Primary method** | `freeleaps-ops/freeleaps/helm-pkg/` | +| **kubectl apply** | βœ… **Secondary method** | `freeleaps-devops-reconciler/scripts/deploy.sh` | +| **kubectl run** | ❌ **Not used** | - | +| **Direct YAML** | βœ… **For simple resources** | `freeleaps-ops/cluster/manifests/` | + +### **πŸ”§ Your Actual Commands** +```bash +# Your codebase uses these patterns: +helm install/upgrade --namespace -f +kubectl apply -f / +kubectl get pods -n -l app.kubernetes.io/name= +``` + +--- + +## πŸ“¦ **1. Pods (The Smallest Unit)** + +### **What is a Pod?** + +A **Pod** is the smallest deployable unit in Kubernetes. Think of it as a "wrapper" that contains one or more containers. + +### **Pod Characteristics** + +- **Atomic Unit**: Pods are created, scheduled, and destroyed together +- **Shared Network**: Containers in a Pod share the same IP address +- **Shared Storage**: Containers can share volumes +- **Lifecycle**: Pods are ephemeral (temporary) + +### **Pod Structure** + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: my-app-pod + namespace: default + labels: + app: my-app + version: v1 +spec: + containers: + - name: app-container + image: nginx:latest + ports: + - containerPort: 80 + resources: + requests: + memory: "64Mi" + cpu: "250m" + limits: + memory: "128Mi" + cpu: "500m" +``` + +### **Creating Pods** + +#### **Method 1: Direct Pod Creation** ❌ **BAD PRACTICE - NOT USED IN YOUR CODEBASE** +```bash +# ❌ BAD PRACTICE: This method is NOT used in your codebase (and shouldn't be!) +# Create a simple nginx pod +kubectl run nginx-pod --image=nginx:latest --port=80 + +# Why this is BAD: +# - Creates standalone Pods (no self-healing) +# - No scaling capability +# - No rolling updates +# - No rollback capability +# - No resource limits +# - Not declarative + +# βœ… GOOD PRACTICE: This method IS used in your codebase +# Create from YAML file +kubectl apply -f pod.yaml +``` + +#### **Method 2: Using YAML File** βœ… **GOOD PRACTICE - USED IN YOUR CODEBASE** +```yaml +# πŸ“š EDUCATIONAL EXAMPLE (not from your codebase) +# pod.yaml +apiVersion: v1 +kind: Pod +metadata: + name: web-app + labels: + app: web +spec: + containers: + - name: web + image: nginx:latest + ports: + - containerPort: 80 + env: + - name: ENVIRONMENT + value: "production" +``` + +#### **Method 3: Helm Charts** βœ… **BEST PRACTICE - PRIMARY METHOD IN YOUR CODEBASE** +```yaml +# 🏭 ACTUAL EXAMPLE FROM YOUR CODEBASE +# freeleaps-ops/freeleaps/helm-pkg/freeleaps/templates/freeleaps/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} + app.kubernetes.io/name: "freeleaps" + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/instance: {{ .Release.Name }} + name: "freeleaps" + namespace: {{ .Release.Namespace | quote }} +spec: + selector: + matchLabels: + app.kubernetes.io/name: "freeleaps" + app.kubernetes.io/instance: {{ .Release.Name }} + replicas: {{ .Values.freeleaps.replicas }} + template: + metadata: + labels: + app.kubernetes.io/name: "freeleaps" + app.kubernetes.io/instance: {{ .Release.Name }} + spec: + containers: + - name: "freeleaps" + image: "{{ .Values.freeleaps.image.registry }}/{{ .Values.freeleaps.image.repository }}:{{ .Values.freeleaps.image.tag }}" +``` + +### **🎯 Best Practices for Pod Creation** + +#### **❌ What NOT to Do** +```bash +# ❌ NEVER use kubectl run for production applications +kubectl run my-app --image=my-app:latest --port=8080 + +# ❌ NEVER create standalone Pods for services +kubectl run database --image=postgres:13 --port=5432 + +# ❌ NEVER use imperative commands for production +kubectl run nginx --image=nginx:latest +``` + +#### **βœ… What TO Do** +```bash +# βœ… Use Deployments for applications +kubectl create deployment my-app --image=my-app:latest + +# βœ… Use Helm charts for complex applications +helm install my-app ./my-app-chart --namespace my-app + +# βœ… Use kubectl apply for declarative deployments +kubectl apply -f deployment.yaml + +# βœ… Use StatefulSets for databases +kubectl apply -f statefulset.yaml +``` + +#### **πŸ”§ When `kubectl run` is Acceptable** +```bash +# βœ… OK: One-time debugging pods +kubectl run debug-pod --image=busybox --rm -it --restart=Never -- nslookup my-service + +# βœ… OK: Temporary testing +kubectl run test-pod --image=nginx --rm -it --restart=Never -- curl http://my-service:80 + +# βœ… OK: Quick experiments (development only) +kubectl run temp-pod --image=nginx --port=80 +``` + +### **Managing Pods** + +```bash +# List pods +kubectl get pods +kubectl get pods -n + +# Get detailed info +kubectl describe pod + +# View logs +kubectl logs +kubectl logs -f # Follow logs + +# Execute commands in pod +kubectl exec -it -- /bin/bash + +# Delete pod +kubectl delete pod +``` + +### **Pod Lifecycle** + +```bash +# Check pod status +kubectl get pods -o wide + +# Common statuses: +# - Pending: Pod is being scheduled +# - Running: Pod is running +# - Succeeded: Pod completed successfully +# - Failed: Pod failed +# - Unknown: Pod status unclear +``` + +--- + +## 🏒 **2. Namespaces (Logical Isolation)** + +### **What is a Namespace?** + +A **Namespace** is a way to divide cluster resources among multiple users, teams, or applications. It provides a scope for names. + +### **Namespace Benefits** + +- **Resource Isolation**: Separate resources logically +- **Access Control**: Different permissions per namespace +- **Resource Quotas**: Limit resource usage +- **Network Policies**: Control network traffic + +### **Default Namespaces** + +```bash +# View all namespaces +kubectl get namespaces + +# Default namespaces: +# - default: User resources +# - kube-system: System components +# - kube-public: Public resources +# - kube-node-lease: Node lease objects +``` + +### **Creating Namespaces** + +#### **Method 1: Command Line** βœ… **USED IN YOUR CODEBASE** +```bash +# βœ… This method IS used in your codebase +# Create namespace +kubectl create namespace my-app + +# βœ… This pattern IS used in your codebase +# Create with labels +kubectl create namespace my-app --dry-run=client -o yaml | \ + kubectl label --local -f - environment=production | \ + kubectl apply -f - +``` + +#### **Method 2: YAML File** βœ… **USED IN YOUR CODEBASE** +```yaml +# πŸ“š EDUCATIONAL EXAMPLE (not from your codebase) +# namespace.yaml +apiVersion: v1 +kind: Namespace +metadata: + name: my-app + labels: + environment: production + team: backend +``` + +#### **Method 3: Helm Charts** βœ… **PRIMARY METHOD IN YOUR CODEBASE** +```yaml +# 🏭 ACTUAL EXAMPLE FROM YOUR CODEBASE +# Your Helm charts automatically create namespaces +# freeleaps-devops-reconciler/scripts/deploy.sh +HELM_CMD+=(--namespace "$NAMESPACE") + +# Create namespace if requested +if [[ "$CREATE_NAMESPACE" == "true" && "$UPGRADE" != "true" ]]; then + HELM_CMD+=(--create-namespace) +fi +``` + +### **Working with Namespaces** + +```bash +# Set default namespace +kubectl config set-context --current --namespace=my-app + +# Run command in specific namespace +kubectl get pods -n my-app + +# Create resource in namespace +kubectl run nginx --image=nginx -n my-app + +# Delete namespace (deletes all resources) +kubectl delete namespace my-app +``` + +### **Namespace Best Practices** + +```yaml +# πŸ“š EDUCATIONAL EXAMPLE (not from your codebase) +# Example: Production namespace setup +apiVersion: v1 +kind: Namespace +metadata: + name: production + labels: + environment: production + team: platform +--- +apiVersion: v1 +kind: ResourceQuota +metadata: + name: production-quota + namespace: production +spec: + hard: + requests.cpu: "4" + requests.memory: 8Gi + limits.cpu: "8" + limits.memory: 16Gi + pods: "20" +``` + +### **Your Actual Namespace Structure** +```bash +# 🏭 YOUR ACTUAL NAMESPACES +kubectl get namespaces + +# Your codebase uses these namespaces: +# - freeleaps-controls-system (ingress, cert-manager) +# - freeleaps-devops-system (ArgoCD) +# - freeleaps-prod (Gitea) +# - magicleaps (main application) +# - freeleaps-alpha (testing) +``` + +--- + +## πŸ’Ύ **3. Persistent Volume Claims (PVCs)** + +### **What is a PVC?** + +A **Persistent Volume Claim (PVC)** is a request for storage by a user. It's like a "storage reservation" that provides persistent storage to Pods. + +### **Storage Concepts** + +- **Persistent Volume (PV)**: The actual storage resource +- **Persistent Volume Claim (PVC)**: A request for storage +- **Storage Class**: Defines the type of storage + +### **PVC Structure** + +```yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: my-app-storage + namespace: my-app +spec: + accessModes: + - ReadWriteOnce # Single node read/write + resources: + requests: + storage: 10Gi + storageClassName: managed-premium # Azure Premium SSD +``` + +### **Creating PVCs** + +#### **Method 1: Command Line** βœ… **USED IN YOUR CODEBASE** +```bash +# βœ… This method IS used in your codebase +# Create PVC +kubectl create -f pvc.yaml + +# βœ… This pattern IS used in your codebase +# Create with kubectl +kubectl apply -f - < + +# Delete PVC +kubectl delete pvc + +# Check storage classes +kubectl get storageclass +``` + +--- + +## πŸ”§ **4. Practical Examples** + +### **Example 1: Web Application with Database** + +```yaml +# πŸ“š EDUCATIONAL EXAMPLE (not from your codebase) +# namespace.yaml +apiVersion: v1 +kind: Namespace +metadata: + name: webapp +--- +# database-pvc.yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: database-storage + namespace: webapp +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi + storageClassName: managed-premium +--- +# database-pod.yaml +apiVersion: v1 +kind: Pod +metadata: + name: database + namespace: webapp + labels: + app: database +spec: + containers: + - name: postgres + image: postgres:13 + env: + - name: POSTGRES_DB + value: "myapp" + - name: POSTGRES_PASSWORD + value: "secret" + ports: + - containerPort: 5432 + volumeMounts: + - name: db-storage + mountPath: /var/lib/postgresql/data + volumes: + - name: db-storage + persistentVolumeClaim: + claimName: database-storage +--- +# webapp-pod.yaml +apiVersion: v1 +kind: Pod +metadata: + name: webapp + namespace: webapp + labels: + app: webapp +spec: + containers: + - name: webapp + image: my-webapp:latest + ports: + - containerPort: 8080 + env: + - name: DATABASE_URL + value: "postgresql://postgres:secret@database:5432/myapp" +``` + +### **Example 2: Multi-Container Pod** + +```yaml +# πŸ“š EDUCATIONAL EXAMPLE (not from your codebase) +apiVersion: v1 +kind: Pod +metadata: + name: app-with-sidecar + namespace: my-app +spec: + containers: + - name: main-app + image: my-app:latest + ports: + - containerPort: 8080 + volumeMounts: + - name: shared-data + mountPath: /app/data + - name: log-collector + image: fluentd:latest + volumeMounts: + - name: shared-data + mountPath: /logs + - name: config-volume + mountPath: /etc/fluentd + volumes: + - name: shared-data + emptyDir: {} + - name: config-volume + configMap: + name: fluentd-config +``` + +--- + +## πŸ› οΈ **5. Management Commands** + +### **Pod Management** + +```bash +# Create and manage pods +kubectl run nginx --image=nginx:latest --port=80 +kubectl get pods +kubectl describe pod nginx +kubectl logs nginx +kubectl exec -it nginx -- /bin/bash +kubectl delete pod nginx + +# Port forwarding +kubectl port-forward nginx 8080:80 + +# Copy files +kubectl cp local-file.txt nginx:/tmp/ +``` + +### **Namespace Management** + +```bash +# Create and manage namespaces +kubectl create namespace my-app +kubectl get namespaces +kubectl get pods -n my-app +kubectl config set-context --current --namespace=my-app +kubectl delete namespace my-app +``` + +### **PVC Management** + +```bash +# Create and manage PVCs +kubectl apply -f pvc.yaml +kubectl get pvc +kubectl describe pvc my-pvc +kubectl delete pvc my-pvc + +# Check storage usage +kubectl get pv +kubectl get storageclass +``` + +--- + +## πŸ“Š **6. Monitoring and Debugging** + +### **Pod Health Checks** + +```yaml +# πŸ“š EDUCATIONAL EXAMPLE (not from your codebase) +apiVersion: v1 +kind: Pod +metadata: + name: healthy-app +spec: + containers: + - name: app + image: my-app:latest + livenessProbe: + httpGet: + path: /health + port: 8080 + initialDelaySeconds: 30 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 5 + periodSeconds: 5 +``` + +### **Resource Monitoring** + +```bash +# Check resource usage +kubectl top pods +kubectl top nodes + +# Check events +kubectl get events --sort-by='.lastTimestamp' + +# Check pod status +kubectl get pods -o wide +kubectl describe pod +``` + +--- + +## πŸ”’ **7. Security Best Practices** + +### **Pod Security** + +```yaml +# πŸ“š EDUCATIONAL EXAMPLE (not from your codebase) +apiVersion: v1 +kind: Pod +metadata: + name: secure-app +spec: + securityContext: + runAsNonRoot: true + runAsUser: 1000 + fsGroup: 2000 + containers: + - name: app + image: my-app:latest + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL +``` + +### **Network Policies** + +```yaml +# πŸ“š EDUCATIONAL EXAMPLE (not from your codebase) +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: default-deny + namespace: my-app +spec: + podSelector: {} + policyTypes: + - Ingress + - Egress +``` + +--- + +## πŸ“š **8. Next Steps** + +### **Advanced Concepts** + +- **Deployments**: Manage Pod replicas +- **Services**: Expose Pods internally/externally +- **ConfigMaps & Secrets**: Configuration management +- **Jobs & CronJobs**: Batch processing +- **StatefulSets**: Stateful applications + +### **Best Practices** + +1. **Don't create Pods directly** - Use Deployments +2. **Use namespaces** for organization +3. **Set resource limits** on all containers +4. **Use health checks** for reliability +5. **Implement security contexts** +6. **Monitor resource usage** + +--- + +## πŸ†˜ **Troubleshooting** + +### **Common Issues** + +```bash +# Pod stuck in Pending +kubectl describe pod +kubectl get events --sort-by='.lastTimestamp' + +# PVC not bound +kubectl describe pvc +kubectl get pv + +# Namespace issues +kubectl get namespaces +kubectl describe namespace +``` + +### **Useful Commands** + +```bash +# Debug pod +kubectl logs +kubectl exec -it -- /bin/bash +kubectl describe pod + +# Check resources +kubectl get all -n +kubectl get pvc,pv -n +kubectl get events -n +``` + +--- + +## 🏭 **Appendix: Your Codebase Patterns** + +### **Your Actual Deployment Commands** +```bash +# 🏭 REAL COMMANDS FROM YOUR CODEBASE +# From freeleaps-devops-reconciler/scripts/deploy.sh + +# Helm deployment (primary method) +helm install/upgrade "$RELEASE_NAME" . \ + --namespace "$NAMESPACE" \ + --create-namespace \ + -f "$VALUES_FILE" \ + --set "image.tag=$IMAGE_TAG" + +# kubectl apply (secondary method) +kubectl apply -f / + +# Status checking +kubectl get pods -n "$NAMESPACE" -l "app.kubernetes.io/name=freeleaps-devops-reconciler" +kubectl logs -n "$NAMESPACE" deployment/"$RELEASE_NAME" +``` + +### **Your Actual Namespace Structure** +```bash +# 🏭 YOUR REAL NAMESPACES +kubectl get namespaces + +# Production namespaces: +# - freeleaps-controls-system (ingress, cert-manager) +# - freeleaps-devops-system (ArgoCD) +# - freeleaps-prod (Gitea) +# - magicleaps (main application) +# - freeleaps-alpha (testing) +``` + +### **Your Actual Storage Classes** +```bash +# 🏭 YOUR REAL STORAGE CLASSES +kubectl get storageclass + +# Azure storage classes used: +# - managed-premium (SSD) +# - managed-standard (HDD) +# - azure-disk-std-lrs (standard disk) +``` + +### **Your Actual Resource Naming Conventions** +```yaml +# 🏭 YOUR REAL NAMING PATTERNS +# From freeleaps-service-hub deployment guidelines + +# Resource naming: {APP_NAME}-{RESOURCE_NAME} +# Examples: +# - payment-deployment +# - payment-service +# - payment-configmap + +# Namespace: same as repository name +# Examples: +# - freeleaps-service-hub +# - freeleaps-ops +# - magicleaps +``` + +--- + +**Last Updated**: September 3, 2025 +**Version**: 1.0 +**Maintainer**: Infrastructure Team diff --git a/docs/Kubernetes_Fundamentals_For_Junior_Engineers.md b/docs/Kubernetes_Fundamentals_For_Junior_Engineers.md new file mode 100644 index 00000000..c56441d8 --- /dev/null +++ b/docs/Kubernetes_Fundamentals_For_Junior_Engineers.md @@ -0,0 +1,832 @@ +# Kubernetes Fundamentals for Junior Engineers + +## 🎯 **Overview** + +This guide is designed for junior engineers starting their DevOps journey. It covers the essential Kubernetes concepts you'll encounter daily, with practical examples and real-world scenarios. + +--- + +## πŸ“‹ **Prerequisites** + +Before diving into these concepts, make sure you understand: +- βœ… **Pods**: Basic container units +- βœ… **Namespaces**: Resource organization +- βœ… **PVCs**: Persistent storage +- βœ… **Basic kubectl commands** + +--- + +## πŸš€ **1. Deployments (The Right Way to Run Apps)** + +### **Why Deployments?** + +**Never create Pods directly!** Deployments are the standard way to run applications because they provide: +- **Replicas**: Run multiple copies of your app +- **Rolling updates**: Zero-downtime deployments +- **Rollback**: Easy recovery from failed deployments +- **Self-healing**: Automatically restart failed pods + +### **Deployment Structure** + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: web-app + namespace: my-app + labels: + app: web-app + version: v1 +spec: + replicas: 3 # Run 3 copies + selector: + matchLabels: + app: web-app + template: + metadata: + labels: + app: web-app + version: v1 + spec: + containers: + - name: web-app + image: nginx:latest + ports: + - containerPort: 80 + resources: + requests: + memory: "64Mi" + cpu: "250m" + limits: + memory: "128Mi" + cpu: "500m" +``` + +### **Managing Deployments** + +```bash +# Create deployment +kubectl apply -f deployment.yaml + +# Check deployment status +kubectl get deployments +kubectl describe deployment web-app + +# Scale deployment +kubectl scale deployment web-app --replicas=5 + +# Update deployment (change image) +kubectl set image deployment/web-app web-app=nginx:1.21 + +# Rollback to previous version +kubectl rollout undo deployment/web-app + +# Check rollout status +kubectl rollout status deployment/web-app + +# View rollout history +kubectl rollout history deployment/web-app +``` + +--- + +## 🌐 **2. Services (Network Communication)** + +### **Why Services?** + +Pods are ephemeral (temporary). Services provide: +- **Stable IP addresses** for your applications +- **Load balancing** across multiple pods +- **Service discovery** within the cluster +- **External access** to your applications + +### **Service Types** + +#### **ClusterIP (Internal Access)** +```yaml +apiVersion: v1 +kind: Service +metadata: + name: web-app-service + namespace: my-app +spec: + type: ClusterIP + selector: + app: web-app + ports: + - port: 80 + targetPort: 80 + protocol: TCP +``` + +#### **NodePort (External Access via Node)** +```yaml +apiVersion: v1 +kind: Service +metadata: + name: web-app-nodeport + namespace: my-app +spec: + type: NodePort + selector: + app: web-app + ports: + - port: 80 + targetPort: 80 + nodePort: 30080 # Access via node IP:30080 + protocol: TCP +``` + +#### **LoadBalancer (Cloud Load Balancer)** +```yaml +apiVersion: v1 +kind: Service +metadata: + name: web-app-lb + namespace: my-app +spec: + type: LoadBalancer + selector: + app: web-app + ports: + - port: 80 + targetPort: 80 + protocol: TCP +``` + +### **Managing Services** + +```bash +# Create service +kubectl apply -f service.yaml + +# List services +kubectl get services +kubectl get svc + +# Get service details +kubectl describe service web-app-service + +# Test service connectivity +kubectl run test-pod --image=busybox --rm -it --restart=Never -- wget -O- web-app-service:80 + +# Port forward for testing +kubectl port-forward service/web-app-service 8080:80 +``` + +--- + +## πŸ”§ **3. ConfigMaps & Secrets (Configuration Management)** + +### **Why ConfigMaps & Secrets?** + +Applications need configuration. These provide: +- **Environment-specific settings** (dev, staging, prod) +- **Secure credential storage** +- **Configuration without rebuilding images** +- **Centralized configuration management** + +### **ConfigMaps** + +#### **Creating ConfigMaps** + +```bash +# From literal values +kubectl create configmap app-config \ + --from-literal=DB_HOST=postgres-service \ + --from-literal=DB_PORT=5432 \ + --from-literal=ENVIRONMENT=production + +# From file +kubectl create configmap app-config --from-file=config.properties + +# From YAML +kubectl apply -f configmap.yaml +``` + +#### **ConfigMap YAML** +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: app-config + namespace: my-app +data: + # Simple key-value pairs + DB_HOST: "postgres-service" + DB_PORT: "5432" + ENVIRONMENT: "production" + + # File-like content + config.properties: | + server.port=8080 + logging.level=INFO + cache.enabled=true +``` + +#### **Using ConfigMaps in Pods** +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: web-app +spec: + template: + spec: + containers: + - name: web-app + image: my-app:latest + env: + # Environment variables + - name: DB_HOST + valueFrom: + configMapKeyRef: + name: app-config + key: DB_HOST + - name: DB_PORT + valueFrom: + configMapKeyRef: + name: app-config + key: DB_PORT + volumeMounts: + # Mount as files + - name: config-volume + mountPath: /app/config + volumes: + - name: config-volume + configMap: + name: app-config +``` + +### **Secrets** + +#### **Creating Secrets** + +```bash +# From literal values +kubectl create secret generic db-secret \ + --from-literal=DB_USERNAME=admin \ + --from-literal=DB_PASSWORD=secret123 + +# From file +kubectl create secret generic tls-secret \ + --from-file=tls.crt=cert.pem \ + --from-file=tls.key=key.pem +``` + +#### **Secret YAML** +```yaml +apiVersion: v1 +kind: Secret +metadata: + name: db-secret + namespace: my-app +type: Opaque +data: + # Base64 encoded values + DB_USERNAME: YWRtaW4= # admin + DB_PASSWORD: c2VjcmV0MTIz # secret123 +``` + +#### **Using Secrets in Pods** +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: web-app +spec: + template: + spec: + containers: + - name: web-app + image: my-app:latest + env: + - name: DB_USERNAME + valueFrom: + secretKeyRef: + name: db-secret + key: DB_USERNAME + - name: DB_PASSWORD + valueFrom: + secretKeyRef: + name: db-secret + key: DB_PASSWORD +``` + +--- + +## 🎯 **4. Ingress (External Access & Routing)** + +### **Why Ingress?** + +Ingress provides: +- **URL-based routing** (example.com/api, example.com/web) +- **SSL/TLS termination** +- **Load balancing** +- **Name-based virtual hosting** + +### **Ingress Controller** + +First, ensure you have an Ingress controller (like nginx-ingress): + +```bash +# Check if ingress controller exists +kubectl get pods -n ingress-nginx + +# If not, install nginx-ingress +kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/controller-v1.8.2/deploy/static/provider/cloud/deploy.yaml +``` + +### **Ingress Resource** + +```yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: web-app-ingress + namespace: my-app + annotations: + nginx.ingress.kubernetes.io/rewrite-target: / + cert-manager.io/cluster-issuer: "letsencrypt-prod" +spec: + tls: + - hosts: + - myapp.example.com + secretName: myapp-tls + rules: + - host: myapp.example.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: web-app-service + port: + number: 80 + - path: /api + pathType: Prefix + backend: + service: + name: api-service + port: + number: 8080 +``` + +### **Managing Ingress** + +```bash +# Apply ingress +kubectl apply -f ingress.yaml + +# Check ingress status +kubectl get ingress +kubectl describe ingress web-app-ingress + +# Test ingress +curl -H "Host: myapp.example.com" http://your-cluster-ip/ +``` + +--- + +## πŸ”„ **5. Jobs & CronJobs (Batch Processing)** + +### **Why Jobs & CronJobs?** + +For tasks that need to: +- **Run to completion** (not continuously) +- **Execute on schedule** (daily backups, reports) +- **Process data** (ETL jobs, batch processing) + +### **Jobs** + +```yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: data-processing-job + namespace: my-app +spec: + completions: 3 # Run 3 times + parallelism: 2 # Run 2 in parallel + template: + spec: + containers: + - name: data-processor + image: data-processor:latest + command: ["python", "process_data.py"] + env: + - name: INPUT_FILE + value: "/data/input.csv" + - name: OUTPUT_FILE + value: "/data/output.csv" + volumeMounts: + - name: data-volume + mountPath: /data + volumes: + - name: data-volume + persistentVolumeClaim: + claimName: data-pvc + restartPolicy: Never +``` + +### **CronJobs** + +```yaml +apiVersion: batch/v1 +kind: CronJob +metadata: + name: daily-backup + namespace: my-app +spec: + schedule: "0 2 * * *" # Daily at 2 AM + jobTemplate: + spec: + template: + spec: + containers: + - name: backup + image: backup-tool:latest + command: ["/bin/bash", "-c"] + args: + - | + echo "Starting backup at $(date)" + pg_dump -h postgres-service -U admin mydb > /backup/backup-$(date +%Y%m%d).sql + echo "Backup completed at $(date)" + env: + - name: PGPASSWORD + valueFrom: + secretKeyRef: + name: db-secret + key: DB_PASSWORD + volumeMounts: + - name: backup-volume + mountPath: /backup + volumes: + - name: backup-volume + persistentVolumeClaim: + claimName: backup-pvc + restartPolicy: OnFailure +``` + +### **Managing Jobs & CronJobs** + +```bash +# Create job +kubectl apply -f job.yaml + +# Check job status +kubectl get jobs +kubectl describe job data-processing-job + +# View job logs +kubectl logs job/data-processing-job + +# Create cronjob +kubectl apply -f cronjob.yaml + +# Check cronjob status +kubectl get cronjobs +kubectl describe cronjob daily-backup + +# Suspend cronjob +kubectl patch cronjob daily-backup -p '{"spec" : {"suspend" : true}}' + +# Resume cronjob +kubectl patch cronjob daily-backup -p '{"spec" : {"suspend" : false}}' +``` + +--- + +## πŸ“Š **6. Resource Management & Limits** + +### **Why Resource Management?** + +To prevent: +- **Resource starvation** (one app consuming all CPU/memory) +- **Node failures** (out of memory) +- **Poor performance** (over-subscription) + +### **Resource Requests & Limits** + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: resource-managed-app +spec: + template: + spec: + containers: + - name: app + image: my-app:latest + resources: + requests: + memory: "64Mi" # Minimum guaranteed + cpu: "250m" # 0.25 CPU cores + limits: + memory: "128Mi" # Maximum allowed + cpu: "500m" # 0.5 CPU cores +``` + +### **Resource Quotas** + +```yaml +apiVersion: v1 +kind: ResourceQuota +metadata: + name: namespace-quota + namespace: my-app +spec: + hard: + requests.cpu: "4" # 4 CPU cores total + requests.memory: 8Gi # 8GB memory total + limits.cpu: "8" # 8 CPU cores max + limits.memory: 16Gi # 16GB memory max + pods: "20" # 20 pods max + services: "10" # 10 services max + persistentvolumeclaims: "10" # 10 PVCs max +``` + +### **Managing Resources** + +```bash +# Check resource usage +kubectl top pods +kubectl top nodes + +# Check quotas +kubectl get resourcequota +kubectl describe resourcequota namespace-quota + +# Check resource requests/limits +kubectl describe pod | grep -A 10 "Limits" +``` + +--- + +## πŸ” **7. Monitoring & Debugging** + +### **Essential Commands** + +```bash +# Check cluster health +kubectl get nodes +kubectl get pods --all-namespaces + +# Check specific resources +kubectl get deployments,services,pods -n my-app + +# View logs +kubectl logs +kubectl logs -f # Follow logs +kubectl logs --previous # Previous container + +# Execute commands in pods +kubectl exec -it -- /bin/bash +kubectl exec -- ls /app + +# Port forwarding for debugging +kubectl port-forward 8080:80 +kubectl port-forward service/ 8080:80 + +# Check events +kubectl get events --sort-by='.lastTimestamp' +kubectl get events -n my-app + +# Check resource usage +kubectl top pods +kubectl top nodes +``` + +### **Common Debugging Scenarios** + +#### **Pod Stuck in Pending** +```bash +# Check why pod can't be scheduled +kubectl describe pod + +# Check node resources +kubectl describe node + +# Check events +kubectl get events --sort-by='.lastTimestamp' +``` + +#### **Pod Crashing** +```bash +# Check pod status +kubectl get pods +kubectl describe pod + +# Check logs +kubectl logs +kubectl logs --previous + +# Check resource usage +kubectl top pod +``` + +#### **Service Not Working** +```bash +# Check service endpoints +kubectl get endpoints + +# Check service configuration +kubectl describe service + +# Test service connectivity +kubectl run test-pod --image=busybox --rm -it --restart=Never -- wget -O- : +``` + +--- + +## πŸ”’ **8. Security Best Practices** + +### **Pod Security** + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: secure-app +spec: + template: + spec: + securityContext: + runAsNonRoot: true + runAsUser: 1000 + fsGroup: 2000 + containers: + - name: app + image: my-app:latest + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + volumeMounts: + - name: tmp-volume + mountPath: /tmp + volumes: + - name: tmp-volume + emptyDir: {} +``` + +### **Network Policies** + +```yaml +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: default-deny + namespace: my-app +spec: + podSelector: {} + policyTypes: + - Ingress + - Egress +--- +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: allow-web-traffic + namespace: my-app +spec: + podSelector: + matchLabels: + app: web-app + policyTypes: + - Ingress + ingress: + - from: + - namespaceSelector: + matchLabels: + name: frontend + ports: + - protocol: TCP + port: 80 +``` + +--- + +## πŸ“š **9. Best Practices for Junior Engineers** + +### **1. Always Use Deployments (Not Pods)** +```bash +# ❌ Don't do this +kubectl run nginx --image=nginx + +# βœ… Do this +kubectl create deployment nginx --image=nginx +``` + +### **2. Use Namespaces for Organization** +```bash +# Create namespaces for different environments +kubectl create namespace development +kubectl create namespace staging +kubectl create namespace production +``` + +### **3. Set Resource Limits** +```yaml +resources: + requests: + memory: "64Mi" + cpu: "250m" + limits: + memory: "128Mi" + cpu: "500m" +``` + +### **4. Use Health Checks** +```yaml +livenessProbe: + httpGet: + path: /health + port: 8080 + initialDelaySeconds: 30 + periodSeconds: 10 +readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 5 + periodSeconds: 5 +``` + +### **5. Use Labels and Selectors** +```yaml +metadata: + labels: + app: web-app + version: v1 + environment: production + team: backend +``` + +### **6. Use ConfigMaps and Secrets** +```bash +# Store configuration externally +kubectl create configmap app-config --from-literal=DB_HOST=postgres +kubectl create secret generic db-secret --from-literal=DB_PASSWORD=secret123 +``` + +--- + +## πŸš€ **10. Next Steps** + +### **Advanced Concepts to Learn** +- **StatefulSets**: For stateful applications (databases) +- **DaemonSets**: For node-level services (monitoring agents) +- **Horizontal Pod Autoscaler (HPA)**: Automatic scaling +- **Vertical Pod Autoscaler (VPA)**: Resource optimization +- **Pod Disruption Budgets**: Availability guarantees +- **Pod Security Standards**: Security policies + +### **Tools to Master** +- **Helm**: Package manager for Kubernetes +- **Kustomize**: Configuration management +- **ArgoCD**: GitOps deployment +- **Prometheus & Grafana**: Monitoring +- **Fluentd/Elasticsearch**: Logging + +### **Practice Projects** +1. **Simple Web App**: Deploy nginx with database +2. **API Service**: Deploy REST API with authentication +3. **Batch Job**: Create data processing pipeline +4. **Monitoring Stack**: Deploy Prometheus + Grafana +5. **CI/CD Pipeline**: Automate deployments + +--- + +## πŸ†˜ **Troubleshooting Quick Reference** + +### **Common Issues & Solutions** + +| Issue | Command | What to Check | +|-------|---------|---------------| +| Pod not starting | `kubectl describe pod ` | Events, resource limits | +| Service not working | `kubectl get endpoints ` | Pod labels, service selector | +| Deployment stuck | `kubectl rollout status deployment/` | Image pull, resource limits | +| Ingress not working | `kubectl describe ingress ` | Ingress controller, TLS | +| High resource usage | `kubectl top pods` | Resource limits, memory leaks | + +### **Useful Aliases** +```bash +# Add to your .bashrc or .zshrc +alias k='kubectl' +alias kg='kubectl get' +alias kd='kubectl describe' +alias kl='kubectl logs' +alias ke='kubectl exec -it' +alias kp='kubectl port-forward' +``` + +--- + +**Last Updated**: September 3, 2025 +**Version**: 1.0 +**Maintainer**: Infrastructure Team diff --git a/docs/PVC_Deep_Dive_Guide.md b/docs/PVC_Deep_Dive_Guide.md new file mode 100644 index 00000000..2b850b47 --- /dev/null +++ b/docs/PVC_Deep_Dive_Guide.md @@ -0,0 +1,608 @@ +# PVC Deep Dive Guide: Understanding Persistent Storage in Kubernetes + +## 🎯 **Overview** + +This guide explains **Persistent Volume Claims (PVCs)** in detail, why they're essential, and how your current Kubernetes setup uses them. PVCs are crucial for applications that need to store data that survives pod restarts, crashes, or migrations. + +--- + +## πŸ“Š **How PVCs Work: Visual Explanation** + +### **πŸ”„ PVC Lifecycle Flow** + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ PVC LIFECYCLE β”‚ +β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ DEVELOPER β”‚ β”‚ PVC β”‚ β”‚ PV β”‚ β”‚ STORAGE β”‚ β”‚ +β”‚ β”‚ Creates β”‚ β”‚ Requests β”‚ β”‚ Provides β”‚ β”‚ Backend β”‚ β”‚ +β”‚ β”‚ PVC β”‚ β”‚ Storage β”‚ β”‚ Storage β”‚ β”‚ (Azure) β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ 1. Create PVC β”‚ β”‚ β”‚ β”‚ +β”‚ │───────────────▢│ β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ 2. Find PV β”‚ β”‚ β”‚ +β”‚ β”‚ │───────────────▢│ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ 3. Provision β”‚ β”‚ +β”‚ β”‚ β”‚ │───────────────▢│ β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ 4. Create Disk β”‚ +β”‚ β”‚ β”‚ β”‚ │◀───────────────│ +β”‚ β”‚ β”‚ β”‚ 5. Bind PV β”‚ β”‚ +β”‚ β”‚ β”‚ │◀───────────────│ β”‚ +β”‚ β”‚ β”‚ 6. Bind PVC β”‚ β”‚ β”‚ +β”‚ β”‚ │◀───────────────│ β”‚ β”‚ +β”‚ β”‚ 7. Ready β”‚ β”‚ β”‚ β”‚ +β”‚ │◀───────────────│ β”‚ β”‚ β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +### **πŸ—οΈ Storage Architecture** + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ STORAGE ARCHITECTURE β”‚ +β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ KUBERNETES CLUSTER β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ +β”‚ β”‚ β”‚ POD 1 β”‚ β”‚ POD 2 β”‚ β”‚ POD 3 β”‚ β”‚ POD 4 β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ Volume β”‚ β”‚ β”‚ β”‚ Volume β”‚ β”‚ β”‚ β”‚ Volume β”‚ β”‚ β”‚ β”‚ Volume β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ Mount β”‚ β”‚ β”‚ β”‚ Mount β”‚ β”‚ β”‚ β”‚ Mount β”‚ β”‚ β”‚ β”‚ Mount β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ β”‚ +β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ +β”‚ β”‚ β”‚ PVCs β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ PVC: gitea β”‚ β”‚ PVC: mongo β”‚ β”‚ PVC: logs β”‚ β”‚ PVC: jenkinsβ”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ 15Gi β”‚ β”‚ 8Gi β”‚ β”‚ 1Gi β”‚ β”‚ 50Gi β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ RWO β”‚ β”‚ RWO β”‚ β”‚ RWO β”‚ β”‚ RWO β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ β”‚ +β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ +β”‚ β”‚ β”‚ PVs β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ PV: gitea β”‚ β”‚ PV: mongo β”‚ β”‚ PV: logs β”‚ β”‚ PV: jenkins β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ 15Gi β”‚ β”‚ 8Gi β”‚ β”‚ 1Gi β”‚ β”‚ 50Gi β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ azure-disk β”‚ β”‚ azure-disk β”‚ β”‚ azure-disk β”‚ β”‚ azure-disk β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ β”‚ +β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ β”‚ +β”‚ β–Ό β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ AZURE STORAGE BACKEND β”‚ β”‚ +β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ +β”‚ β”‚ β”‚ Managed Diskβ”‚ β”‚ Managed Diskβ”‚ β”‚ Managed Diskβ”‚ β”‚ Managed Diskβ”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ 15Gi SSD β”‚ β”‚ 8Gi SSD β”‚ β”‚ 1Gi SSD β”‚ β”‚ 50Gi SSD β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ Premium β”‚ β”‚ Premium β”‚ β”‚ Standard β”‚ β”‚ Standard β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +--- + +## πŸ€” **Why Each Pod Needs PVC: The Data Persistence Problem** + +### **❌ Without PVC: Data Loss Scenario** + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ WITHOUT PVC (BAD) β”‚ +β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ POD 1 β”‚ β”‚ POD 2 β”‚ β”‚ POD 3 β”‚ β”‚ POD 4 β”‚ β”‚ +β”‚ β”‚ nginx:latestβ”‚ β”‚ nginx:latestβ”‚ β”‚ nginx:latestβ”‚ β”‚ nginx:latestβ”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ +β”‚ β”‚ β”‚ /tmp β”‚ β”‚ β”‚ β”‚ /tmp β”‚ β”‚ β”‚ β”‚ /tmp β”‚ β”‚ β”‚ β”‚ /tmp β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ (temp) β”‚ β”‚ β”‚ β”‚ (temp) β”‚ β”‚ β”‚ β”‚ (temp) β”‚ β”‚ β”‚ β”‚ (temp) β”‚ β”‚ β”‚ +β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ +β”‚ πŸ”„ Pod Restart/Delete β†’ ❌ ALL DATA LOST β”‚ +β”‚ β”‚ +β”‚ ❌ User uploads gone β”‚ +β”‚ ❌ Database files gone β”‚ +β”‚ ❌ Configuration gone β”‚ +β”‚ ❌ Logs gone β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +### **βœ… With PVC: Data Persistence** + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ WITH PVC (GOOD) β”‚ +β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ POD 1 β”‚ β”‚ POD 2 β”‚ β”‚ POD 3 β”‚ β”‚ POD 4 β”‚ β”‚ +β”‚ β”‚ nginx:latestβ”‚ β”‚ nginx:latestβ”‚ β”‚ nginx:latestβ”‚ β”‚ nginx:latestβ”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ +β”‚ β”‚ β”‚ /data β”‚ β”‚ β”‚ β”‚ /data β”‚ β”‚ β”‚ β”‚ /data β”‚ β”‚ β”‚ β”‚ /data β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ (PVC) β”‚ β”‚ β”‚ β”‚ (PVC) β”‚ β”‚ β”‚ β”‚ (PVC) β”‚ β”‚ β”‚ β”‚ (PVC) β”‚ β”‚ β”‚ +β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ SHARED STORAGE β”‚ β”‚ +β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ +β”‚ β”‚ β”‚ πŸ“ /data β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ user-uploads/ β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ database/ β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ config/ β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ └── πŸ“„ logs/ β”‚ β”‚ β”‚ +β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ +β”‚ πŸ”„ Pod Restart/Delete β†’ βœ… DATA PERSISTS β”‚ +β”‚ β”‚ +β”‚ βœ… User uploads preserved β”‚ +β”‚ βœ… Database files preserved β”‚ +β”‚ βœ… Configuration preserved β”‚ +β”‚ βœ… Logs preserved β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +--- + +## 🏭 **Your Current Kubernetes Setup: PVC Analysis** + +### **πŸ“Š Your Actual PVC Usage** + +Based on your codebase analysis, here's how PVCs are currently used: + +#### **1. Gitea (Git Repository)** +```yaml +# 🏭 ACTUAL CONFIGURATION FROM YOUR CODEBASE +# freeleaps-ops/freeleaps/helm-pkg/3rd/gitea/values.prod.yaml +persistence: + enabled: true + create: true + mount: true + claimName: gitea-shared-storage + size: 15Gi + accessModes: + - ReadWriteOnce + storageClass: azure-disk-std-lrs + annotations: + helm.sh/resource-policy: keep +``` + +**What this means:** +- βœ… **Gitea uses PVC** for storing repositories, user data, and configuration +- βœ… **15GB storage** allocated for Git repositories and user data +- βœ… **Azure Standard Disk** (cost-effective for this use case) +- βœ… **ReadWriteOnce** - only one pod can access at a time +- βœ… **Data persists** when Gitea pod restarts + +#### **2. MongoDB (Database)** +```yaml +# 🏭 ACTUAL CONFIGURATION FROM YOUR CODEBASE +# freeleaps-ops/freeleaps/helm-pkg/3rd/mongo/values.yaml +persistence: + enabled: true + size: 8Gi + accessModes: + - ReadWriteOnce + storageClass: "" # Uses default Azure storage class +``` + +**What this means:** +- βœ… **MongoDB uses PVC** for database files +- βœ… **8GB storage** for database data +- βœ… **Data persists** when MongoDB pod restarts +- βœ… **Critical for data integrity** + +#### **3. Jenkins (CI/CD)** +```yaml +# 🏭 ACTUAL CONFIGURATION FROM YOUR CODEBASE +# freeleaps-ops/cluster/manifests/freeleaps-devops-system/jenkins/values.yaml +persistence: + enabled: true + storageClass: azure-blob-fuse-2-std-lrs + accessMode: "ReadWriteOnce" + size: "50Gi" +``` + +**What this means:** +- βœ… **Jenkins uses PVC** for build artifacts, workspace data +- βœ… **50GB storage** for build history and artifacts +- βœ… **Azure Blob Storage** (cost-effective for large files) +- βœ… **Build history preserved** across pod restarts + +#### **4. Central Storage (Logs)** +```yaml +# 🏭 ACTUAL CONFIGURATION FROM YOUR CODEBASE +# freeleaps-ops/freeleaps/helm-pkg/centralStorage/templates/central-storage/pvc.yaml +persistence: + enabled: true + size: 1Gi + accessModes: + - ReadWriteOnce +``` + +**What this means:** +- βœ… **Central storage uses PVC** for log ingestion +- βœ… **1GB storage** for log processing +- βœ… **Logs preserved** during processing + +### **πŸ“‹ PVC Usage Summary** + +| Application | PVC Name | Size | Storage Class | Purpose | Critical? | +|-------------|----------|------|---------------|---------|-----------| +| **Gitea** | `gitea-shared-storage` | 15Gi | `azure-disk-std-lrs` | Git repositories, user data | πŸ”΄ **Critical** | +| **MongoDB** | `mongodb-datadir` | 8Gi | Default | Database files | πŸ”΄ **Critical** | +| **Jenkins** | `jenkins-pvc` | 50Gi | `azure-blob-fuse-2-std-lrs` | Build artifacts, workspace | 🟑 **Important** | +| **Central Storage** | `central-storage-logs-pvc` | 1Gi | Default | Log processing | 🟒 **Nice to have** | + +--- + +## πŸ€·β€β™‚οΈ **Does Each Pod Need PVC? NO!** + +### **❌ Common Misconception** + +**"Every pod needs a PVC"** - This is **WRONG**! + +### **βœ… Reality: PVCs Are Optional** + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ PVC DECISION TREE β”‚ +β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ DOES YOUR APP NEED PERSISTENT DATA? β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ +β”‚ β”‚ β”‚ YES β”‚ β”‚ NO β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ USE β”‚ β”‚ β”‚ β”‚ DON'T β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ PVC β”‚ β”‚ β”‚ β”‚ USE β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ PVC β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ β”‚ +β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ Examples: β”‚ β”‚ +β”‚ β”‚ β€’ Databases (PostgreSQL, MongoDB) β”‚ β”‚ +β”‚ β”‚ β€’ File storage (Gitea, Jenkins) β”‚ β”‚ +β”‚ β”‚ β€’ Application data (user uploads) β”‚ β”‚ +β”‚ β”‚ β€’ Logs (if you want to keep them) β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ Examples: β”‚ β”‚ +β”‚ β”‚ β€’ Web servers (nginx, static content) β”‚ β”‚ +β”‚ β”‚ β€’ API servers (stateless applications) β”‚ β”‚ +β”‚ β”‚ β€’ Cache servers (Redis, Memcached) β”‚ β”‚ +β”‚ β”‚ β€’ Load balancers β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +### **πŸ“Š Your Current Setup Analysis** + +Looking at your applications: + +#### **βœ… Applications WITH PVCs (Need Persistent Data)** +- **Gitea**: Git repositories, user data, configuration +- **MongoDB**: Database files +- **Jenkins**: Build artifacts, workspace data +- **Central Storage**: Log processing + +#### **❌ Applications WITHOUT PVCs (Stateless)** +- **Nginx Ingress Controller**: Stateless routing +- **ArgoCD**: GitOps configuration (stored in Git) +- **Cert-manager**: Certificate management (stateless) +- **Prometheus/Grafana**: Metrics (can use PVC for data retention) + +--- + +## 🎯 **PVC Considerations: When to Use Them** + +### **βœ… Use PVCs When:** + +#### **1. Database Applications** +```yaml +# Database needs persistent storage +apiVersion: apps/v1 +kind: Deployment +metadata: + name: postgres +spec: + template: + spec: + containers: + - name: postgres + image: postgres:13 + volumeMounts: + - name: db-storage + mountPath: /var/lib/postgresql/data + volumes: + - name: db-storage + persistentVolumeClaim: + claimName: postgres-pvc +``` + +#### **2. File Storage Applications** +```yaml +# File server needs persistent storage +apiVersion: apps/v1 +kind: Deployment +metadata: + name: file-server +spec: + template: + spec: + containers: + - name: file-server + image: nginx:latest + volumeMounts: + - name: file-storage + mountPath: /var/www/html + volumes: + - name: file-storage + persistentVolumeClaim: + claimName: file-storage-pvc +``` + +#### **3. Application Data** +```yaml +# Application needs to store user data +apiVersion: apps/v1 +kind: Deployment +metadata: + name: my-app +spec: + template: + spec: + containers: + - name: my-app + image: my-app:latest + volumeMounts: + - name: app-data + mountPath: /app/data + volumes: + - name: app-data + persistentVolumeClaim: + claimName: app-data-pvc +``` + +### **❌ Don't Use PVCs When:** + +#### **1. Stateless Web Servers** +```yaml +# Web server doesn't need persistent storage +apiVersion: apps/v1 +kind: Deployment +metadata: + name: web-server +spec: + template: + spec: + containers: + - name: web-server + image: nginx:latest + # No volumeMounts needed - stateless +``` + +#### **2. API Servers** +```yaml +# API server doesn't need persistent storage +apiVersion: apps/v1 +kind: Deployment +metadata: + name: api-server +spec: + template: + spec: + containers: + - name: api-server + image: my-api:latest + # No volumeMounts needed - stateless +``` + +#### **3. Cache Servers** +```yaml +# Cache server doesn't need persistent storage +apiVersion: apps/v1 +kind: Deployment +metadata: + name: redis-cache +spec: + template: + spec: + containers: + - name: redis + image: redis:latest + # No volumeMounts needed - cache is temporary +``` + +--- + +## πŸ”§ **PVC Configuration Options** + +### **1. Access Modes** + +```yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: my-pvc +spec: + accessModes: + - ReadWriteOnce # Single node read/write (most common) + - ReadOnlyMany # Multiple nodes read-only + - ReadWriteMany # Multiple nodes read/write (rare) + resources: + requests: + storage: 10Gi +``` + +### **2. Storage Classes** + +```yaml +# Azure Storage Classes Available +storageClass: azure-disk-std-lrs # Standard HDD (cheapest) +storageClass: azure-disk-premium-lrs # Premium SSD (fastest) +storageClass: azure-blob-fuse-2-std-lrs # Blob storage (for large files) +``` + +### **3. Size Considerations** + +```yaml +# Size your PVCs appropriately +resources: + requests: + storage: 1Gi # Small: logs, config + storage: 10Gi # Medium: databases + storage: 100Gi # Large: file storage, backups +``` + +--- + +## 🚨 **Common PVC Mistakes** + +### **❌ Mistake 1: Using PVC for Everything** +```yaml +# ❌ DON'T DO THIS +apiVersion: apps/v1 +kind: Deployment +metadata: + name: nginx +spec: + template: + spec: + containers: + - name: nginx + image: nginx:latest + volumeMounts: + - name: temp-storage # ❌ Unnecessary PVC + mountPath: /tmp + volumes: + - name: temp-storage + persistentVolumeClaim: + claimName: temp-pvc # ❌ Waste of resources +``` + +### **❌ Mistake 2: Not Setting Resource Limits** +```yaml +# ❌ DON'T DO THIS +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: unlimited-pvc +spec: + accessModes: + - ReadWriteOnce + # ❌ No size limit - could consume all storage +``` + +### **βœ… Correct Approach** +```yaml +# βœ… DO THIS +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: limited-pvc +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi # βœ… Set appropriate size +``` + +--- + +## πŸ“š **Best Practices** + +### **1. Size Appropriately** +- Start small and scale up +- Monitor actual usage +- Use storage quotas + +### **2. Choose Right Storage Class** +- **Standard HDD**: Cost-effective for backups, logs +- **Premium SSD**: Performance-critical databases +- **Blob Storage**: Large files, archives + +### **3. Use Labels and Annotations** +```yaml +metadata: + name: my-pvc + labels: + app: my-app + environment: production + storage-type: database + annotations: + helm.sh/resource-policy: keep # Don't delete on helm uninstall +``` + +### **4. Monitor Usage** +```bash +# Check PVC usage +kubectl get pvc +kubectl describe pvc + +# Check storage classes +kubectl get storageclass + +# Monitor disk usage in pods +kubectl exec -- df -h +``` + +--- + +## πŸ” **Your Setup Recommendations** + +### **Current State: Good!** +Your current setup uses PVCs appropriately: +- βœ… **Gitea**: 15Gi for repositories (appropriate) +- βœ… **MongoDB**: 8Gi for database (appropriate) +- βœ… **Jenkins**: 50Gi for builds (appropriate) +- βœ… **Central Storage**: 1Gi for logs (appropriate) + +### **Potential Improvements** +1. **Monitor usage**: Check actual disk usage in these PVCs +2. **Consider backups**: Implement PVC backup strategy +3. **Storage quotas**: Set namespace storage limits +4. **Performance tuning**: Use Premium SSD for databases if needed + +--- + +## πŸ“– **Next Steps** + +1. **Monitor your current PVCs**: + ```bash + kubectl get pvc --all-namespaces + kubectl describe pvc + ``` + +2. **Check storage usage**: + ```bash + kubectl exec -it -- df -h + ``` + +3. **Learn about backup strategies**: + - Azure Backup for PVCs + - Velero for Kubernetes backups + +4. **Consider storage optimization**: + - Right-size PVCs based on actual usage + - Use appropriate storage classes for cost optimization + +--- + +**Last Updated**: September 3, 2025 +**Version**: 1.0 +**Maintainer**: Infrastructure Team diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 00000000..534cda20 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,406 @@ +# πŸš€ FreeLeaps DevOps Learning Path for Junior Engineers + +> **Production-Ready Kubernetes & DevOps Documentation** +> *Your gateway to understanding our actual infrastructure and becoming a DevOps expert* + +--- + +## πŸ“‹ **Table of Contents** + +1. [🎯 **Quick Start Guide**](#-quick-start-guide) +2. [πŸ—οΈ **Your Production Infrastructure**](#️-your-production-infrastructure) +3. [πŸ“š **Core Learning Materials**](#-core-learning-materials) +4. [πŸ”§ **Practical Exercises**](#-practical-exercises) +5. [⚑ **Essential Commands**](#-essential-commands) +6. [πŸŽ“ **Learning Path**](#-learning-path) +7. [πŸ” **Production Troubleshooting**](#-production-troubleshooting) +8. [πŸ“– **Additional Resources**](#-additional-resources) + +--- + +## 🎯 **Quick Start Guide** + +### **πŸš€ First Day Checklist** +- [ ] **Access your production cluster**: `kubectl config use-context your-cluster` +- [ ] **Explore the management UI**: [RabbitMQ Management UI](#rabbitmq-management-ui) +- [ ] **Check ArgoCD**: Visit `https://argo.mathmast.com` +- [ ] **Review monitoring**: Access Grafana dashboards +- [ ] **Understand your apps**: Check `freeleaps-devops-reconciler` status + +### **πŸ”‘ Essential Access Points** +```bash +# Your production cluster access +kubectl config get-contexts +kubectl get nodes -o wide + +# Your actual services +kubectl get svc -A | grep -E "(rabbitmq|argocd|jenkins|gitea)" + +# Your actual namespaces +kubectl get namespaces | grep freeleaps +``` + +--- + +## πŸ—οΈ **Your Production Infrastructure** + +### **🌐 Production Domains & Services** + +| **Service** | **Production URL** | **Purpose** | **Access** | +|-------------|-------------------|-------------|------------| +| **ArgoCD** | `https://argo.mathmast.com` | GitOps deployment | Web UI | +| **Gitea** | `https://gitea.freeleaps.mathmast.com` | Git repository | Web UI | +| **Jenkins** | `http://jenkins.freeleaps.mathmast.com` | CI/CD pipelines | Web UI | +| **RabbitMQ** | `http://rabbitmq:15672` | Message broker | Management UI | +| **Grafana** | `https://grafana.mathmast.com` | Monitoring | Dashboards | + +### **πŸ”§ Production Architecture** + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ PRODUCTION INFRASTRUCTURE β”‚ +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ Azure Load Balancer (4.155.160.32) β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ Ingress-NGINX β”‚ β”‚ cert-manager β”‚ β”‚ ArgoCD β”‚ β”‚ +β”‚ β”‚ Controller β”‚ β”‚ (Let's Encrypt)β”‚ β”‚ (GitOps) β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ RabbitMQ β”‚ β”‚ Jenkins β”‚ β”‚ Gitea β”‚ β”‚ +β”‚ β”‚ (Message Q) β”‚ β”‚ (CI/CD) β”‚ β”‚ (Git Repo) β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ freeleaps- β”‚ β”‚ freeleaps- β”‚ β”‚ freeleaps- β”‚ β”‚ +β”‚ β”‚ devops- β”‚ β”‚ apps β”‚ β”‚ monitoring β”‚ β”‚ +β”‚ β”‚ reconciler β”‚ β”‚ (Your Apps) β”‚ β”‚ (Metrics) β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +### **πŸ“Š Production Namespaces** + +```bash +# Your actual namespaces +freeleaps-alpha # Alpha environment +freeleaps-prod # Production environment +freeleaps-devops-system # DevOps tools +freeleaps-controls-system # Control plane +freeleaps-monitoring-system # Monitoring stack +``` + +--- + +## πŸ“š **Core Learning Materials** + +### **πŸŽ“ Phase 1: Kubernetes Fundamentals** +- **[Kubernetes Core Concepts Guide](Kubernetes_Core_Concepts_Guide.md)** - *Start here!* + - **Production Connection**: Your actual pods, services, and deployments + - **Real Examples**: Based on your `freeleaps-devops-reconciler` deployment + - **Hands-on**: Practice with your actual cluster + +- **[PVC Deep Dive Guide](PVC_Deep_Dive_Guide.md)** - *Storage fundamentals* + - **Production Connection**: Your Azure disk storage classes + - **Real Examples**: How your apps use persistent storage + - **Troubleshooting**: Common storage issues in your environment + +### **πŸ”§ Phase 2: DevOps Infrastructure** +- **[Custom Resources & Operators Guide](Custom_Resources_And_Operators_Guide.md)** - *Advanced concepts* + - **Production Connection**: Your `freeleaps-devops-reconciler` operator + - **Real Examples**: How your CRDs work in production + - **Architecture**: Understanding your operator pattern + +- **[Reconciler Architecture Deep Dive](Reconciler_Architecture_Deep_Dive.md)** - *Your core system* + - **Production Connection**: Your actual reconciler deployment + - **Real Examples**: How your DevOps automation works + - **Troubleshooting**: Common reconciler issues + +- **[Reconciler Framework Analysis](Reconciler_Framework_Analysis.md)** - *Technical deep dive* + - **Production Connection**: Your Python/Kopf operator framework + - **Real Examples**: Code analysis from your actual implementation + - **Best Practices**: How to improve your reconciler + +### **🌐 Phase 3: Networking & Ingress** +- **[Ingress Setup & Redirects Guide](Ingress_Setup_And_Redirects_Guide.md)** - *Web traffic management* + - **Production Connection**: Your actual ingress controllers + - **Real Examples**: How your domains are configured + - **Troubleshooting**: Common ingress issues + +- **[Current Ingress Analysis](Current_Ingress_Analysis.md)** - *Your actual setup* + - **Production Connection**: Your real ingress configurations + - **Real Examples**: Your actual domain routing + - **Monitoring**: How to check ingress health + +### **πŸ“¨ Phase 4: Messaging & Communication** +- **[RabbitMQ Management Analysis](RabbitMQ_Management_Analysis.md)** - *Message broker* + - **Production Connection**: Your actual RabbitMQ deployment + - **Real Examples**: Your message queues and exchanges + - **Management UI**: How to use the built-in management interface + +### **πŸš€ Phase 5: Operations & Deployment** +- **[Kubernetes Bootstrap Guide](Kubernetes_Bootstrap_Guide.md)** - *Cluster setup* + - **Production Connection**: How your cluster was built + - **Real Examples**: Your actual bootstrap process + - **Maintenance**: How to maintain your cluster + +- **[Azure K8s Node Addition Runbook](Azure_K8s_Node_Addition_Runbook.md)** - *Scaling* + - **Production Connection**: How to add nodes to your cluster + - **Real Examples**: Your actual node addition process + - **Automation**: Scripts for node management + +--- + +## πŸ”§ **Practical Exercises** + +### **🎯 Exercise 1: Explore Your Production Cluster** +```bash +# 1. Connect to your cluster +kubectl config use-context your-production-cluster + +# 2. Explore your namespaces +kubectl get namespaces | grep freeleaps + +# 3. Check your actual deployments +kubectl get deployments -A | grep freeleaps + +# 4. Monitor your reconciler +kubectl logs -f deployment/freeleaps-devops-reconciler -n freeleaps-devops-system +``` + +### **🎯 Exercise 2: RabbitMQ Management UI** +```bash +# 1. Port forward to RabbitMQ management UI +kubectl port-forward svc/rabbitmq-headless -n freeleaps-alpha 15672:15672 + +# 2. Access the UI: http://localhost:15672 +# Username: user +# Password: NjlhHFvnDuC7K0ir + +# 3. Explore your queues: +# - freeleaps.devops.reconciler.queue +# - freeleaps.devops.reconciler.input +``` + +### **🎯 Exercise 3: ArgoCD GitOps** +```bash +# 1. Access ArgoCD: https://argo.mathmast.com + +# 2. Explore your applications: +# - freeleaps-devops-reconciler +# - freeleaps-apps +# - monitoring stack + +# 3. Check deployment status +kubectl get applications -n argocd +``` + +### **🎯 Exercise 4: Monitor Your Infrastructure** +```bash +# 1. Check cluster health +kubectl get nodes -o wide + +# 2. Monitor resource usage +kubectl top nodes +kubectl top pods -A + +# 3. Check ingress status +kubectl get ingress -A +``` + +--- + +## ⚑ **Essential Commands** + +### **πŸ” Production Monitoring** +```bash +# Your cluster health +kubectl get nodes -o wide +kubectl get pods -A --field-selector=status.phase!=Running + +# Your services +kubectl get svc -A | grep -E "(rabbitmq|argocd|jenkins|gitea)" + +# Your reconciler status +kubectl get deployment freeleaps-devops-reconciler -n freeleaps-devops-system +kubectl logs -f deployment/freeleaps-devops-reconciler -n freeleaps-devops-system +``` + +### **πŸ”§ Troubleshooting** +```bash +# Check reconciler health +kubectl describe deployment freeleaps-devops-reconciler -n freeleaps-devops-system + +# Check RabbitMQ status +kubectl get pods -n freeleaps-alpha | grep rabbitmq +kubectl logs -f deployment/rabbitmq -n freeleaps-alpha + +# Check ingress issues +kubectl describe ingress -A +kubectl get events -A --sort-by='.lastTimestamp' +``` + +### **πŸ“Š Resource Management** +```bash +# Monitor resource usage +kubectl top nodes +kubectl top pods -A + +# Check storage +kubectl get pvc -A +kubectl get pv + +# Check networking +kubectl get svc -A +kubectl get endpoints -A +``` + +--- + +## πŸŽ“ **Learning Path** + +### **πŸ“… Week 1: Foundations** +- **Day 1-2**: [Kubernetes Core Concepts](Kubernetes_Core_Concepts_Guide.md) +- **Day 3-4**: [PVC Deep Dive](PVC_Deep_Dive_Guide.md) +- **Day 5**: Practice exercises with your actual cluster + +### **πŸ“… Week 2: DevOps Infrastructure** +- **Day 1-2**: [Custom Resources & Operators](Custom_Resources_And_Operators_Guide.md) +- **Day 3-4**: [Reconciler Architecture](Reconciler_Architecture_Deep_Dive.md) +- **Day 5**: [Reconciler Framework Analysis](Reconciler_Framework_Analysis.md) + +### **πŸ“… Week 3: Networking & Communication** +- **Day 1-2**: [Ingress Setup & Redirects](Ingress_Setup_And_Redirects_Guide.md) +- **Day 3**: [Current Ingress Analysis](Current_Ingress_Analysis.md) +- **Day 4-5**: [RabbitMQ Management](RabbitMQ_Management_Analysis.md) + +### **πŸ“… Week 4: Operations & Production** +- **Day 1-2**: [Kubernetes Bootstrap](Kubernetes_Bootstrap_Guide.md) +- **Day 3-4**: [Azure Node Addition](Azure_K8s_Node_Addition_Runbook.md) +- **Day 5**: Production troubleshooting and monitoring + +--- + +## πŸ” **Production Troubleshooting** + +### **🚨 Common Issues & Solutions** + +#### **1. Reconciler Not Working** +```bash +# Check reconciler status +kubectl get deployment freeleaps-devops-reconciler -n freeleaps-devops-system +kubectl logs -f deployment/freeleaps-devops-reconciler -n freeleaps-devops-system + +# Check RabbitMQ connection +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl list_connections +``` + +#### **2. Ingress Issues** +```bash +# Check ingress controller +kubectl get pods -n ingress-nginx +kubectl logs -f deployment/ingress-nginx-controller -n ingress-nginx + +# Check certificates +kubectl get certificates -A +kubectl describe certificate -n your-namespace +``` + +#### **3. Storage Problems** +```bash +# Check PVC status +kubectl get pvc -A +kubectl describe pvc your-pvc-name -n your-namespace + +# Check storage classes +kubectl get storageclass +``` + +### **πŸ“Š Monitoring & Alerts** + +#### **Key Metrics to Watch** +- **Cluster health**: Node status, pod restarts +- **Resource usage**: CPU, memory, disk +- **Network**: Ingress traffic, service connectivity +- **Applications**: Reconciler health, RabbitMQ queues + +#### **Alerting Setup** +```bash +# Check Prometheus targets +kubectl get targets -n freeleaps-monitoring-system + +# Check Grafana dashboards +# Access: https://grafana.mathmast.com +``` + +--- + +## πŸ“– **Additional Resources** + +### **πŸ”— Official Documentation** +- **[Kubernetes Documentation](https://kubernetes.io/docs/)** - Official K8s docs +- **[ArgoCD Documentation](https://argo-cd.readthedocs.io/)** - GitOps platform +- **[RabbitMQ Documentation](https://www.rabbitmq.com/documentation.html)** - Message broker +- **[Helm Documentation](https://helm.sh/docs/)** - Package manager + +### **πŸŽ₯ Video Resources** +- **Kubernetes Crash Course**: [TechWorld with Nana](https://www.youtube.com/watch?v=s_o8dwzRlu4) +- **ArgoCD Tutorial**: [ArgoCD Official](https://www.youtube.com/watch?v=MeU5_k9ssOY) +- **RabbitMQ Basics**: [RabbitMQ Official](https://www.youtube.com/watch?v=deG25y_r6OI) + +### **πŸ“š Books** +- **"Kubernetes in Action"** by Marko LukΕ‘a +- **"GitOps and Kubernetes"** by Billy Yuen +- **"RabbitMQ in Depth"** by Gavin M. Roy + +### **πŸ› οΈ Tools & Utilities** +- **[k9s](https://k9scli.io/)** - Terminal UI for K8s +- **[Lens](https://k8slens.dev/)** - Desktop IDE for K8s +- **[kubectx](https://github.com/ahmetb/kubectx)** - Context switching + +--- + +## 🎯 **Next Steps** + +### **πŸš€ Immediate Actions** +1. **Set up your development environment** with kubectl and helm +2. **Access your production cluster** and explore the resources +3. **Complete the practical exercises** in this guide +4. **Join the monitoring dashboards** and understand the metrics + +### **πŸ“ˆ Career Development** +1. **Get certified**: [CKA (Certified Kubernetes Administrator)](https://www.cncf.io/certification/cka/) +2. **Contribute**: Help improve the reconciler and infrastructure +3. **Learn**: Stay updated with latest K8s and DevOps practices +4. **Share**: Document your learnings and share with the team + +### **🀝 Team Collaboration** +- **Code reviews**: Review reconciler changes +- **Documentation**: Improve this guide based on your experience +- **Mentoring**: Help other junior engineers +- **Innovation**: Suggest improvements to the infrastructure + +--- + +## πŸ“ž **Support & Contact** + +### **πŸ†˜ Getting Help** +- **Team Slack**: #devops-support channel +- **Documentation**: This guide and linked resources +- **Code Reviews**: GitHub pull requests +- **Pair Programming**: Schedule sessions with senior engineers + +### **πŸ“ Feedback** +- **Documentation**: Create issues for improvements +- **Process**: Suggest workflow optimizations +- **Tools**: Recommend new tools or improvements + +--- + +**πŸŽ‰ Welcome to the FreeLeaps DevOps team! You're now part of a production infrastructure that serves real users. Take ownership, learn continuously, and help us build amazing things!** + +--- + +*Last updated: $(date)* +*Maintained by: FreeLeaps DevOps Team* + diff --git a/docs/RabbitMQ_Management_Analysis.md b/docs/RabbitMQ_Management_Analysis.md new file mode 100644 index 00000000..3c4ab2f6 --- /dev/null +++ b/docs/RabbitMQ_Management_Analysis.md @@ -0,0 +1,1015 @@ +# 🐰 RabbitMQ Management Analysis & Production Guide + +> **Complete Guide to Managing RabbitMQ in Your FreeLeaps Production Environment** +> *From configuration to monitoring to troubleshooting* + +--- + +## πŸ“‹ **Table of Contents** + +1. [🎯 **Quick Start**](#-quick-start) +2. [πŸ—οΈ **Your Production Setup**](#️-your-production-setup) +3. [πŸ”§ **Current Configuration Analysis**](#-current-configuration-analysis) +4. [πŸ“Š **Management UI Guide**](#-management-ui-guide) +5. [πŸ” **Production Monitoring**](#-production-monitoring) +6. [🚨 **Troubleshooting Guide**](#-troubleshooting-guide) +7. [⚑ **Performance Optimization**](#-performance-optimization) +8. [πŸ”’ **Security Best Practices**](#-security-best-practices) +9. [πŸ“ˆ **Scaling & High Availability**](#-scaling--high-availability) +10. [πŸ› οΈ **Maintenance Procedures**](#️-maintenance-procedures) + +--- + +## 🎯 **Quick Start** + +### **πŸš€ First Day Checklist** +- [ ] **Access RabbitMQ Management UI**: Port forward to `http://localhost:15672` +- [ ] **Check your queues**: Verify `freeleaps.devops.reconciler.*` queues exist +- [ ] **Monitor connections**: Check if reconciler is connected +- [ ] **Review metrics**: Check message rates and queue depths +- [ ] **Test connectivity**: Verify RabbitMQ is accessible from your apps + +### **πŸ”‘ Essential Commands** +```bash +# Access your RabbitMQ cluster +kubectl get pods -n freeleaps-alpha | grep rabbitmq + +# Port forward to management UI +kubectl port-forward svc/rabbitmq-headless -n freeleaps-alpha 15672:15672 + +# Check RabbitMQ logs +kubectl logs -f deployment/rabbitmq -n freeleaps-alpha + +# Access RabbitMQ CLI +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl list_queues +``` + +--- + +## πŸ—οΈ **Your Production Setup** + +### **🌐 Production Architecture** + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ RABBITMQ PRODUCTION SETUP β”‚ +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ freeleaps- β”‚ β”‚ freeleaps- β”‚ β”‚ freeleaps- β”‚ β”‚ +β”‚ β”‚ devops- β”‚ β”‚ apps β”‚ β”‚ monitoring β”‚ β”‚ +β”‚ β”‚ reconciler β”‚ β”‚ (Your Apps) β”‚ β”‚ (Metrics) β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ AMQP 5672 β”‚ AMQP 5672 β”‚ β”‚ +β”‚ β”‚ HTTP 15672 β”‚ HTTP 15672 β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ RABBITMQ CLUSTER β”‚ β”‚ +β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ +β”‚ β”‚ β”‚ Node 1 β”‚ β”‚ Node 2 β”‚ β”‚ Node 3 β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ (Primary) β”‚ β”‚ (Replica) β”‚ β”‚ (Replica) β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ Port: 5672 β”‚ β”‚ Port: 5672 β”‚ β”‚ Port: 5672 β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ UI: 15672 β”‚ β”‚ UI: 15672 β”‚ β”‚ UI: 15672 β”‚ β”‚ β”‚ +β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +### **πŸ“Š Production Namespaces** + +| **Environment** | **Namespace** | **Purpose** | **Status** | +|-----------------|---------------|-------------|------------| +| **Alpha** | `freeleaps-alpha` | Development & Testing | βœ… Active | +| **Production** | `freeleaps-prod` | Live Production | βœ… Active | + +### **πŸ”§ Production Services** + +```bash +# Your actual RabbitMQ services +kubectl get svc -n freeleaps-alpha | grep rabbitmq +kubectl get svc -n freeleaps-prod | grep rabbitmq + +# Service details: +# - rabbitmq-headless: Internal cluster communication +# - rabbitmq: External access (if needed) +# - rabbitmq-management: Management UI access +``` + +--- + +## πŸ”§ **Current Configuration Analysis** + +### **πŸ“‹ Configuration Sources** + +#### **1. Helm Chart Configuration** +```yaml +# Location: freeleaps-ops/freeleaps/helm-pkg/3rd/rabbitmq/ +# Primary configuration files: +# - values.yaml (base configuration) +# - values.alpha.yaml (alpha environment overrides) +# - values.prod.yaml (production environment overrides) +``` + +#### **2. Reconciler Configuration** +```yaml +# Location: freeleaps-devops-reconciler/helm/freeleaps-devops-reconciler/values.yaml +rabbitmq: + host: "rabbitmq-headless.freeleaps-alpha.svc.cluster.local" + port: 5672 + username: "user" + password: "NjlhHFvnDuC7K0ir" + vhost: "/" +``` + +#### **3. Python Configuration** +```python +# Location: freeleaps-devops-reconciler/reconciler/config/config.py +RABBITMQ_HOST = os.getenv('RABBITMQ_HOST', 'localhost') +RABBITMQ_PORT = int(os.getenv('RABBITMQ_PORT', '5672')) +RABBITMQ_USERNAME = os.getenv('RABBITMQ_USERNAME', 'guest') +RABBITMQ_PASSWORD = os.getenv('RABBITMQ_PASSWORD', 'guest') +``` + +### **πŸ” Configuration Analysis** + +#### **βœ… What's Working Well** +1. **Helm-based deployment** - Consistent and repeatable +2. **Environment separation** - Alpha vs Production +3. **Clustering enabled** - High availability +4. **Management plugin** - Web UI available +5. **Resource limits** - Proper resource management + +#### **⚠️ Issues Identified** + +##### **1. Configuration Mismatch** +```yaml +# ❌ PROBLEM: Different image versions +# Helm chart: bitnami/rabbitmq:4.0.6-debian-12-r0 +# Reconciler: rabbitmq:3.12-management-alpine + +# ❌ PROBLEM: Different credentials +# Alpha: username: "user", password: "NjlhHFvnDuC7K0ir" +# Production: Different credentials (not shown in config) +``` + +##### **2. Security Concerns** +```yaml +# ❌ PROBLEM: Hardcoded passwords in values files +auth: + username: user + password: "NjlhHFvnDuC7K0ir" # Should be in Kubernetes secrets +``` + +##### **3. Network Configuration** +```yaml +# ❌ PROBLEM: Inconsistent hostnames +# Reconciler uses: rabbitmq-headless.freeleaps-alpha.svc.cluster.local +# But should use service discovery +``` + +### **🎯 Recommended Improvements** + +#### **1. Centralized Configuration** +```yaml +# Create a centralized RabbitMQ configuration +# Location: freeleaps-ops/config/rabbitmq/ +rabbitmq-config: + image: + repository: bitnami/rabbitmq + tag: "4.0.6-debian-12-r0" + auth: + username: ${RABBITMQ_USERNAME} + password: ${RABBITMQ_PASSWORD} + clustering: + enabled: true + name: "freeleaps-${ENVIRONMENT}" +``` + +#### **2. Secret Management** +```yaml +# Use Kubernetes secrets instead of hardcoded values +apiVersion: v1 +kind: Secret +metadata: + name: rabbitmq-credentials + namespace: freeleaps-alpha +type: Opaque +data: + username: dXNlcg== # base64 encoded + password: TmphbEhGdm5EdUM3SzBpcg== # base64 encoded +``` + +#### **3. Service Discovery** +```yaml +# Use consistent service discovery +# Instead of hardcoded hostnames, use: +RABBITMQ_HOST: "rabbitmq-headless.${NAMESPACE}.svc.cluster.local" +``` + +--- + +## πŸ“Š **Management UI Guide** + +### **🌐 Accessing the Management UI** + +#### **Method 1: Port Forward (Recommended)** +```bash +# Port forward to RabbitMQ management UI +kubectl port-forward svc/rabbitmq-headless -n freeleaps-alpha 15672:15672 + +# Access: http://localhost:15672 +# Username: user +# Password: NjlhHFvnDuC7K0ir +``` + +#### **Method 2: Ingress (If configured)** +```bash +# If you have ingress configured for RabbitMQ +# Access: https://rabbitmq.freeleaps.mathmast.com +``` + +### **πŸ“‹ Management UI Features** + +#### **1. Overview Dashboard** +- **Cluster status** and health indicators +- **Node information** and resource usage +- **Connection counts** and message rates +- **Queue depths** and performance metrics + +#### **2. Queues Management** +```bash +# Your actual queues to monitor: +# - freeleaps.devops.reconciler.queue (heartbeat) +# - freeleaps.devops.reconciler.input (input messages) +# - freeleaps.devops.reconciler.output (output messages) + +# Queue operations: +# - View queue details and metrics +# - Purge queues (remove all messages) +# - Delete queues (with safety confirmations) +# - Monitor message rates and consumer counts +``` + +#### **3. Exchanges Management** +```bash +# Your actual exchanges: +# - amq.default (default direct exchange) +# - amq.topic (topic exchange) +# - amq.fanout (fanout exchange) + +# Exchange operations: +# - View exchange properties and bindings +# - Create new exchanges with custom types +# - Monitor message routing and performance +``` + +#### **4. Connections & Channels** +```bash +# Monitor your reconciler connections: +# - Check if reconciler is connected +# - Monitor connection health and performance +# - View channel details and limits +# - Force disconnect if needed +``` + +#### **5. Users & Permissions** +```bash +# Current user setup: +# - Username: user +# - Permissions: Full access to vhost "/" +# - Tags: management + +# User management: +# - Create new users for different applications +# - Set up proper permissions and access control +# - Monitor user activity and connections +``` + +### **πŸ”§ Practical UI Operations** + +#### **Monitoring Your Reconciler** +```bash +# 1. Check if reconciler is connected +# Go to: Connections tab +# Look for: freeleaps-devops-reconciler connections + +# 2. Monitor message flow +# Go to: Queues tab +# Check: freeleaps.devops.reconciler.* queues +# Monitor: Message rates and queue depths + +# 3. Check cluster health +# Go to: Overview tab +# Monitor: Node status and resource usage +``` + +#### **Troubleshooting via UI** +```bash +# 1. Check for stuck messages +# Go to: Queues > freeleaps.devops.reconciler.input +# Look for: High message count or no consumers + +# 2. Check connection issues +# Go to: Connections tab +# Look for: Disconnected or error states + +# 3. Monitor resource usage +# Go to: Overview tab +# Check: Memory usage and disk space +``` + +--- + +## πŸ” **Production Monitoring** + +### **πŸ“Š Key Metrics to Monitor** + +#### **1. Cluster Health** +```bash +# Check cluster status +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl cluster_status + +# Monitor node health +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl list_nodes +``` + +#### **2. Queue Metrics** +```bash +# Check queue depths +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl list_queues name messages consumers + +# Monitor message rates +# Use Management UI: Queues tab > Queue details > Message rates +``` + +#### **3. Connection Metrics** +```bash +# Check active connections +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl list_connections + +# Monitor connection health +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl list_channels +``` + +#### **4. Resource Usage** +```bash +# Check memory usage +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl status + +# Monitor disk usage +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- df -h +``` + +### **🚨 Alerting Setup** + +#### **1. Queue Depth Alerts** +```yaml +# Alert when queue depth exceeds threshold +# Queue: freeleaps.devops.reconciler.input +# Threshold: > 100 messages +# Action: Send Slack notification +``` + +#### **2. Connection Loss Alerts** +```yaml +# Alert when reconciler disconnects +# Monitor: freeleaps-devops-reconciler connections +# Threshold: Connection count = 0 +# Action: Page on-call engineer +``` + +#### **3. Resource Usage Alerts** +```yaml +# Alert when memory usage is high +# Threshold: Memory usage > 80% +# Action: Scale up or investigate +``` + +### **πŸ“ˆ Monitoring Dashboard** + +#### **Grafana Dashboard** +```yaml +# Your existing RabbitMQ dashboard +# Location: freeleaps-ops/cluster/manifests/freeleaps-monitoring-system/kube-prometheus-stack/dashboards/rabbitmq.yaml +# Access: https://grafana.mathmast.com +# Dashboard: RabbitMQ Management Overview +``` + +#### **Key Dashboard Panels** +1. **Queue Depth** - Monitor message accumulation +2. **Message Rates** - Track throughput +3. **Connection Count** - Monitor client connections +4. **Memory Usage** - Track resource consumption +5. **Error Rates** - Monitor failures + +--- + +## 🚨 **Troubleshooting Guide** + +### **πŸ” Common Issues & Solutions** + +#### **1. Reconciler Connection Issues** + +##### **Problem**: Reconciler can't connect to RabbitMQ +```bash +# Symptoms: +# - Reconciler logs show connection errors +# - No connections in RabbitMQ UI +# - Pods restarting due to connection failures + +# Diagnosis: +kubectl logs -f deployment/freeleaps-devops-reconciler -n freeleaps-devops-system +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl list_connections + +# Solutions: +# 1. Check network connectivity +kubectl exec -it deployment/freeleaps-devops-reconciler -n freeleaps-devops-system -- ping rabbitmq-headless.freeleaps-alpha.svc.cluster.local + +# 2. Verify credentials +kubectl get secret rabbitmq-credentials -n freeleaps-alpha -o yaml + +# 3. Check RabbitMQ status +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl status +``` + +#### **2. Queue Message Accumulation** + +##### **Problem**: Messages stuck in queues +```bash +# Symptoms: +# - High message count in queues +# - No consumers processing messages +# - Increasing queue depth + +# Diagnosis: +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl list_queues name messages consumers + +# Solutions: +# 1. Check consumer health +kubectl logs -f deployment/freeleaps-devops-reconciler -n freeleaps-devops-system + +# 2. Restart consumers +kubectl rollout restart deployment/freeleaps-devops-reconciler -n freeleaps-devops-system + +# 3. Purge stuck messages (if safe) +# Via Management UI: Queues > Queue > Purge +``` + +#### **3. Memory Pressure** + +##### **Problem**: RabbitMQ running out of memory +```bash +# Symptoms: +# - High memory usage +# - Slow performance +# - Connection drops + +# Diagnosis: +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl status +kubectl top pods -n freeleaps-alpha | grep rabbitmq + +# Solutions: +# 1. Increase memory limits +kubectl patch deployment rabbitmq -n freeleaps-alpha -p '{"spec":{"template":{"spec":{"containers":[{"name":"rabbitmq","resources":{"limits":{"memory":"2Gi"}}}]}}}}' + +# 2. Restart RabbitMQ +kubectl rollout restart deployment/rabbitmq -n freeleaps-alpha + +# 3. Check for memory leaks +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl list_queues name memory +``` + +#### **4. Cluster Issues** + +##### **Problem**: RabbitMQ cluster not healthy +```bash +# Symptoms: +# - Nodes not in sync +# - Replication lag +# - Split-brain scenarios + +# Diagnosis: +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl cluster_status +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl list_nodes + +# Solutions: +# 1. Check node connectivity +kubectl get pods -n freeleaps-alpha | grep rabbitmq + +# 2. Restart problematic nodes +kubectl delete pod rabbitmq-0 -n freeleaps-alpha + +# 3. Rejoin cluster if needed +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl join_cluster rabbit@rabbitmq-0 +``` + +### **πŸ› οΈ Debugging Commands** + +#### **Essential Debugging Commands** +```bash +# Check RabbitMQ status +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl status + +# List all queues +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl list_queues + +# List all exchanges +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl list_exchanges + +# List all bindings +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl list_bindings + +# List all connections +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl list_connections + +# List all channels +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl list_channels + +# Check user permissions +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl list_users +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl list_user_permissions user +``` + +#### **Advanced Debugging** +```bash +# Check RabbitMQ logs +kubectl logs -f deployment/rabbitmq -n freeleaps-alpha + +# Check system logs +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- journalctl -u rabbitmq-server + +# Check network connectivity +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- netstat -tlnp + +# Check disk usage +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- df -h + +# Check memory usage +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- free -h +``` + +--- + +## ⚑ **Performance Optimization** + +### **🎯 Performance Tuning** + +#### **1. Memory Optimization** +```yaml +# Optimize memory settings +# Location: values.alpha.yaml +configuration: |- + # Memory management + vm_memory_high_watermark.relative = 0.6 + vm_memory_high_watermark_paging_ratio = 0.5 + + # Message store + msg_store_file_size_limit = 16777216 + msg_store_credit_disc_bound = 4000 +``` + +#### **2. Disk Optimization** +```yaml +# Optimize disk settings +configuration: |- + # Disk free space + disk_free_limit.relative = 2.0 + + # Queue master location + queue_master_locator = min-masters + + # Message persistence + queue.default_consumer_prefetch = 50 +``` + +#### **3. Network Optimization** +```yaml +# Optimize network settings +configuration: |- + # TCP settings + tcp_listen_options.backlog = 128 + tcp_listen_options.nodelay = true + + # Heartbeat + heartbeat = 60 + + # Connection limits + max_connections = 1000 + max_connections_per_user = 100 +``` + +### **πŸ“Š Performance Monitoring** + +#### **Key Performance Indicators** +1. **Message Throughput** - Messages per second +2. **Latency** - Message processing time +3. **Queue Depth** - Messages waiting to be processed +4. **Memory Usage** - Heap and process memory +5. **Disk I/O** - Write and read operations + +#### **Performance Benchmarks** +```bash +# Your expected performance: +# - Message rate: 1000+ messages/second +# - Latency: < 10ms for local messages +# - Queue depth: < 100 messages (normal operation) +# - Memory usage: < 80% of allocated memory +# - Disk usage: < 70% of allocated storage +``` + +--- + +## πŸ”’ **Security Best Practices** + +### **πŸ›‘οΈ Current Security Analysis** + +#### **βœ… Security Strengths** +1. **Network isolation** - RabbitMQ runs in Kubernetes namespace +2. **Resource limits** - Memory and CPU limits set +3. **Non-root user** - Runs as non-root in container +4. **TLS support** - SSL/TLS configuration available + +#### **⚠️ Security Weaknesses** +1. **Hardcoded passwords** - Passwords in YAML files +2. **Default permissions** - Overly permissive user access +3. **No audit logging** - Limited security event tracking +4. **No network policies** - No ingress/egress restrictions + +### **πŸ”§ Security Improvements** + +#### **1. Secret Management** +```yaml +# Use Kubernetes secrets +apiVersion: v1 +kind: Secret +metadata: + name: rabbitmq-credentials + namespace: freeleaps-alpha +type: Opaque +data: + username: dXNlcg== # base64 encoded + password: +--- +# Reference in Helm values +auth: + existingSecret: rabbitmq-credentials + existingSecretPasswordKey: password + existingSecretUsernameKey: username +``` + +#### **2. User Access Control** +```yaml +# Create application-specific users +# Instead of one user with full access: +# - freeleaps-reconciler (reconciler access only) +# - freeleaps-monitoring (read-only access) +# - freeleaps-admin (full access, limited to admins) +``` + +#### **3. Network Policies** +```yaml +# Restrict network access +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: rabbitmq-network-policy + namespace: freeleaps-alpha +spec: + podSelector: + matchLabels: + app: rabbitmq + policyTypes: + - Ingress + - Egress + ingress: + - from: + - namespaceSelector: + matchLabels: + name: freeleaps-devops-system + ports: + - protocol: TCP + port: 5672 + - protocol: TCP + port: 15672 +``` + +#### **4. Audit Logging** +```yaml +# Enable audit logging +configuration: |- + # Audit logging + log.file.level = info + log.file.rotation.date = $D0 + log.file.rotation.size = 10485760 + + # Security events + log.security = true +``` + +--- + +## πŸ“ˆ **Scaling & High Availability** + +### **πŸ—οΈ Current HA Setup** + +#### **Cluster Configuration** +```yaml +# Your current clustering setup +clustering: + enabled: true + name: "freeleaps-alpha" + addressType: hostname + rebalance: false + forceBoot: false + partitionHandling: autoheal +``` + +#### **Replication Strategy** +```yaml +# Queue replication +# - Queues are replicated across cluster nodes +# - Automatic failover if primary node fails +# - Data consistency maintained across cluster +``` + +### **πŸš€ Scaling Strategies** + +#### **1. Horizontal Scaling** +```bash +# Scale RabbitMQ cluster +kubectl scale statefulset rabbitmq -n freeleaps-alpha --replicas=5 + +# Verify scaling +kubectl get pods -n freeleaps-alpha | grep rabbitmq +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl cluster_status +``` + +#### **2. Vertical Scaling** +```yaml +# Increase resource limits +resources: + requests: + cpu: 500m + memory: 1Gi + limits: + cpu: 2000m + memory: 4Gi +``` + +#### **3. Queue Partitioning** +```yaml +# Partition large queues across nodes +# Strategy: Hash-based partitioning +# Benefits: Better performance and fault tolerance +``` + +### **πŸ”§ High Availability Best Practices** + +#### **1. Node Distribution** +```yaml +# Ensure nodes are distributed across availability zones +# Use pod anti-affinity to prevent single points of failure +affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: app + operator: In + values: + - rabbitmq + topologyKey: kubernetes.io/hostname +``` + +#### **2. Data Replication** +```yaml +# Configure proper replication +# - All queues should have at least 2 replicas +# - Use quorum queues for critical data +# - Monitor replication lag +``` + +#### **3. Backup Strategy** +```bash +# Backup RabbitMQ data +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl export_definitions /tmp/rabbitmq-definitions.json + +# Restore from backup +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl import_definitions /tmp/rabbitmq-definitions.json +``` + +--- + +## πŸ› οΈ **Maintenance Procedures** + +### **πŸ“… Regular Maintenance Tasks** + +#### **Daily Tasks** +```bash +# 1. Check cluster health +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl cluster_status + +# 2. Monitor queue depths +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl list_queues name messages + +# 3. Check connection count +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl list_connections | wc -l + +# 4. Review error logs +kubectl logs --tail=100 deployment/rabbitmq -n freeleaps-alpha | grep ERROR +``` + +#### **Weekly Tasks** +```bash +# 1. Review performance metrics +# Access Grafana dashboard: RabbitMQ Management Overview + +# 2. Check disk usage +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- df -h + +# 3. Review user permissions +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl list_users +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl list_user_permissions user + +# 4. Backup configurations +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl export_definitions /tmp/weekly-backup-$(date +%Y%m%d).json +``` + +#### **Monthly Tasks** +```bash +# 1. Security audit +# Review user access and permissions +# Check for unused queues and exchanges +# Verify network policies + +# 2. Performance review +# Analyze message rates and latency +# Review resource usage trends +# Optimize configurations + +# 3. Capacity planning +# Project growth based on usage trends +# Plan for scaling if needed +# Review backup and disaster recovery procedures +``` + +### **πŸ”§ Maintenance Scripts** + +#### **Health Check Script** +```bash +#!/bin/bash +# scripts/rabbitmq-health-check.sh + +NAMESPACE="freeleaps-alpha" +POD_NAME=$(kubectl get pods -n $NAMESPACE -l app=rabbitmq -o jsonpath='{.items[0].metadata.name}') + +echo "🐰 RabbitMQ Health Check - $(date)" +echo "==================================" + +# Check cluster status +echo "πŸ“Š Cluster Status:" +kubectl exec -it $POD_NAME -n $NAMESPACE -- rabbitmqctl cluster_status + +# Check queue depths +echo "πŸ“‹ Queue Depths:" +kubectl exec -it $POD_NAME -n $NAMESPACE -- rabbitmqctl list_queues name messages consumers + +# Check connections +echo "πŸ”— Active Connections:" +kubectl exec -it $POD_NAME -n $NAMESPACE -- rabbitmqctl list_connections | wc -l + +# Check resource usage +echo "πŸ’Ύ Resource Usage:" +kubectl top pods -n $NAMESPACE | grep rabbitmq +``` + +#### **Backup Script** +```bash +#!/bin/bash +# scripts/rabbitmq-backup.sh + +NAMESPACE="freeleaps-alpha" +BACKUP_DIR="/tmp/rabbitmq-backups" +DATE=$(date +%Y%m%d_%H%M%S) + +mkdir -p $BACKUP_DIR + +echo "πŸ“¦ Creating RabbitMQ backup..." + +# Export definitions +kubectl exec -it deployment/rabbitmq -n $NAMESPACE -- rabbitmqctl export_definitions /tmp/rabbitmq-definitions-$DATE.json + +# Copy backup file +kubectl cp $NAMESPACE/deployment/rabbitmq:/tmp/rabbitmq-definitions-$DATE.json $BACKUP_DIR/ + +echo "βœ… Backup created: $BACKUP_DIR/rabbitmq-definitions-$DATE.json" +``` + +### **🚨 Emergency Procedures** + +#### **1. RabbitMQ Node Failure** +```bash +# If a RabbitMQ node fails: +# 1. Check node status +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl list_nodes + +# 2. Restart failed node +kubectl delete pod rabbitmq-1 -n freeleaps-alpha + +# 3. Verify cluster health +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl cluster_status +``` + +#### **2. Data Loss Recovery** +```bash +# If data is lost: +# 1. Stop all consumers +kubectl scale deployment freeleaps-devops-reconciler -n freeleaps-devops-system --replicas=0 + +# 2. Restore from backup +kubectl cp backup-file.json freeleaps-alpha/deployment/rabbitmq:/tmp/ +kubectl exec -it deployment/rabbitmq -n freeleaps-alpha -- rabbitmqctl import_definitions /tmp/backup-file.json + +# 3. Restart consumers +kubectl scale deployment freeleaps-devops-reconciler -n freeleaps-devops-system --replicas=1 +``` + +#### **3. Performance Emergency** +```bash +# If performance is severely degraded: +# 1. Check resource usage +kubectl top pods -n freeleaps-alpha | grep rabbitmq + +# 2. Scale up resources +kubectl patch deployment rabbitmq -n freeleaps-alpha -p '{"spec":{"template":{"spec":{"containers":[{"name":"rabbitmq","resources":{"limits":{"memory":"4Gi","cpu":"2000m"}}}]}}}}' + +# 3. Restart RabbitMQ +kubectl rollout restart deployment/rabbitmq -n freeleaps-alpha +``` + +--- + +## 🎯 **Summary & Next Steps** + +### **πŸ“Š Current State Assessment** + +#### **βœ… Strengths** +1. **Production-ready setup** - Clustering, monitoring, management UI +2. **Helm-based deployment** - Consistent and repeatable +3. **Environment separation** - Alpha vs Production +4. **Integration working** - Reconciler successfully using RabbitMQ +5. **Monitoring available** - Grafana dashboards and metrics + +#### **⚠️ Areas for Improvement** +1. **Security hardening** - Remove hardcoded passwords, implement secrets +2. **Configuration standardization** - Centralize configuration management +3. **Performance optimization** - Tune settings for your workload +4. **Documentation** - Create runbooks for common operations +5. **Automation** - Implement automated health checks and alerts + +### **πŸš€ Recommended Actions** + +#### **Immediate (This Week)** +1. **Implement secret management** - Move passwords to Kubernetes secrets +2. **Standardize configuration** - Create centralized RabbitMQ config +3. **Set up monitoring alerts** - Configure alerts for critical metrics +4. **Document procedures** - Create runbooks for common operations + +#### **Short Term (Next Month)** +1. **Security audit** - Review and improve security posture +2. **Performance tuning** - Optimize settings based on usage patterns +3. **Automation** - Implement automated health checks and backups +4. **Training** - Train team on RabbitMQ management and troubleshooting + +#### **Long Term (Next Quarter)** +1. **High availability** - Implement multi-zone deployment +2. **Disaster recovery** - Set up automated backup and recovery procedures +3. **Advanced monitoring** - Implement predictive analytics and alerting +4. **Capacity planning** - Plan for growth and scaling + +### **πŸ“š Additional Resources** + +#### **Official Documentation** +- **[RabbitMQ Documentation](https://www.rabbitmq.com/documentation.html)** - Official guides +- **[RabbitMQ Management UI](https://www.rabbitmq.com/management.html)** - UI documentation +- **[RabbitMQ Clustering](https://www.rabbitmq.com/clustering.html)** - Cluster setup + +#### **Community Resources** +- **[RabbitMQ Slack](https://rabbitmq-slack.herokuapp.com/)** - Community support +- **[RabbitMQ GitHub](https://github.com/rabbitmq/rabbitmq-server)** - Source code +- **[RabbitMQ Blog](https://blog.rabbitmq.com/)** - Latest updates and tips + +#### **Books & Courses** +- **"RabbitMQ in Depth"** by Gavin M. Roy +- **"RabbitMQ Essentials"** by Lovisa Johansson +- **RabbitMQ Tutorials** - Official tutorial series + +--- + +**πŸŽ‰ You now have a comprehensive understanding of your RabbitMQ production environment! Use this guide to maintain, monitor, and optimize your message broker infrastructure.** + +--- + +*Last updated: $(date)* +*Maintained by: FreeLeaps DevOps Team* diff --git a/docs/Reconciler_Architecture_Deep_Dive.md b/docs/Reconciler_Architecture_Deep_Dive.md new file mode 100644 index 00000000..861398a4 --- /dev/null +++ b/docs/Reconciler_Architecture_Deep_Dive.md @@ -0,0 +1,440 @@ +# Reconciler Architecture Deep Dive + +## 🎯 **Overview** + +Your `freeleaps-devops-reconciler` is a **sophisticated Kubernetes Operator** that orchestrates your entire DevOps infrastructure. It's not just a simple CRD controller - it's a **full-stack DevOps automation platform** that bridges your Git repositories, container registries, Jenkins pipelines, ArgoCD applications, and Kubernetes deployments. + +--- + +## πŸ—οΈ **Architecture Overview** + +### **πŸ”„ The Big Picture: How Your Reconciler Works** + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ FRELEAPS DEVOPS RECONCILER ARCHITECTURE β”‚ +β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ KUBERNETES OPERATOR (KOPF) β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ β”‚ DevOpsProject β”‚ β”‚ ArgoSetting β”‚ β”‚ JenkinsSettingβ”‚ β”‚ IngressResourceβ”‚ β”‚ +β”‚ β”‚ β”‚ Controller β”‚ β”‚ Controller β”‚ β”‚ Controller β”‚ β”‚ Controller β”‚ β”‚ +β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ EXTERNAL SERVICE INTEGRATION β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ β”‚ ArgoCD β”‚ β”‚ Jenkins β”‚ β”‚ Docker Hub β”‚ β”‚ GoDaddy β”‚ β”‚ +β”‚ β”‚ β”‚ Client β”‚ β”‚ Client β”‚ β”‚ Client β”‚ β”‚ Client β”‚ β”‚ +β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ MESSAGING & EVENT SYSTEM β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ β”‚ RabbitMQ β”‚ β”‚ Heartbeat β”‚ β”‚ Deployment β”‚ β”‚ TTL Monitor β”‚ β”‚ +β”‚ β”‚ β”‚ Listener β”‚ β”‚ Sender β”‚ β”‚ Monitor β”‚ β”‚ Manager β”‚ β”‚ +β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +--- + +## πŸ”§ **Core Components Deep Dive** + +### **1. DevOpsProject Controller** πŸ—οΈ + +**What it does:** The **orchestrator-in-chief** that creates your entire DevOps ecosystem. + +**Real Code Example:** +```python +@kopf.on.create(group=consts.GROUP, version=consts.VERSION, kind=consts.DEVOPS_PROJECT_KIND) +def on_devops_proj_created(name: str, namespace: Optional[str], body: Body, logger: Logger, **kwargs): + # When you create a DevOpsProject CR, this triggers: + # 1. Validates your Git repo and container registry config + # 2. Creates ArgoSetting CR (for ArgoCD management) + # 3. Creates JenkinsSetting CR (for CI/CD pipelines) + # 4. Creates ContainerRegistry CR (for image management) + # 5. Creates GitCredentials CR (for authentication) +``` + +**Your Actual Flow:** +``` +User creates DevOpsProject CR + ↓ +Reconciler validates Git repo + container registry + ↓ +Creates ArgoSetting CR (manages ArgoCD projects/apps) + ↓ +Creates JenkinsSetting CR (manages CI/CD pipelines) + ↓ +Creates ContainerRegistry CR (manages Docker images) + ↓ +Creates GitCredentials CR (manages authentication) + ↓ +Your DevOps ecosystem is ready! πŸŽ‰ +``` + +### **2. ArgoSetting Controller** πŸš€ + +**What it does:** Manages your **ArgoCD infrastructure** - projects, repositories, and applications. + +**Real Code Example:** +```python +# When ArgoSetting CR is created: +for project in as_spec.projects: + # Creates ArgoCD Project + desired_resources.append(ManagedResource( + resource_type="project", + resource_id=project.name, + description=project.desc, + metadata={ + "source_repos": [repo.url for repo in as_spec.repositories], + "destinations": [{"server": dest.server, "namespace": dest.namespace}] + } + )) + +for app in as_spec.applications: + # Creates ArgoCD Application + desired_resources.append(ManagedResource( + resource_type="application", + resource_id=app.name, + metadata={ + "project": app.project, + "repo_url": app.source.repo_url, + "path": app.source.path, + "target_revision": app.source.revision + } + )) +``` + +**Your Actual ArgoCD Management:** +``` +ArgoSetting CR created + ↓ +Reconciler connects to ArgoCD API (argo.mathmast.com) + ↓ +Creates ArgoCD Project (defines permissions, repos, destinations) + ↓ +Creates ArgoCD Repository (connects to your Git repo) + ↓ +Creates ArgoCD Application (deploys your app) + ↓ +ArgoCD starts syncing your application! πŸ”„ +``` + +### **3. JenkinsSetting Controller** βš™οΈ + +**What it does:** Manages your **Jenkins CI/CD pipelines** - creates folders, pipelines, and credentials. + +**Real Code Example:** +```python +@kopf.timer(group=consts.GROUP, version=consts.VERSION, kind=consts.JENKINS_SETTINGS_KIND, interval=300) +def poll_project_config(name: str, namespace: str, body: Body, logger: logging.Logger, **kwargs): + # Every 5 minutes, the reconciler: + # 1. Fetches your project's YAML config from Git + # 2. Generates Jenkins Pipeline DSL + # 3. Creates/updates Jenkins pipelines + # 4. Manages pipeline credentials +``` + +**Your Actual Jenkins Management:** +``` +JenkinsSetting CR created + ↓ +Reconciler clones your Git repo + ↓ +Reads your project's YAML configuration + ↓ +Generates Jenkins Pipeline DSL (Groovy script) + ↓ +Creates Jenkins folder structure (project/environment) + ↓ +Creates Jenkins pipeline with your DSL + ↓ +Your CI/CD pipeline is ready! πŸš€ +``` + +### **4. DeploymentRecord Controller** 🎯 + +**What it does:** Orchestrates **actual deployments** - triggers Jenkins builds, monitors ArgoCD sync, manages TTL. + +**Real Code Example:** +```python +@kopf.on.create(group=consts.GROUP, version=consts.VERSION, plural=consts.DEPLOYMENT_RECORD_PLURAL) +async def create_deployment_record(spec: Dict[str, Any], name: str, namespace: str, uid: str, logger: Logger, **kwargs): + # When you trigger a deployment: + # 1. Validates deployment request + # 2. Triggers Jenkins build + # 3. Monitors build progress + # 4. Triggers ArgoCD sync + # 5. Monitors deployment status + # 6. Manages TTL (Time To Live) +``` + +**Your Actual Deployment Flow:** +``` +User clicks "Deploy" button + ↓ +DeploymentRecord CR created + ↓ +Reconciler triggers Jenkins build + ↓ +Monitors build phases (building, testing, packaging) + ↓ +Triggers ArgoCD sync when build completes + ↓ +Monitors ArgoCD sync status + ↓ +Creates IngressResource for external access + ↓ +Deployment is live! 🌐 +``` + +### **5. IngressResource Controller** 🌐 + +**What it does:** Manages **external access** - DNS records, SSL certificates, and ingress rules. + +**Real Code Example:** +```python +async def create_ingress_resource(self, body: Body, name: str, namespace: str, **kwargs): + # When IngressResource CR is created: + # 1. Creates DNS record via GoDaddy API + # 2. Requests SSL certificate via cert-manager + # 3. Creates Kubernetes Ingress + # 4. Updates deployment URL + # 5. Sends heartbeat with live URL +``` + +**Your Actual Ingress Management:** +``` +IngressResource CR created + ↓ +Reconciler calls GoDaddy API + ↓ +Creates DNS record (app.mathmast.com β†’ 4.155.160.32) + ↓ +Requests SSL certificate from Let's Encrypt + ↓ +Creates Kubernetes Ingress rule + ↓ +Your app is accessible at https://app.mathmast.com! πŸ”’ +``` + +--- + +## πŸ“‘ **Messaging & Event System** + +### **RabbitMQ Integration** 🐰 + +Your reconciler uses **RabbitMQ** for asynchronous communication and event-driven architecture. + +**Event Types:** +```python +class EventType(Enum): + DEVOPS_INITIALIZE = "DevOpsInitialize" # New project setup + DEVOPS_RECONCILE = "DevOpsReconcile" # Deployment trigger + DEVOPS_RECONCILE_HEARTBEAT = "DevOpsReconcileJobHeartbeat" # Progress updates +``` + +**Real Event Flow:** +``` +User triggers deployment + ↓ +DevOpsReconcileEvent sent to RabbitMQ + ↓ +Reconciler picks up event + ↓ +Creates DeploymentRecord CR + ↓ +Sends heartbeat every 30 seconds + ↓ +User sees real-time progress! πŸ“Š +``` + +### **Heartbeat System** πŸ’“ + +**What it does:** Provides **real-time deployment status** to your users. + +**Real Code Example:** +```python +@dataclass +class DevOpsReconcileJobHeartbeatEvent: + operation: str = "heartbeat" + id: str = "" # deployment ID + status: str = "running" # running/success/failed/terminated + phase: str = "initializing" # current deployment phase + phase_message: str = "" # human-readable message + url: Optional[str] = None # live URL when deployment completes +``` + +**Your Actual Heartbeat Flow:** +``` +Deployment starts + ↓ +Heartbeat every 30 seconds: + - Phase: "initializing" β†’ "building" β†’ "deploying" β†’ "verifying" + - Status: "running" β†’ "success" + - URL: None β†’ "https://app.mathmast.com" + ↓ +User sees live progress in UI! πŸ“ˆ +``` + +--- + +## πŸ•’ **TTL (Time To Live) Management** + +### **Automatic Cleanup** 🧹 + +Your reconciler includes **sophisticated TTL management** for temporary deployments. + +**Real Code Example:** +```python +@dataclass +class TTLMonitoringState: + deployment_record_name: str + enabled: bool + ttl_seconds: int # Default: 3 hours (10800 seconds) + start_time: datetime + expiration_time: datetime + phase: TTLMonitoringPhase # monitoring/cleanup/completed +``` + +**Your Actual TTL Flow:** +``` +Deployment completes + ↓ +TTL monitoring starts (3 hours by default) + ↓ +Every minute: Check if TTL expired + ↓ +When expired: Trigger cleanup + ↓ +Delete ArgoCD applications + ↓ +Delete Kubernetes resources + ↓ +Delete DNS records + ↓ +Delete SSL certificates + ↓ +Environment cleaned up! 🧹 +``` + +--- + +## πŸ”— **Relationship with Your DevOps Infrastructure** + +### **How the Reconciler Extends Kubernetes** πŸ”§ + +Your reconciler doesn't just manage Kubernetes resources - it **extends Kubernetes** with custom DevOps capabilities: + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ KUBERNETES API EXTENSION β”‚ +β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ CUSTOM RESOURCES (CRs) β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ DevOpsProject CR β†’ ArgoSetting CR β†’ JenkinsSetting CR β”‚ β”‚ +β”‚ β”‚ ↓ ↓ ↓ β”‚ β”‚ +β”‚ β”‚ ContainerRegistry CR β†’ GitCredentials CR β†’ IngressResource CR β”‚ β”‚ +β”‚ β”‚ ↓ ↓ ↓ β”‚ β”‚ +β”‚ β”‚ DeploymentRecord CR β†’ TTL Management β†’ Heartbeat System β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ EXTERNAL SERVICE ORCHESTRATION β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ ArgoCD API β†’ Jenkins API β†’ Docker Hub API β†’ GoDaddy API β”‚ β”‚ +β”‚ β”‚ ↓ ↓ ↓ ↓ β”‚ β”‚ +β”‚ β”‚ Git Repos β†’ CI/CD Pipelines β†’ Container Images β†’ DNS Records β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +### **Your Actual DevOps Workflow** πŸ”„ + +**Complete Flow Example:** +``` +1. Developer pushes code to Git + ↓ +2. User creates DevOpsProject CR + ↓ +3. Reconciler creates ArgoSetting CR + ↓ +4. Reconciler creates JenkinsSetting CR + ↓ +5. Jenkins pipeline created and triggered + ↓ +6. Build completes, image pushed to registry + ↓ +7. ArgoCD syncs new image + ↓ +8. IngressResource creates external access + ↓ +9. App is live at https://app.mathmast.com + ↓ +10. TTL monitoring starts (3 hours) + ↓ +11. After 3 hours: Automatic cleanup +``` + +--- + +## 🎯 **Key Benefits of Your Reconciler Architecture** + +### **1. Declarative DevOps** πŸ“ +- **Define once, deploy everywhere**: Your DevOpsProject CR defines your entire infrastructure +- **GitOps workflow**: Everything is version-controlled and declarative +- **Consistency**: Same process for alpha and production environments + +### **2. Automation at Scale** πŸ€– +- **Zero manual intervention**: From Git push to live deployment +- **Multi-environment support**: Alpha and production with same configuration +- **Automatic cleanup**: TTL management prevents resource waste + +### **3. Real-time Visibility** πŸ‘οΈ +- **Live progress tracking**: Heartbeat system shows real-time deployment status +- **Comprehensive monitoring**: Every phase is tracked and reported +- **Error handling**: Detailed error messages and recovery mechanisms + +### **4. Enterprise Integration** 🏒 +- **Multi-service orchestration**: ArgoCD, Jenkins, Docker Hub, GoDaddy +- **Security**: Credential management and SSL certificate automation +- **Scalability**: Kubernetes-native architecture scales with your cluster + +--- + +## πŸ” **Your Reconciler vs. Traditional DevOps** + +### **Traditional DevOps** πŸ—οΈ +``` +Manual Jenkins setup β†’ Manual ArgoCD config β†’ Manual DNS setup β†’ Manual SSL setup +``` + +### **Your Reconciler** πŸš€ +``` +DevOpsProject CR β†’ Automatic Jenkins + ArgoCD + DNS + SSL setup +``` + +**The difference:** Your reconciler transforms **manual DevOps tasks** into **declarative, automated, and scalable** operations that run on Kubernetes. + +--- + +## πŸŽ‰ **Conclusion** + +Your `freeleaps-devops-reconciler` is not just a Kubernetes operator - it's a **complete DevOps automation platform** that: + +1. **Extends Kubernetes** with custom DevOps capabilities +2. **Orchestrates multiple external services** (ArgoCD, Jenkins, Docker Hub, GoDaddy) +3. **Provides real-time visibility** into deployment progress +4. **Automates complex workflows** from Git push to live deployment +5. **Manages the complete lifecycle** including cleanup and TTL + +It's the **brain** of your DevOps infrastructure, making complex multi-service orchestration as simple as creating a Kubernetes Custom Resource! 🧠✨ diff --git a/docs/Reconciler_Framework_Analysis.md b/docs/Reconciler_Framework_Analysis.md new file mode 100644 index 00000000..898c66b6 --- /dev/null +++ b/docs/Reconciler_Framework_Analysis.md @@ -0,0 +1,521 @@ +# Reconciler Framework Analysis & Robustness Assessment + +## 🎯 **Framework Overview** + +Your `freeleaps-devops-reconciler` is built on **Kopf** (Kubernetes Operator Pythonic Framework), not FastAPI. Here's the detailed breakdown: + +### **πŸ—οΈ Framework Stack** + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ FRELEAPS RECONCILER FRAMEWORK STACK β”‚ +β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ KOPF (Kubernetes Operator Framework) β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β€’ Event-driven Kubernetes resource watching β”‚ β”‚ +β”‚ β”‚ β€’ Custom Resource Definition (CRD) management β”‚ β”‚ +β”‚ β”‚ β€’ Reconciliation loop with retry mechanisms β”‚ β”‚ +β”‚ β”‚ β€’ Kubernetes API integration β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ ASYNCIO + THREADING HYBRID β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β€’ Asynchronous operations for I/O-bound tasks β”‚ β”‚ +β”‚ β”‚ β€’ Threading for CPU-bound operations β”‚ β”‚ +β”‚ β”‚ β€’ Event loop management for concurrent operations β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ RABBITMQ MESSAGING LAYER β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β€’ Asynchronous message processing β”‚ β”‚ +β”‚ β”‚ β€’ Event-driven architecture β”‚ β”‚ +β”‚ β”‚ β€’ Heartbeat system for real-time updates β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ EXTERNAL SERVICE INTEGRATION β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β€’ ArgoCD API client (synchronous) β”‚ β”‚ +β”‚ β”‚ β€’ Jenkins API client (synchronous) β”‚ β”‚ +β”‚ β”‚ β€’ Docker Hub API client (synchronous) β”‚ β”‚ +β”‚ β”‚ β€’ GoDaddy DNS API client (asynchronous) β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +--- + +## πŸ”§ **Framework Architecture Deep Dive** + +### **1. Kopf Framework** 🎯 + +**What it is:** A Python framework for building Kubernetes operators using decorators and event handlers. + +**Your Implementation:** +```python +# Main operator setup +kopf.configure( + verbose=config.RECONCILER_DEBUG, +) + +# Event handlers using decorators +@kopf.on.create(group=consts.GROUP, version=consts.VERSION, kind=consts.DEVOPS_PROJECT_KIND) +def on_devops_proj_created(name: str, namespace: Optional[str], body: Body, logger: Logger, **kwargs): + # Your reconciliation logic here + +@kopf.timer(group=consts.GROUP, version=consts.VERSION, kind=consts.JENKINS_SETTINGS_KIND, interval=300) +def poll_project_config(name: str, namespace: str, body: Body, logger: logging.Logger, **kwargs): + # Periodic reconciliation every 5 minutes +``` + +**Key Features:** +- **Event-driven**: Watches Kubernetes API for resource changes +- **Retry mechanisms**: `kopf.TemporaryError` for transient failures +- **Resource management**: Automatic cleanup and state management +- **Logging integration**: Built-in logging with Kubernetes events + +### **2. Asyncio + Threading Hybrid** πŸ”„ + +**Your Architecture Pattern:** +```python +# Main event loop (asyncio) +loop = asyncio.get_event_loop() +loop.run_until_complete( + kopf.operator( + clusterwide=False, + priority=int(time.time() * 1000000), + peering_name="freeleaps-devops-reconciler", + namespaces=["freeleaps-devops-system"], + ) +) + +# Threading for TTL recovery +def delayed_ttl_recovery(): + import threading + ttl_thread = threading.Thread(target=delayed_ttl_recovery, daemon=True) + ttl_thread.start() +``` + +**Why This Pattern:** +- **Asyncio**: For I/O-bound operations (API calls, network requests) +- **Threading**: For CPU-bound operations and blocking calls +- **Event Loop**: Manages concurrent operations efficiently + +### **3. RabbitMQ Integration** 🐰 + +**Your Messaging Architecture:** +```python +# Event types +class EventType(Enum): + DEVOPS_INITIALIZE = "DevOpsInitialize" # New project setup + DEVOPS_RECONCILE = "DevOpsReconcile" # Deployment trigger + DEVOPS_RECONCILE_HEARTBEAT = "DevOpsReconcileJobHeartbeat" # Progress updates + +# Async message processing +async def handle_rabbitmq_message(ch, method, properties, body): + # Process messages asynchronously +``` + +--- + +## ⚠️ **Current Issues & Reliability Problems** + +### **1. Error Handling Inconsistencies** 🚨 + +**Problem:** Mixed error handling patterns throughout the codebase. + +**Evidence:** +```python +# Inconsistent error handling patterns found: +# Pattern 1: Generic Exception catching +except Exception as e: + logger.error(f"Failed to setup HeartbeatSender: {e}") + logger.warning("DeploymentRecord controller will continue without heartbeat functionality") + +# Pattern 2: Specific error handling +except kopf.TemporaryError: + raise # Re-raise kopf.TemporaryError for retry + +# Pattern 3: Custom error classes +except SecretNotFoundError as e: + # Handle specific error +``` + +**Issues:** +- **Silent failures**: Some exceptions are caught and logged but not properly handled +- **Inconsistent retry logic**: Some errors retry, others don't +- **Resource leaks**: Failed operations may leave resources in inconsistent state + +### **2. Threading and Asyncio Complexity** πŸ”„ + +**Problem:** Complex interaction between threading and asyncio can lead to race conditions. + +**Evidence:** +```python +# Complex threading setup in operator.py +def delayed_ttl_recovery(): + import threading + import asyncio + + def run_async_callback(): + try: + loop = asyncio.get_event_loop() + except RuntimeError: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + if loop.is_running(): + asyncio.run_coroutine_threadsafe(run_ttl_recovery(), loop) + else: + loop.run_until_complete(run_ttl_recovery()) + + ttl_thread = threading.Thread(target=delayed_ttl_recovery, daemon=True) + ttl_thread.start() +``` + +**Issues:** +- **Race conditions**: Multiple threads accessing shared resources +- **Event loop conflicts**: Complex event loop management +- **Resource cleanup**: Daemon threads may not clean up properly + +### **3. Configuration Management** βš™οΈ + +**Problem:** Complex configuration with many environment variables and potential for misconfiguration. + +**Evidence:** +```python +# 50+ environment variables in config.py +env_mappings = { + "RECONCILER_DEBUG": (bool, lambda x: x.lower() == "true"), + "RABBITMQ_HOST": str, + "RABBITMQ_PORT": int, + "JENKINS_ENDPOINT": str, + "ARGOCD_ENDPOINT": str, + # ... 40+ more variables +} +``` + +**Issues:** +- **Configuration drift**: Easy to have mismatched configurations +- **Validation gaps**: Limited validation of configuration values +- **Default handling**: Some configurations have defaults, others don't + +### **4. External Service Dependencies** πŸ”— + +**Problem:** Heavy dependency on external services that can fail independently. + +**Evidence:** +```python +# Multiple external service dependencies +try: + init_argo_client(host=config.ARGOCD_ENDPOINT, ...) + remote_argo_ver = get_argo_client().get_version() +except Exception as e: + logger.error(f"Failed to connect to ArgoCD server: {e}") + logger.warning("Continuing operator startup without ArgoCD connection") + +try: + message_listener = MessageListener(...) + if message_listener.start(): + logger.info("RabbitMQ message listener started successfully") + else: + logger.warning("Failed to start RabbitMQ message listener") +except Exception as e: + logger.error(f"Error starting RabbitMQ message listener: {e}") +``` + +**Issues:** +- **Cascade failures**: One service failure can affect others +- **Partial functionality**: System continues with degraded capabilities +- **Error propagation**: Errors from external services may not be properly handled + +### **5. Resource Management** πŸ’Ύ + +**Problem:** Complex resource lifecycle management with potential for leaks. + +**Evidence:** +```python +# Complex resource cleanup in TTL management +async def cleanup_application_resources(self, applications: List[ArgoApplicationInfo], + skip_resource_types: List[str] = None, + cleanup_timeout: int = 300) -> Dict[str, Any]: + # Complex cleanup logic with multiple failure points +``` + +**Issues:** +- **Resource leaks**: Failed cleanup operations may leave resources +- **Timeout handling**: Complex timeout management across multiple operations +- **State inconsistency**: Resources may be in inconsistent states after failures + +--- + +## πŸš€ **Robustness Improvement Recommendations** + +### **1. Standardized Error Handling** πŸ›‘οΈ + +**Recommendation:** Implement consistent error handling patterns. + +```python +# Proposed error handling pattern +class ReconcilerErrorHandler: + @staticmethod + def handle_operation(operation_name: str, operation: Callable, logger: Logger): + try: + return operation() + except kopf.TemporaryError: + # Re-raise for retry + raise + except ExternalServiceError as e: + # Handle external service failures + logger.error(f"External service error in {operation_name}: {e}") + raise kopf.TemporaryError(f"External service unavailable: {e}", delay=30) + except ValidationError as e: + # Handle validation errors + logger.error(f"Validation error in {operation_name}: {e}") + raise kopf.PermanentError(f"Invalid configuration: {e}") + except Exception as e: + # Handle unexpected errors + logger.error(f"Unexpected error in {operation_name}: {e}") + raise kopf.TemporaryError(f"Internal error: {e}", delay=60) +``` + +### **2. Simplified Asyncio Architecture** πŸ”„ + +**Recommendation:** Reduce threading complexity and use pure asyncio where possible. + +```python +# Proposed simplified architecture +class ReconcilerManager: + def __init__(self): + self.event_loop = asyncio.get_event_loop() + self.tasks = [] + + async def start(self): + # Start all async tasks + self.tasks.extend([ + asyncio.create_task(self.ttl_monitor()), + asyncio.create_task(self.heartbeat_sender()), + asyncio.create_task(self.message_listener()), + ]) + + async def stop(self): + # Clean shutdown of all tasks + for task in self.tasks: + task.cancel() + await asyncio.gather(*self.tasks, return_exceptions=True) +``` + +### **3. Configuration Validation** βœ… + +**Recommendation:** Add comprehensive configuration validation. + +```python +# Proposed configuration validation +class ConfigurationValidator: + @staticmethod + def validate_config(config: Config) -> List[str]: + errors = [] + + # Required fields + required_fields = [ + "RABBITMQ_HOST", "RABBITMQ_PORT", "JENKINS_ENDPOINT", + "ARGOCD_ENDPOINT", "DEFAULT_GIT_USERNAME" + ] + + for field in required_fields: + if not getattr(config, field, None): + errors.append(f"Missing required configuration: {field}") + + # URL validation + if not is_valid_url(config.JENKINS_ENDPOINT): + errors.append(f"Invalid Jenkins endpoint: {config.JENKINS_ENDPOINT}") + + # Port validation + if not (1 <= config.RABBITMQ_PORT <= 65535): + errors.append(f"Invalid RabbitMQ port: {config.RABBITMQ_PORT}") + + return errors +``` + +### **4. Circuit Breaker Pattern** ⚑ + +**Recommendation:** Implement circuit breakers for external service calls. + +```python +# Proposed circuit breaker implementation +class CircuitBreaker: + def __init__(self, failure_threshold: int = 5, timeout: int = 60): + self.failure_threshold = failure_threshold + self.timeout = timeout + self.failure_count = 0 + self.last_failure_time = None + self.state = "CLOSED" # CLOSED, OPEN, HALF_OPEN + + async def call(self, operation: Callable): + if self.state == "OPEN": + if time.time() - self.last_failure_time > self.timeout: + self.state = "HALF_OPEN" + else: + raise ExternalServiceError("Circuit breaker is OPEN") + + try: + result = await operation() + if self.state == "HALF_OPEN": + self.state = "CLOSED" + self.failure_count = 0 + return result + except Exception as e: + self.failure_count += 1 + self.last_failure_time = time.time() + + if self.failure_count >= self.failure_threshold: + self.state = "OPEN" + + raise e +``` + +### **5. Health Checks and Monitoring** πŸ“Š + +**Recommendation:** Add comprehensive health checks and monitoring. + +```python +# Proposed health check system +class HealthChecker: + def __init__(self): + self.checks = { + "kopf_operator": self.check_kopf_operator, + "rabbitmq_connection": self.check_rabbitmq_connection, + "argocd_connection": self.check_argocd_connection, + "jenkins_connection": self.check_jenkins_connection, + "kubernetes_api": self.check_kubernetes_api, + } + + async def run_health_checks(self) -> Dict[str, bool]: + results = {} + for name, check in self.checks.items(): + try: + results[name] = await check() + except Exception as e: + results[name] = False + logger.error(f"Health check failed for {name}: {e}") + return results + + async def check_kopf_operator(self) -> bool: + # Check if Kopf operator is running + return True + + async def check_rabbitmq_connection(self) -> bool: + # Check RabbitMQ connectivity + return True +``` + +### **6. Resource Lifecycle Management** πŸ”„ + +**Recommendation:** Implement proper resource lifecycle management. + +```python +# Proposed resource lifecycle manager +class ResourceLifecycleManager: + def __init__(self): + self.resources = {} + + async def create_resource(self, resource_type: str, resource_id: str, + create_func: Callable, cleanup_func: Callable): + try: + result = await create_func() + self.resources[resource_id] = { + "type": resource_type, + "created_at": time.time(), + "cleanup_func": cleanup_func, + "status": "active" + } + return result + except Exception as e: + # Cleanup on creation failure + await self.cleanup_resource(resource_id) + raise e + + async def cleanup_resource(self, resource_id: str): + if resource_id in self.resources: + resource = self.resources[resource_id] + try: + await resource["cleanup_func"]() + resource["status"] = "cleaned" + except Exception as e: + logger.error(f"Failed to cleanup resource {resource_id}: {e}") + resource["status"] = "cleanup_failed" +``` + +--- + +## 🎯 **Feature Enhancement Recommendations** + +### **1. Observability Improvements** πŸ‘οΈ + +**Current State:** Basic logging with some structured logging. + +**Recommendations:** +- **Distributed tracing**: Add OpenTelemetry integration +- **Metrics collection**: Prometheus metrics for all operations +- **Structured logging**: Consistent log format across all components +- **Alerting**: Proactive alerts for failures and degraded states + +### **2. Testing Improvements** πŸ§ͺ + +**Current State:** Limited test coverage with some unit tests. + +**Recommendations:** +- **Integration tests**: Test full reconciliation flows +- **Chaos engineering**: Test failure scenarios +- **Performance tests**: Test under load +- **End-to-end tests**: Test complete user workflows + +### **3. Security Enhancements** πŸ”’ + +**Current State:** Basic authentication and authorization. + +**Recommendations:** +- **RBAC improvements**: Fine-grained permissions +- **Secret management**: Better secret rotation and management +- **Audit logging**: Comprehensive audit trails +- **Network policies**: Restrict network access + +### **4. Performance Optimizations** ⚑ + +**Current State:** Basic performance with some optimization. + +**Recommendations:** +- **Connection pooling**: Reuse connections to external services +- **Caching**: Cache frequently accessed data +- **Batch operations**: Batch API calls where possible +- **Resource limits**: Proper resource limits and requests + +--- + +## πŸŽ‰ **Conclusion** + +Your `freeleaps-devops-reconciler` is a **sophisticated DevOps automation platform** built on solid foundations, but it has several areas for improvement: + +### **Strengths** βœ… +- **Comprehensive functionality**: Handles complex multi-service orchestration +- **Event-driven architecture**: Good use of RabbitMQ for messaging +- **Kubernetes-native**: Proper use of Kopf framework +- **Real-time visibility**: Heartbeat system provides good user experience + +### **Areas for Improvement** πŸ”§ +- **Error handling**: Standardize error handling patterns +- **Architecture complexity**: Simplify threading/asyncio interactions +- **Configuration management**: Add validation and defaults +- **External dependencies**: Implement circuit breakers and fallbacks +- **Resource management**: Improve lifecycle management +- **Observability**: Add comprehensive monitoring and tracing + +### **Priority Recommendations** 🎯 +1. **High Priority**: Standardize error handling and add circuit breakers +2. **Medium Priority**: Simplify architecture and add configuration validation +3. **Low Priority**: Add comprehensive monitoring and testing + +The reconciler is **production-ready** but would benefit significantly from these robustness improvements to handle edge cases and failures more gracefully! πŸš€ diff --git a/docs/add_k8s_node.sh b/docs/add_k8s_node.sh new file mode 100755 index 00000000..a4e26bdb --- /dev/null +++ b/docs/add_k8s_node.sh @@ -0,0 +1,397 @@ +#!/bin/bash + +# Azure Kubernetes Node Addition Script +# This script automates the process of adding new Azure VMs to an existing Kubernetes cluster + +set -e # Exit on any error + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Configuration +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +INVENTORY_FILE="freeleaps-ops/cluster/ansible/manifests/inventory.ini" +KUBESPRAY_DIR="freeleaps-ops/3rd/kubespray" +ANSIBLE_USER="wwwadmin@mathmast.com" + +# Function to print colored output +print_status() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +print_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +print_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +print_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# Function to validate input +validate_input() { + if [[ -z "$1" ]]; then + print_error "Input cannot be empty" + return 1 + fi + return 0 +} + +# Function to check prerequisites +check_prerequisites() { + print_status "Checking prerequisites..." + + # Check if kubectl is installed + if ! command -v kubectl &> /dev/null; then + print_error "kubectl is not installed" + exit 1 + fi + + # Check if ansible is installed + if ! command -v ansible &> /dev/null; then + print_error "ansible is not installed" + exit 1 + fi + + # Check if az CLI is installed + if ! command -v az &> /dev/null; then + print_error "Azure CLI is not installed" + exit 1 + fi + + # Check if inventory file exists + if [[ ! -f "$INVENTORY_FILE" ]]; then + print_error "Inventory file not found: $INVENTORY_FILE" + exit 1 + fi + + # Check if kubespray directory exists + if [[ ! -d "$KUBESPRAY_DIR" ]]; then + print_error "Kubespray directory not found: $KUBESPRAY_DIR" + exit 1 + fi + + print_success "All prerequisites are met" +} + +# Function to get VM details from Azure +get_vm_details() { + local vm_name="$1" + local resource_group="$2" + + print_status "Getting VM details from Azure..." + + # Get VM private IP + local private_ip=$(az vm show --resource-group "$resource_group" --name "$vm_name" --query "privateIps" -o tsv 2>/dev/null) + if [[ -z "$private_ip" ]]; then + print_error "Failed to get private IP for VM: $vm_name" + return 1 + fi + + # Get VM power state + local power_state=$(az vm show --resource-group "$resource_group" --name "$vm_name" --query "powerState" -o tsv 2>/dev/null) + if [[ "$power_state" != "VM running" ]]; then + print_warning "VM is not running. Current state: $power_state" + read -p "Do you want to start the VM? (y/N): " -n 1 -r + echo + if [[ $REPLY =~ ^[Yy]$ ]]; then + az vm start --resource-group "$resource_group" --name "$vm_name" + print_status "Waiting for VM to start..." + sleep 30 + else + print_error "VM must be running to proceed" + return 1 + fi + fi + + echo "$private_ip" +} + +# Function to test SSH connectivity +test_ssh_connectivity() { + local ip_address="$1" + + print_status "Testing SSH connectivity to $ip_address..." + + # Test SSH connection + if timeout 10 ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$ANSIBLE_USER@$ip_address" "echo 'SSH connection successful'" 2>/dev/null; then + print_success "SSH connectivity verified" + return 0 + else + print_error "SSH connection failed to $ip_address" + print_warning "Please ensure:" + print_warning "1. VM is running" + print_warning "2. Network security group allows SSH (port 22)" + print_warning "3. SSH service is running on the VM" + return 1 + fi +} + +# Function to update inventory file +update_inventory() { + local vm_name="$1" + local ip_address="$2" + local node_type="$3" + + print_status "Updating inventory file..." + + # Create backup of inventory file + cp "$INVENTORY_FILE" "${INVENTORY_FILE}.backup.$(date +%Y%m%d_%H%M%S)" + + # Add node to inventory based on type + if [[ "$node_type" == "worker" ]]; then + echo "$vm_name ansible_host=$ip_address ansible_user=$ANSIBLE_USER host_name=$vm_name" >> "$INVENTORY_FILE" + print_success "Added worker node to inventory" + elif [[ "$node_type" == "master" ]]; then + echo "$vm_name ansible_host=$ip_address ansible_user=$ANSIBLE_USER etcd_member_name=${vm_name}-etcd host_name=$vm_name" >> "$INVENTORY_FILE" + print_success "Added master node to inventory" + else + print_error "Invalid node type: $node_type" + return 1 + fi +} + +# Function to verify inventory +verify_inventory() { + print_status "Verifying inventory configuration..." + + # Test inventory syntax + if ansible-inventory -i "$INVENTORY_FILE" --list > /dev/null 2>&1; then + print_success "Inventory syntax is valid" + else + print_error "Inventory syntax is invalid" + return 1 + fi + + # Test connectivity to all nodes + print_status "Testing connectivity to all nodes..." + if ansible -i "$INVENTORY_FILE" all -m ping -kK; then + print_success "Connectivity to all nodes verified" + else + print_error "Connectivity test failed" + return 1 + fi +} + +# Function to run kubespray scale playbook +run_scale_playbook() { + print_status "Running Kubespray scale playbook..." + + cd "$(dirname "$INVENTORY_FILE")" + + # Run the scale playbook + if ansible-playbook -i inventory.ini "$KUBESPRAY_DIR/scale.yml" -kK -b; then + print_success "Scale playbook completed successfully" + else + print_error "Scale playbook failed" + return 1 + fi +} + +# Function to verify node addition +verify_node_addition() { + local vm_name="$1" + + print_status "Verifying node addition..." + + # Wait for node to appear + local max_attempts=30 + local attempt=1 + + while [[ $attempt -le $max_attempts ]]; do + if kubectl get nodes | grep -q "$vm_name"; then + print_success "Node $vm_name found in cluster" + break + fi + + print_status "Waiting for node to appear... (attempt $attempt/$max_attempts)" + sleep 10 + ((attempt++)) + done + + if [[ $attempt -gt $max_attempts ]]; then + print_error "Node $vm_name did not appear in cluster" + return 1 + fi + + # Wait for node to be ready + attempt=1 + while [[ $attempt -le $max_attempts ]]; do + local node_status=$(kubectl get nodes "$vm_name" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null) + if [[ "$node_status" == "True" ]]; then + print_success "Node $vm_name is ready" + break + fi + + print_status "Waiting for node to be ready... (attempt $attempt/$max_attempts)" + sleep 10 + ((attempt++)) + done + + if [[ $attempt -gt $max_attempts ]]; then + print_error "Node $vm_name is not ready" + kubectl describe node "$vm_name" + return 1 + fi +} + +# Function to test pod scheduling +test_pod_scheduling() { + local vm_name="$1" + + print_status "Testing pod scheduling on new node..." + + # Create a test pod + local test_pod_name="test-pod-$(date +%s)" + kubectl run "$test_pod_name" --image=nginx --restart=Never --overrides="{\"spec\":{\"nodeSelector\":{\"kubernetes.io/hostname\":\"$vm_name\"}}}" + + # Wait for pod to be scheduled + local max_attempts=30 + local attempt=1 + + while [[ $attempt -le $max_attempts ]]; do + local pod_status=$(kubectl get pod "$test_pod_name" -o jsonpath='{.status.phase}' 2>/dev/null) + if [[ "$pod_status" == "Running" ]]; then + print_success "Test pod is running on node $vm_name" + break + fi + + print_status "Waiting for test pod to be ready... (attempt $attempt/$max_attempts)" + sleep 10 + ((attempt++)) + done + + # Clean up test pod + kubectl delete pod "$test_pod_name" + + if [[ $attempt -gt $max_attempts ]]; then + print_error "Test pod failed to run on node $vm_name" + kubectl describe pod "$test_pod_name" + kubectl delete pod "$test_pod_name" + return 1 + fi +} + +# Function to display final status +display_final_status() { + local vm_name="$1" + + print_success "Node addition completed successfully!" + echo + echo "=== Final Status ===" + echo "Node Name: $vm_name" + echo "Node Status: $(kubectl get nodes "$vm_name" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}')" + echo "Node IP: $(kubectl get nodes "$vm_name" -o jsonpath='{.status.addresses[?(@.type=="InternalIP")].address}')" + echo "Node Capacity: $(kubectl get nodes "$vm_name" -o jsonpath='{.status.capacity.cpu}') CPU, $(kubectl get nodes "$vm_name" -o jsonpath='{.status.capacity.memory}') Memory" + echo + echo "=== Next Steps ===" + echo "1. Monitor the node for any issues" + echo "2. Update monitoring and alerting if needed" + echo "3. Update documentation" + echo "4. Consider running node maintenance tasks" +} + +# Main function +main() { + echo "==========================================" + echo "Azure Kubernetes Node Addition Script" + echo "==========================================" + echo + + # Check prerequisites + check_prerequisites + + # Get user input + echo "Please provide the following information:" + echo + + read -p "VM Name: " vm_name + validate_input "$vm_name" || exit 1 + + read -p "Resource Group: " resource_group + validate_input "$resource_group" || exit 1 + + read -p "Node Type (worker/master): " node_type + if [[ "$node_type" != "worker" && "$node_type" != "master" ]]; then + print_error "Node type must be 'worker' or 'master'" + exit 1 + fi + + echo + print_status "Summary:" + echo " VM Name: $vm_name" + echo " Resource Group: $resource_group" + echo " Node Type: $node_type" + echo + + read -p "Proceed with node addition? (y/N): " -n 1 -r + echo + if [[ ! $REPLY =~ ^[Yy]$ ]]; then + print_status "Operation cancelled" + exit 0 + fi + + # Get VM details + ip_address=$(get_vm_details "$vm_name" "$resource_group") + if [[ $? -ne 0 ]]; then + exit 1 + fi + + print_success "VM IP Address: $ip_address" + + # Test SSH connectivity + test_ssh_connectivity "$ip_address" || exit 1 + + # Update inventory + update_inventory "$vm_name" "$ip_address" "$node_type" || exit 1 + + # Verify inventory + verify_inventory || exit 1 + + # Run scale playbook + run_scale_playbook || exit 1 + + # Verify node addition + verify_node_addition "$vm_name" || exit 1 + + # Test pod scheduling + test_pod_scheduling "$vm_name" || exit 1 + + # Display final status + display_final_status "$vm_name" +} + +# Handle script arguments +if [[ $# -eq 0 ]]; then + main +else + case "$1" in + --help|-h) + echo "Usage: $0 [OPTIONS]" + echo + echo "Options:" + echo " --help, -h Show this help message" + echo " --version, -v Show version information" + echo + echo "This script automates the process of adding new Azure VMs to an existing Kubernetes cluster." + echo "It will prompt for necessary information and guide you through the process." + exit 0 + ;; + --version|-v) + echo "Azure Kubernetes Node Addition Script v1.0" + exit 0 + ;; + *) + print_error "Unknown option: $1" + echo "Use --help for usage information" + exit 1 + ;; + esac +fi diff --git a/docs/bootstrap-k8s-cluster.sh b/docs/bootstrap-k8s-cluster.sh new file mode 100755 index 00000000..8e79f46a --- /dev/null +++ b/docs/bootstrap-k8s-cluster.sh @@ -0,0 +1,394 @@ +#!/bin/bash + +# Freeleaps Kubernetes Cluster Bootstrap Script +# This script bootstraps a complete Kubernetes cluster from Azure VMs + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Configuration +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +FREELEAPS_OPS_DIR="$(dirname "$SCRIPT_DIR")" +INVENTORY_FILE="$FREELEAPS_OPS_DIR/cluster/ansible/manifests/inventory.ini" +KUBESPRAY_DIR="$FREELEAPS_OPS_DIR/3rd/kubespray" +MANIFESTS_DIR="$FREELEAPS_OPS_DIR/cluster/manifests" +BIN_DIR="$FREELEAPS_OPS_DIR/cluster/bin" + +# Function to print colored output +print_status() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +print_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +print_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +print_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# Function to check prerequisites +check_prerequisites() { + print_status "Checking prerequisites..." + + # Check if we're in the right directory + if [[ ! -f "$INVENTORY_FILE" ]]; then + print_error "Inventory file not found: $INVENTORY_FILE" + print_error "Please run this script from the freeleaps-ops/docs directory" + exit 1 + fi + + # Check if kubespray exists + if [[ ! -d "$KUBESPRAY_DIR" ]]; then + print_error "Kubespray directory not found: $KUBESPRAY_DIR" + exit 1 + fi + + # Check required tools + local missing_tools=() + + if ! command -v ansible &> /dev/null; then + missing_tools+=("ansible") + fi + + if ! command -v az &> /dev/null; then + missing_tools+=("azure-cli") + fi + + if ! command -v kubectl &> /dev/null; then + missing_tools+=("kubectl") + fi + + if [[ ${#missing_tools[@]} -gt 0 ]]; then + print_error "Missing required tools: ${missing_tools[*]}" + print_warning "Please install missing tools before proceeding" + exit 1 + fi + + print_success "All prerequisites are met" +} + +# Function to verify Azure VMs +verify_azure_vms() { + print_status "Verifying Azure VMs..." + + # Get VMs from inventory + local vms=() + while IFS= read -r line; do + if [[ $line =~ ^[a-zA-Z0-9-]+ ]]; then + vm_name=$(echo "$line" | awk '{print $1}') + vms+=("$vm_name") + fi + done < "$INVENTORY_FILE" + + print_status "Found VMs in inventory: ${vms[*]}" + + # Check VM status in Azure + for vm in "${vms[@]}"; do + local power_state=$(az vm show --resource-group k8s --name "$vm" --query "powerState" -o tsv 2>/dev/null) + if [[ "$power_state" != "VM running" ]]; then + print_warning "VM $vm is not running (state: $power_state)" + read -p "Do you want to start VM $vm? (y/N): " -n 1 -r + echo + if [[ $REPLY =~ ^[Yy]$ ]]; then + az vm start --resource-group k8s --name "$vm" + print_status "Starting VM $vm..." + sleep 30 + fi + else + print_success "VM $vm is running" + fi + done +} + +# Function to test connectivity +test_connectivity() { + print_status "Testing connectivity to all VMs..." + + cd "$(dirname "$INVENTORY_FILE")" + + if ansible -i inventory.ini all -m ping -kK; then + print_success "Connectivity to all VMs verified" + else + print_error "Connectivity test failed" + print_warning "Please check:" + print_warning "1. VMs are running" + print_warning "2. Network security groups allow SSH (port 22)" + print_warning "3. SSH credentials are correct" + exit 1 + fi +} + +# Function to bootstrap Kubernetes cluster +bootstrap_cluster() { + print_status "Bootstrapping Kubernetes cluster..." + + cd "$KUBESPRAY_DIR" + + print_status "Running Kubespray cluster installation..." + print_warning "This process may take 15-30 minutes..." + + if ansible-playbook -i ../../cluster/ansible/manifests/inventory.ini ./cluster.yml -kK -b; then + print_success "Kubernetes cluster bootstrapped successfully" + else + print_error "Cluster bootstrap failed" + print_warning "Check the Ansible output for errors" + exit 1 + fi +} + +# Function to get kubeconfig +get_kubeconfig() { + print_status "Retrieving kubeconfig..." + + # Get the first master node IP + local master_ip=$(grep -A 10 "\[kube_control_plane\]" "$INVENTORY_FILE" | grep ansible_host | head -1 | awk '{print $2}' | cut -d'=' -f2) + + if [[ -z "$master_ip" ]]; then + print_error "Could not find master node IP in inventory" + exit 1 + fi + + print_status "Getting kubeconfig from master node: $master_ip" + + # Create .kube directory if it doesn't exist + mkdir -p ~/.kube + + # Get kubeconfig from master node + ssh wwwadmin@mathmast.com@"$master_ip" "sudo cat /etc/kubernetes/admin.conf" > ~/.kube/config + + if [[ $? -eq 0 ]]; then + print_success "Kubeconfig retrieved successfully" + else + print_error "Failed to retrieve kubeconfig" + exit 1 + fi +} + +# Function to verify cluster +verify_cluster() { + print_status "Verifying cluster installation..." + + # Wait for cluster to be ready + local max_attempts=30 + local attempt=1 + + while [[ $attempt -le $max_attempts ]]; do + if kubectl get nodes &> /dev/null; then + print_success "Cluster is accessible" + break + fi + + print_status "Waiting for cluster to be ready... (attempt $attempt/$max_attempts)" + sleep 30 + ((attempt++)) + done + + if [[ $attempt -gt $max_attempts ]]; then + print_error "Cluster verification failed" + print_warning "Troubleshooting steps:" + print_warning "1. Check VM resources (CPU, memory)" + print_warning "2. Check network connectivity between nodes" + print_warning "3. Check Ansible logs for errors" + print_warning "4. Verify inventory file configuration" + exit 1 + fi + + # Check node status + print_status "Checking node status..." + kubectl get nodes + + # Wait for all nodes to be ready + local ready_nodes=$(kubectl get nodes --no-headers | grep -c "Ready") + local total_nodes=$(kubectl get nodes --no-headers | wc -l) + + if [[ $ready_nodes -eq $total_nodes ]]; then + print_success "All $total_nodes nodes are ready" + else + print_warning "Only $ready_nodes/$total_nodes nodes are ready" + kubectl get nodes + fi + + # Check system pods + print_status "Checking system pods..." + kubectl get pods -n kube-system + + # Wait for critical system pods + print_status "Waiting for critical system pods..." + local critical_pods=("kube-apiserver" "kube-controller-manager" "kube-scheduler" "etcd") + + for pod_prefix in "${critical_pods[@]}"; do + local max_pod_attempts=20 + local pod_attempt=1 + + while [[ $pod_attempt -le $max_pod_attempts ]]; do + if kubectl get pods -n kube-system | grep -q "$pod_prefix.*Running"; then + print_success "$pod_prefix is running" + break + fi + + if [[ $pod_attempt -eq $max_pod_attempts ]]; then + print_warning "$pod_prefix is not running" + kubectl get pods -n kube-system | grep "$pod_prefix" + fi + + sleep 10 + ((pod_attempt++)) + done + done + + # Check cluster info + print_status "Checking cluster info..." + kubectl cluster-info +} + +# Function to deploy infrastructure +deploy_infrastructure() { + print_status "Deploying infrastructure components..." + + cd "$MANIFESTS_DIR" + + # Deploy in order + local components=( + "freeleaps-controls-system" + "freeleaps-devops-system" + "freeleaps-monitoring-system" + "freeleaps-logging-system" + "freeleaps-data-platform" + ) + + for component in "${components[@]}"; do + if [[ -d "$component" ]]; then + print_status "Deploying $component..." + kubectl apply -f "$component/" + + # Wait for deployment to stabilize + print_status "Waiting for $component to stabilize..." + sleep 30 + else + print_warning "Component directory not found: $component" + fi + done + + print_success "Infrastructure deployment completed" +} + +# Function to setup authentication +setup_authentication() { + print_status "Setting up authentication..." + + cd "$BIN_DIR" + + if [[ -f "freeleaps-cluster-authenticator" ]]; then + print_status "Running authentication setup..." + ./freeleaps-cluster-authenticator auth + else + print_warning "Authentication script not found" + print_warning "Please run authentication setup manually" + fi +} + +# Function to display final status +display_final_status() { + print_success "Kubernetes cluster bootstrap completed!" + echo + echo "=== Cluster Status ===" + kubectl get nodes + echo + echo "=== System Pods ===" + kubectl get pods -n kube-system + echo + echo "=== Infrastructure Status ===" + kubectl get pods --all-namespaces | grep -E "(argocd|cert-manager|prometheus|grafana)" + echo + echo "=== Next Steps ===" + echo "1. Verify all components are running: kubectl get pods --all-namespaces" + echo "2. Access ArgoCD: kubectl port-forward svc/argocd-server -n freeleaps-devops-system 8080:80" + echo "3. Access Grafana: kubectl port-forward svc/kube-prometheus-stack-grafana -n freeleaps-monitoring-system 3000:80" + echo "4. Setup authentication: cd $BIN_DIR && ./freeleaps-cluster-authenticator auth" + echo "5. Deploy applications via ArgoCD" +} + +# Main function +main() { + echo "==========================================" + echo "Freeleaps Kubernetes Cluster Bootstrap" + echo "==========================================" + echo + + # Check prerequisites + check_prerequisites + + # Verify Azure VMs + verify_azure_vms + + # Test connectivity + test_connectivity + + # Bootstrap cluster + bootstrap_cluster + + # Get kubeconfig + get_kubeconfig + + # Verify cluster + verify_cluster + + # Deploy infrastructure + deploy_infrastructure + + # Setup authentication + setup_authentication + + # Display final status + display_final_status +} + +# Handle script arguments +if [[ $# -eq 0 ]]; then + main +else + case "$1" in + --help|-h) + echo "Usage: $0 [OPTIONS]" + echo + echo "Options:" + echo " --help, -h Show this help message" + echo " --verify Only verify prerequisites and connectivity" + echo " --bootstrap Only bootstrap the cluster (skip infrastructure)" + echo + echo "This script bootstraps a complete Kubernetes cluster from Azure VMs." + exit 0 + ;; + --verify) + check_prerequisites + verify_azure_vms + test_connectivity + print_success "Verification completed successfully" + ;; + --bootstrap) + check_prerequisites + verify_azure_vms + test_connectivity + bootstrap_cluster + get_kubeconfig + verify_cluster + print_success "Cluster bootstrap completed" + ;; + *) + print_error "Unknown option: $1" + echo "Use --help for usage information" + exit 1 + ;; + esac +fi diff --git a/docs/examples/basic-pod.yaml b/docs/examples/basic-pod.yaml new file mode 100644 index 00000000..14492e67 --- /dev/null +++ b/docs/examples/basic-pod.yaml @@ -0,0 +1,141 @@ +# Basic Pod Example with Detailed Comments +# This example shows a simple nginx pod with health checks and resource limits +# +# 🎯 What this does: Creates a single nginx web server pod that: +# - Runs nginx web server on port 80 +# - Has health checks to ensure it's working +# - Has resource limits to prevent it from consuming too much CPU/memory +# - Includes security best practices +# +# πŸ“š EDUCATIONAL EXAMPLE (not from your codebase) +# This is a learning example. Your codebase uses Helm charts and Deployments instead of direct Pods. +# +# ⚠️ IMPORTANT: Direct Pod creation is NOT good practice for production! +# This example is for learning purposes only. In production, you should use: +# - Deployments (for applications) +# - StatefulSets (for databases) +# - Helm charts (for complex applications) +# - kubectl apply (for declarative deployments) + +apiVersion: v1 # ← Kubernetes API version for Pod resources +kind: Pod # ← Resource type: Pod (smallest deployable unit) +metadata: # ← Metadata section: describes the pod + name: nginx-pod # ← Unique name for this pod in the namespace + namespace: default # ← Namespace where pod will be created (default if not specified) + labels: # ← Labels for organizing and selecting pods + app: nginx # ← Label: identifies this as an nginx application + version: v1 # ← Label: version of the application + environment: development # ← Label: environment this pod runs in +spec: # ← Specification: defines what the pod should do + containers: # ← List of containers in this pod + - name: nginx # ← Container name (used for logs, exec, etc.) + image: nginx:latest # ← Docker image to run (nginx with latest tag) + ports: # ← Ports the container exposes + - containerPort: 80 # ← Port 80 inside the container (nginx default) + name: http # ← Name for this port (useful for service references) + protocol: TCP # ← Protocol (TCP is default) + + # πŸ”§ Resource Management + # These limits prevent the pod from consuming too many resources + # Think of it like setting a budget for CPU and memory usage + resources: + requests: # ← Minimum resources guaranteed to the pod + memory: "64Mi" # ← 64 megabytes of RAM (minimum guaranteed) + cpu: "250m" # ← 0.25 CPU cores (250 millicores = 25% of 1 CPU) + limits: # ← Maximum resources the pod can use + memory: "128Mi" # ← 128 megabytes of RAM (maximum allowed) + cpu: "500m" # ← 0.5 CPU cores (500 millicores = 50% of 1 CPU) + + # πŸ₯ Health Checks + # These tell Kubernetes how to check if the pod is healthy + # Like a doctor checking your vital signs! + livenessProbe: # ← Checks if the pod is alive (restarts if failed) + httpGet: # ← Use HTTP GET request to check health + path: / # ← Check the root path of nginx + port: 80 # ← Check on port 80 + initialDelaySeconds: 30 # ← Wait 30 seconds before first check (nginx startup time) + periodSeconds: 10 # ← Check every 10 seconds + timeoutSeconds: 5 # ← Fail if response takes longer than 5 seconds + failureThreshold: 3 # ← Restart pod after 3 consecutive failures + + readinessProbe: # ← Checks if the pod is ready to receive traffic + httpGet: # ← Use HTTP GET request to check readiness + path: / # ← Check the root path + port: 80 # ← Check on port 80 + initialDelaySeconds: 5 # ← Wait 5 seconds before first check + periodSeconds: 5 # ← Check every 5 seconds + timeoutSeconds: 3 # ← Fail if response takes longer than 3 seconds + failureThreshold: 3 # ← Mark as not ready after 3 consecutive failures + + # πŸ”’ Security Context + # These settings make the pod more secure + # Like locking your doors and windows! + securityContext: + allowPrivilegeEscalation: false # ← Prevent the container from gaining root privileges + readOnlyRootFilesystem: true # ← Make the root filesystem read-only (more secure) + capabilities: # ← Remove unnecessary Linux capabilities + drop: # ← Drop these capabilities + - ALL # ← Drop ALL capabilities (most restrictive) + runAsNonRoot: true # ← Don't run as root user + runAsUser: 101 # ← Run as user ID 101 (nginx user) + + # πŸ“ Volume Mounts + # These allow the container to access files from the pod + volumeMounts: + - name: tmp-volume # ← Name of the volume to mount + mountPath: /tmp # ← Where to mount it inside the container + readOnly: false # ← Allow read/write access + + # πŸ’Ύ Volumes + # These define storage that can be mounted into containers + volumes: + - name: tmp-volume # ← Volume name (matches volumeMounts above) + emptyDir: {} # ← Empty directory volume (temporary, deleted when pod dies) + # emptyDir creates a temporary directory that exists as long as the pod exists + # Perfect for temporary files, caches, etc. + +# πŸš€ How to use this (FOR LEARNING ONLY): +# kubectl apply -f basic-pod.yaml +# kubectl get pods # Check if pod is running +# kubectl logs nginx-pod # View nginx logs +# kubectl port-forward nginx-pod 8080:80 # Access nginx at http://localhost:8080 +# kubectl exec -it nginx-pod -- /bin/bash # Get a shell inside the pod + +# 🏭 YOUR CODEBASE COMPARISON: +# +# ❌ Your codebase does NOT create Pods directly like this +# βœ… Your codebase uses Helm charts and Deployments instead +# +# Example from your codebase: +# - Helm charts in: freeleaps-ops/freeleaps/helm-pkg/ +# - Deployments with replicas, rolling updates, etc. +# - Automatic pod creation via Deployment controllers +# +# Commands your codebase actually uses: +# helm install/upgrade --namespace -f +# kubectl get pods -n -l app.kubernetes.io/name= + +# 🎯 PRODUCTION BEST PRACTICES: +# +# ❌ DON'T DO THIS (bad practices): +# kubectl run nginx --image=nginx:latest # Creates standalone Pod +# kubectl run my-app --image=my-app:latest --port=8080 # No self-healing +# kubectl run database --image=postgres:13 --port=5432 # No scaling +# +# βœ… DO THIS INSTEAD (good practices): +# kubectl create deployment nginx --image=nginx:latest # Creates Deployment +# helm install my-app ./my-app-chart --namespace my-app # Use Helm charts +# kubectl apply -f deployment.yaml # Declarative deployment +# kubectl apply -f statefulset.yaml # For databases +# +# πŸ”§ When kubectl run is OK (limited use cases): +# kubectl run debug-pod --image=busybox --rm -it --restart=Never -- nslookup my-service +# kubectl run test-pod --image=nginx --rm -it --restart=Never -- curl http://my-service:80 + +# πŸ“š Learn more: +# - Pods: https://kubernetes.io/docs/concepts/workloads/pods/ +# - Deployments: https://kubernetes.io/docs/concepts/workloads/controllers/deployment/ +# - Helm: https://helm.sh/docs/ +# - Health Checks: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/ +# - Security Context: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ +# - Resource Management: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ diff --git a/docs/examples/complete-application-example.yaml b/docs/examples/complete-application-example.yaml new file mode 100644 index 00000000..6ecc15eb --- /dev/null +++ b/docs/examples/complete-application-example.yaml @@ -0,0 +1,468 @@ +# Complete Application Example +# This demonstrates a full web application with database, API, and monitoring + +# 1. Namespace +apiVersion: v1 +kind: Namespace +metadata: + name: complete-app + labels: + environment: production + team: backend + app: complete-app +--- +# 2. ConfigMap for application configuration +apiVersion: v1 +kind: ConfigMap +metadata: + name: app-config + namespace: complete-app +data: + DB_HOST: "postgres-service" + DB_PORT: "5432" + DB_NAME: "myapp" + REDIS_HOST: "redis-service" + REDIS_PORT: "6379" + ENVIRONMENT: "production" + LOG_LEVEL: "INFO" + + application.properties: | + server.port=8080 + logging.level=INFO + cache.enabled=true + session.timeout=3600 + cors.allowed-origins=* +--- +# 3. Secret for sensitive data +apiVersion: v1 +kind: Secret +metadata: + name: app-secrets + namespace: complete-app +type: Opaque +data: + DB_USERNAME: YWRtaW4= # admin + DB_PASSWORD: c2VjcmV0MTIz # secret123 + API_KEY: bXktYXBpLWtleQ== # my-api-key + JWT_SECRET: bXktand0LXNlY3JldA== # my-jwt-secret +--- +# 4. PVC for database +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: postgres-pvc + namespace: complete-app +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi + storageClassName: managed-premium +--- +# 5. PVC for application data +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: app-data-pvc + namespace: complete-app +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 5Gi + storageClassName: managed-premium +--- +# 6. PostgreSQL Database Deployment +apiVersion: apps/v1 +kind: Deployment +metadata: + name: postgres + namespace: complete-app + labels: + app: postgres + component: database +spec: + replicas: 1 + selector: + matchLabels: + app: postgres + template: + metadata: + labels: + app: postgres + component: database + spec: + securityContext: + runAsNonRoot: true + runAsUser: 999 + fsGroup: 999 + containers: + - name: postgres + image: postgres:13 + ports: + - containerPort: 5432 + env: + - name: POSTGRES_DB + value: "myapp" + - name: POSTGRES_USER + valueFrom: + secretKeyRef: + name: app-secrets + key: DB_USERNAME + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: app-secrets + key: DB_PASSWORD + volumeMounts: + - name: postgres-data + mountPath: /var/lib/postgresql/data + resources: + requests: + memory: "256Mi" + cpu: "250m" + limits: + memory: "512Mi" + cpu: "500m" + livenessProbe: + exec: + command: + - pg_isready + - -U + - admin + initialDelaySeconds: 30 + periodSeconds: 10 + readinessProbe: + exec: + command: + - pg_isready + - -U + - admin + initialDelaySeconds: 5 + periodSeconds: 5 + volumes: + - name: postgres-data + persistentVolumeClaim: + claimName: postgres-pvc +--- +# 7. Redis Cache Deployment +apiVersion: apps/v1 +kind: Deployment +metadata: + name: redis + namespace: complete-app + labels: + app: redis + component: cache +spec: + replicas: 1 + selector: + matchLabels: + app: redis + template: + metadata: + labels: + app: redis + component: cache + spec: + securityContext: + runAsNonRoot: true + runAsUser: 999 + fsGroup: 999 + containers: + - name: redis + image: redis:6-alpine + ports: + - containerPort: 6379 + resources: + requests: + memory: "128Mi" + cpu: "100m" + limits: + memory: "256Mi" + cpu: "200m" + livenessProbe: + exec: + command: + - redis-cli + - ping + initialDelaySeconds: 30 + periodSeconds: 10 + readinessProbe: + exec: + command: + - redis-cli + - ping + initialDelaySeconds: 5 + periodSeconds: 5 +--- +# 8. Web Application Deployment +apiVersion: apps/v1 +kind: Deployment +metadata: + name: web-app + namespace: complete-app + labels: + app: web-app + component: frontend +spec: + replicas: 3 + selector: + matchLabels: + app: web-app + template: + metadata: + labels: + app: web-app + component: frontend + spec: + securityContext: + runAsNonRoot: true + runAsUser: 1000 + fsGroup: 2000 + containers: + - name: web-app + image: nginx:latest + ports: + - containerPort: 80 + resources: + requests: + memory: "64Mi" + cpu: "100m" + limits: + memory: "128Mi" + cpu: "200m" + livenessProbe: + httpGet: + path: / + port: 80 + initialDelaySeconds: 30 + periodSeconds: 10 + readinessProbe: + httpGet: + path: / + port: 80 + initialDelaySeconds: 5 + periodSeconds: 5 + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + volumeMounts: + - name: tmp-volume + mountPath: /tmp + volumes: + - name: tmp-volume + emptyDir: {} +--- +# 9. API Application Deployment +apiVersion: apps/v1 +kind: Deployment +metadata: + name: api-app + namespace: complete-app + labels: + app: api-app + component: backend +spec: + replicas: 2 + selector: + matchLabels: + app: api-app + template: + metadata: + labels: + app: api-app + component: backend + spec: + securityContext: + runAsNonRoot: true + runAsUser: 1000 + fsGroup: 2000 + containers: + - name: api-app + image: python:3.9-slim + ports: + - containerPort: 8080 + env: + - name: DB_HOST + valueFrom: + configMapKeyRef: + name: app-config + key: DB_HOST + - name: DB_PORT + valueFrom: + configMapKeyRef: + name: app-config + key: DB_PORT + - name: DB_NAME + valueFrom: + configMapKeyRef: + name: app-config + key: DB_NAME + - name: DB_USERNAME + valueFrom: + secretKeyRef: + name: app-secrets + key: DB_USERNAME + - name: DB_PASSWORD + valueFrom: + secretKeyRef: + name: app-secrets + key: DB_PASSWORD + - name: REDIS_HOST + valueFrom: + configMapKeyRef: + name: app-config + key: REDIS_HOST + - name: REDIS_PORT + valueFrom: + configMapKeyRef: + name: app-config + key: REDIS_PORT + - name: API_KEY + valueFrom: + secretKeyRef: + name: app-secrets + key: API_KEY + - name: JWT_SECRET + valueFrom: + secretKeyRef: + name: app-secrets + key: JWT_SECRET + volumeMounts: + - name: app-data + mountPath: /app/data + - name: config-volume + mountPath: /app/config + resources: + requests: + memory: "256Mi" + cpu: "250m" + limits: + memory: "512Mi" + cpu: "500m" + livenessProbe: + httpGet: + path: /health + port: 8080 + initialDelaySeconds: 30 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 5 + periodSeconds: 5 + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + volumes: + - name: app-data + persistentVolumeClaim: + claimName: app-data-pvc + - name: config-volume + configMap: + name: app-config +--- +# 10. Services +apiVersion: v1 +kind: Service +metadata: + name: postgres-service + namespace: complete-app +spec: + type: ClusterIP + selector: + app: postgres + ports: + - port: 5432 + targetPort: 5432 + protocol: TCP +--- +apiVersion: v1 +kind: Service +metadata: + name: redis-service + namespace: complete-app +spec: + type: ClusterIP + selector: + app: redis + ports: + - port: 6379 + targetPort: 6379 + protocol: TCP +--- +apiVersion: v1 +kind: Service +metadata: + name: web-app-service + namespace: complete-app +spec: + type: ClusterIP + selector: + app: web-app + ports: + - port: 80 + targetPort: 80 + protocol: TCP +--- +apiVersion: v1 +kind: Service +metadata: + name: api-app-service + namespace: complete-app +spec: + type: ClusterIP + selector: + app: api-app + ports: + - port: 8080 + targetPort: 8080 + protocol: TCP +--- +# 11. Ingress +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: complete-app-ingress + namespace: complete-app + annotations: + nginx.ingress.kubernetes.io/rewrite-target: / + cert-manager.io/cluster-issuer: "letsencrypt-prod" + nginx.ingress.kubernetes.io/cors-allow-origin: "*" +spec: + tls: + - hosts: + - myapp.example.com + - api.myapp.example.com + secretName: myapp-tls + rules: + - host: myapp.example.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: web-app-service + port: + number: 80 + - host: api.myapp.example.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: api-app-service + port: + number: 8080 diff --git a/docs/examples/configmap-secret-example.yaml b/docs/examples/configmap-secret-example.yaml new file mode 100644 index 00000000..afa09ba1 --- /dev/null +++ b/docs/examples/configmap-secret-example.yaml @@ -0,0 +1,100 @@ +# ConfigMap for application configuration +apiVersion: v1 +kind: ConfigMap +metadata: + name: app-config + namespace: my-app + labels: + app: web-app +data: + # Environment variables + DB_HOST: "postgres-service" + DB_PORT: "5432" + ENVIRONMENT: "production" + LOG_LEVEL: "INFO" + + # File-like content + application.properties: | + server.port=8080 + logging.level=INFO + cache.enabled=true + session.timeout=3600 +--- +# Secret for sensitive data +apiVersion: v1 +kind: Secret +metadata: + name: db-secret + namespace: my-app + labels: + app: web-app +type: Opaque +data: + # Base64 encoded values + DB_USERNAME: YWRtaW4= # admin + DB_PASSWORD: c2VjcmV0MTIz # secret123 + API_KEY: bXktYXBpLWtleQ== # my-api-key +--- +# Deployment using ConfigMap and Secret +apiVersion: apps/v1 +kind: Deployment +metadata: + name: web-app-with-config + namespace: my-app +spec: + replicas: 2 + selector: + matchLabels: + app: web-app + template: + metadata: + labels: + app: web-app + spec: + containers: + - name: web-app + image: nginx:latest + ports: + - containerPort: 80 + env: + # Environment variables from ConfigMap + - name: DB_HOST + valueFrom: + configMapKeyRef: + name: app-config + key: DB_HOST + - name: DB_PORT + valueFrom: + configMapKeyRef: + name: app-config + key: DB_PORT + - name: ENVIRONMENT + valueFrom: + configMapKeyRef: + name: app-config + key: ENVIRONMENT + # Environment variables from Secret + - name: DB_USERNAME + valueFrom: + secretKeyRef: + name: db-secret + key: DB_USERNAME + - name: DB_PASSWORD + valueFrom: + secretKeyRef: + name: db-secret + key: DB_PASSWORD + volumeMounts: + # Mount ConfigMap as files + - name: config-volume + mountPath: /app/config + - name: secret-volume + mountPath: /app/secrets + readOnly: true + volumes: + - name: config-volume + configMap: + name: app-config + - name: secret-volume + secret: + secretName: db-secret diff --git a/docs/examples/deployment-example.yaml b/docs/examples/deployment-example.yaml new file mode 100644 index 00000000..17809c04 --- /dev/null +++ b/docs/examples/deployment-example.yaml @@ -0,0 +1,158 @@ +# Production-Ready Deployment Example with Detailed Comments +# This example shows a deployment that creates and manages multiple nginx pods +# +# 🎯 What this does: Creates a deployment that: +# - Runs 3 copies of nginx web server (replicas) +# - Automatically restarts failed pods +# - Supports rolling updates (zero downtime) +# - Includes security, health checks, and resource management +# - Can be easily scaled up or down + +# πŸ“Š ASCII Diagram: How Deployments Work +# +# β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +# β”‚ DEPLOYMENT β”‚ +# β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +# β”‚ β”‚ name: web-app β”‚ β”‚ +# β”‚ β”‚ replicas: 3 β”‚ β”‚ +# β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +# β”‚ β”‚ β”‚ +# β”‚ β–Ό β”‚ +# β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +# β”‚ β”‚ POD TEMPLATE β”‚ β”‚ +# β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ +# β”‚ β”‚ β”‚ POD 1 β”‚ β”‚ POD 2 β”‚ β”‚ POD 3 β”‚ β”‚ β”‚ +# β”‚ β”‚ β”‚ nginx:latestβ”‚ β”‚ nginx:latestβ”‚ β”‚ nginx:latestβ”‚ β”‚ β”‚ +# β”‚ β”‚ β”‚ port: 80 β”‚ β”‚ port: 80 β”‚ β”‚ port: 80 β”‚ β”‚ β”‚ +# β”‚ β”‚ β”‚ IP: 10.0.1.1β”‚ β”‚ IP: 10.0.1.2β”‚ β”‚ IP: 10.0.1.3β”‚ β”‚ β”‚ +# β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ +# β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +# β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +# +# πŸ”„ Rolling Update Process: +# β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +# β”‚ OLD POD β”‚ β”‚ NEW POD β”‚ β”‚ OLD POD β”‚ +# β”‚ nginx:v1.0 β”‚ β”‚ nginx:v1.1 β”‚ β”‚ nginx:v1.0 β”‚ +# β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +# β”‚ β”‚ β”‚ +# β–Ό β–Ό β–Ό +# β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +# β”‚ NEW POD β”‚ β”‚ NEW POD β”‚ β”‚ NEW POD β”‚ +# β”‚ nginx:v1.1 β”‚ β”‚ nginx:v1.1 β”‚ β”‚ nginx:v1.1 β”‚ +# β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + +apiVersion: apps/v1 # ← Kubernetes API version for Deployment resources +kind: Deployment # ← Resource type: Deployment (manages multiple pods) +metadata: # ← Metadata section: describes the deployment + name: web-app # ← Unique name for this deployment + namespace: my-app # ← Namespace where deployment will be created + labels: # ← Labels for organizing and selecting deployments + app: web-app # ← Label: identifies this as a web application + version: v1 # ← Label: version of the application + environment: production # ← Label: environment this runs in + team: backend # ← Label: team responsible for this app +spec: # ← Specification: defines what the deployment should do + replicas: 3 # ← Number of pod copies to run (3 nginx instances) + # Think of replicas like having 3 backup singers - if one gets sick, + # the show goes on with the other 2! + + selector: # ← How to find the pods this deployment manages + matchLabels: # ← Match pods with these labels + app: web-app # ← Only manage pods with label app=web-app + + template: # ← Template for creating new pods + metadata: # ← Metadata for pods created from this template + labels: # ← Labels applied to all pods created by this deployment + app: web-app # ← Must match selector above + version: v1 # ← Version label for tracking + environment: production # ← Environment label + team: backend # ← Team label + + spec: # ← Pod specification (same as basic-pod.yaml) + # πŸ”’ Pod-Level Security Context + # These settings apply to the entire pod + securityContext: + runAsNonRoot: true # ← Don't run any container as root + runAsUser: 1000 # ← Run as user ID 1000 + fsGroup: 2000 # ← Set group ID for mounted volumes + + containers: # ← List of containers in each pod + - name: web-app # ← Container name + image: nginx:latest # ← Docker image to run + ports: # ← Ports the container exposes + - containerPort: 80 # ← Port 80 inside the container + name: http # ← Name for this port + # πŸ”§ Resource Management + # These limits prevent pods from consuming too many resources + # Like setting a budget for each pod + resources: + requests: # ← Minimum resources guaranteed to each pod + memory: "64Mi" # ← 64 megabytes of RAM (minimum guaranteed) + cpu: "250m" # ← 0.25 CPU cores (250 millicores = 25% of 1 CPU) + limits: # ← Maximum resources each pod can use + memory: "128Mi" # ← 128 megabytes of RAM (maximum allowed) + cpu: "500m" # ← 0.5 CPU cores (500 millicores = 50% of 1 CPU) + + # πŸ₯ Health Checks + # These tell Kubernetes how to check if each pod is healthy + # Like having a health monitor for each pod + livenessProbe: # ← Checks if the pod is alive (restarts if failed) + httpGet: # ← Use HTTP GET request to check health + path: / # ← Check the root path of nginx + port: 80 # ← Check on port 80 + initialDelaySeconds: 30 # ← Wait 30 seconds before first check + periodSeconds: 10 # ← Check every 10 seconds + timeoutSeconds: 5 # ← Fail if response takes longer than 5 seconds + failureThreshold: 3 # ← Restart pod after 3 consecutive failures + + readinessProbe: # ← Checks if the pod is ready to receive traffic + httpGet: # ← Use HTTP GET request to check readiness + path: / # ← Check the root path + port: 80 # ← Check on port 80 + initialDelaySeconds: 5 # ← Wait 5 seconds before first check + periodSeconds: 5 # ← Check every 5 seconds + timeoutSeconds: 3 # ← Fail if response takes longer than 3 seconds + failureThreshold: 3 # ← Mark as not ready after 3 consecutive failures + + # πŸ”’ Container-Level Security Context + # These settings make each container more secure + securityContext: + allowPrivilegeEscalation: false # ← Prevent gaining root privileges + readOnlyRootFilesystem: true # ← Make root filesystem read-only + capabilities: # ← Remove unnecessary Linux capabilities + drop: # ← Drop these capabilities + - ALL # ← Drop ALL capabilities (most restrictive) + + # πŸ“ Volume Mounts + # These allow the container to access files from the pod + volumeMounts: + - name: tmp-volume # ← Name of the volume to mount + mountPath: /tmp # ← Where to mount it inside the container + + # πŸ’Ύ Volumes + # These define storage that can be mounted into containers + volumes: + - name: tmp-volume # ← Volume name (matches volumeMounts above) + emptyDir: {} # ← Empty directory volume (temporary) + +# πŸš€ How to use this: +# kubectl apply -f deployment-example.yaml +# kubectl get deployments # Check deployment status +# kubectl get pods -l app=web-app # See all pods created by this deployment +# kubectl scale deployment web-app --replicas=5 # Scale up to 5 replicas +# kubectl set image deployment/web-app web-app=nginx:1.21 # Update to new version +# kubectl rollout status deployment/web-app # Check rollout progress +# kubectl rollout undo deployment/web-app # Rollback to previous version + +# πŸ“Š What happens when you apply this: +# 1. Kubernetes creates 3 nginx pods +# 2. Each pod runs nginx on port 80 +# 3. Health checks ensure pods are working +# 4. If a pod fails, deployment automatically creates a new one +# 5. Load balancer can send traffic to any of the 3 pods + +# πŸ“š Learn more: +# - Deployments: https://kubernetes.io/docs/concepts/workloads/controllers/deployment/ +# - Rolling Updates: https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#rolling-update-deployment +# - Scaling: https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#scaling-a-deployment +# - Rollbacks: https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#rolling-back-a-deployment diff --git a/docs/examples/ingress-example.yaml b/docs/examples/ingress-example.yaml new file mode 100644 index 00000000..9333dd2f --- /dev/null +++ b/docs/examples/ingress-example.yaml @@ -0,0 +1,265 @@ +# Enhanced Ingress Example with Detailed Comments +# This example shows how to expose your applications externally with SSL/TLS +# +# 🎯 What this does: Creates an Ingress that: +# - Routes traffic from external domains to your services +# - Automatically handles SSL/TLS certificates +# - Provides load balancing across multiple pods +# - Supports path-based routing (different URLs to different services) +# - Includes security features like rate limiting and CORS + +# πŸ“Š ASCII Diagram: How Ingress Works in Your Cluster +# +# β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +# β”‚ INTERNET β”‚ +# β”‚ β”‚ +# β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +# β”‚ β”‚ Browser β”‚ β”‚ Mobile β”‚ β”‚ API β”‚ β”‚ Other β”‚ β”‚ +# β”‚ β”‚ β”‚ β”‚ App β”‚ β”‚ Client β”‚ β”‚ Clients β”‚ β”‚ +# β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +# β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ +# β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +# β”‚ β”‚ β”‚ β”‚ +# β”‚ β–Ό β–Ό β”‚ +# β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +# β”‚ β”‚ AZURE LOAD BALANCER β”‚ β”‚ +# β”‚ β”‚ IP: 4.155.160.32 (prod-usw2-k8s-freeleaps-lb-fe-ip) β”‚ β”‚ +# β”‚ β”‚ Port: 80/443 β”‚ β”‚ +# β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +# β”‚ β”‚ β”‚ +# β”‚ β–Ό β”‚ +# β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +# β”‚ β”‚ NGINX INGRESS CONTROLLER β”‚ β”‚ +# β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ +# β”‚ β”‚ β”‚ Pod: ingress-nginx-controller-abc123 β”‚ β”‚ β”‚ +# β”‚ β”‚ β”‚ IP: 10.0.1.100 Port: 80/443 β”‚ β”‚ β”‚ +# β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ +# β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +# β”‚ β”‚ β”‚ +# β”‚ β–Ό β”‚ +# β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +# β”‚ β”‚ INGRESS RULES β”‚ β”‚ +# β”‚ β”‚ β”‚ β”‚ +# β”‚ β”‚ argo.mathmast.com β†’ argo-cd-server:80 β”‚ β”‚ +# β”‚ β”‚ gitea.freeleaps.mathmast.com β†’ gitea-http:3000 β”‚ β”‚ +# β”‚ β”‚ magicleaps.mathmast.com β†’ magicleaps-frontend-service:80 β”‚ β”‚ +# β”‚ β”‚ alpha.magicleaps.mathmast.com β†’ magicleaps-frontend-service:80 β”‚ β”‚ +# β”‚ β”‚ β”‚ β”‚ +# β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +# β”‚ β”‚ β”‚ +# β”‚ β–Ό β”‚ +# β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +# β”‚ β”‚ KUBERNETES SERVICES β”‚ β”‚ +# β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ +# β”‚ β”‚ β”‚argo-cd-svc β”‚ β”‚gitea-http β”‚ β”‚magic-front β”‚ β”‚magic-api β”‚ β”‚ β”‚ +# β”‚ β”‚ β”‚ClusterIP β”‚ β”‚ClusterIP β”‚ β”‚ClusterIP β”‚ β”‚ClusterIP β”‚ β”‚ β”‚ +# β”‚ β”‚ β”‚10.0.1.10 β”‚ β”‚10.0.1.11 β”‚ β”‚10.0.1.12 β”‚ β”‚10.0.1.13 β”‚ β”‚ β”‚ +# β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ +# β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +# β”‚ β”‚ β”‚ +# β”‚ β–Ό β”‚ +# β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +# β”‚ β”‚ APPLICATION PODS β”‚ β”‚ +# β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ +# β”‚ β”‚ β”‚argo-cd-pod β”‚ β”‚gitea-pod β”‚ β”‚magic-front β”‚ β”‚magic-api β”‚ β”‚ β”‚ +# β”‚ β”‚ β”‚10.0.1.101 β”‚ β”‚10.0.1.102 β”‚ β”‚10.0.1.103 β”‚ β”‚10.0.1.104 β”‚ β”‚ β”‚ +# β”‚ β”‚ β”‚argo-cd:v2.8 β”‚ β”‚gitea:1.20 β”‚ β”‚nginx:latest β”‚ β”‚api:v1.2 β”‚ β”‚ β”‚ +# β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ +# β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +# β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +# +# πŸ”„ Request Flow Example: +# 1. User visits: https://magicleaps.mathmast.com/ +# 2. DNS resolves to Azure Load Balancer IP (4.155.160.32) +# 3. Load Balancer forwards to nginx-ingress-controller +# 4. Ingress controller checks rules: +# - Host: magicleaps.mathmast.com βœ“ +# - Path: / matches /* prefix βœ“ +# 5. Routes to magicleaps-frontend-service:80 +# 6. Service load balances to magicleaps-frontend pods +# 7. Pod returns response through same path + +# πŸ” SSL/TLS Certificate Flow: +# β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +# β”‚ Browser β”‚ β”‚ Ingress β”‚ β”‚cert-manager β”‚ β”‚Let's Encryptβ”‚ +# β”‚ β”‚ β”‚ Controller β”‚ β”‚ β”‚ β”‚ β”‚ +# β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +# β”‚ β”‚ β”‚ β”‚ +# β”‚ HTTPS Request β”‚ β”‚ β”‚ +# │───────────────▢│ β”‚ β”‚ +# β”‚ β”‚ Check Cert β”‚ β”‚ +# β”‚ │───────────────▢│ β”‚ +# β”‚ β”‚ β”‚ Request Cert β”‚ +# β”‚ β”‚ │───────────────▢│ +# β”‚ β”‚ β”‚ β”‚ DNS Challenge +# β”‚ β”‚ β”‚ │◀───────────────│ +# β”‚ β”‚ β”‚ Cert Ready β”‚ +# β”‚ β”‚ │◀───────────────│ +# β”‚ β”‚ Cert Ready β”‚ β”‚ +# β”‚ │◀───────────────│ β”‚ +# β”‚ HTTPS Response β”‚ β”‚ β”‚ +# │◀───────────────│ β”‚ β”‚ + +# 🏭 YOUR ACTUAL PRODUCTION SETUP +# +# Based on your codebase, here are your actual production values: +# +# Load Balancer IP: 4.155.160.32 (prod-usw2-k8s-freeleaps-lb-fe-ip) +# +# Current Applications: +# 1. ArgoCD: argo.mathmast.com β†’ argo-cd-server:80 (freeleaps-devops-system) +# 2. Gitea: gitea.freeleaps.mathmast.com β†’ gitea-http:3000 (freeleaps-prod) +# 3. Magicleaps Prod: magicleaps.mathmast.com β†’ magicleaps-frontend-service:80 (magicleaps) +# 4. Magicleaps Alpha: alpha.magicleaps.mathmast.com β†’ magicleaps-frontend-service:80 (magicleaps) +# +# Certificate Issuer: mathmast-dot-com (Let's Encrypt + GoDaddy DNS01) +# Ingress Controller: nginx-ingress-controller:v1.12.0 (freeleaps-controls-system) +# +# Commands to check your actual setup: +# kubectl get ingress --all-namespaces +# kubectl get certificates --all-namespaces +# kubectl get pods -n freeleaps-controls-system -l app.kubernetes.io/name=ingress-nginx +# curl -I http://4.155.160.32 +# nslookup argo.mathmast.com +# nslookup gitea.freeleaps.mathmast.com +# nslookup magicleaps.mathmast.com + +# πŸ“š EDUCATIONAL EXAMPLE BELOW +# This is a generic example for learning purposes. Your actual setup is above. + +# Ingress for external access and routing +apiVersion: networking.k8s.io/v1 # ← Kubernetes API version for Ingress resources +kind: Ingress # ← Resource type: Ingress (external access layer) +metadata: # ← Metadata section: describes the ingress + name: web-app-ingress # ← Unique name for this ingress + namespace: my-app # ← Namespace where ingress will be created + labels: # ← Labels for organizing and selecting ingresses + app: web-app # ← Label: identifies this as a web application ingress + environment: production # ← Label: environment this runs in + annotations: # ← Annotations: configuration for ingress controller + # πŸ”§ Nginx Ingress Controller Annotations + # These tell the nginx-ingress-controller how to behave + nginx.ingress.kubernetes.io/rewrite-target: / # ← Rewrite URL paths (remove /api prefix) + nginx.ingress.kubernetes.io/ssl-redirect: "true" # ← Redirect HTTP to HTTPS + nginx.ingress.kubernetes.io/force-ssl-redirect: "true" # ← Force HTTPS redirect + + # πŸ” Cert-Manager Integration + # This tells cert-manager to automatically get SSL certificates + cert-manager.io/cluster-issuer: "letsencrypt-prod" # ← Use Let's Encrypt for certificates + + # πŸ›‘οΈ Rate Limiting + # Prevent abuse by limiting requests per time window + nginx.ingress.kubernetes.io/rate-limit: "100" # ← 100 requests per window + nginx.ingress.kubernetes.io/rate-limit-window: "1m" # ← 1 minute window + + # 🌐 CORS (Cross-Origin Resource Sharing) + # Allow web browsers to make requests from different domains + nginx.ingress.kubernetes.io/enable-cors: "true" # ← Enable CORS + nginx.ingress.kubernetes.io/cors-allow-origin: "*" # ← Allow all origins (customize for production) + nginx.ingress.kubernetes.io/cors-allow-methods: "GET, POST, PUT, DELETE, OPTIONS" # ← Allowed HTTP methods + nginx.ingress.kubernetes.io/cors-allow-headers: "DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range,Authorization" # ← Allowed headers + + # ⚑ Performance Optimizations + # These improve performance for your applications + nginx.ingress.kubernetes.io/proxy-body-size: "0" # ← No limit on request body size + nginx.ingress.kubernetes.io/proxy-read-timeout: "600" # ← 10 minute read timeout + nginx.ingress.kubernetes.io/proxy-send-timeout: "600" # ← 10 minute send timeout + +spec: # ← Specification: defines routing rules + # πŸ” TLS Configuration + # This defines SSL/TLS certificates for secure HTTPS connections + tls: # ← TLS (Transport Layer Security) configuration + - hosts: # ← List of hostnames this certificate covers + - myapp.example.com # ← Main application domain + - api.myapp.example.com # ← API subdomain + secretName: myapp-tls # ← Name of the secret containing the certificate + # cert-manager will automatically create this secret with the SSL certificate + + # πŸ›£οΈ Routing Rules + # These define how traffic is routed to different services + rules: # ← List of routing rules + # Rule 1: Main application (myapp.example.com) + - host: myapp.example.com # ← Hostname to match (like a domain name) + http: # ← HTTP traffic configuration + paths: # ← List of URL paths and their destinations + # Path 1: Root path (/) β†’ web application + - path: / # ← URL path to match (root path) + pathType: Prefix # ← How to match the path (Prefix = starts with) + backend: # ← Where to send the traffic + service: # ← Backend service configuration + name: web-app-service # ← Service name to route to + port: # ← Service port + number: 80 # ← Port number (80 = HTTP) + + # Path 2: Static files (/static) β†’ static file server + - path: /static # ← URL path to match (static files) + pathType: Prefix # ← Match paths starting with /static + backend: # ← Where to send the traffic + service: # ← Backend service configuration + name: static-service # ← Service name for static files + port: # ← Service port + number: 80 # ← Port number + + # Rule 2: API subdomain (api.myapp.example.com) + - host: api.myapp.example.com # ← Different hostname for API + http: # ← HTTP traffic configuration + paths: # ← List of URL paths and their destinations + # Path 1: Root path (/) β†’ main API service + - path: / # ← URL path to match (root path) + pathType: Prefix # ← How to match the path + backend: # ← Where to send the traffic + service: # ← Backend service configuration + name: api-service # ← Service name for main API + port: # ← Service port + number: 8080 # ← Port number (8080 = common API port) + + # Path 2: Version 1 API (/v1) β†’ v1 API service + - path: /v1 # ← URL path to match (v1 API) + pathType: Prefix # ← Match paths starting with /v1 + backend: # ← Where to send the traffic + service: # ← Backend service configuration + name: api-v1-service # ← Service name for v1 API + port: # ← Service port + number: 8080 # ← Port number + + # Path 3: Version 2 API (/v2) β†’ v2 API service + - path: /v2 # ← URL path to match (v2 API) + pathType: Prefix # ← Match paths starting with /v2 + backend: # ← Where to send the traffic + service: # ← Backend service configuration + name: api-v2-service # ← Service name for v2 API + port: # ← Service port + number: 8080 # ← Port number + +# πŸš€ How to use this: +# kubectl apply -f ingress-example.yaml +# kubectl get ingress # Check ingress status +# kubectl describe ingress web-app-ingress # See detailed ingress info +# kubectl get certificates # Check SSL certificate status +# curl -H "Host: myapp.example.com" http://your-cluster-ip/ # Test routing + +# πŸ“Š What happens when you apply this: +# 1. Kubernetes creates the Ingress resource +# 2. nginx-ingress-controller reads the Ingress and configures nginx +# 3. cert-manager sees the cert-manager.io/cluster-issuer annotation +# 4. cert-manager requests SSL certificate from Let's Encrypt +# 5. Let's Encrypt validates domain ownership via DNS challenge +# 6. Certificate is stored in Kubernetes secret +# 7. nginx-ingress-controller uses the certificate for HTTPS +# 8. Traffic flows: Internet β†’ Load Balancer β†’ nginx β†’ Services β†’ Pods + +# πŸ” Your Current Setup Analysis: +# Based on your codebase, you're using: +# - nginx-ingress-controller in freeleaps-controls-system namespace +# - cert-manager with Let's Encrypt for SSL certificates +# - Custom ingress manager that automatically creates ingresses +# - Annotations for SSL redirect, rate limiting, and CORS +# - DNS-based certificate validation (DNS01 challenge) + +# πŸ“š Learn more: +# - Ingress: https://kubernetes.io/docs/concepts/services-networking/ingress/ +# - nginx-ingress: https://kubernetes.github.io/ingress-nginx/ +# - cert-manager: https://cert-manager.io/docs/ +# - SSL/TLS: https://kubernetes.io/docs/concepts/services-networking/ingress/#tls +# - Rate Limiting: https://kubernetes.github.io/ingress-nginx/user-guide/nginx-configuration/annotations/#rate-limiting +# - CORS: https://kubernetes.github.io/ingress-nginx/user-guide/nginx-configuration/annotations/#enable-cors diff --git a/docs/examples/job-cronjob-example.yaml b/docs/examples/job-cronjob-example.yaml new file mode 100644 index 00000000..674ab025 --- /dev/null +++ b/docs/examples/job-cronjob-example.yaml @@ -0,0 +1,162 @@ +# Job for one-time data processing +apiVersion: batch/v1 +kind: Job +metadata: + name: data-processing-job + namespace: my-app + labels: + app: data-processor + job-type: batch +spec: + completions: 3 # Run 3 times + parallelism: 2 # Run 2 in parallel + backoffLimit: 3 # Retry 3 times on failure + template: + metadata: + labels: + app: data-processor + job-type: batch + spec: + restartPolicy: Never + containers: + - name: data-processor + image: python:3.9-slim + command: ["python", "process_data.py"] + env: + - name: INPUT_FILE + value: "/data/input.csv" + - name: OUTPUT_FILE + value: "/data/output.csv" + - name: DB_HOST + valueFrom: + configMapKeyRef: + name: app-config + key: DB_HOST + - name: DB_PASSWORD + valueFrom: + secretKeyRef: + name: db-secret + key: DB_PASSWORD + volumeMounts: + - name: data-volume + mountPath: /data + - name: script-volume + mountPath: /app + resources: + requests: + memory: "128Mi" + cpu: "250m" + limits: + memory: "256Mi" + cpu: "500m" + volumes: + - name: data-volume + persistentVolumeClaim: + claimName: data-pvc + - name: script-volume + configMap: + name: app-config +--- +# CronJob for scheduled tasks +apiVersion: batch/v1 +kind: CronJob +metadata: + name: daily-backup + namespace: my-app + labels: + app: backup + job-type: scheduled +spec: + schedule: "0 2 * * *" # Daily at 2 AM + concurrencyPolicy: Forbid # Don't run if previous job is still running + successfulJobsHistoryLimit: 3 # Keep 3 successful job histories + failedJobsHistoryLimit: 1 # Keep 1 failed job history + jobTemplate: + spec: + template: + metadata: + labels: + app: backup + job-type: scheduled + spec: + restartPolicy: OnFailure + containers: + - name: backup + image: postgres:13 + command: ["/bin/bash", "-c"] + args: + - | + echo "Starting backup at $(date)" + pg_dump -h $DB_HOST -U $DB_USER -d $DB_NAME > /backup/backup-$(date +%Y%m%d).sql + echo "Backup completed at $(date)" + echo "Backup file size: $(ls -lh /backup/backup-$(date +%Y%m%d).sql)" + env: + - name: PGHOST + valueFrom: + configMapKeyRef: + name: app-config + key: DB_HOST + - name: PGUSER + valueFrom: + secretKeyRef: + name: db-secret + key: DB_USERNAME + - name: PGPASSWORD + valueFrom: + secretKeyRef: + name: db-secret + key: DB_PASSWORD + - name: PGDATABASE + value: "myapp" + volumeMounts: + - name: backup-volume + mountPath: /backup + resources: + requests: + memory: "256Mi" + cpu: "250m" + limits: + memory: "512Mi" + cpu: "500m" + volumes: + - name: backup-volume + persistentVolumeClaim: + claimName: backup-pvc +--- +# CronJob for cleanup tasks +apiVersion: batch/v1 +kind: CronJob +metadata: + name: cleanup-logs + namespace: my-app + labels: + app: cleanup + job-type: maintenance +spec: + schedule: "0 3 * * 0" # Weekly on Sunday at 3 AM + concurrencyPolicy: Allow + jobTemplate: + spec: + template: + metadata: + labels: + app: cleanup + job-type: maintenance + spec: + restartPolicy: OnFailure + containers: + - name: cleanup + image: alpine:latest + command: ["/bin/sh", "-c"] + args: + - | + echo "Starting log cleanup at $(date)" + find /logs -name "*.log" -mtime +7 -delete + echo "Cleanup completed at $(date)" + volumeMounts: + - name: logs-volume + mountPath: /logs + volumes: + - name: logs-volume + persistentVolumeClaim: + claimName: logs-pvc diff --git a/docs/examples/kubectl-quick-reference.md b/docs/examples/kubectl-quick-reference.md new file mode 100644 index 00000000..2a97f405 --- /dev/null +++ b/docs/examples/kubectl-quick-reference.md @@ -0,0 +1,381 @@ +# kubectl Quick Reference Guide + +## πŸš€ **Essential Commands for Junior Engineers** + +### **Basic Resource Management** + +```bash +# Get resources +kubectl get pods +kubectl get deployments +kubectl get services +kubectl get namespaces +kubectl get configmaps +kubectl get secrets +kubectl get pvc +kubectl get ingress + +# Get all resources in namespace +kubectl get all -n + +# Get resources with labels +kubectl get pods -l app=web-app +kubectl get pods -l environment=production + +# Get resources in wide format +kubectl get pods -o wide +kubectl get nodes -o wide +``` + +### **Resource Creation** + +```bash +# Create from YAML file +kubectl apply -f + +# Create from directory +kubectl apply -f / + +# Create from URL +kubectl apply -f https://raw.githubusercontent.com/... + +# Create resources directly +kubectl create namespace my-app +kubectl create deployment nginx --image=nginx:latest +kubectl create service clusterip nginx --tcp=80:80 +kubectl create configmap app-config --from-literal=DB_HOST=postgres +kubectl create secret generic db-secret --from-literal=DB_PASSWORD=secret123 +``` + +### **Resource Inspection** + +```bash +# Describe resources +kubectl describe pod +kubectl describe deployment +kubectl describe service +kubectl describe namespace + +# Get resource YAML +kubectl get pod -o yaml +kubectl get deployment -o yaml + +# Get resource in specific format +kubectl get pod -o json +kubectl get pod -o jsonpath='{.spec.containers[0].image}' +``` + +### **Logs and Debugging** + +```bash +# View logs +kubectl logs +kubectl logs -f # Follow logs +kubectl logs --previous # Previous container +kubectl logs --tail=100 # Last 100 lines + +# Execute commands in pods +kubectl exec -it -- /bin/bash +kubectl exec -- ls /app +kubectl exec -- cat /etc/passwd + +# Port forwarding +kubectl port-forward 8080:80 +kubectl port-forward service/ 8080:80 +kubectl port-forward deployment/ 8080:80 +``` + +### **Scaling and Updates** + +```bash +# Scale deployments +kubectl scale deployment --replicas=5 +kubectl scale deployment --replicas=0 # Scale to zero + +# Update deployments +kubectl set image deployment/ = +kubectl set image deployment/nginx nginx=nginx:1.21 + +# Rollout management +kubectl rollout status deployment/ +kubectl rollout history deployment/ +kubectl rollout undo deployment/ +kubectl rollout pause deployment/ +kubectl rollout resume deployment/ +``` + +### **Resource Deletion** + +```bash +# Delete resources +kubectl delete pod +kubectl delete deployment +kubectl delete service +kubectl delete namespace + +# Delete from YAML file +kubectl delete -f + +# Delete all resources in namespace +kubectl delete all --all -n + +# Force delete (use with caution) +kubectl delete pod --force --grace-period=0 +``` + +### **Context and Namespace Management** + +```bash +# View current context +kubectl config current-context + +# List contexts +kubectl config get-contexts + +# Switch context +kubectl config use-context + +# Set default namespace +kubectl config set-context --current --namespace= + +# View cluster info +kubectl cluster-info +kubectl cluster-info dump +``` + +### **Resource Monitoring** + +```bash +# Check resource usage +kubectl top pods +kubectl top nodes +kubectl top pods --containers + +# Check events +kubectl get events +kubectl get events -n +kubectl get events --sort-by='.lastTimestamp' + +# Check resource quotas +kubectl get resourcequota +kubectl describe resourcequota +``` + +### **Troubleshooting Commands** + +```bash +# Check node status +kubectl get nodes +kubectl describe node + +# Check service endpoints +kubectl get endpoints +kubectl describe endpoints + +# Check persistent volumes +kubectl get pv +kubectl get pvc +kubectl describe pv + +# Check ingress +kubectl get ingress +kubectl describe ingress + +# Check jobs and cronjobs +kubectl get jobs +kubectl get cronjobs +kubectl describe job +kubectl describe cronjob +``` + +### **Useful Aliases** + +```bash +# Add to your .bashrc or .zshrc +alias k='kubectl' +alias kg='kubectl get' +alias kd='kubectl describe' +alias kl='kubectl logs' +alias ke='kubectl exec -it' +alias kp='kubectl port-forward' +alias ka='kubectl apply -f' +alias kdel='kubectl delete' +alias kctx='kubectl config use-context' +alias kns='kubectl config set-context --current --namespace' +``` + +### **Common Patterns** + +```bash +# Get all pods with their IPs +kubectl get pods -o wide + +# Get all services with their endpoints +kubectl get services -o wide + +# Get all resources in a namespace +kubectl get all -n + +# Get resources by label +kubectl get pods -l app=web-app,environment=production + +# Get resources sorted by creation time +kubectl get pods --sort-by=.metadata.creationTimestamp + +# Get resources in custom columns +kubectl get pods -o custom-columns=NAME:.metadata.name,STATUS:.status.phase,AGE:.metadata.creationTimestamp +``` + +### **Advanced Commands** + +```bash +# Patch resources +kubectl patch deployment -p '{"spec":{"replicas":5}}' + +# Edit resources +kubectl edit deployment +kubectl edit configmap + +# Copy files +kubectl cp :/path/in/pod +kubectl cp :/path/in/pod + +# Run temporary pods +kubectl run test-pod --image=busybox --rm -it --restart=Never -- wget -O- : + +# Check API resources +kubectl api-resources +kubectl explain +``` + +### **Context-Specific Commands** + +```bash +# For debugging network issues +kubectl run test-pod --image=busybox --rm -it --restart=Never -- wget -O- : + +# For checking storage +kubectl run test-pod --image=busybox --rm -it --restart=Never -- ls /data + +# For testing DNS +kubectl run test-pod --image=busybox --rm -it --restart=Never -- nslookup + +# For checking secrets +kubectl run test-pod --rm -it --restart=Never --image=busybox -- env | grep DB_ +``` + +## ⚠️ **Bad Practices to Avoid** + +### **❌ DON'T DO THIS** + +```bash +# ❌ NEVER use kubectl run for production applications +kubectl run my-app --image=my-app:latest --port=8080 + +# ❌ NEVER create standalone Pods for services +kubectl run database --image=postgres:13 --port=5432 + +# ❌ NEVER use imperative commands for production +kubectl run nginx --image=nginx:latest + +# ❌ NEVER delete Pods directly (they'll be recreated by Deployment) +kubectl delete pod + +# ❌ NEVER use --force without understanding the consequences +kubectl delete pod --force --grace-period=0 +``` + +### **βœ… DO THIS INSTEAD** + +```bash +# βœ… Use Deployments for applications +kubectl create deployment my-app --image=my-app:latest + +# βœ… Use Helm charts for complex applications +helm install my-app ./my-app-chart --namespace my-app + +# βœ… Use kubectl apply for declarative deployments +kubectl apply -f deployment.yaml + +# βœ… Use StatefulSets for databases +kubectl apply -f statefulset.yaml + +# βœ… Delete Deployments, not Pods +kubectl delete deployment + +# βœ… Use proper resource management +kubectl scale deployment --replicas=0 +``` + +### **πŸ”§ When `kubectl run` is Acceptable** + +```bash +# βœ… OK: One-time debugging pods +kubectl run debug-pod --image=busybox --rm -it --restart=Never -- nslookup my-service + +# βœ… OK: Temporary testing +kubectl run test-pod --image=nginx --rm -it --restart=Never -- curl http://my-service:80 + +# βœ… OK: Quick experiments (development only) +kubectl run temp-pod --image=nginx --port=80 + +# βœ… OK: Troubleshooting network issues +kubectl run test-pod --image=busybox --rm -it --restart=Never -- wget -O- my-service:80 +``` + +## 🏭 **Your Codebase Best Practices** + +### **Your Actual Commands** +```bash +# 🏭 REAL COMMANDS FROM YOUR CODEBASE +# From freeleaps-devops-reconciler/scripts/deploy.sh + +# Helm deployment (primary method) +helm install/upgrade "$RELEASE_NAME" . \ + --namespace "$NAMESPACE" \ + --create-namespace \ + -f "$VALUES_FILE" \ + --set "image.tag=$IMAGE_TAG" + +# kubectl apply (secondary method) +kubectl apply -f / + +# Status checking +kubectl get pods -n "$NAMESPACE" -l "app.kubernetes.io/name=freeleaps-devops-reconciler" +kubectl logs -n "$NAMESPACE" deployment/"$RELEASE_NAME" +``` + +### **Best Practices** + +1. **Always use namespaces** to organize resources +2. **Use labels** for better resource management +3. **Set resource limits** on all containers +4. **Use health checks** for reliability +5. **Use ConfigMaps and Secrets** for configuration +6. **Test changes** in a staging environment first +7. **Keep kubectl updated** to match your cluster version +8. **Use Deployments, not standalone Pods** +9. **Use Helm charts for complex applications** +10. **Use declarative YAML files** + +### **Common Mistakes to Avoid** + +```bash +# ❌ Don't do this +kubectl run nginx --image=nginx # Creates a pod, not a deployment + +# βœ… Do this instead +kubectl create deployment nginx --image=nginx + +# ❌ Don't do this +kubectl delete pod # Pod will be recreated by deployment + +# βœ… Do this instead +kubectl delete deployment + +# ❌ Don't do this +kubectl exec -- rm -rf / # Dangerous command + +# βœ… Do this instead +kubectl exec -- ls / # Safe inspection command +``` diff --git a/docs/examples/namespace-with-pvc.yaml b/docs/examples/namespace-with-pvc.yaml new file mode 100644 index 00000000..2e672ff2 --- /dev/null +++ b/docs/examples/namespace-with-pvc.yaml @@ -0,0 +1,44 @@ +# Create namespace +apiVersion: v1 +kind: Namespace +metadata: + name: my-app + labels: + environment: development + team: backend +--- +# Create PVC +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: app-storage + namespace: my-app +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 5Gi + storageClassName: managed-premium +--- +# Create pod with PVC +apiVersion: v1 +kind: Pod +metadata: + name: app-with-storage + namespace: my-app + labels: + app: my-app +spec: + containers: + - name: app + image: nginx:latest + ports: + - containerPort: 80 + volumeMounts: + - name: app-storage + mountPath: /app/data + volumes: + - name: app-storage + persistentVolumeClaim: + claimName: app-storage diff --git a/docs/examples/resource-management-example.yaml b/docs/examples/resource-management-example.yaml new file mode 100644 index 00000000..f24d8712 --- /dev/null +++ b/docs/examples/resource-management-example.yaml @@ -0,0 +1,150 @@ +# Namespace with Resource Quota +apiVersion: v1 +kind: Namespace +metadata: + name: production + labels: + environment: production + team: platform +--- +# Resource Quota for the namespace +apiVersion: v1 +kind: ResourceQuota +metadata: + name: production-quota + namespace: production +spec: + hard: + # CPU and Memory limits + requests.cpu: "8" # 8 CPU cores total + requests.memory: 16Gi # 16GB memory total + limits.cpu: "16" # 16 CPU cores max + limits.memory: 32Gi # 32GB memory max + + # Resource counts + pods: "50" # 50 pods max + services: "20" # 20 services max + persistentvolumeclaims: "20" # 20 PVCs max + configmaps: "50" # 50 ConfigMaps max + secrets: "50" # 50 Secrets max + + # Storage + requests.storage: 100Gi # 100GB storage total +--- +# Limit Range for default limits +apiVersion: v1 +kind: LimitRange +metadata: + name: production-limits + namespace: production +spec: + limits: + # Default limits for containers + - default: + memory: 512Mi + cpu: 500m + defaultRequest: + memory: 256Mi + cpu: 250m + type: Container + # Default limits for pods + - default: + memory: 1Gi + cpu: 1000m + type: Pod +--- +# Deployment with proper resource management +apiVersion: apps/v1 +kind: Deployment +metadata: + name: resource-managed-app + namespace: production + labels: + app: resource-managed-app + environment: production +spec: + replicas: 3 + selector: + matchLabels: + app: resource-managed-app + template: + metadata: + labels: + app: resource-managed-app + environment: production + spec: + # Pod-level security context + securityContext: + runAsNonRoot: true + runAsUser: 1000 + fsGroup: 2000 + containers: + - name: app + image: nginx:latest + ports: + - containerPort: 80 + # Resource requests and limits + resources: + requests: + memory: "256Mi" # Minimum guaranteed + cpu: "250m" # 0.25 CPU cores + limits: + memory: "512Mi" # Maximum allowed + cpu: "500m" # 0.5 CPU cores + # Health checks + livenessProbe: + httpGet: + path: / + port: 80 + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + readinessProbe: + httpGet: + path: / + port: 80 + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + # Container-level security context + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + volumeMounts: + - name: tmp-volume + mountPath: /tmp + volumes: + - name: tmp-volume + emptyDir: {} +--- +# Horizontal Pod Autoscaler (HPA) +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: app-hpa + namespace: production +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: resource-managed-app + minReplicas: 3 + maxReplicas: 10 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 70 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 diff --git a/docs/examples/service-example.yaml b/docs/examples/service-example.yaml new file mode 100644 index 00000000..8d3e0eb8 --- /dev/null +++ b/docs/examples/service-example.yaml @@ -0,0 +1,54 @@ +# ClusterIP Service (Internal Access) +apiVersion: v1 +kind: Service +metadata: + name: web-app-service + namespace: my-app + labels: + app: web-app +spec: + type: ClusterIP + selector: + app: web-app + ports: + - name: http + port: 80 + targetPort: 80 + protocol: TCP +--- +# NodePort Service (External Access via Node) +apiVersion: v1 +kind: Service +metadata: + name: web-app-nodeport + namespace: my-app + labels: + app: web-app +spec: + type: NodePort + selector: + app: web-app + ports: + - name: http + port: 80 + targetPort: 80 + nodePort: 30080 + protocol: TCP +--- +# LoadBalancer Service (Cloud Load Balancer) +apiVersion: v1 +kind: Service +metadata: + name: web-app-lb + namespace: my-app + labels: + app: web-app +spec: + type: LoadBalancer + selector: + app: web-app + ports: + - name: http + port: 80 + targetPort: 80 + protocol: TCP diff --git a/docs/node_config.env.template b/docs/node_config.env.template new file mode 100644 index 00000000..d3facdac --- /dev/null +++ b/docs/node_config.env.template @@ -0,0 +1,190 @@ +# Azure Kubernetes Node Addition Configuration Template +# Copy this file to node_config.env and update the values + +# ============================================================================= +# VM Configuration +# ============================================================================= + +# VM Name (as it appears in Azure) +VM_NAME="prod-usw2-k8s-freeleaps-worker-nodes-06" + +# Azure Resource Group containing the VM +RESOURCE_GROUP="k8s" + +# Node Type: worker or master +NODE_TYPE="worker" + +# ============================================================================= +# Authentication Configuration +# ============================================================================= + +# Ansible user for SSH connections +# Note: This should be wwwadmin@mathmast.com for your environment +ANSIBLE_USER="wwwadmin@mathmast.com" + +# SSH Password (will be prompted during execution) +# Leave empty to be prompted during script execution +SSH_PASSWORD="" + +# Sudo Password (will be prompted during execution) +# Leave empty to be prompted during script execution +SUDO_PASSWORD="" + +# ============================================================================= +# Network Configuration +# ============================================================================= + +# VM Private IP Address (will be auto-detected if left empty) +# Leave empty to auto-detect from Azure +VM_PRIVATE_IP="" + +# Network Security Group name (for troubleshooting) +NSG_NAME="k8s-nsg" + +# Subnet name (for troubleshooting) +SUBNET_NAME="k8s-subnet" + +# ============================================================================= +# Kubernetes Configuration +# ============================================================================= + +# Kubernetes cluster name +CLUSTER_NAME="freeleaps" + +# Kubernetes version (should match existing cluster) +KUBERNETES_VERSION="1.31.4" + +# Container runtime (should match existing cluster) +CONTAINER_RUNTIME="docker" + +# ============================================================================= +# Paths and Directories +# ============================================================================= + +# Path to inventory file +INVENTORY_FILE="freeleaps-ops/cluster/ansible/manifests/inventory.ini" + +# Path to kubespray directory +KUBESPRAY_DIR="freeleaps-ops/3rd/kubespray" + +# Path to group_vars directory +GROUP_VARS_DIR="freeleaps-ops/cluster/ansible/manifests/group_vars" + +# ============================================================================= +# Script Behavior Configuration +# ============================================================================= + +# Enable verbose output (true/false) +VERBOSE="false" + +# Enable dry run mode (true/false) +# When enabled, script will show what it would do without making changes +DRY_RUN="false" + +# Maximum wait time for node to appear (seconds) +MAX_WAIT_TIME="300" + +# Maximum wait time for node to be ready (seconds) +MAX_READY_WAIT_TIME="600" + +# ============================================================================= +# Backup and Recovery Configuration +# ============================================================================= + +# Enable automatic backup of inventory file (true/false) +ENABLE_BACKUP="true" + +# Number of backup files to keep +BACKUP_RETENTION="5" + +# Backup directory +BACKUP_DIR="./backups" + +# ============================================================================= +# Monitoring and Alerting Configuration +# ============================================================================= + +# Enable post-addition health checks (true/false) +ENABLE_HEALTH_CHECKS="true" + +# Enable pod scheduling test (true/false) +ENABLE_POD_TEST="true" + +# Test pod image +TEST_POD_IMAGE="nginx:latest" + +# ============================================================================= +# Troubleshooting Configuration +# ============================================================================= + +# Enable detailed logging (true/false) +ENABLE_LOGGING="true" + +# Log file path +LOG_FILE="./node_addition.log" + +# Enable SSH connection testing (true/false) +ENABLE_SSH_TEST="true" + +# SSH timeout (seconds) +SSH_TIMEOUT="10" + +# ============================================================================= +# Advanced Configuration +# ============================================================================= + +# Ansible playbook timeout (seconds) +ANSIBLE_TIMEOUT="3600" + +# Kubectl timeout (seconds) +KUBECTL_TIMEOUT="300" + +# Azure CLI timeout (seconds) +AZURE_TIMEOUT="300" + +# ============================================================================= +# Validation Rules +# ============================================================================= + +# Allowed node types +ALLOWED_NODE_TYPES="worker,master" + +# Required VM name pattern +VM_NAME_PATTERN="^[a-zA-Z0-9-]+$" + +# Required resource group pattern +RESOURCE_GROUP_PATTERN="^[a-zA-Z0-9-]+$" + +# ============================================================================= +# Notes and Instructions +# ============================================================================= + +# IMPORTANT NOTES: +# 1. This template should be copied to node_config.env before use +# 2. Update the values according to your environment +# 3. Passwords will be prompted during execution for security +# 4. The script will create backups automatically +# 5. All paths are relative to the script execution directory + +# USAGE: +# 1. Copy this template: cp node_config.env.template node_config.env +# 2. Edit the configuration: vim node_config.env +# 3. Run the script: ./add_k8s_node.sh + +# SECURITY NOTES: +# - Never commit passwords to version control +# - Use SSH keys when possible +# - Regularly rotate passwords +# - Monitor access logs + +# TROUBLESHOOTING: +# - Check VM power state in Azure +# - Verify network security group rules +# - Ensure SSH service is running on VM +# - Check firewall rules if applicable +# - Verify DNS resolution + +# SUPPORT: +# - Infrastructure Team: [Contact Information] +# - Kubernetes Administrators: [Contact Information] +# - Azure Support: [Contact Information]