diff --git a/docs/Service Monitor and Error Alter Integration Guideline.md b/docs/Service Monitor and Error Alter Integration Guideline.md index a45c2353..d4e0faf2 100644 --- a/docs/Service Monitor and Error Alter Integration Guideline.md +++ b/docs/Service Monitor and Error Alter Integration Guideline.md @@ -1,10 +1,16 @@ -# 1. Prometheus Alert Rule Configuration +# 1. Prerequisites -## 1.1. Add `prometheusrule.yaml` to `/templates`. +Before proceeding with the steps in this document, ensure your service has integrated Prometheus metrics collection. For details, refer to +[prometheus-metrics-intergration-guideline.md](prometheus-metrics-intergration-guideline.md) + +# 2. Prometheus Alert Rule Configuration + + +## 2.1. Add `prometheusrule.yaml` to `/templates`. Example: -> Update metrics to your service name, see freeleaps-ops/freeleaps/helm-pkg/metrics +> Update the metrics configuration to your service name. See `freeleaps-ops/freeleaps/helm-pkg/metrics`. ```yaml {{- /* Copyright Broadcom, Inc. All Rights Reserved. @@ -45,40 +51,41 @@ spec: {{- end }} ``` -## 1.2. Add prometheusrule configuration to values.{alpha/prod}.yaml +## 2.2. Add prometheusrule configuration to `values.{alpha/prod}.yaml` Example: -> See freeleaps-ops/freeleaps/helm-pkg/metrics +> See `freeleaps-ops/freeleaps/helm-pkg/metrics`. ```yaml prometheusRule: - name: freepeals-metrics - enabled: true - namespace: "freeleaps-monitoring-system" + name: freepeals-prod-metrics + enabled: true # disable in alpha environment + namespace: freeleaps-monitoring-system labels: release: kube-prometheus-stack rules: - - alert: FreeleapsMetricsServiceDown - expr: up{job="metrics-service"} == 0 - for: 1m - labels: - severity: critical - service: metrics-service - annotations: - summary: "Freeleaps Metrics service is down (instance {{ $labels.instance }})" - description: "Freeleaps Metrics service has been down for more than 1 minutes." - runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7" - - - alert: FreeleapsMetricsServiceHighErrorRate - expr: rate(http_requests_total{job="metrics-service",status=~"5.."}[5m]) > 0.1 - for: 5m - labels: - severity: warning - service: metrics-service - annotations: - summary: "High error rate in freeleaps metrics service (instance {{ $labels.instance }})" - description: "Freeleaps Metrics service error rate is {{ $value }} errors per second." - runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7" + - alert: FreeleapsMetricsServiceDown # Service down alert + expr: up{job="metrics-service"} == 0 + for: 1m + labels: + severity: critical # severity: warning/info/critical + service: metrics-service # service name + namespace: freeleaps-prod # namespace of the service + annotations: + summary: Freeleaps Metrics service is down (instance {{ $labels.instance }}) # summary + description: Freeleaps Metrics service has been down for more than 1 minute. # description + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7 # Runbook url + - alert: FreeleapsMetricsServiceHighErrorRate + expr: rate(http_requests_total{job="metrics-service",status=~"5.."}[5m]) > 0.1 + for: 5m + labels: + severity: warning + service: metrics-service + namespace: freeleaps-prod + annotations: + summary: High error rate in freeleaps metrics service (instance {{ $labels.instance }}) + description: Freeleaps Metrics service error rate is {{ $value }} errors per second. + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7 ``` ## 1.3. Verify Alert Rule Configuration is Effective @@ -88,4 +95,84 @@ prometheusRule: > You can see the newly added rules indicating they are effective -![alt text](asserts/image5.png) \ No newline at end of file +![alt text](asserts/image5.png) + + +# 3. Add AlertmanagerConfig (Email Notifications) +## 3.1 Add AlertmanagerConfig +> If there is no `AlertmanagerConfig` in the namespace, create one. If it already exists, no action is required. + +To create a new `AlertmanagerConfig`, refer to `freeleaps-ops/altermanager/altermanager-config.yaml`. +``` +apiVersion: v1 +kind: Secret +type: Opaque +metadata: + name: altermanager-email-credentials + namespace: freeleaps-prod # The namespace whose service alerts you want to configure +data: + password: cHducGNya3d0aXp5Z2RoZQ== +--- +apiVersion: monitoring.coreos.com/v1alpha1 +kind: AlertmanagerConfig +metadata: + name: alertmanager-config + namespace: freeleaps-prod # The namespace whose service alerts you want to configure +spec: + receivers: + # - msteamsConfigs: + # - sendResolved: true + # text: '{{ template "msteams.default.text" . }}' + # title: >- + # {{ if eq .Status "firing" }}🚨 [FIRING] 🔥{{- else -}}🙌 [RESOLVED] + # 🍻{{- end -}} + # webhookUrl: + # key: webhook-url + # name: freeleaps-teams-webhook + # name: ms-teams + - emailConfigs: + - to: "icecheng@mathmast.com" # email recipient + from: "support@freeleaps.com" # email sender + smarthost: "smtp.freeleaps.com:465" + authUsername: "support@freeleaps.com" + authPassword: + name: "altermanager-email-credentials" + key: "password" + authIdentity: "support@freeleaps.com" + requireTLS: false + sendResolved: true + headers: # email Subject configuration + - key: Subject + value: '{{ if eq .Status "firing" }}🚨 Freeleaps Alert: {{ .CommonAnnotations.summary }}{{ else }}✅ Freeleaps Resolved: {{ .CommonAnnotations.summary }}{{ end }}' + html: |- # email content configuration +

{{ if eq .Status "firing" }}🚨 Alert: {{ .CommonAnnotations.summary }}{{ else }}✅ Resolved: {{ .CommonAnnotations.summary }}{{ end }}

+

📝 AlertName: {{ .CommonLabels.alertname }}

+

🔧 Service: {{ .CommonLabels.service }}

+

🔧 Pod: {{ .CommonLabels.pod }}({{ .CommonLabels.instance }})

+

🏷️ Severity: {{ .CommonLabels.severity }}

+

{{ if eq .Status "firing" }}🔴 Status:{{ else }}🟢 Status:{{ end }} {{ .Status | toUpper }}

+

📝 Description: {{ .CommonAnnotations.description }}

+

📖 Runbook: {{ .CommonAnnotations.runbook_url }}

+ name: email + route: + groupBy: + - severity + groupInterval: 5m + receiver: email + groupWait: 5m + repeatInterval: 6h +``` +## 3.2. Verify Configuration Success + +> Trigger an alert and check the pages below for alert data. If present, the configuration is successful. + +![alt text](asserts/image6.png) +![alt text](asserts/image7.png) + +## 3.3. Verify Email Notification Success +![alt text](asserts/img_v3_02qe_37dcdd6a-09f3-4ac0-982d-b91869959a7g.jpg) +![alt text](asserts/img_v3_02qe_526363a6-d136-4c89-9422-12209062748g.jpg) + +# 4. Teams Alert Integration + +TODO \ No newline at end of file diff --git a/docs/asserts/image6.png b/docs/asserts/image6.png new file mode 100644 index 00000000..fc783fdf Binary files /dev/null and b/docs/asserts/image6.png differ diff --git a/docs/asserts/image7.png b/docs/asserts/image7.png new file mode 100644 index 00000000..7a777e0a Binary files /dev/null and b/docs/asserts/image7.png differ diff --git a/docs/asserts/img_v3_02qe_37dcdd6a-09f3-4ac0-982d-b91869959a7g.jpg b/docs/asserts/img_v3_02qe_37dcdd6a-09f3-4ac0-982d-b91869959a7g.jpg new file mode 100644 index 00000000..f0d49f17 Binary files /dev/null and b/docs/asserts/img_v3_02qe_37dcdd6a-09f3-4ac0-982d-b91869959a7g.jpg differ diff --git a/docs/asserts/img_v3_02qe_526363a6-d136-4c89-9422-12209062748g.jpg b/docs/asserts/img_v3_02qe_526363a6-d136-4c89-9422-12209062748g.jpg new file mode 100644 index 00000000..f8eeb404 Binary files /dev/null and b/docs/asserts/img_v3_02qe_526363a6-d136-4c89-9422-12209062748g.jpg differ