kiss server monitoring tool with email alerts
go
monitoring
1package main
2
3import (
4 "context"
5 "fmt"
6 "log"
7 "net/http"
8 "time"
9
10 "github.com/shirou/gopsutil/v4/cpu"
11 "github.com/shirou/gopsutil/v4/disk"
12 "github.com/shirou/gopsutil/v4/mem"
13)
14
15func monitorCPU(cfg *Config) {
16 log.Printf("Monitoring CPU usage with threshold %.2f%% and cooldown %v", cfg.AlertThresholds.CPU.Threshold, cfg.AlertThresholds.CPU.Cooldown)
17
18 alertCooldown := time.NewTimer(cfg.AlertThresholds.CPU.Cooldown)
19 for {
20 percent, err := cpu.Percent(cfg.AlertThresholds.CPU.Duration, false)
21 if err != nil {
22 log.Printf("Error getting CPU usage: %v", err)
23 time.Sleep(1 * time.Second)
24 continue
25 }
26
27 // Average CPU usage across all cores
28 var total float64
29 for _, p := range percent {
30 total += p
31 }
32 avg := total / float64(len(percent))
33
34 if avg > cfg.AlertThresholds.CPU.Threshold {
35 // Check if we're within the cooldown period
36 select {
37 case <-alertCooldown.C:
38 // Cooldown expired, check again
39 alertCooldown.Reset(cfg.AlertThresholds.CPU.Cooldown)
40 default:
41 // Within cooldown, skip alert
42 time.Sleep(1 * time.Second)
43 continue
44 }
45
46 err := sendEmail(fmt.Sprintf("CPU Usage Alert: %.2f%%", avg),
47 fmt.Sprintf("CPU usage of %.2f%% has exceeded the threshold of %.2f%%", avg, cfg.AlertThresholds.CPU.Threshold), cfg)
48 if err != nil {
49 log.Printf("Error sending email: %v", err)
50 }
51 }
52
53 time.Sleep(time.Duration(1) * time.Second)
54 }
55}
56
57func monitorMemory(cfg *Config) {
58 log.Printf("Monitoring memory usage with threshold %.2f%% and cooldown %v", cfg.AlertThresholds.Memory.Threshold, cfg.AlertThresholds.Memory.Cooldown)
59
60 alertCooldown := time.NewTimer(cfg.AlertThresholds.Memory.Cooldown)
61 for {
62 vm, err := mem.VirtualMemory()
63 if err != nil {
64 log.Printf("Error getting memory usage: %v", err)
65 time.Sleep(1 * time.Second)
66 continue
67 }
68
69 usedPercent := vm.UsedPercent
70
71 if usedPercent > cfg.AlertThresholds.Memory.Threshold {
72 // Check if we're within the cooldown period
73 select {
74 case <-alertCooldown.C:
75 // Cooldown expired, check again
76 alertCooldown.Reset(cfg.AlertThresholds.Memory.Cooldown)
77 default:
78 // Within cooldown, skip alert
79 time.Sleep(1 * time.Second)
80 continue
81 }
82
83 err := sendEmail(fmt.Sprintf("Memory Usage Alert: %.2f%%", usedPercent),
84 fmt.Sprintf("Memory usage of %.2f%% has exceeded the threshold of %.2f%%", usedPercent, cfg.AlertThresholds.Memory.Threshold), cfg)
85 if err != nil {
86 log.Printf("Error sending email: %v", err)
87 }
88 }
89
90 time.Sleep(time.Duration(1) * time.Second)
91 }
92}
93
94func monitorDisk(cfg *Config) {
95 log.Printf("Monitoring disk usage with threshold %.2f%% and cooldown %v", cfg.AlertThresholds.Disk.Threshold, cfg.AlertThresholds.Disk.Cooldown)
96
97 alertCooldown := time.NewTimer(cfg.AlertThresholds.Disk.Cooldown)
98 for {
99 usage, err := disk.Usage("/")
100 if err != nil {
101 log.Printf("Error getting disk usage: %v", err)
102 time.Sleep(1 * time.Second)
103 continue
104 }
105
106 usedPercent := usage.UsedPercent
107 if usedPercent > cfg.AlertThresholds.Disk.Threshold {
108 // Check if we're within the cooldown period
109 select {
110 case <-alertCooldown.C:
111 // Cooldown expired, check again
112 alertCooldown.Reset(cfg.AlertThresholds.Disk.Cooldown)
113 default:
114 // Within cooldown, skip alert
115 time.Sleep(1 * time.Second)
116 continue
117 }
118
119 err := sendEmail(fmt.Sprintf("Disk Usage Alert: %.2f%%", usedPercent),
120 fmt.Sprintf("Disk usage of %.2f%% has exceeded the threshold of %.2f%%", usedPercent, cfg.AlertThresholds.Disk.Threshold), cfg)
121 if err != nil {
122 log.Printf("Error sending email: %v", err)
123 }
124 }
125
126 time.Sleep(time.Duration(1) * time.Second)
127 }
128}
129
130func monitorHTTP(cfg *Config) {
131 log.Printf("Monitoring HTTP checks (%s) with threshold %.2f%% and cooldown %v", cfg.AlertThresholds.HTTP.URL, cfg.AlertThresholds.HTTP.FailureThreshold, cfg.AlertThresholds.HTTP.Cooldown)
132
133 alertCooldown := time.NewTimer(cfg.AlertThresholds.HTTP.Cooldown)
134 client := &http.Client{
135 Timeout: cfg.AlertThresholds.HTTP.Timeout,
136 }
137
138 for {
139 // Wait for check interval
140 time.Sleep(cfg.AlertThresholds.HTTP.CheckInterval)
141
142 // Perform HTTP checks
143 failureCount := 0
144 for i := 0; i < cfg.AlertThresholds.HTTP.SampleRate; i++ {
145 req, err := http.NewRequest("GET", cfg.AlertThresholds.HTTP.URL, nil)
146 if err != nil {
147 failureCount++
148 continue
149 }
150
151 ctx, cancel := context.WithTimeout(context.Background(), cfg.AlertThresholds.HTTP.Timeout)
152
153 resp, err := client.Do(req.WithContext(ctx))
154 if err != nil || resp.StatusCode >= 400 {
155 failureCount++
156 }
157
158 cancel()
159 }
160
161 // Calculate failure rate
162 failureRate := (float64(failureCount) / float64(cfg.AlertThresholds.HTTP.SampleRate)) * 100
163 if failureRate > cfg.AlertThresholds.HTTP.FailureThreshold {
164 // Check if we're within the cooldown period
165 select {
166 case <-alertCooldown.C:
167 // Cooldown expired, check again
168 alertCooldown.Reset(cfg.AlertThresholds.HTTP.Cooldown)
169 default:
170 // Within cooldown, skip alert
171 continue
172 }
173
174 err := sendEmail(fmt.Sprintf("HTTP Failure Alert: %.2f%%", failureRate),
175 fmt.Sprintf("HTTP failure rate of %.2f%% has exceeded the threshold of %.2f%%", failureRate, cfg.AlertThresholds.HTTP.FailureThreshold), cfg)
176 if err != nil {
177 log.Printf("Error sending email: %v", err)
178 }
179 }
180 }
181}