kiss server monitoring tool with email alerts
go monitoring
at v0.1.0 5.3 kB view raw
1package main 2 3import ( 4 "context" 5 "fmt" 6 "log" 7 "net/http" 8 "time" 9 10 "github.com/shirou/gopsutil/v4/cpu" 11 "github.com/shirou/gopsutil/v4/disk" 12 "github.com/shirou/gopsutil/v4/mem" 13) 14 15func monitorCPU(cfg *Config) { 16 log.Printf("Monitoring CPU usage with threshold %.2f%% and cooldown %v", cfg.AlertThresholds.CPU.Threshold, cfg.AlertThresholds.CPU.Cooldown) 17 18 alertCooldown := time.NewTimer(cfg.AlertThresholds.CPU.Cooldown) 19 for { 20 percent, err := cpu.Percent(cfg.AlertThresholds.CPU.Duration, false) 21 if err != nil { 22 log.Printf("Error getting CPU usage: %v", err) 23 time.Sleep(1 * time.Second) 24 continue 25 } 26 27 // Average CPU usage across all cores 28 var total float64 29 for _, p := range percent { 30 total += p 31 } 32 avg := total / float64(len(percent)) 33 34 if avg > cfg.AlertThresholds.CPU.Threshold { 35 // Check if we're within the cooldown period 36 select { 37 case <-alertCooldown.C: 38 // Cooldown expired, check again 39 alertCooldown.Reset(cfg.AlertThresholds.CPU.Cooldown) 40 default: 41 // Within cooldown, skip alert 42 time.Sleep(1 * time.Second) 43 continue 44 } 45 46 err := sendEmail(fmt.Sprintf("CPU Usage Alert: %.2f%%", avg), 47 fmt.Sprintf("CPU usage of %.2f%% has exceeded the threshold of %.2f%%", avg, cfg.AlertThresholds.CPU.Threshold), cfg) 48 if err != nil { 49 log.Printf("Error sending email: %v", err) 50 } 51 } 52 53 time.Sleep(time.Duration(1) * time.Second) 54 } 55} 56 57func monitorMemory(cfg *Config) { 58 log.Printf("Monitoring memory usage with threshold %.2f%% and cooldown %v", cfg.AlertThresholds.Memory.Threshold, cfg.AlertThresholds.Memory.Cooldown) 59 60 alertCooldown := time.NewTimer(cfg.AlertThresholds.Memory.Cooldown) 61 for { 62 vm, err := mem.VirtualMemory() 63 if err != nil { 64 log.Printf("Error getting memory usage: %v", err) 65 time.Sleep(1 * time.Second) 66 continue 67 } 68 69 usedPercent := vm.UsedPercent 70 71 if usedPercent > cfg.AlertThresholds.Memory.Threshold { 72 // Check if we're within the cooldown period 73 select { 74 case <-alertCooldown.C: 75 // Cooldown expired, check again 76 alertCooldown.Reset(cfg.AlertThresholds.Memory.Cooldown) 77 default: 78 // Within cooldown, skip alert 79 time.Sleep(1 * time.Second) 80 continue 81 } 82 83 err := sendEmail(fmt.Sprintf("Memory Usage Alert: %.2f%%", usedPercent), 84 fmt.Sprintf("Memory usage of %.2f%% has exceeded the threshold of %.2f%%", usedPercent, cfg.AlertThresholds.Memory.Threshold), cfg) 85 if err != nil { 86 log.Printf("Error sending email: %v", err) 87 } 88 } 89 90 time.Sleep(time.Duration(1) * time.Second) 91 } 92} 93 94func monitorDisk(cfg *Config) { 95 log.Printf("Monitoring disk usage with threshold %.2f%% and cooldown %v", cfg.AlertThresholds.Disk.Threshold, cfg.AlertThresholds.Disk.Cooldown) 96 97 alertCooldown := time.NewTimer(cfg.AlertThresholds.Disk.Cooldown) 98 for { 99 usage, err := disk.Usage("/") 100 if err != nil { 101 log.Printf("Error getting disk usage: %v", err) 102 time.Sleep(1 * time.Second) 103 continue 104 } 105 106 usedPercent := usage.UsedPercent 107 if usedPercent > cfg.AlertThresholds.Disk.Threshold { 108 // Check if we're within the cooldown period 109 select { 110 case <-alertCooldown.C: 111 // Cooldown expired, check again 112 alertCooldown.Reset(cfg.AlertThresholds.Disk.Cooldown) 113 default: 114 // Within cooldown, skip alert 115 time.Sleep(1 * time.Second) 116 continue 117 } 118 119 err := sendEmail(fmt.Sprintf("Disk Usage Alert: %.2f%%", usedPercent), 120 fmt.Sprintf("Disk usage of %.2f%% has exceeded the threshold of %.2f%%", usedPercent, cfg.AlertThresholds.Disk.Threshold), cfg) 121 if err != nil { 122 log.Printf("Error sending email: %v", err) 123 } 124 } 125 126 time.Sleep(time.Duration(1) * time.Second) 127 } 128} 129 130func monitorHTTP(cfg *Config) { 131 log.Printf("Monitoring HTTP checks (%s) with threshold %.2f%% and cooldown %v", cfg.AlertThresholds.HTTP.URL, cfg.AlertThresholds.HTTP.FailureThreshold, cfg.AlertThresholds.HTTP.Cooldown) 132 133 alertCooldown := time.NewTimer(cfg.AlertThresholds.HTTP.Cooldown) 134 client := &http.Client{ 135 Timeout: cfg.AlertThresholds.HTTP.Timeout, 136 } 137 138 for { 139 // Wait for check interval 140 time.Sleep(cfg.AlertThresholds.HTTP.CheckInterval) 141 142 // Perform HTTP checks 143 failureCount := 0 144 for i := 0; i < cfg.AlertThresholds.HTTP.SampleRate; i++ { 145 req, err := http.NewRequest("GET", cfg.AlertThresholds.HTTP.URL, nil) 146 if err != nil { 147 failureCount++ 148 continue 149 } 150 151 ctx, cancel := context.WithTimeout(context.Background(), cfg.AlertThresholds.HTTP.Timeout) 152 153 resp, err := client.Do(req.WithContext(ctx)) 154 if err != nil || resp.StatusCode >= 400 { 155 failureCount++ 156 } 157 158 cancel() 159 } 160 161 // Calculate failure rate 162 failureRate := (float64(failureCount) / float64(cfg.AlertThresholds.HTTP.SampleRate)) * 100 163 if failureRate > cfg.AlertThresholds.HTTP.FailureThreshold { 164 // Check if we're within the cooldown period 165 select { 166 case <-alertCooldown.C: 167 // Cooldown expired, check again 168 alertCooldown.Reset(cfg.AlertThresholds.HTTP.Cooldown) 169 default: 170 // Within cooldown, skip alert 171 continue 172 } 173 174 err := sendEmail(fmt.Sprintf("HTTP Failure Alert: %.2f%%", failureRate), 175 fmt.Sprintf("HTTP failure rate of %.2f%% has exceeded the threshold of %.2f%%", failureRate, cfg.AlertThresholds.HTTP.FailureThreshold), cfg) 176 if err != nil { 177 log.Printf("Error sending email: %v", err) 178 } 179 } 180 } 181}