kiss server monitoring tool with email alerts
go monitoring

feat: add disk usage

+4 -2
README.md
···
-
# Servmond
+
# Servmon
KISS server monitoring tool with email alerts.
+
For those who want to keep it simple instead of using Prometheus, Grafana, and Alertmanager.
Monitors:
- [x] CPU
- [x] Memory
- [x] HTTP Health check
-
- [ ] Disk
+
- [x] Disk Usage
+
- [ ] Disk Write/Read
- [ ] Docker
## Installation
+12 -12
config.go
···
}
type Thresholds struct {
-
CPU CPU `yaml:"cpu"`
-
Memory Memory `yaml:"memory"`
-
HTTP HTTP `yaml:"http"`
-
}
-
-
type CPU struct {
-
Threshold float64 `yaml:"threshold"`
-
Duration time.Duration `yaml:"duration"`
-
Cooldown time.Duration `yaml:"cooldown"`
+
CPU ThresholdConfig `yaml:"cpu"`
+
Memory ThresholdConfig `yaml:"memory"`
+
Disk ThresholdConfig `yaml:"disk"`
+
HTTP HTTP `yaml:"http"`
}
-
type Memory struct {
+
type ThresholdConfig struct {
Threshold float64 `yaml:"threshold"`
+
Duration time.Duration `yaml:"duration,omitempty"`
Cooldown time.Duration `yaml:"cooldown"`
}
···
func defaultConfig() *Config {
return &Config{
AlertThresholds: Thresholds{
-
CPU: CPU{
+
CPU: ThresholdConfig{
Threshold: 90,
Duration: 5 * time.Minute,
Cooldown: 30 * time.Minute,
},
-
Memory: Memory{
+
Memory: ThresholdConfig{
Threshold: 80,
Cooldown: 30 * time.Minute,
+
},
+
Disk: ThresholdConfig{
+
Threshold: 90,
+
Cooldown: 4 * time.Hour,
},
HTTP: HTTP{
URL: "http://localhost:8080/health",
+1
main.go
···
go monitorCPU(cfg)
go monitorMemory(cfg)
+
go monitorDisk(cfg)
go monitorHTTP(cfg)
select {} // keep alive
+40 -3
monitor.go
···
"time"
"github.com/shirou/gopsutil/v4/cpu"
+
"github.com/shirou/gopsutil/v4/disk"
"github.com/shirou/gopsutil/v4/mem"
)
···
alertCooldown := time.NewTimer(cfg.AlertThresholds.CPU.Cooldown)
for {
-
percent, err := cpu.Percent(time.Duration(1)*time.Second, false)
+
percent, err := cpu.Percent(cfg.AlertThresholds.CPU.Duration, false)
if err != nil {
log.Printf("Error getting CPU usage: %v", err)
time.Sleep(1 * time.Second)
···
for _, p := range percent {
total += p
}
-
avg := total / float64(len(percent))
if avg > cfg.AlertThresholds.CPU.Threshold {
···
}
}
+
func monitorDisk(cfg *Config) {
+
log.Printf("Monitoring disk usage with threshold %.2f%% and cooldown %v", cfg.AlertThresholds.Disk.Threshold, cfg.AlertThresholds.Disk.Cooldown)
+
+
alertCooldown := time.NewTimer(cfg.AlertThresholds.Disk.Cooldown)
+
for {
+
usage, err := disk.Usage("/")
+
if err != nil {
+
log.Printf("Error getting disk usage: %v", err)
+
time.Sleep(1 * time.Second)
+
continue
+
}
+
+
usedPercent := usage.UsedPercent
+
if usedPercent > cfg.AlertThresholds.Disk.Threshold {
+
// Check if we're within the cooldown period
+
select {
+
case <-alertCooldown.C:
+
// Cooldown expired, check again
+
alertCooldown.Reset(cfg.AlertThresholds.Disk.Cooldown)
+
default:
+
// Within cooldown, skip alert
+
time.Sleep(1 * time.Second)
+
continue
+
}
+
+
err := sendEmail(fmt.Sprintf("Disk Usage Alert: %.2f%%", usedPercent),
+
fmt.Sprintf("Disk usage of %.2f%% has exceeded the threshold of %.2f%%", usedPercent, cfg.AlertThresholds.Disk.Threshold), cfg)
+
if err != nil {
+
log.Printf("Error sending email: %v", err)
+
}
+
}
+
+
time.Sleep(time.Duration(1) * time.Second)
+
}
+
}
+
func monitorHTTP(cfg *Config) {
log.Printf("Monitoring HTTP checks (%s) with threshold %.2f%% and cooldown %v", cfg.AlertThresholds.HTTP.URL, cfg.AlertThresholds.HTTP.FailureThreshold, cfg.AlertThresholds.HTTP.Cooldown)
···
}
ctx, cancel := context.WithTimeout(context.Background(), cfg.AlertThresholds.HTTP.Timeout)
-
defer cancel()
resp, err := client.Do(req.WithContext(ctx))
if err != nil || resp.StatusCode >= 400 {
failureCount++
}
+
+
cancel()
}
// Calculate failure rate