commit 9ac2a626ae839f2ad0f0418d41c0d29455ac3201 · julien.rbrt.fr/servmon

+4 -2

README.md

···

       1
       1
       -
       # Servmond

     

       1
       1
       +
       # Servmon

     

       2
       2
        
       

     

       3
       3
        
       KISS server monitoring tool with email alerts.

     

       4
       4
       +
       For those who want to keep it simple instead of using Prometheus, Grafana, and Alertmanager.

     

       4
       5
        
       

     

       5
       6
        
       Monitors:

     

       6
       7
        
       

     

       7
       8
        
       - [x] CPU

     

       8
       9
        
       - [x] Memory

     

       9
       10
        
       - [x] HTTP Health check

     

       10
       10
       -
       - [ ] Disk

     

       11
       11
       +
       - [x] Disk Usage

     

       12
       12
       +
       - [ ] Disk Write/Read

     

       11
       13
        
       - [ ] Docker

     

       12
       14
        
       

     

       13
       15
        
       ## Installation

+12 -12

config.go

···

       15
       15
        
       }

     

       16
       16
        
       

     

       17
       17
        
       type Thresholds struct {

     

       18
       18
       -
       	CPU    CPU    `yaml:"cpu"`

     

       19
       19
       -
       	Memory Memory `yaml:"memory"`

     

       20
       20
       -
       	HTTP   HTTP   `yaml:"http"`

     

       21
       21
       -
       }

     

       22
       22
       -
       

     

       23
       23
       -
       type CPU struct {

     

       24
       24
       -
       	Threshold float64       `yaml:"threshold"`

     

       25
       25
       -
       	Duration  time.Duration `yaml:"duration"`

     

       26
       26
       -
       	Cooldown  time.Duration `yaml:"cooldown"`

     

       18
       18
       +
       	CPU    ThresholdConfig `yaml:"cpu"`

     

       19
       19
       +
       	Memory ThresholdConfig `yaml:"memory"`

     

       20
       20
       +
       	Disk   ThresholdConfig `yaml:"disk"`

     

       21
       21
       +
       	HTTP   HTTP            `yaml:"http"`

     

       27
       22
        
       }

     

       28
       23
        
       

     

       29
       29
       -
       type Memory struct {

     

       24
       24
       +
       type ThresholdConfig struct {

     

       30
       25
        
       	Threshold float64       `yaml:"threshold"`

     

       26
       26
       +
       	Duration  time.Duration `yaml:"duration,omitempty"`

     

       31
       27
        
       	Cooldown  time.Duration `yaml:"cooldown"`

     

       32
       28
        
       }

     

       33
       29
        
       

     
···

       65
       61
        
       func defaultConfig() *Config {

     

       66
       62
        
       	return &Config{

     

       67
       63
        
       		AlertThresholds: Thresholds{

     

       68
       68
       -
       			CPU: CPU{

     

       64
       64
       +
       			CPU: ThresholdConfig{

     

       69
       65
        
       				Threshold: 90,

     

       70
       66
        
       				Duration:  5 * time.Minute,

     

       71
       67
        
       				Cooldown:  30 * time.Minute,

     

       72
       68
        
       			},

     

       73
       73
       -
       			Memory: Memory{

     

       69
       69
       +
       			Memory: ThresholdConfig{

     

       74
       70
        
       				Threshold: 80,

     

       75
       71
        
       				Cooldown:  30 * time.Minute,

     

       72
       72
       +
       			},

     

       73
       73
       +
       			Disk: ThresholdConfig{

     

       74
       74
       +
       				Threshold: 90,

     

       75
       75
       +
       				Cooldown:  4 * time.Hour,

     

       76
       76
        
       			},

     

       77
       77
        
       			HTTP: HTTP{

     

       78
       78
        
       				URL:              "http://localhost:8080/health",

main.go

···

       50
       50
        
       

     

       51
       51
        
       			go monitorCPU(cfg)

     

       52
       52
        
       			go monitorMemory(cfg)

     

       53
       53
       +
       			go monitorDisk(cfg)

     

       53
       54
        
       			go monitorHTTP(cfg)

     

       54
       55
        
       

     

       55
       56
        
       			select {} // keep alive

+40 -3

monitor.go

···

       8
       8
        
       	"time"

     

       9
       9
        
       

     

       10
       10
        
       	"github.com/shirou/gopsutil/v4/cpu"

     

       11
       11
       +
       	"github.com/shirou/gopsutil/v4/disk"

     

       11
       12
        
       	"github.com/shirou/gopsutil/v4/mem"

     

       12
       13
        
       )

     

       13
       14
        
       

     
···

       16
       17
        
       

     

       17
       18
        
       	alertCooldown := time.NewTimer(cfg.AlertThresholds.CPU.Cooldown)

     

       18
       19
        
       	for {

     

       19
       19
       -
       		percent, err := cpu.Percent(time.Duration(1)*time.Second, false)

     

       20
       20
       +
       		percent, err := cpu.Percent(cfg.AlertThresholds.CPU.Duration, false)

     

       20
       21
        
       		if err != nil {

     

       21
       22
        
       			log.Printf("Error getting CPU usage: %v", err)

     

       22
       23
        
       			time.Sleep(1 * time.Second)

     
···

       28
       29
        
       		for _, p := range percent {

     

       29
       30
        
       			total += p

     

       30
       31
        
       		}

     

       31
       31
       -
       

     

       32
       32
        
       		avg := total / float64(len(percent))

     

       33
       33
        
       

     

       34
       34
        
       		if avg > cfg.AlertThresholds.CPU.Threshold {

     
···

       91
       91
        
       	}

     

       92
       92
        
       }

     

       93
       93
        
       

     

       94
       94
       +
       func monitorDisk(cfg *Config) {

     

       95
       95
       +
       	log.Printf("Monitoring disk usage with threshold %.2f%% and cooldown %v", cfg.AlertThresholds.Disk.Threshold, cfg.AlertThresholds.Disk.Cooldown)

     

       96
       96
       +
       

     

       97
       97
       +
       	alertCooldown := time.NewTimer(cfg.AlertThresholds.Disk.Cooldown)

     

       98
       98
       +
       	for {

     

       99
       99
       +
       		usage, err := disk.Usage("/")

     

       100
       100
       +
       		if err != nil {

     

       101
       101
       +
       			log.Printf("Error getting disk usage: %v", err)

     

       102
       102
       +
       			time.Sleep(1 * time.Second)

     

       103
       103
       +
       			continue

     

       104
       104
       +
       		}

     

       105
       105
       +
       

     

       106
       106
       +
       		usedPercent := usage.UsedPercent

     

       107
       107
       +
       		if usedPercent > cfg.AlertThresholds.Disk.Threshold {

     

       108
       108
       +
       			// Check if we're within the cooldown period

     

       109
       109
       +
       			select {

     

       110
       110
       +
       			case <-alertCooldown.C:

     

       111
       111
       +
       				// Cooldown expired, check again

     

       112
       112
       +
       				alertCooldown.Reset(cfg.AlertThresholds.Disk.Cooldown)

     

       113
       113
       +
       			default:

     

       114
       114
       +
       				// Within cooldown, skip alert

     

       115
       115
       +
       				time.Sleep(1 * time.Second)

     

       116
       116
       +
       				continue

     

       117
       117
       +
       			}

     

       118
       118
       +
       

     

       119
       119
       +
       			err := sendEmail(fmt.Sprintf("Disk Usage Alert: %.2f%%", usedPercent),

     

       120
       120
       +
       				fmt.Sprintf("Disk usage of %.2f%% has exceeded the threshold of %.2f%%", usedPercent, cfg.AlertThresholds.Disk.Threshold), cfg)

     

       121
       121
       +
       			if err != nil {

     

       122
       122
       +
       				log.Printf("Error sending email: %v", err)

     

       123
       123
       +
       			}

     

       124
       124
       +
       		}

     

       125
       125
       +
       

     

       126
       126
       +
       		time.Sleep(time.Duration(1) * time.Second)

     

       127
       127
       +
       	}

     

       128
       128
       +
       }

     

       129
       129
       +
       

     

       94
       130
        
       func monitorHTTP(cfg *Config) {

     

       95
       131
        
       	log.Printf("Monitoring HTTP checks (%s) with threshold %.2f%% and cooldown %v", cfg.AlertThresholds.HTTP.URL, cfg.AlertThresholds.HTTP.FailureThreshold, cfg.AlertThresholds.HTTP.Cooldown)

     

       96
       132
        
       

     
···

       113
       149
        
       			}

     

       114
       150
        
       

     

       115
       151
        
       			ctx, cancel := context.WithTimeout(context.Background(), cfg.AlertThresholds.HTTP.Timeout)

     

       116
       116
       -
       			defer cancel()

     

       117
       152
        
       

     

       118
       153
        
       			resp, err := client.Do(req.WithContext(ctx))

     

       119
       154
        
       			if err != nil || resp.StatusCode >= 400 {

     

       120
       155
        
       				failureCount++

     

       121
       156
        
       			}

     

       157
       157
       +
       

     

       158
       158
       +
       			cancel()

     

       122
       159
        
       		}

     

       123
       160
        
       

     

       124
       161
        
       		// Calculate failure rate