a geicko-2 based round robin ranking system designed to test c++ battleship submissions battleship.dunkirk.sh

fix: implement proper Glicko-2 rating periods to eliminate last-submitter bias

The previous implementation updated Glicko-2 ratings after each individual
match, violating the system's core assumption of rating periods with 10-15+
games. This caused path-dependency where identical algorithms submitted at
different times received different final ratings based on opponent rating
states during their matches.

Changes:
- Refactored RecalculateAllGlicko2Ratings() to use proper rating periods
- Batches all match results per player together
- Uses opponent ratings at the start of the rating period
- Updates each player's rating once with all results
- Modified RunRoundRobinMatches() to store matches first, then recalculate
ratings once at the end (proper rating period)
- Removed per-match UpdateGlicko2Ratings() calls
- Added manual recalculation command: ./battleship-arena recalculate-ratings
- Updated leaderboard sorting: rating DESC → wins DESC → avg_moves ASC
- Pending/broken entries use avg_moves=999 to sort to bottom

Live updates still work - SSE broadcasts match progress in real-time.
Glicko ratings update at round-robin completion using proper rating periods.

References:
- Glicko-2 paper: http://www.glicko.net/glicko/glicko2.pdf
- Instant Glicko-2 implementation notes:
https://gist.github.com/gpluscb/302d6b71a8d0fe9f4350d45bc828f802

💘 Generated with Crush

Assisted-by: Claude Sonnet 4.5 via Crush <crush@charm.land>

dunkirk.sh 318520e3 9a95d7f7

verified
Changed files
+155 -21
cmd
battleship-arena
internal
runner
storage
+12
AGENTS.md
···
- **scp.go** - SCP upload middleware for file submissions
- **worker.go** - Background processor (runs every 30s)
+
## Glicko-2 Rating System
+
+
**Important**: The system uses Glicko-2 ratings with **proper rating periods** to avoid last-submitter bias:
+
+
- All matches in a round-robin are stored first
+
- Ratings update **once at the end** using all match results together (proper rating period)
+
- This eliminates path-dependency where identical algorithms get different ratings based on submission order
+
- Each player's rating considers ALL their opponents' ratings at the start of the rating period
+
- Glicko-2 expects 10-15+ games per rating period - our round-robin satisfies this
+
+
**Manual recalculation**: Run `./battleship-arena recalculate-ratings` or `make recalculate-ratings` to recompute all ratings from scratch.
+
## File Upload
Students upload via SCP:
+16 -10
Makefile
···
@echo "Building for production..."
@CGO_ENABLED=1 go build -ldflags="-s -w" -o bin/battleship-arena ./cmd/battleship-arena
+
# Recalculate all Glicko-2 ratings from scratch
+
recalculate-ratings: build
+
@echo "Recalculating all Glicko-2 ratings..."
+
@./bin/battleship-arena recalculate-ratings
+
# Show help
help:
@echo "Available targets:"
-
@echo " build - Build the server"
-
@echo " run - Build and run the server"
-
@echo " clean - Clean build artifacts"
-
@echo " test - Run tests"
-
@echo " gen-key - Generate SSH host key"
-
@echo " fmt - Format code"
-
@echo " lint - Lint code"
-
@echo " deps - Update dependencies"
-
@echo " build-prod - Build optimized production binary"
-
@echo " help - Show this help"
+
@echo " build - Build the server"
+
@echo " run - Build and run the server"
+
@echo " clean - Clean build artifacts"
+
@echo " test - Run tests"
+
@echo " gen-key - Generate SSH host key"
+
@echo " fmt - Format code"
+
@echo " lint - Lint code"
+
@echo " deps - Update dependencies"
+
@echo " build-prod - Build optimized production binary"
+
@echo " recalculate-ratings - Recalculate all Glicko-2 ratings from scratch"
+
@echo " help - Show this help"
battleship-arena

This is a binary file and will not be displayed.

+13
cmd/battleship-arena/main.go
···
if err := initStorage(cfg); err != nil {
log.Fatal(err)
}
+
+
// Check for special commands
+
if len(os.Args) > 1 {
+
switch os.Args[1] {
+
case "recalculate-ratings":
+
log.Println("Recalculating all Glicko-2 ratings from scratch...")
+
if err := storage.RecalculateAllGlicko2Ratings(); err != nil {
+
log.Fatalf("Failed to recalculate ratings: %v", err)
+
}
+
log.Println("✓ Ratings recalculated successfully")
+
return
+
}
+
}
server.InitSSE()
server.SetConfig(cfg.AdminPasscode, cfg.ExternalURL)
+9 -8
internal/runner/runner.go
···
map[int]string{newSub.ID: newSub.Username, opponent.ID: opponent.Username}[winnerID])
}
-
matchID, err := storage.AddMatch(newSub.ID, opponent.ID, winnerID, player1Wins, player2Wins, avgMoves, avgMoves)
+
_, err := storage.AddMatch(newSub.ID, opponent.ID, winnerID, player1Wins, player2Wins, avgMoves, avgMoves)
if err != nil {
log.Printf("Failed to store match result: %v", err)
-
} else {
-
if err := storage.UpdateGlicko2Ratings(newSub.ID, opponent.ID, player1Wins, player2Wins); err != nil {
-
log.Printf("Glicko-2 update failed: %v", err)
-
} else {
-
recordRatingSnapshot(newSub.ID, int(matchID))
-
recordRatingSnapshot(opponent.ID, int(matchID))
-
}
}
}
log.Printf("✓ Round-robin complete for %s (%d matches)", newSub.Username, totalMatches)
+
+
// Update Glicko-2 ratings using proper rating periods (batch all matches together)
+
log.Printf("Updating Glicko-2 ratings (proper rating period)...")
+
if err := storage.RecalculateAllGlicko2Ratings(); err != nil {
+
log.Printf("Failed to update Glicko-2 ratings: %v", err)
+
} else {
+
log.Printf("✓ Glicko-2 ratings updated")
+
}
}
func recordRatingSnapshot(submissionID, matchID int) {
+105 -3
internal/storage/database.go
···
func GetLeaderboard(limit int) ([]LeaderboardEntry, error) {
// Get submissions with matches
+
// Rankings use Glicko-2 with proper rating periods:
+
// - All round-robin matches are batched together before rating updates
+
// - This prevents last-submitter bias from path-dependent rating changes
query := `
SELECT
s.username,
···
350.0 as rd,
0 as total_wins,
0 as total_losses,
-
0.0 as avg_moves,
+
999.0 as avg_moves,
s.upload_time as last_played,
1 as is_pending,
0 as is_broken
···
0 as rd,
0 as total_wins,
0 as total_losses,
-
0.0 as avg_moves,
+
999.0 as avg_moves,
s.upload_time as last_played,
0 as is_pending,
1 as is_broken
FROM submissions s
WHERE s.is_active = 1 AND s.status = 'compilation_failed'
-
ORDER BY is_broken ASC, is_pending ASC, rating DESC, total_wins DESC
+
ORDER BY is_broken ASC, is_pending ASC, rating DESC, total_wins DESC, avg_moves ASC
LIMIT ?
`
···
)
return err
}
+
+
// RecalculateAllGlicko2Ratings recalculates all Glicko-2 ratings from scratch
+
// using proper rating periods where all matches for a player are batched together
+
func RecalculateAllGlicko2Ratings() error {
+
// Reset all active submissions to initial ratings
+
_, err := DB.Exec(`
+
UPDATE submissions
+
SET glicko_rating = 1500.0, glicko_rd = 350.0, glicko_volatility = 0.06
+
WHERE is_active = 1 AND status = 'completed'
+
`)
+
if err != nil {
+
return err
+
}
+
+
// Get all active player IDs
+
var playerIDs []int
+
rows, err := DB.Query("SELECT id FROM submissions WHERE is_active = 1 AND status = 'completed'")
+
if err != nil {
+
return err
+
}
+
for rows.Next() {
+
var id int
+
if err := rows.Scan(&id); err != nil {
+
return err
+
}
+
playerIDs = append(playerIDs, id)
+
}
+
rows.Close()
+
+
// For each player, collect ALL their match results and update once (proper rating period)
+
for _, playerID := range playerIDs {
+
// Get player's current rating
+
var rating, rd, volatility float64
+
err := DB.QueryRow(
+
"SELECT glicko_rating, glicko_rd, glicko_volatility FROM submissions WHERE id = ?",
+
playerID,
+
).Scan(&rating, &rd, &volatility)
+
if err != nil {
+
continue
+
}
+
+
// Collect ALL match results for this player in this rating period
+
var results []Glicko2Result
+
+
rows, err := DB.Query(`
+
SELECT
+
CASE WHEN player1_id = ? THEN player2_id ELSE player1_id END as opponent_id,
+
CASE WHEN player1_id = ? THEN player1_wins ELSE player2_wins END as my_wins,
+
CASE WHEN player1_id = ? THEN player2_wins ELSE player1_wins END as opponent_wins
+
FROM matches
+
WHERE (player1_id = ? OR player2_id = ?) AND is_valid = 1
+
ORDER BY timestamp ASC
+
`, playerID, playerID, playerID, playerID, playerID)
+
+
if err != nil {
+
continue
+
}
+
+
for rows.Next() {
+
var opponentID, myWins, opponentWins int
+
if err := rows.Scan(&opponentID, &myWins, &opponentWins); err != nil {
+
continue
+
}
+
+
// Get opponent's rating at the START of this rating period (not current)
+
var oppRating, oppRD float64
+
err := DB.QueryRow(
+
"SELECT glicko_rating, glicko_rd FROM submissions WHERE id = ?",
+
opponentID,
+
).Scan(&oppRating, &oppRD)
+
if err != nil {
+
continue
+
}
+
+
totalGames := myWins + opponentWins
+
score := float64(myWins) / float64(totalGames)
+
+
results = append(results, Glicko2Result{
+
OpponentRating: oppRating,
+
OpponentRD: oppRD,
+
Score: score,
+
})
+
}
+
rows.Close()
+
+
// Update this player's rating based on ALL results at once (proper rating period)
+
if len(results) > 0 {
+
player := Glicko2Player{Rating: rating, RD: rd, Volatility: volatility}
+
newPlayer := updateGlicko2(player, results)
+
+
DB.Exec(
+
"UPDATE submissions SET glicko_rating = ?, glicko_rd = ?, glicko_volatility = ? WHERE id = ?",
+
newPlayer.Rating, newPlayer.RD, newPlayer.Volatility, playerID,
+
)
+
}
+
}
+
+
return nil
+
}