this repo has no description

cleanup

Changed files
+1 -449
cmd
-420
cmd/bodega/main.go
···
-
package main
-
-
import (
-
"bytes"
-
"context"
-
"encoding/json"
-
"fmt"
-
"io"
-
"log/slog"
-
"net/http"
-
"os"
-
"strings"
-
"sync"
-
"time"
-
-
"github.com/ClickHouse/clickhouse-go/v2"
-
atproto_repo "github.com/bluesky-social/indigo/atproto/repo"
-
"github.com/bluesky-social/indigo/atproto/syntax"
-
"github.com/bluesky-social/indigo/repo"
-
"github.com/bluesky-social/indigo/util"
-
"github.com/haileyok/photocopy/clickhouse_inserter"
-
"github.com/haileyok/photocopy/models"
-
"github.com/ipfs/go-cid"
-
"github.com/ipld/go-car"
-
_ "github.com/joho/godotenv/autoload"
-
"github.com/urfave/cli/v2"
-
"go.uber.org/ratelimit"
-
)
-
-
func main() {
-
app := cli.App{
-
Name: "bodega",
-
Action: run,
-
Flags: []cli.Flag{
-
&cli.StringFlag{
-
Name: "clickhouse-addr",
-
EnvVars: []string{"PHOTOCOPY_CLICKHOUSE_ADDR"},
-
Required: true,
-
},
-
&cli.StringFlag{
-
Name: "clickhouse-database",
-
EnvVars: []string{"PHOTOCOPY_CLICKHOUSE_DATABASE"},
-
Required: true,
-
},
-
&cli.StringFlag{
-
Name: "clickhouse-user",
-
EnvVars: []string{"PHOTOCOPY_CLICKHOUSE_USER"},
-
Value: "default",
-
},
-
&cli.StringFlag{
-
Name: "clickhouse-pass",
-
EnvVars: []string{"PHOTOCOPY_CLICKHOUSE_PASS"},
-
Required: true,
-
},
-
&cli.BoolFlag{
-
Name: "debug",
-
Value: false,
-
},
-
},
-
}
-
-
app.Run(os.Args)
-
}
-
-
type RepoDownloader struct {
-
clients map[string]*http.Client
-
rateLimits map[string]ratelimit.Limiter
-
mu sync.RWMutex
-
}
-
-
func NewRepoDownloader() *RepoDownloader {
-
return &RepoDownloader{
-
clients: make(map[string]*http.Client),
-
rateLimits: make(map[string]ratelimit.Limiter),
-
}
-
}
-
-
func (rd *RepoDownloader) getClient(service string) *http.Client {
-
rd.mu.RLock()
-
client, exists := rd.clients[service]
-
rd.mu.RUnlock()
-
-
if exists {
-
return client
-
}
-
-
rd.mu.Lock()
-
defer rd.mu.Unlock()
-
-
if client, exists := rd.clients[service]; exists {
-
return client
-
}
-
-
client = util.RobustHTTPClient()
-
client.Timeout = 30 * time.Minute
-
rd.clients[service] = client
-
return client
-
}
-
-
func (rd *RepoDownloader) getRateLimiter(service string) ratelimit.Limiter {
-
rd.mu.RLock()
-
limiter, exists := rd.rateLimits[service]
-
rd.mu.RUnlock()
-
-
if exists {
-
return limiter
-
}
-
-
rd.mu.Lock()
-
defer rd.mu.Unlock()
-
-
if limiter, exists := rd.rateLimits[service]; exists {
-
return limiter
-
}
-
-
// 3000 per five minutes
-
limiter = ratelimit.New(10)
-
rd.rateLimits[service] = limiter
-
return limiter
-
}
-
-
func (rd *RepoDownloader) downloadRepo(service, did string) ([]byte, error) {
-
dlurl := fmt.Sprintf("%s/xrpc/com.atproto.sync.getRepo?did=%s", service, did)
-
-
req, err := http.NewRequestWithContext(context.TODO(), "GET", dlurl, nil)
-
if err != nil {
-
return nil, fmt.Errorf("failed to create request: %w", err)
-
}
-
-
client := rd.getClient(service)
-
-
resp, err := client.Do(req)
-
if err != nil {
-
return nil, fmt.Errorf("failed to download repo: %w", err)
-
}
-
defer resp.Body.Close()
-
-
if resp.StatusCode != http.StatusOK {
-
return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
-
}
-
-
b, err := io.ReadAll(resp.Body)
-
if err != nil {
-
return nil, fmt.Errorf("could not read bytes from response: %w", err)
-
}
-
-
return b, nil
-
}
-
-
func processRepo(b []byte, did string, inserter *clickhouse_inserter.Inserter) error {
-
bs := atproto_repo.NewTinyBlockstore()
-
cs, err := car.NewCarReader(bytes.NewReader(b))
-
if err != nil {
-
return fmt.Errorf("error opening car: %v\n", err)
-
}
-
-
currBlock, _ := cs.Next()
-
for currBlock != nil {
-
bs.Put(context.TODO(), currBlock)
-
next, _ := cs.Next()
-
currBlock = next
-
}
-
-
r, err := repo.OpenRepo(context.TODO(), bs, cs.Header.Roots[0])
-
if err != nil || r == nil {
-
fmt.Printf("could not open repo: %v", err)
-
return nil
-
}
-
-
if err := r.ForEach(context.TODO(), "", func(key string, cid cid.Cid) error {
-
pts := strings.Split(key, "/")
-
nsid := pts[0]
-
rkey := pts[1]
-
cidStr := cid.String()
-
b, err := bs.Get(context.TODO(), cid)
-
if err != nil {
-
return nil
-
}
-
-
var cat time.Time
-
tid, err := syntax.ParseTID(rkey)
-
if err != nil {
-
cat = time.Now()
-
} else {
-
cat = tid.Time()
-
}
-
-
rec := models.Record{
-
Did: did,
-
Rkey: rkey,
-
Collection: nsid,
-
Cid: cidStr,
-
Seq: "",
-
Raw: string(b.RawData()),
-
CreatedAt: cat,
-
}
-
-
inserter.Insert(context.TODO(), rec)
-
-
return nil
-
}); err != nil {
-
return fmt.Errorf("erorr traversing records: %v", err)
-
}
-
-
return nil
-
}
-
-
type ListReposResponse struct {
-
Cursor string `json:"cursor"`
-
Repos []ListReposRepo `json:"repos"`
-
}
-
-
type ListReposRepo struct {
-
Did string `json:"did"`
-
Head string `json:"head"`
-
Rev string `json:"rev"`
-
Active bool `json:"active"`
-
Status *string `json:"status,omitempty"`
-
}
-
-
func (rd *RepoDownloader) getDidsFromService(ctx context.Context, service string) ([]ListReposRepo, error) {
-
var cursor string
-
var repos []ListReposRepo
-
for {
-
req, err := http.NewRequestWithContext(ctx, "GET", fmt.Sprintf("%s/xrpc/com.atproto.sync.listRepos?limit=1000&cursor=%s", service, cursor), nil)
-
if err != nil {
-
return nil, err
-
}
-
-
rl := rd.getRateLimiter(service)
-
rl.Take()
-
-
cli := rd.getClient(service)
-
resp, err := cli.Do(req)
-
if err != nil {
-
return nil, err
-
}
-
defer resp.Body.Close()
-
-
if resp.StatusCode != http.StatusOK {
-
return nil, fmt.Errorf("received non-200 response code: %d", resp.StatusCode)
-
}
-
-
var reposResp ListReposResponse
-
if err := json.NewDecoder(resp.Body).Decode(&reposResp); err != nil {
-
return nil, fmt.Errorf("error decoding repos response: %w", err)
-
}
-
-
repos = append(repos, reposResp.Repos...)
-
-
if len(reposResp.Repos) != 1000 {
-
break
-
}
-
}
-
-
return repos, nil
-
}
-
-
var run = func(cmd *cli.Context) error {
-
startTime := time.Now()
-
-
conn, err := clickhouse.Open(&clickhouse.Options{
-
Addr: []string{cmd.String("clickhouse-addr")},
-
Auth: clickhouse.Auth{
-
Database: cmd.String("clickhouse-database"),
-
Username: cmd.String("clickhouse-user"),
-
Password: cmd.String("clickhouse-pass"),
-
},
-
})
-
if err != nil {
-
return err
-
}
-
defer conn.Close()
-
-
fmt.Println("querying clickhouse for dids and services...")
-
-
type servicesQueryRow struct {
-
PlcOpServices []string `ch:"plc_op_services"`
-
}
-
var servicesQueryRows []servicesQueryRow
-
if err := conn.Select(cmd.Context, &servicesQueryRows, `
-
SELECT DISTINCT(plc_op_services) FROM default.plc WHERE arrayExists(x -> x LIKE '%.bsky.network', plc_op_services)
-
`); err != nil {
-
return err
-
}
-
-
servicesDids := map[string][]string{}
-
for _, svcs := range servicesQueryRows {
-
for _, s := range svcs.PlcOpServices {
-
servicesDids[s] = []string{}
-
}
-
}
-
-
fmt.Printf("found %d services\n", len(servicesDids))
-
-
fmt.Printf("getting most recent record for each did...")
-
var records []models.Record
-
if err := conn.Select(cmd.Context, &records, `
-
SELECT did, created_at
-
FROM default.record
-
QUALIFY row_number() OVER (PARTITION BY did ORDER BY created_at ASC) = 1
-
`); err != nil {
-
return err
-
}
-
-
fmt.Printf("collecting dids...\n")
-
-
didCreatedAt := map[string]time.Time{}
-
for _, r := range records {
-
didCreatedAt[r.Did] = r.CreatedAt
-
}
-
-
inserter, err := clickhouse_inserter.New(context.TODO(), &clickhouse_inserter.Args{
-
BatchSize: 100000,
-
Logger: slog.Default(),
-
Conn: conn,
-
Query: "INSERT INTO record (did, rkey, collection, cid, seq, raw, created_at)",
-
RateLimit: 2, // two inserts per second in the event of massive repos
-
})
-
if err != nil {
-
return err
-
}
-
-
fmt.Printf("building download buckets...")
-
-
skipped := 0
-
total := 0
-
needOlderThan, _ := time.Parse(time.DateTime, "2025-06-28 04:18:22")
-
downloader := NewRepoDownloader()
-
serviceDids := map[string][]string{}
-
-
wg := sync.WaitGroup{}
-
for s := range servicesDids {
-
wg.Add(1)
-
go func() {
-
defer wg.Done()
-
repos, err := downloader.getDidsFromService(context.TODO(), s)
-
if err != nil {
-
fmt.Printf("error getting dids for services %s: %v", s, err)
-
return
-
}
-
dids := []string{}
-
for _, r := range repos {
-
lastRecord, exists := didCreatedAt[r.Did]
-
if exists && lastRecord.Before(needOlderThan) {
-
skipped++
-
continue
-
}
-
-
dids = append(dids, r.Did)
-
}
-
serviceDids[s] = dids
-
}()
-
}
-
-
fmt.Println("getting all the repos...")
-
wg.Wait()
-
-
fmt.Printf("Total jobs: %d across %d services \n", total, len(serviceDids))
-
fmt.Printf("was able to skip %d repos\n", skipped)
-
-
for service, dids := range serviceDids {
-
if len(dids) < 100 {
-
continue
-
}
-
fmt.Printf("%s: %d jobs\n", service, len(dids))
-
}
-
-
processed := 0
-
errored := 0
-
-
for service, dids := range serviceDids {
-
go func() {
-
for _, did := range dids {
-
ratelimiter := downloader.getRateLimiter(service)
-
ratelimiter.Take()
-
-
b, err := downloader.downloadRepo(service, did)
-
if err != nil {
-
errored++
-
processed++
-
continue
-
}
-
-
go func(b []byte, did string, inserter *clickhouse_inserter.Inserter) {
-
processRepo(b, did, inserter)
-
}(b, did, inserter)
-
-
processed++
-
}
-
}()
-
}
-
-
ticker := time.NewTicker(1 * time.Second)
-
defer ticker.Stop()
-
-
for range ticker.C {
-
elapsed := time.Since(startTime)
-
rate := float64(processed) / elapsed.Seconds()
-
remaining := total - processed
-
-
var eta string
-
if rate > 0 {
-
etaSeconds := float64(remaining) / rate
-
etaDuration := time.Duration(etaSeconds * float64(time.Second))
-
eta = fmt.Sprintf(", ETA: %v", etaDuration.Round(time.Second))
-
} else {
-
eta = ", ETA: calculating..."
-
}
-
-
fmt.Printf("\rProgress: %d/%d processed (%.1f%%), %d skipped, %d errors, %.1f jobs/sec%s",
-
processed, total, float64(processed)/float64(total)*100, skipped, errored, rate, eta)
-
}
-
-
fmt.Printf("\nCompleted: %d processed, %d errors\n", processed, errored)
-
-
inserter.Close(context.TODO())
-
-
return nil
-
}
-28
cmd/bodega/plc_models.go
···
-
package main
-
-
import (
-
"time"
-
)
-
-
type ClickhousePLCEntry struct {
-
Did string `ch:"did"`
-
Cid string `ch:"cid"`
-
Nullified bool `ch:"nullified"`
-
CreatedAt time.Time `ch:"created_at"`
-
PlcOpSig string `ch:"plc_op_sig"`
-
PlcOpPrev string `ch:"plc_op_prev"`
-
PlcOpType string `ch:"plc_op_type"`
-
PlcOpServices []string `ch:"plc_op_services"`
-
PlcOpAlsoKnownAs []string `ch:"plc_op_also_known_as"`
-
PlcOpRotationKeys []string `ch:"plc_op_rotation_keys"`
-
PlcTombSig string `ch:"plc_tomb_sig"`
-
PlcTombPrev string `ch:"plc_tomb_prev"`
-
PlcTombType string `ch:"plc_tomb_type"`
-
LegacyOpSig string `ch:"legacy_op_sig"`
-
LegacyOpPrev string `ch:"legacy_op_prev"`
-
LegacyOpType string `ch:"legacy_op_type"`
-
LegacyOpHandle string `ch:"legacy_op_handle"`
-
LegacyOpService string `ch:"legacy_op_service"`
-
LegacyOpSigningKey string `ch:"legacy_op_signing_key"`
-
LegacyOpRecoveryKey string `ch:"legacy_op_recovery_key"`
-
}
+1 -1
go.mod
···
github.com/prometheus/client_golang v1.22.0
github.com/urfave/cli/v2 v2.25.7
go.uber.org/ratelimit v0.3.1
-
golang.org/x/sync v0.15.0
)
require (
···
golang.org/x/crypto v0.39.0 // indirect
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect
golang.org/x/net v0.41.0 // indirect
+
golang.org/x/sync v0.15.0 // indirect
golang.org/x/sys v0.33.0 // indirect
golang.org/x/text v0.26.0 // indirect
golang.org/x/time v0.11.0 // indirect