this repo has no description

feat: backfilling

Changed files
+852 -41
clickhouse_inserter
cmd
bodega
photocopy
models
+1
Makefile
···
.PHONY: build
build: ## Build all executables
go build -ldflags "-X main.Version=$(VERSION)" -o photocopy ./cmd/photocopy
+
go build -o bodega ./cmd/bodega
.PHONY: run
run:
+408
backfiller.go
···
+
package photocopy
+
+
import (
+
"bytes"
+
"context"
+
"encoding/json"
+
"fmt"
+
"io"
+
"net/http"
+
"strings"
+
"sync"
+
"time"
+
+
atproto_repo "github.com/bluesky-social/indigo/atproto/repo"
+
"github.com/bluesky-social/indigo/repo"
+
"github.com/bluesky-social/indigo/util"
+
"github.com/ipfs/go-cid"
+
"github.com/ipld/go-car"
+
_ "github.com/joho/godotenv/autoload"
+
"go.uber.org/ratelimit"
+
)
+
+
type RepoDownloader struct {
+
clients map[string]*http.Client
+
rateLimits map[string]ratelimit.Limiter
+
mu sync.RWMutex
+
p *Photocopy
+
}
+
+
func NewRepoDownloader(p *Photocopy) *RepoDownloader {
+
return &RepoDownloader{
+
clients: make(map[string]*http.Client),
+
rateLimits: make(map[string]ratelimit.Limiter),
+
p: p,
+
}
+
}
+
+
func (rd *RepoDownloader) getClient(service string) *http.Client {
+
rd.mu.RLock()
+
client, exists := rd.clients[service]
+
rd.mu.RUnlock()
+
+
if exists {
+
return client
+
}
+
+
rd.mu.Lock()
+
defer rd.mu.Unlock()
+
+
if client, exists := rd.clients[service]; exists {
+
return client
+
}
+
+
client = util.RobustHTTPClient()
+
client.Timeout = 45 * time.Second
+
rd.clients[service] = client
+
return client
+
}
+
+
func (rd *RepoDownloader) getRateLimiter(service string) ratelimit.Limiter {
+
if !strings.HasSuffix(service, ".bsky.network") {
+
service = "third-party"
+
}
+
+
rd.mu.RLock()
+
limiter, exists := rd.rateLimits[service]
+
rd.mu.RUnlock()
+
+
if exists {
+
return limiter
+
}
+
+
rd.mu.Lock()
+
defer rd.mu.Unlock()
+
+
if limiter, exists := rd.rateLimits[service]; exists {
+
return limiter
+
}
+
+
// 3000 per five minutes
+
limiter = ratelimit.New(10)
+
rd.rateLimits[service] = limiter
+
return limiter
+
}
+
+
func (rd *RepoDownloader) downloadRepo(service, did string) ([]byte, error) {
+
dlurl := fmt.Sprintf("%s/xrpc/com.atproto.sync.getRepo?did=%s", service, did)
+
+
req, err := http.NewRequestWithContext(context.TODO(), "GET", dlurl, nil)
+
if err != nil {
+
return nil, fmt.Errorf("failed to create request: %w", err)
+
}
+
+
if rd.p.ratelimitBypassKey != "" && strings.HasSuffix(service, ".bsky.network") {
+
req.Header.Set("x-ratelimit-bypass", rd.p.ratelimitBypassKey)
+
}
+
+
client := rd.getClient(service)
+
+
resp, err := client.Do(req)
+
if err != nil {
+
return nil, fmt.Errorf("failed to download repo: %w", err)
+
}
+
defer resp.Body.Close()
+
+
if resp.StatusCode != http.StatusOK {
+
if resp.StatusCode == 400 {
+
return nil, nil
+
}
+
return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
+
}
+
+
b, err := io.ReadAll(resp.Body)
+
if err != nil {
+
return nil, fmt.Errorf("could not read bytes from response: %w", err)
+
}
+
+
return b, nil
+
}
+
+
func (p *Photocopy) processRepo(ctx context.Context, b []byte, did string) error {
+
bs := atproto_repo.NewTinyBlockstore()
+
cs, err := car.NewCarReader(bytes.NewReader(b))
+
if err != nil {
+
return fmt.Errorf("error opening car: %v\n", err)
+
}
+
+
currBlock, _ := cs.Next()
+
for currBlock != nil {
+
bs.Put(context.TODO(), currBlock)
+
next, _ := cs.Next()
+
currBlock = next
+
}
+
+
r, err := repo.OpenRepo(context.TODO(), bs, cs.Header.Roots[0])
+
if err != nil || r == nil {
+
fmt.Printf("could not open repo: %v", err)
+
return nil
+
}
+
+
if err := r.ForEach(context.TODO(), "", func(key string, cid cid.Cid) error {
+
pts := strings.Split(key, "/")
+
nsid := pts[0]
+
rkey := pts[1]
+
cidStr := cid.String()
+
b, err := bs.Get(context.TODO(), cid)
+
if err != nil {
+
return nil
+
}
+
if err := p.handleCreate(ctx, b.RawData(), time.Now().Format(time.RFC3339Nano), "unk", did, nsid, rkey, cidStr, "unk"); err != nil {
+
return err
+
}
+
return nil
+
}); err != nil {
+
return fmt.Errorf("erorr traversing records: %v", err)
+
}
+
+
return nil
+
}
+
+
type ListReposResponse struct {
+
Cursor string `json:"cursor"`
+
Repos []ListReposRepo `json:"repos"`
+
}
+
+
type ListReposRepo struct {
+
Did string `json:"did"`
+
Head string `json:"head"`
+
Rev string `json:"rev"`
+
Active bool `json:"active"`
+
Status *string `json:"status,omitempty"`
+
}
+
+
func (rd *RepoDownloader) getDidsFromService(ctx context.Context, service string) ([]ListReposRepo, error) {
+
var cursor string
+
var repos []ListReposRepo
+
if service == "https://atproto.brid.gy" {
+
return nil, nil
+
}
+
for {
+
req, err := http.NewRequestWithContext(ctx, "GET", fmt.Sprintf("%s/xrpc/com.atproto.sync.listRepos?limit=1000&cursor=%s", service, cursor), nil)
+
if err != nil {
+
return nil, err
+
}
+
+
if rd.p.ratelimitBypassKey != "" && strings.HasSuffix(service, ".bsky.network") {
+
req.Header.Set("x-ratelimit-bypass", rd.p.ratelimitBypassKey)
+
}
+
+
rl := rd.getRateLimiter(service)
+
rl.Take()
+
+
cli := rd.getClient(service)
+
resp, err := cli.Do(req)
+
if err != nil {
+
return nil, err
+
}
+
defer resp.Body.Close()
+
+
if resp.StatusCode != http.StatusOK {
+
return nil, fmt.Errorf("received non-200 response code: %d", resp.StatusCode)
+
}
+
+
var reposResp ListReposResponse
+
if err := json.NewDecoder(resp.Body).Decode(&reposResp); err != nil {
+
return nil, fmt.Errorf("error decoding repos response: %w", err)
+
}
+
+
for _, repo := range reposResp.Repos {
+
if repo.Status != nil {
+
if *repo.Status == "deleted" || *repo.Status == "takendown" || *repo.Status == "deactivated" {
+
continue
+
}
+
}
+
+
repos = append(repos, repo)
+
}
+
+
if len(reposResp.Repos) != 1000 || reposResp.Cursor == "" {
+
break
+
}
+
+
fmt.Printf("cursor %s service %s\n", reposResp.Cursor, service)
+
+
cursor = reposResp.Cursor
+
}
+
+
return repos, nil
+
}
+
+
type ListServicesResponse struct {
+
Cursor string `json:"cursor"`
+
Hosts []ListServicesResponseItem `json:"hosts"`
+
}
+
+
type ListServicesResponseItem struct {
+
Hostname string `json:"hostname"`
+
Status string `json:"status"`
+
}
+
+
func (p *Photocopy) runBackfiller(ctx context.Context) error {
+
startTime := time.Now()
+
+
fmt.Println("querying clickhouse for dids and services...")
+
+
var hostsCursor string
+
var sevs []ListServicesResponseItem
+
for {
+
if hostsCursor != "" {
+
hostsCursor = "&cursor=" + hostsCursor
+
}
+
req, err := http.NewRequestWithContext(ctx, "GET", fmt.Sprintf("https://relay1.us-east.bsky.network/xrpc/com.atproto.sync.listHosts?limit=1000%s", hostsCursor), nil)
+
if err != nil {
+
return err
+
}
+
+
resp, err := http.DefaultClient.Do(req)
+
if err != nil {
+
return err
+
}
+
defer resp.Body.Close()
+
+
if resp.StatusCode != http.StatusOK {
+
return fmt.Errorf("received non-200 response code: %d", resp.StatusCode)
+
}
+
+
var sevsResp ListServicesResponse
+
if err := json.NewDecoder(resp.Body).Decode(&sevsResp); err != nil {
+
return fmt.Errorf("error decoding sevs response: %w", err)
+
}
+
+
for _, sev := range sevsResp.Hosts {
+
if sev.Status != "active" {
+
continue
+
}
+
+
sevs = append(sevs, sev)
+
}
+
+
if len(sevsResp.Hosts) != 1000 || sevsResp.Cursor == "" {
+
break
+
}
+
+
hostsCursor = sevsResp.Cursor
+
}
+
+
servicesDids := map[string][]string{}
+
for _, sev := range sevs {
+
servicesDids["https://"+sev.Hostname] = []string{}
+
}
+
+
fmt.Printf("found %d services\n", len(servicesDids))
+
+
fmt.Printf("collecting dids...\n")
+
+
fmt.Printf("building download buckets...")
+
+
skipped := 0
+
downloader := NewRepoDownloader(p)
+
serviceDids := map[string][]string{}
+
+
wg := sync.WaitGroup{}
+
mplk := sync.Mutex{}
+
for s := range servicesDids {
+
wg.Add(1)
+
go func() {
+
defer wg.Done()
+
repos, err := downloader.getDidsFromService(context.TODO(), s)
+
if err != nil {
+
fmt.Printf("error getting dids for services %s: %v", s, err)
+
return
+
}
+
dids := []string{}
+
for _, r := range repos {
+
dids = append(dids, r.Did)
+
}
+
mplk.Lock()
+
defer mplk.Unlock()
+
serviceDids[s] = dids
+
}()
+
}
+
+
fmt.Println("getting all the repos...")
+
wg.Wait()
+
+
fmt.Printf("was able to skip %d repos\n", skipped)
+
+
total := 0
+
+
for service, dids := range serviceDids {
+
if len(dids) < 100 {
+
continue
+
}
+
fmt.Printf("%s: %d jobs\n", service, len(dids))
+
total += len(dids)
+
}
+
+
fmt.Printf("Total jobs: %d across %d services \n", total, len(serviceDids))
+
+
for _, c := range downloader.clients {
+
c.Timeout = 10 * time.Minute
+
}
+
+
for s := range downloader.rateLimits {
+
if p.ratelimitBypassKey != "" && strings.HasSuffix(s, ".bsky.network") {
+
downloader.rateLimits[s] = ratelimit.New(25)
+
}
+
}
+
+
processed := 0
+
errored := 0
+
var errors []error
+
for service, dids := range serviceDids {
+
go func() {
+
for _, did := range dids {
+
ratelimiter := downloader.getRateLimiter(service)
+
ratelimiter.Take()
+
+
b, err := downloader.downloadRepo(service, did)
+
if err != nil {
+
errored++
+
processed++
+
errors = append(errors, err)
+
continue
+
}
+
+
go func(b []byte, did string) {
+
if err := p.processRepo(ctx, b, did); err != nil {
+
fmt.Printf("error processing backfill record: %v\n", err)
+
}
+
}(b, did)
+
+
processed++
+
}
+
}()
+
}
+
+
ticker := time.NewTicker(1 * time.Second)
+
defer ticker.Stop()
+
+
for range ticker.C {
+
elapsed := time.Since(startTime)
+
rate := float64(processed) / elapsed.Seconds()
+
remaining := total - processed
+
+
var eta string
+
if rate > 0 {
+
etaSeconds := float64(remaining) / rate
+
etaDuration := time.Duration(etaSeconds * float64(time.Second))
+
eta = fmt.Sprintf(", ETA: %v", etaDuration.Round(time.Second))
+
} else {
+
eta = ", ETA: calculating..."
+
}
+
+
for _, err := range errors {
+
fmt.Printf("%v\n", err)
+
}
+
+
errors = nil
+
+
fmt.Printf("\rProgress: %d/%d processed (%.1f%%), %d skipped, %d errors, %.1f jobs/sec%s",
+
processed, total, float64(processed)/float64(total)*100, skipped, errored, rate, eta)
+
}
+
+
fmt.Printf("\nCompleted: %d processed, %d errors\n", processed, errored)
+
+
return nil
+
}
+21 -5
clickhouse_inserter/inserter.go
···
"github.com/ClickHouse/clickhouse-go/v2/lib/driver"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
+
"go.uber.org/ratelimit"
)
type Inserter struct {
···
histogram *prometheus.HistogramVec
logger *slog.Logger
prefix string
+
rateLimit ratelimit.Limiter
}
type Args struct {
···
PrometheusCounterPrefix string
Logger *slog.Logger
Histogram *prometheus.HistogramVec
+
RateLimit int
}
func New(ctx context.Context, args *Args) (*Inserter, error) {
···
histogram: args.Histogram,
logger: args.Logger,
prefix: args.PrometheusCounterPrefix,
+
}
+
+
if args.RateLimit != 0 {
+
rateLimit := ratelimit.New(args.RateLimit)
+
inserter.rateLimit = rateLimit
}
if args.PrometheusCounterPrefix != "" {
···
}
func (i *Inserter) sendStream(ctx context.Context, toInsert []any) {
-
i.pendingSends.Inc()
-
defer i.pendingSends.Dec()
+
if i.pendingSends != nil {
+
i.pendingSends.Inc()
+
defer i.pendingSends.Dec()
+
}
if i.histogram != nil {
start := time.Now()
···
}
status := "ok"
-
defer func() {
-
i.insertsCounter.WithLabelValues(status).Add(float64(len(toInsert)))
-
}()
+
if i.insertsCounter != nil {
+
defer func() {
+
i.insertsCounter.WithLabelValues(status).Add(float64(len(toInsert)))
+
}()
+
}
batch, err := i.conn.PrepareBatch(ctx, i.query)
if err != nil {
···
if err := batch.AppendStruct(structPtr); err != nil {
i.logger.Error("error appending to batch", "prefix", i.prefix, "error", err)
}
+
}
+
+
if i.rateLimit != nil {
+
i.rateLimit.Take()
}
if err := batch.Send(); err != nil {
+361 -2
cmd/bodega/main.go
···
package main
import (
+
"bytes"
+
"context"
+
"encoding/json"
+
"fmt"
+
"io"
+
"log/slog"
+
"net/http"
"os"
+
"strings"
+
"sync"
+
"time"
"github.com/ClickHouse/clickhouse-go/v2"
+
atproto_repo "github.com/bluesky-social/indigo/atproto/repo"
+
"github.com/bluesky-social/indigo/atproto/syntax"
+
"github.com/bluesky-social/indigo/repo"
+
"github.com/bluesky-social/indigo/util"
+
"github.com/haileyok/photocopy/clickhouse_inserter"
+
"github.com/haileyok/photocopy/models"
+
"github.com/ipfs/go-cid"
+
"github.com/ipld/go-car"
+
_ "github.com/joho/godotenv/autoload"
"github.com/urfave/cli/v2"
+
"go.uber.org/ratelimit"
)
func main() {
···
EnvVars: []string{"PHOTOCOPY_CLICKHOUSE_PASS"},
Required: true,
},
+
&cli.BoolFlag{
+
Name: "debug",
+
Value: false,
+
},
},
}
app.Run(os.Args)
}
+
type RepoDownloader struct {
+
clients map[string]*http.Client
+
rateLimits map[string]ratelimit.Limiter
+
mu sync.RWMutex
+
}
+
+
func NewRepoDownloader() *RepoDownloader {
+
return &RepoDownloader{
+
clients: make(map[string]*http.Client),
+
rateLimits: make(map[string]ratelimit.Limiter),
+
}
+
}
+
+
func (rd *RepoDownloader) getClient(service string) *http.Client {
+
rd.mu.RLock()
+
client, exists := rd.clients[service]
+
rd.mu.RUnlock()
+
+
if exists {
+
return client
+
}
+
+
rd.mu.Lock()
+
defer rd.mu.Unlock()
+
+
if client, exists := rd.clients[service]; exists {
+
return client
+
}
+
+
client = util.RobustHTTPClient()
+
client.Timeout = 30 * time.Minute
+
rd.clients[service] = client
+
return client
+
}
+
+
func (rd *RepoDownloader) getRateLimiter(service string) ratelimit.Limiter {
+
rd.mu.RLock()
+
limiter, exists := rd.rateLimits[service]
+
rd.mu.RUnlock()
+
+
if exists {
+
return limiter
+
}
+
+
rd.mu.Lock()
+
defer rd.mu.Unlock()
+
+
if limiter, exists := rd.rateLimits[service]; exists {
+
return limiter
+
}
+
+
// 3000 per five minutes
+
limiter = ratelimit.New(10)
+
rd.rateLimits[service] = limiter
+
return limiter
+
}
+
+
func (rd *RepoDownloader) downloadRepo(service, did string) ([]byte, error) {
+
dlurl := fmt.Sprintf("%s/xrpc/com.atproto.sync.getRepo?did=%s", service, did)
+
+
req, err := http.NewRequestWithContext(context.TODO(), "GET", dlurl, nil)
+
if err != nil {
+
return nil, fmt.Errorf("failed to create request: %w", err)
+
}
+
+
client := rd.getClient(service)
+
+
resp, err := client.Do(req)
+
if err != nil {
+
return nil, fmt.Errorf("failed to download repo: %w", err)
+
}
+
defer resp.Body.Close()
+
+
if resp.StatusCode != http.StatusOK {
+
return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
+
}
+
+
b, err := io.ReadAll(resp.Body)
+
if err != nil {
+
return nil, fmt.Errorf("could not read bytes from response: %w", err)
+
}
+
+
return b, nil
+
}
+
+
func processRepo(b []byte, did string, inserter *clickhouse_inserter.Inserter) error {
+
bs := atproto_repo.NewTinyBlockstore()
+
cs, err := car.NewCarReader(bytes.NewReader(b))
+
if err != nil {
+
return fmt.Errorf("error opening car: %v\n", err)
+
}
+
+
currBlock, _ := cs.Next()
+
for currBlock != nil {
+
bs.Put(context.TODO(), currBlock)
+
next, _ := cs.Next()
+
currBlock = next
+
}
+
+
r, err := repo.OpenRepo(context.TODO(), bs, cs.Header.Roots[0])
+
if err != nil || r == nil {
+
fmt.Printf("could not open repo: %v", err)
+
return nil
+
}
+
+
if err := r.ForEach(context.TODO(), "", func(key string, cid cid.Cid) error {
+
pts := strings.Split(key, "/")
+
nsid := pts[0]
+
rkey := pts[1]
+
cidStr := cid.String()
+
b, err := bs.Get(context.TODO(), cid)
+
if err != nil {
+
return nil
+
}
+
+
var cat time.Time
+
tid, err := syntax.ParseTID(rkey)
+
if err != nil {
+
cat = time.Now()
+
} else {
+
cat = tid.Time()
+
}
+
+
rec := models.Record{
+
Did: did,
+
Rkey: rkey,
+
Collection: nsid,
+
Cid: cidStr,
+
Seq: "",
+
Raw: string(b.RawData()),
+
CreatedAt: cat,
+
}
+
+
inserter.Insert(context.TODO(), rec)
+
+
return nil
+
}); err != nil {
+
return fmt.Errorf("erorr traversing records: %v", err)
+
}
+
+
return nil
+
}
+
+
type ListReposResponse struct {
+
Cursor string `json:"cursor"`
+
Repos []ListReposRepo `json:"repos"`
+
}
+
+
type ListReposRepo struct {
+
Did string `json:"did"`
+
Head string `json:"head"`
+
Rev string `json:"rev"`
+
Active bool `json:"active"`
+
Status *string `json:"status,omitempty"`
+
}
+
+
func (rd *RepoDownloader) getDidsFromService(ctx context.Context, service string) ([]ListReposRepo, error) {
+
var cursor string
+
var repos []ListReposRepo
+
for {
+
req, err := http.NewRequestWithContext(ctx, "GET", fmt.Sprintf("%s/xrpc/com.atproto.sync.listRepos?limit=1000&cursor=%s", service, cursor), nil)
+
if err != nil {
+
return nil, err
+
}
+
+
rl := rd.getRateLimiter(service)
+
rl.Take()
+
+
cli := rd.getClient(service)
+
resp, err := cli.Do(req)
+
if err != nil {
+
return nil, err
+
}
+
defer resp.Body.Close()
+
+
if resp.StatusCode != http.StatusOK {
+
return nil, fmt.Errorf("received non-200 response code: %d", resp.StatusCode)
+
}
+
+
var reposResp ListReposResponse
+
if err := json.NewDecoder(resp.Body).Decode(&reposResp); err != nil {
+
return nil, fmt.Errorf("error decoding repos response: %w", err)
+
}
+
+
repos = append(repos, reposResp.Repos...)
+
+
if len(reposResp.Repos) != 1000 {
+
break
+
}
+
}
+
+
return repos, nil
+
}
+
var run = func(cmd *cli.Context) error {
+
startTime := time.Now()
+
conn, err := clickhouse.Open(&clickhouse.Options{
Addr: []string{cmd.String("clickhouse-addr")},
Auth: clickhouse.Auth{
···
}
defer conn.Close()
-
var entries []ClickhousePLCEntry
-
if err := conn.Select(cmd.Context, &entries, "SELECT..."); err != nil {
+
fmt.Println("querying clickhouse for dids and services...")
+
+
type servicesQueryRow struct {
+
PlcOpServices []string `ch:"plc_op_services"`
+
}
+
var servicesQueryRows []servicesQueryRow
+
if err := conn.Select(cmd.Context, &servicesQueryRows, `
+
SELECT DISTINCT(plc_op_services) FROM default.plc WHERE arrayExists(x -> x LIKE '%.bsky.network', plc_op_services)
+
`); err != nil {
+
return err
+
}
+
+
servicesDids := map[string][]string{}
+
for _, svcs := range servicesQueryRows {
+
for _, s := range svcs.PlcOpServices {
+
servicesDids[s] = []string{}
+
}
+
}
+
+
fmt.Printf("found %d services\n", len(servicesDids))
+
+
fmt.Printf("getting most recent record for each did...")
+
var records []models.Record
+
if err := conn.Select(cmd.Context, &records, `
+
SELECT did, created_at
+
FROM default.record
+
QUALIFY row_number() OVER (PARTITION BY did ORDER BY created_at ASC) = 1
+
`); err != nil {
+
return err
+
}
+
+
fmt.Printf("collecting dids...\n")
+
+
didCreatedAt := map[string]time.Time{}
+
for _, r := range records {
+
didCreatedAt[r.Did] = r.CreatedAt
+
}
+
+
inserter, err := clickhouse_inserter.New(context.TODO(), &clickhouse_inserter.Args{
+
BatchSize: 100000,
+
Logger: slog.Default(),
+
Conn: conn,
+
Query: "INSERT INTO record (did, rkey, collection, cid, seq, raw, created_at)",
+
RateLimit: 2, // two inserts per second in the event of massive repos
+
})
+
if err != nil {
return err
}
+
+
fmt.Printf("building download buckets...")
+
+
skipped := 0
+
total := 0
+
needOlderThan, _ := time.Parse(time.DateTime, "2025-06-28 04:18:22")
+
downloader := NewRepoDownloader()
+
serviceDids := map[string][]string{}
+
+
wg := sync.WaitGroup{}
+
for s := range servicesDids {
+
wg.Add(1)
+
go func() {
+
defer wg.Done()
+
repos, err := downloader.getDidsFromService(context.TODO(), s)
+
if err != nil {
+
fmt.Printf("error getting dids for services %s: %v", s, err)
+
return
+
}
+
dids := []string{}
+
for _, r := range repos {
+
lastRecord, exists := didCreatedAt[r.Did]
+
if exists && lastRecord.Before(needOlderThan) {
+
skipped++
+
continue
+
}
+
+
dids = append(dids, r.Did)
+
}
+
serviceDids[s] = dids
+
}()
+
}
+
+
fmt.Println("getting all the repos...")
+
wg.Wait()
+
+
fmt.Printf("Total jobs: %d across %d services \n", total, len(serviceDids))
+
fmt.Printf("was able to skip %d repos\n", skipped)
+
+
for service, dids := range serviceDids {
+
if len(dids) < 100 {
+
continue
+
}
+
fmt.Printf("%s: %d jobs\n", service, len(dids))
+
}
+
+
processed := 0
+
errored := 0
+
+
for service, dids := range serviceDids {
+
go func() {
+
for _, did := range dids {
+
ratelimiter := downloader.getRateLimiter(service)
+
ratelimiter.Take()
+
+
b, err := downloader.downloadRepo(service, did)
+
if err != nil {
+
errored++
+
processed++
+
continue
+
}
+
+
go func(b []byte, did string, inserter *clickhouse_inserter.Inserter) {
+
processRepo(b, did, inserter)
+
}(b, did, inserter)
+
+
processed++
+
}
+
}()
+
}
+
+
ticker := time.NewTicker(1 * time.Second)
+
defer ticker.Stop()
+
+
for range ticker.C {
+
elapsed := time.Since(startTime)
+
rate := float64(processed) / elapsed.Seconds()
+
remaining := total - processed
+
+
var eta string
+
if rate > 0 {
+
etaSeconds := float64(remaining) / rate
+
etaDuration := time.Duration(etaSeconds * float64(time.Second))
+
eta = fmt.Sprintf(", ETA: %v", etaDuration.Round(time.Second))
+
} else {
+
eta = ", ETA: calculating..."
+
}
+
+
fmt.Printf("\rProgress: %d/%d processed (%.1f%%), %d skipped, %d errors, %.1f jobs/sec%s",
+
processed, total, float64(processed)/float64(total)*100, skipped, errored, rate, eta)
+
}
+
+
fmt.Printf("\nCompleted: %d processed, %d errors\n", processed, errored)
+
+
inserter.Close(context.TODO())
return nil
}
+10 -1
cmd/photocopy/main.go
···
EnvVars: []string{"PHOTOCOPY_CLICKHOUSE_PASS"},
Required: true,
},
+
&cli.StringFlag{
+
Name: "ratelimit-bypass-key",
+
EnvVars: []string{"PHOTOCOPY_RATELIMIT_BYPASS_KEY"},
+
Required: false,
+
},
+
&cli.BoolFlag{
+
Name: "with-backfill",
+
},
},
Commands: cli.Commands{
&cli.Command{
···
ClickhouseDatabase: cmd.String("clickhouse-database"),
ClickhouseUser: cmd.String("clickhouse-user"),
ClickhousePass: cmd.String("clickhouse-pass"),
+
RatelimitBypassKey: cmd.String("ratelimit-bypass-key"),
})
if err != nil {
panic(err)
···
cancel()
}()
-
if err := p.Run(ctx); err != nil {
+
if err := p.Run(ctx, cmd.Bool("with-backfill")); err != nil {
panic(err)
}
+4 -4
go.mod
···
go 1.24.4
require (
-
github.com/ClickHouse/clickhouse-go v1.5.4
github.com/ClickHouse/clickhouse-go/v2 v2.37.2
github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de
github.com/bluesky-social/indigo v0.0.0-20250626183556-5641d3c27325
github.com/gorilla/websocket v1.5.1
github.com/ipfs/go-cid v0.5.0
+
github.com/ipld/go-car v0.6.1-0.20230509095817-92d28eb23ba4
github.com/joho/godotenv v1.5.1
github.com/prometheus/client_golang v1.22.0
github.com/urfave/cli/v2 v2.25.7
+
go.uber.org/ratelimit v0.3.1
+
golang.org/x/sync v0.15.0
)
require (
github.com/ClickHouse/ch-go v0.66.1 // indirect
github.com/RussellLuo/slidingwindow v0.0.0-20200528002341-535bb99d338b // indirect
github.com/andybalholm/brotli v1.1.1 // indirect
+
github.com/benbjohnson/clock v1.3.0 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/carlmjohnson/versioninfo v0.22.5 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
-
github.com/cloudflare/golz4 v0.0.0-20150217214814-ef862a3cdc58 // indirect
github.com/cpuguy83/go-md2man/v2 v2.0.3 // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
github.com/go-faster/city v1.0.1 // indirect
···
github.com/ipfs/go-merkledag v0.11.0 // indirect
github.com/ipfs/go-metrics-interface v0.0.1 // indirect
github.com/ipfs/go-verifcid v0.0.3 // indirect
-
github.com/ipld/go-car v0.6.1-0.20230509095817-92d28eb23ba4 // indirect
github.com/ipld/go-codec-dagpb v1.6.0 // indirect
github.com/ipld/go-ipld-prime v0.21.0 // indirect
github.com/jackc/pgpassfile v1.0.0 // indirect
···
golang.org/x/crypto v0.39.0 // indirect
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect
golang.org/x/net v0.41.0 // indirect
-
golang.org/x/sync v0.15.0 // indirect
golang.org/x/sys v0.33.0 // indirect
golang.org/x/text v0.26.0 // indirect
golang.org/x/time v0.11.0 // indirect
+2 -12
go.sum
···
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/ClickHouse/ch-go v0.66.1 h1:LQHFslfVYZsISOY0dnOYOXGkOUvpv376CCm8g7W74A4=
github.com/ClickHouse/ch-go v0.66.1/go.mod h1:NEYcg3aOFv2EmTJfo4m2WF7sHB/YFbLUuIWv9iq76xY=
-
github.com/ClickHouse/clickhouse-go v1.5.4 h1:cKjXeYLNWVJIx2J1K6H2CqyRmfwVJVY1OV1coaaFcI0=
-
github.com/ClickHouse/clickhouse-go v1.5.4/go.mod h1:EaI/sW7Azgz9UATzd5ZdZHRUhHgv5+JMS9NSr2smCJI=
github.com/ClickHouse/clickhouse-go/v2 v2.37.2 h1:wRLNKoynvHQEN4znnVHNLaYnrqVc9sGJmGYg+GGCfto=
github.com/ClickHouse/clickhouse-go/v2 v2.37.2/go.mod h1:pH2zrBGp5Y438DMwAxXMm1neSXPPjSI7tD4MURVULw8=
github.com/RussellLuo/slidingwindow v0.0.0-20200528002341-535bb99d338b h1:5/++qT1/z812ZqBvqQt6ToRswSuPZ/B33m6xVHRzADU=
···
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/bitly/go-hostpool v0.0.0-20171023180738-a3a6125de932 h1:mXoPYz/Ul5HYEDvkta6I8/rnYM5gSdSV2tJ6XbZuEtY=
github.com/bitly/go-hostpool v0.0.0-20171023180738-a3a6125de932/go.mod h1:NOuUCSz6Q9T7+igc/hlvDOUdtWKryOrtFyIVABv/p7k=
-
github.com/bkaradzic/go-lz4 v1.0.0 h1:RXc4wYsyz985CkXXeX04y4VnZFGG8Rd43pRaHsOXAKk=
-
github.com/bkaradzic/go-lz4 v1.0.0/go.mod h1:0YdlkowM3VswSROI7qDxhRvJ3sLhlFrRRwjwegp5jy4=
github.com/bluesky-social/indigo v0.0.0-20250626183556-5641d3c27325 h1:Bftt2EcoLZK2Z2m12Ih5QqbReX8j29hbf4zJU/FKzaY=
github.com/bluesky-social/indigo v0.0.0-20250626183556-5641d3c27325/go.mod h1:8FlFpF5cIq3DQG0kEHqyTkPV/5MDQoaWLcVwza5ZPJU=
github.com/bmizerany/assert v0.0.0-20160611221934-b7ed37b82869 h1:DDGfHa7BWjL4YnC6+E63dPcxHo2sUxDIu8g3QgEJdRY=
···
github.com/carlmjohnson/versioninfo v0.22.5/go.mod h1:QT9mph3wcVfISUKd0i9sZfVrPviHuSF+cUtLjm2WSf8=
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
-
github.com/cloudflare/golz4 v0.0.0-20150217214814-ef862a3cdc58 h1:F1EaeKL/ta07PY/k9Os/UFtwERei2/XzGemhpGnBKNg=
-
github.com/cloudflare/golz4 v0.0.0-20150217214814-ef862a3cdc58/go.mod h1:EOBUe0h4xcZ5GoxqC5SDxFQ8gwyZPKQoEzownBlhI80=
github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
github.com/cpuguy83/go-md2man/v2 v2.0.3 h1:qMCsGGgs+MAzDFyp9LpAe1Lqy/fY/qCovCm0qnXZOBM=
github.com/cpuguy83/go-md2man/v2 v2.0.3/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
···
github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
github.com/go-redis/redis v6.15.9+incompatible h1:K0pv1D7EQUjfyoMql+r/jZqCLizCGKFlFgcHWWmHQjg=
github.com/go-redis/redis v6.15.9+incompatible/go.mod h1:NAIEuMOZ/fxfXJIrKDQDz8wamY7mA7PouImQ2Jvg6kA=
-
github.com/go-sql-driver/mysql v1.4.0/go.mod h1:zAC/RDZ24gD3HViQzih4MyKcchzm+sOG5ZlKdlhCg5w=
github.com/go-yaml/yaml v2.1.0+incompatible/go.mod h1:w2MrLa16VYP0jy6N7M5kHaCkaLENm+P+Tv+MfurjSw0=
github.com/gocql/gocql v1.7.0 h1:O+7U7/1gSN7QTEAaMEsJc1Oq2QHXvCWoF3DFK9HDHus=
github.com/gocql/gocql v1.7.0/go.mod h1:vnlvXyFZeLBF0Wy+RS8hrOdbn0UWsWtdg07XJnFxZ+4=
···
github.com/jinzhu/inflection v1.0.0/go.mod h1:h+uFLlag+Qp1Va5pdKtLDYj+kHp5pxUVkryuEj+Srlc=
github.com/jinzhu/now v1.1.5 h1:/o9tlHleP7gOFmsnYNz3RGnqzefHA47wQpKrrdTIwXQ=
github.com/jinzhu/now v1.1.5/go.mod h1:d3SSVoowX0Lcu0IBviAWJpolVfI5UJVZZ7cO71lE/z8=
-
github.com/jmoiron/sqlx v1.2.0/go.mod h1:1FEQNm3xlJgrMD+FBdI9+xvCksHtbpVBBw5dYhBSsks=
github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo=
···
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
-
github.com/lib/pq v1.0.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo=
github.com/libp2p/go-buffer-pool v0.1.0 h1:oK4mSFcQz7cTQIfqbe4MIj9gLW+mnanjyFtc6cdF0Y8=
github.com/libp2p/go-buffer-pool v0.1.0/go.mod h1:N+vh8gMqimBzdKkSMVuydVDq+UV5QTWy5HSiZacSbPg=
github.com/libp2p/go-cidranger v1.1.0 h1:ewPN8EZ0dd1LSnrtuwd4709PXVcITVeuwbag38yPW7c=
···
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/mattn/go-runewidth v0.0.10/go.mod h1:RAqKPSqVFrSLVXbA8x7dzmKdmGzieGRCM46jaSJTDAk=
-
github.com/mattn/go-sqlite3 v1.9.0/go.mod h1:FPy6KqzDD04eiIsT53CuJW3U88zkxoIYsOqkbpncsNc=
github.com/mattn/go-sqlite3 v1.14.22 h1:2gZY6PC6kBnID23Tichd1K+Z0oS6nE/XwU+Vz/5o4kU=
github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
github.com/miekg/dns v1.1.50 h1:DQUfb9uc6smULcREF09Uc+/Gd46YWqJd5DbpPE9xkcA=
···
github.com/paulmach/protoscan v0.2.1/go.mod h1:SpcSwydNLrxUGSDvXvO0P7g7AuhJ7lcKfDlhJCDw2gY=
github.com/petar/GoLLRB v0.0.0-20210522233825-ae3b015fd3e9 h1:1/WtZae0yGtPq+TI6+Tv1WTxkukpXeMlviSxvL7SRgk=
github.com/petar/GoLLRB v0.0.0-20210522233825-ae3b015fd3e9/go.mod h1:x3N5drFsm2uilKKuuYo6LdyD8vZAW55sH/9w+pbo1sw=
-
github.com/pierrec/lz4 v2.0.5+incompatible h1:2xWsjqPFWcplujydGg4WmhC/6fZqK42wMM8aXeqhl0I=
-
github.com/pierrec/lz4 v2.0.5+incompatible/go.mod h1:pdkljMzZIN41W+lC3N2tnIh5sFi+IEE17M5jbnwPHcY=
github.com/pierrec/lz4/v4 v4.1.22 h1:cKFw6uJDK+/gfw5BcDL0JL5aBsAFdsIT18eRtLj7VIU=
github.com/pierrec/lz4/v4 v4.1.22/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
···
go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU=
go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0=
go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
+
go.uber.org/ratelimit v0.3.1 h1:K4qVE+byfv/B3tC+4nYWP7v/6SimcO7HzHekoMNBma0=
+
go.uber.org/ratelimit v0.3.1/go.mod h1:6euWsTB6U/Nb3X++xEUXA8ciPJvr19Q/0h1+oDcJhRk=
go.uber.org/tools v0.0.0-20190618225709-2cfd321de3ee/go.mod h1:vJERXedbb3MVM5f9Ejo0C68/HhF8uaILCdgjnY+goOA=
go.uber.org/zap v1.16.0/go.mod h1:MA8QOfq0BHJwdXa996Y4dYkAqRKB8/1K1QMMZVaNZjQ=
go.uber.org/zap v1.19.1/go.mod h1:j3DNczoxDZroyBnOT1L/Q79cfUMGZxlv/9dzN7SM1rI=
+7 -1
handle_create.go
···
Collection: collection,
Cid: cid,
Seq: seq,
-
Raw: raw,
+
Raw: string(raw),
CreatedAt: cat,
}
···
return err
}
+
lang := ""
+
if len(rec.Langs) != 0 {
+
lang = rec.Langs[0]
+
}
+
post := models.Post{
Uri: uri,
Rkey: rkey,
CreatedAt: *cat,
IndexedAt: indexedAt,
Did: did,
+
Lang: lang,
}
if rec.Reply != nil {
+1
models/post.go
···
ParentDid string `ch:"parent_did"`
QuoteUri string `ch:"quote_uri"`
QuoteDid string `ch:"quote_did"`
+
Lang string `ch:"lang"`
}
+1 -1
models/record.go
···
Collection string `ch:"collection"`
Cid string `ch:"cid"`
Seq string `ch:"seq"`
-
Raw []byte `ch:"raw"`
+
Raw string `ch:"raw"`
CreatedAt time.Time `ch:"created_at"`
}
+36 -15
photocopy.go
···
"time"
"github.com/ClickHouse/clickhouse-go/v2"
+
"github.com/ClickHouse/clickhouse-go/v2/lib/driver"
"github.com/haileyok/photocopy/clickhouse_inserter"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
···
inserters *Inserters
plcScraper *PLCScraper
+
+
ratelimitBypassKey string
+
+
conn driver.Conn
}
type Inserters struct {
···
ClickhouseDatabase string
ClickhouseUser string
ClickhousePass string
+
RatelimitBypassKey string
}
func New(ctx context.Context, args *Args) (*Photocopy, error) {
-
p := &Photocopy{
-
logger: args.Logger,
-
metricsAddr: args.MetricsAddr,
-
relayHost: args.RelayHost,
-
wg: sync.WaitGroup{},
-
cursorFile: args.CursorFile,
-
}
-
conn, err := clickhouse.Open(&clickhouse.Options{
Addr: []string{args.ClickhouseAddr},
Auth: clickhouse.Auth{
···
return nil, err
}
+
p := &Photocopy{
+
logger: args.Logger,
+
metricsAddr: args.MetricsAddr,
+
relayHost: args.RelayHost,
+
wg: sync.WaitGroup{},
+
cursorFile: args.CursorFile,
+
ratelimitBypassKey: args.RatelimitBypassKey,
+
conn: conn,
+
}
+
insertionsHist := promauto.NewHistogramVec(prometheus.HistogramOpts{
Name: "photocopy_inserts_time",
Help: "histogram of photocopy inserts",
···
fi, err := clickhouse_inserter.New(ctx, &clickhouse_inserter.Args{
PrometheusCounterPrefix: "photocopy_follows",
Histogram: insertionsHist,
-
BatchSize: 1000,
+
BatchSize: 250_000,
Logger: p.logger,
Conn: conn,
Query: "INSERT INTO follow (uri, did, rkey, created_at, indexed_at, subject)",
+
RateLimit: 3,
})
if err != nil {
return nil, err
···
pi, err := clickhouse_inserter.New(ctx, &clickhouse_inserter.Args{
PrometheusCounterPrefix: "photocopy_posts",
Histogram: insertionsHist,
-
BatchSize: 100,
+
BatchSize: 250_000,
Logger: p.logger,
Conn: conn,
-
Query: "INSERT INTO post (uri, did, rkey, created_at, indexed_at, root_uri, root_did, parent_uri, parent_did, quote_uri, quote_did)",
+
Query: "INSERT INTO post (uri, did, rkey, created_at, indexed_at, root_uri, root_did, parent_uri, parent_did, quote_uri, quote_did, lang)",
+
RateLimit: 3,
})
if err != nil {
return nil, err
···
ii, err := clickhouse_inserter.New(ctx, &clickhouse_inserter.Args{
PrometheusCounterPrefix: "photocopy_interactions",
Histogram: insertionsHist,
-
BatchSize: 1000,
+
BatchSize: 250_000,
Logger: p.logger,
Conn: conn,
Query: "INSERT INTO interaction (uri, did, rkey, kind, created_at, indexed_at, subject_uri, subject_did)",
+
RateLimit: 3,
})
if err != nil {
return nil, err
···
ri, err := clickhouse_inserter.New(ctx, &clickhouse_inserter.Args{
PrometheusCounterPrefix: "photocopy_records",
Histogram: insertionsHist,
-
BatchSize: 1000,
+
BatchSize: 250_000,
Logger: p.logger,
Conn: conn,
Query: "INSERT INTO record (did, rkey, collection, cid, seq, raw, created_at)",
+
RateLimit: 3,
})
if err != nil {
return nil, err
···
di, err := clickhouse_inserter.New(ctx, &clickhouse_inserter.Args{
PrometheusCounterPrefix: "photocopy_deletes",
Histogram: insertionsHist,
-
BatchSize: 100,
+
BatchSize: 250_000,
Logger: p.logger,
Conn: conn,
Query: "INSERT INTO delete (did, rkey, created_at)",
+
RateLimit: 3,
})
if err != nil {
return nil, err
···
return p, nil
}
-
func (p *Photocopy) Run(baseCtx context.Context) error {
+
func (p *Photocopy) Run(baseCtx context.Context, withBackfill bool) error {
ctx, cancel := context.WithCancel(baseCtx)
metricsServer := http.NewServeMux()
···
panic(fmt.Errorf("failed to start plc scraper: %w", err))
}
}(ctx)
+
+
if withBackfill {
+
go func(ctx context.Context) {
+
if err := p.runBackfiller(ctx); err != nil {
+
panic(fmt.Errorf("error starting backfiller: %w", err))
+
}
+
}(ctx)
+
}
<-ctx.Done()