···
-
"github.com/ClickHouse/clickhouse-go/v2"
-
atproto_repo "github.com/bluesky-social/indigo/atproto/repo"
-
"github.com/bluesky-social/indigo/atproto/syntax"
-
"github.com/bluesky-social/indigo/repo"
-
"github.com/bluesky-social/indigo/util"
-
"github.com/haileyok/photocopy/clickhouse_inserter"
-
"github.com/haileyok/photocopy/models"
-
"github.com/ipfs/go-cid"
-
"github.com/ipld/go-car"
-
_ "github.com/joho/godotenv/autoload"
-
"github.com/urfave/cli/v2"
-
"go.uber.org/ratelimit"
-
Name: "clickhouse-addr",
-
EnvVars: []string{"PHOTOCOPY_CLICKHOUSE_ADDR"},
-
Name: "clickhouse-database",
-
EnvVars: []string{"PHOTOCOPY_CLICKHOUSE_DATABASE"},
-
Name: "clickhouse-user",
-
EnvVars: []string{"PHOTOCOPY_CLICKHOUSE_USER"},
-
Name: "clickhouse-pass",
-
EnvVars: []string{"PHOTOCOPY_CLICKHOUSE_PASS"},
-
type RepoDownloader struct {
-
clients map[string]*http.Client
-
rateLimits map[string]ratelimit.Limiter
-
func NewRepoDownloader() *RepoDownloader {
-
return &RepoDownloader{
-
clients: make(map[string]*http.Client),
-
rateLimits: make(map[string]ratelimit.Limiter),
-
func (rd *RepoDownloader) getClient(service string) *http.Client {
-
client, exists := rd.clients[service]
-
if client, exists := rd.clients[service]; exists {
-
client = util.RobustHTTPClient()
-
client.Timeout = 30 * time.Minute
-
rd.clients[service] = client
-
func (rd *RepoDownloader) getRateLimiter(service string) ratelimit.Limiter {
-
limiter, exists := rd.rateLimits[service]
-
if limiter, exists := rd.rateLimits[service]; exists {
-
// 3000 per five minutes
-
limiter = ratelimit.New(10)
-
rd.rateLimits[service] = limiter
-
func (rd *RepoDownloader) downloadRepo(service, did string) ([]byte, error) {
-
dlurl := fmt.Sprintf("%s/xrpc/com.atproto.sync.getRepo?did=%s", service, did)
-
req, err := http.NewRequestWithContext(context.TODO(), "GET", dlurl, nil)
-
return nil, fmt.Errorf("failed to create request: %w", err)
-
client := rd.getClient(service)
-
resp, err := client.Do(req)
-
return nil, fmt.Errorf("failed to download repo: %w", err)
-
defer resp.Body.Close()
-
if resp.StatusCode != http.StatusOK {
-
return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
-
b, err := io.ReadAll(resp.Body)
-
return nil, fmt.Errorf("could not read bytes from response: %w", err)
-
func processRepo(b []byte, did string, inserter *clickhouse_inserter.Inserter) error {
-
bs := atproto_repo.NewTinyBlockstore()
-
cs, err := car.NewCarReader(bytes.NewReader(b))
-
return fmt.Errorf("error opening car: %v\n", err)
-
currBlock, _ := cs.Next()
-
bs.Put(context.TODO(), currBlock)
-
r, err := repo.OpenRepo(context.TODO(), bs, cs.Header.Roots[0])
-
if err != nil || r == nil {
-
fmt.Printf("could not open repo: %v", err)
-
if err := r.ForEach(context.TODO(), "", func(key string, cid cid.Cid) error {
-
pts := strings.Split(key, "/")
-
b, err := bs.Get(context.TODO(), cid)
-
tid, err := syntax.ParseTID(rkey)
-
Raw: string(b.RawData()),
-
inserter.Insert(context.TODO(), rec)
-
return fmt.Errorf("erorr traversing records: %v", err)
-
type ListReposResponse struct {
-
Cursor string `json:"cursor"`
-
Repos []ListReposRepo `json:"repos"`
-
type ListReposRepo struct {
-
Did string `json:"did"`
-
Head string `json:"head"`
-
Rev string `json:"rev"`
-
Active bool `json:"active"`
-
Status *string `json:"status,omitempty"`
-
func (rd *RepoDownloader) getDidsFromService(ctx context.Context, service string) ([]ListReposRepo, error) {
-
var repos []ListReposRepo
-
req, err := http.NewRequestWithContext(ctx, "GET", fmt.Sprintf("%s/xrpc/com.atproto.sync.listRepos?limit=1000&cursor=%s", service, cursor), nil)
-
rl := rd.getRateLimiter(service)
-
cli := rd.getClient(service)
-
resp, err := cli.Do(req)
-
defer resp.Body.Close()
-
if resp.StatusCode != http.StatusOK {
-
return nil, fmt.Errorf("received non-200 response code: %d", resp.StatusCode)
-
var reposResp ListReposResponse
-
if err := json.NewDecoder(resp.Body).Decode(&reposResp); err != nil {
-
return nil, fmt.Errorf("error decoding repos response: %w", err)
-
repos = append(repos, reposResp.Repos...)
-
if len(reposResp.Repos) != 1000 {
-
var run = func(cmd *cli.Context) error {
-
startTime := time.Now()
-
conn, err := clickhouse.Open(&clickhouse.Options{
-
Addr: []string{cmd.String("clickhouse-addr")},
-
Database: cmd.String("clickhouse-database"),
-
Username: cmd.String("clickhouse-user"),
-
Password: cmd.String("clickhouse-pass"),
-
fmt.Println("querying clickhouse for dids and services...")
-
type servicesQueryRow struct {
-
PlcOpServices []string `ch:"plc_op_services"`
-
var servicesQueryRows []servicesQueryRow
-
if err := conn.Select(cmd.Context, &servicesQueryRows, `
-
SELECT DISTINCT(plc_op_services) FROM default.plc WHERE arrayExists(x -> x LIKE '%.bsky.network', plc_op_services)
-
servicesDids := map[string][]string{}
-
for _, svcs := range servicesQueryRows {
-
for _, s := range svcs.PlcOpServices {
-
servicesDids[s] = []string{}
-
fmt.Printf("found %d services\n", len(servicesDids))
-
fmt.Printf("getting most recent record for each did...")
-
var records []models.Record
-
if err := conn.Select(cmd.Context, &records, `
-
QUALIFY row_number() OVER (PARTITION BY did ORDER BY created_at ASC) = 1
-
fmt.Printf("collecting dids...\n")
-
didCreatedAt := map[string]time.Time{}
-
for _, r := range records {
-
didCreatedAt[r.Did] = r.CreatedAt
-
inserter, err := clickhouse_inserter.New(context.TODO(), &clickhouse_inserter.Args{
-
Logger: slog.Default(),
-
Query: "INSERT INTO record (did, rkey, collection, cid, seq, raw, created_at)",
-
RateLimit: 2, // two inserts per second in the event of massive repos
-
fmt.Printf("building download buckets...")
-
needOlderThan, _ := time.Parse(time.DateTime, "2025-06-28 04:18:22")
-
downloader := NewRepoDownloader()
-
serviceDids := map[string][]string{}
-
for s := range servicesDids {
-
repos, err := downloader.getDidsFromService(context.TODO(), s)
-
fmt.Printf("error getting dids for services %s: %v", s, err)
-
for _, r := range repos {
-
lastRecord, exists := didCreatedAt[r.Did]
-
if exists && lastRecord.Before(needOlderThan) {
-
dids = append(dids, r.Did)
-
fmt.Println("getting all the repos...")
-
fmt.Printf("Total jobs: %d across %d services \n", total, len(serviceDids))
-
fmt.Printf("was able to skip %d repos\n", skipped)
-
for service, dids := range serviceDids {
-
fmt.Printf("%s: %d jobs\n", service, len(dids))
-
for service, dids := range serviceDids {
-
for _, did := range dids {
-
ratelimiter := downloader.getRateLimiter(service)
-
b, err := downloader.downloadRepo(service, did)
-
go func(b []byte, did string, inserter *clickhouse_inserter.Inserter) {
-
processRepo(b, did, inserter)
-
ticker := time.NewTicker(1 * time.Second)
-
elapsed := time.Since(startTime)
-
rate := float64(processed) / elapsed.Seconds()
-
remaining := total - processed
-
etaSeconds := float64(remaining) / rate
-
etaDuration := time.Duration(etaSeconds * float64(time.Second))
-
eta = fmt.Sprintf(", ETA: %v", etaDuration.Round(time.Second))
-
eta = ", ETA: calculating..."
-
fmt.Printf("\rProgress: %d/%d processed (%.1f%%), %d skipped, %d errors, %.1f jobs/sec%s",
-
processed, total, float64(processed)/float64(total)*100, skipped, errored, rate, eta)
-
fmt.Printf("\nCompleted: %d processed, %d errors\n", processed, errored)
-
inserter.Close(context.TODO())