an app.bsky.* indexer

dynamic host/repo updates

Changed files
+146 -15
cmd
+134 -11
cmd/monarch/census.go
···
import (
"context"
"log/slog"
"github.com/bluesky-social/indigo/api/atproto"
"github.com/bluesky-social/indigo/backfill"
"github.com/bluesky-social/indigo/xrpc"
"github.com/urfave/cli/v2"
)
type CensusService struct {
-
cursor *CursorService
backfill *backfill.Backfiller
}
type jobMaker interface {
GetOrCreateJob(context.Context, string, string) (backfill.Job, error)
}
-
func NewCensusService(cursorSvc *CursorService, backfillSvc *backfill.Backfiller) *CensusService {
return &CensusService{
-
cursor: cursorSvc,
backfill: backfillSvc,
}
}
-
func (cs *CensusService) Start(ctx context.Context, cctx *cli.Context) {
xrpcc := &xrpc.Client{
-
Host: "https://" + cctx.String("relay-host"),
}
jmstore, ok := cs.backfill.Store.(jobMaker)
···
return
}
-
curs, _ := cs.cursor.Get("repos")
for {
select {
case <-ctx.Done():
-
slog.Info("stopping repo census")
return
default:
}
res, err := atproto.SyncListRepos(ctx, xrpcc, curs, 1000)
if err != nil {
-
slog.Error("error listing repos", "err", err)
-
return
}
for _, repo := range res.Repos {
_, err := jmstore.GetOrCreateJob(ctx, repo.Did, backfill.StateEnqueued)
if err != nil {
-
slog.Error("error adding listed repo to backfiller", "err", err)
}
}
if res.Cursor != nil && *res.Cursor != "" {
curs = *res.Cursor
-
cs.cursor.SetReposCursor(curs)
} else {
break
}
}
}
···
import (
"context"
"log/slog"
+
"sync"
+
"time"
"github.com/bluesky-social/indigo/api/atproto"
"github.com/bluesky-social/indigo/backfill"
"github.com/bluesky-social/indigo/xrpc"
"github.com/urfave/cli/v2"
+
"golang.org/x/sync/semaphore"
+
"gorm.io/gorm"
)
type CensusService struct {
+
store *gorm.DB
backfill *backfill.Backfiller
+
+
storeLk sync.Mutex
}
type jobMaker interface {
GetOrCreateJob(context.Context, string, string) (backfill.Job, error)
}
+
func NewCensusService(store *gorm.DB, backfillSvc *backfill.Backfiller) *CensusService {
return &CensusService{
+
store: store,
backfill: backfillSvc,
}
}
+
type relayCursor struct {
+
ID int `gorm:"primaryKey"`
+
Host string
+
Cursor string
+
}
+
+
type hostCursor struct {
+
ID int `gorm:"primaryKey"`
+
Host string
+
Cursor string
+
}
+
+
// fetch the PDSes known to the relay
+
func (cs *CensusService) listHosts(ctx context.Context, cctx *cli.Context) {
+
relay := cctx.String("relay-host")
+
maxCrawlers := cctx.Int64("max-repo-crawlers")
+
+
xrpcc := &xrpc.Client{
+
Host: "https://" + relay,
+
}
+
+
var rcur relayCursor
+
if err := cs.store.Where("host = ?", relay).Attrs(relayCursor{
+
Host: relay,
+
Cursor: "",
+
}).FirstOrCreate(&rcur).Error; err != nil {
+
slog.Error("error fetching relay cursor", "err", err)
+
}
+
+
var wg sync.WaitGroup
+
sem := semaphore.NewWeighted(maxCrawlers)
+
+
curs := rcur.Cursor
+
for {
+
select {
+
case <-ctx.Done():
+
slog.Info("stopping listHosts", "err", ctx.Err())
+
return
+
default:
+
}
+
+
slog.Info("listing hosts", "relay", relay, "curs", curs)
+
res, err := atproto.SyncListHosts(ctx, xrpcc, curs, 1000)
+
if err != nil {
+
slog.Error("error obtaining hosts from relay", "err", err)
+
continue
+
}
+
+
for _, host := range res.Hosts {
+
sem.Acquire(ctx, 1)
+
wg.Add(1) // TODO wg.Go
+
go func() {
+
defer sem.Release(1)
+
defer wg.Done()
+
slog.Info("adding host", "host", host.Hostname)
+
cs.listRepos(ctx, host.Hostname)
+
}()
+
}
+
+
if res.Cursor != nil && *res.Cursor != "" {
+
curs = *res.Cursor
+
if err := cs.store.Model(&rcur).Update("cursor", curs).Error; err != nil {
+
slog.Error("error updating cursor for relay", "err", err)
+
}
+
} else {
+
break
+
}
+
}
+
+
wg.Wait()
+
slog.Info("finished listing hosts", "relay", relay)
+
}
+
+
// fetch the repos known to the PDS
+
func (cs *CensusService) listRepos(ctx context.Context, host string) {
xrpcc := &xrpc.Client{
+
Host: "https://" + host,
}
jmstore, ok := cs.backfill.Store.(jobMaker)
···
return
}
+
cs.storeLk.Lock()
+
var hcur hostCursor
+
if err := cs.store.Where("host = ?", host).Attrs(hostCursor{
+
Host: host,
+
Cursor: "",
+
}).FirstOrCreate(&hcur).Error; err != nil {
+
slog.Error("error fetching host cursor", "err", err)
+
}
+
cs.storeLk.Unlock()
+
+
var added int
+
curs := hcur.Cursor
for {
select {
case <-ctx.Done():
+
slog.Info("stopping listRepos", "err", ctx.Err())
return
default:
}
+
slog.Info("listing repos", "host", host, "curs", curs, "added", added)
res, err := atproto.SyncListRepos(ctx, xrpcc, curs, 1000)
if err != nil {
+
slog.Error("error obtaining repos from host", "err", err)
+
continue
}
+
cs.storeLk.Lock()
for _, repo := range res.Repos {
_, err := jmstore.GetOrCreateJob(ctx, repo.Did, backfill.StateEnqueued)
if err != nil {
+
slog.Error("error adding repo to backfiller", "err", err)
+
} else {
+
added += 1
}
}
+
cs.storeLk.Unlock()
if res.Cursor != nil && *res.Cursor != "" {
curs = *res.Cursor
+
cs.storeLk.Lock()
+
if err := cs.store.Model(&hcur).Update("cursor", curs).Error; err != nil {
+
slog.Error("error updating cursor for host", "err", err)
+
}
+
cs.storeLk.Unlock()
} else {
break
}
+
}
+
+
slog.Info("finished listing repos", "host", host)
+
}
+
+
func (cs *CensusService) Start(ctx context.Context, cctx *cli.Context) {
+
slog.Info("starting initial hosts and repos crawl")
+
cs.listHosts(ctx, cctx)
+
+
slog.Info("finished with initial refresh, starting ticker")
+
t := time.NewTicker(time.Hour)
+
defer t.Stop()
+
+
for {
+
select {
+
case <-ctx.Done():
+
slog.Info("stopping census service", "err", ctx.Err())
+
return
+
case <-t.C:
+
}
+
+
slog.Info("refreshing hosts and repos")
+
cs.listHosts(ctx, cctx)
}
}
+2
cmd/monarch/cursors.go
···
func NewCursorService(store *gorm.DB) *CursorService {
store.AutoMigrate(&firehoseCursor{})
return &CursorService{
store: store,
···
func NewCursorService(store *gorm.DB) *CursorService {
store.AutoMigrate(&firehoseCursor{})
+
store.AutoMigrate(&relayCursor{})
+
store.AutoMigrate(&hostCursor{})
return &CursorService{
store: store,
+4 -2
cmd/monarch/handlers.go
···
}
func (hs *HandlerService) HandleCreate(ctx context.Context, repo string, rev string, path string, rec *[]byte, cid *cid.Cid) error {
-
return hs.HandleUpsert(ctx, repo, rev, path, rec, cid, ActionCreate)
}
func (hs *HandlerService) HandleUpdate(ctx context.Context, repo string, rev string, path string, rec *[]byte, cid *cid.Cid) error {
-
return hs.HandleUpsert(ctx, repo, rev, path, rec, cid, ActionUpdate)
}
func (hs *HandlerService) HandleDelete(ctx context.Context, repo string, rev string, path string) error {
···
}
func (hs *HandlerService) HandleCreate(ctx context.Context, repo string, rev string, path string, rec *[]byte, cid *cid.Cid) error {
+
// return hs.HandleUpsert(ctx, repo, rev, path, rec, cid, ActionCreate)
+
return nil
}
func (hs *HandlerService) HandleUpdate(ctx context.Context, repo string, rev string, path string, rec *[]byte, cid *cid.Cid) error {
+
// return hs.HandleUpsert(ctx, repo, rev, path, rec, cid, ActionUpdate)
+
return nil
}
func (hs *HandlerService) HandleDelete(ctx context.Context, repo string, rev string, path string) error {
+5 -1
cmd/monarch/main.go
···
app.backfill = NewBackfillService(backfill.NewGormstore(app.state), app.handler, cctx)
go app.backfill.Start()
-
app.census = NewCensusService(app.cursor, app.backfill)
go app.census.Start(ctx, cctx)
wsconn, err := NewFirehoseConnection(ctx, cctx, app.cursor)
···
&cli.IntFlag{
Name: "backfill-consumers",
Value: 100,
},
}
···
app.backfill = NewBackfillService(backfill.NewGormstore(app.state), app.handler, cctx)
go app.backfill.Start()
+
app.census = NewCensusService(app.state, app.backfill)
go app.census.Start(ctx, cctx)
wsconn, err := NewFirehoseConnection(ctx, cctx, app.cursor)
···
&cli.IntFlag{
Name: "backfill-consumers",
Value: 100,
+
},
+
&cli.IntFlag{
+
Name: "max-repo-crawlers",
+
Value: 4,
},
}
+1 -1
go.mod
···
github.com/gorilla/websocket v1.5.1
github.com/ipfs/go-cid v0.4.1
github.com/urfave/cli/v2 v2.25.7
gorm.io/gorm v1.25.9
)
···
go.uber.org/zap v1.26.0 // indirect
golang.org/x/crypto v0.21.0 // indirect
golang.org/x/net v0.23.0 // indirect
-
golang.org/x/sync v0.7.0 // indirect
golang.org/x/sys v0.22.0 // indirect
golang.org/x/text v0.14.0 // indirect
golang.org/x/time v0.3.0 // indirect
···
github.com/gorilla/websocket v1.5.1
github.com/ipfs/go-cid v0.4.1
github.com/urfave/cli/v2 v2.25.7
+
golang.org/x/sync v0.7.0
gorm.io/gorm v1.25.9
)
···
go.uber.org/zap v1.26.0 // indirect
golang.org/x/crypto v0.21.0 // indirect
golang.org/x/net v0.23.0 // indirect
golang.org/x/sys v0.22.0 // indirect
golang.org/x/text v0.14.0 // indirect
golang.org/x/time v0.3.0 // indirect