an app.bsky.* indexer

split backend apart, use DI

+15
cmd/backfiller2/backfill.go
···
+
package main
+
+
import "github.com/bluesky-social/indigo/backfill"
+
+
func NewBackfillService(store backfill.Store, h *HandlerService) *backfill.Backfiller {
+
opts := &backfill.BackfillOptions{
+
ParallelBackfills: 10,
+
ParallelRecordCreates: 1,
+
NSIDFilter: "",
+
SyncRequestsPerSecond: 5,
+
RelayHost: "https://bsky.network",
+
}
+
+
return backfill.NewBackfiller("backfiller", store, h.HandleCreate, h.HandleUpdate, h.HandleDelete, opts)
+
}
+71
cmd/backfiller2/census.go
···
+
package main
+
+
import (
+
"context"
+
"log/slog"
+
+
"github.com/bluesky-social/indigo/api/atproto"
+
"github.com/bluesky-social/indigo/backfill"
+
"github.com/bluesky-social/indigo/xrpc"
+
)
+
+
type CensusService struct {
+
cursor *CursorService
+
backfill *backfill.Backfiller
+
}
+
+
type jobMaker interface {
+
GetOrCreateJob(context.Context, string, string) (backfill.Job, error)
+
}
+
+
func NewCensusService(cursorSvc *CursorService, backfillSvc *backfill.Backfiller) *CensusService {
+
return &CensusService{
+
cursor: cursorSvc,
+
backfill: backfillSvc,
+
}
+
}
+
+
func (cs *CensusService) Start(ctx context.Context) {
+
xrpcc := &xrpc.Client{
+
Host: "https://bsky.network",
+
}
+
+
jmstore, ok := cs.backfill.Store.(jobMaker)
+
if !ok {
+
slog.Error("configured job store doesn't support random job creation")
+
return
+
}
+
+
curs, _ := cs.cursor.Get("repos")
+
for {
+
select {
+
case <-ctx.Done():
+
slog.Info("stopping repo census")
+
return
+
default:
+
}
+
+
slog.Info("listing repos", "cursor", curs)
+
res, err := atproto.SyncListRepos(ctx, xrpcc, curs, 1000)
+
if err != nil {
+
slog.Error("error listing repos", "err", err)
+
return
+
}
+
+
for _, repo := range res.Repos {
+
_, err := jmstore.GetOrCreateJob(ctx, repo.Did, backfill.StateEnqueued)
+
if err != nil {
+
slog.Error("error adding listed repo to backfiller", "err", err)
+
}
+
}
+
+
if res.Cursor != nil && *res.Cursor != "" {
+
cs.cursor.reposLk.Lock()
+
curs = *res.Cursor
+
cs.cursor.reposSeq = curs
+
cs.cursor.reposLk.Unlock()
+
} else {
+
break
+
}
+
}
+
}
+109
cmd/backfiller2/cursors.go
···
+
package main
+
+
import (
+
"context"
+
"fmt"
+
"log/slog"
+
"strconv"
+
"sync"
+
"time"
+
+
"gorm.io/gorm"
+
)
+
+
type cursorRecord struct {
+
ID uint `gorm:"primaryKey"`
+
Key string
+
Val string
+
}
+
+
type CursorService struct {
+
store *gorm.DB
+
+
firehoseLk sync.Mutex
+
firehoseSeq string
+
+
reposLk sync.Mutex
+
reposSeq string
+
}
+
+
func NewCursorService(store *gorm.DB) *CursorService {
+
store.AutoMigrate(&cursorRecord{})
+
+
var rec cursorRecord
+
store.First(&rec, 1)
+
if rec.ID == 0 {
+
store.Create(&cursorRecord{ID: 1, Key: "firehose", Val: ""})
+
}
+
+
store.First(&rec, 2)
+
if rec.ID == 0 {
+
store.Create(&cursorRecord{ID: 2, Key: "repos", Val: ""})
+
}
+
+
return &CursorService{
+
store: store,
+
}
+
}
+
+
func (cs *CursorService) Get(key string) (string, error) {
+
var rec cursorRecord
+
if err := cs.store.Where("key = ?", key).First(&rec).Error; err != nil {
+
return "", fmt.Errorf("error fetching cursor record: %w", err)
+
}
+
return rec.Val, nil
+
}
+
+
func (cs *CursorService) SetFirehoseCursor(seq int64) {
+
cs.firehoseLk.Lock()
+
val := strconv.Itoa(int(seq))
+
cs.firehoseSeq = val
+
cs.firehoseLk.Unlock()
+
}
+
+
func (cs *CursorService) SetReposCursor(value string) {
+
cs.reposLk.Lock()
+
cs.reposSeq = value
+
cs.reposLk.Unlock()
+
}
+
+
func (cs *CursorService) Flush() error {
+
flusher := func(lk *sync.Mutex, key, value string) error {
+
lk.Lock()
+
if err := cs.store.Model(&cursorRecord{}).Where("key = ?", key).Update("val", value).Error; err != nil {
+
return fmt.Errorf("error updating cursor record: %+v: %w", cursorRecord{Key: key, Val: value}, err)
+
}
+
lk.Unlock()
+
return nil
+
}
+
+
if err := flusher(&cs.firehoseLk, "firehose", cs.firehoseSeq); err != nil {
+
return err
+
}
+
+
if err := flusher(&cs.reposLk, "repos", cs.reposSeq); err != nil {
+
return err
+
}
+
+
return nil
+
}
+
+
func (cs *CursorService) CheckpointCursors(ctx context.Context) {
+
t := time.NewTicker(time.Second * 5)
+
defer t.Stop()
+
+
for {
+
select {
+
case <-ctx.Done():
+
slog.Info("stopping cursor checkpointer")
+
return
+
case <-t.C:
+
}
+
+
slog.Info("flushing cursors")
+
if err := cs.Flush(); err != nil {
+
slog.Error("error flushing cursors", "err", err)
+
return
+
}
+
}
+
}
+31
cmd/backfiller2/database.go
···
+
package main
+
+
import (
+
"log"
+
"log/slog"
+
"os"
+
"time"
+
+
"gorm.io/driver/sqlite"
+
"gorm.io/gorm"
+
"gorm.io/gorm/logger"
+
)
+
+
func NewDatabase(path string) *gorm.DB {
+
sl := slog.With("source", "database")
+
l := logger.New(
+
log.New(os.Stdout, "\r\n", log.LstdFlags),
+
logger.Config{
+
SlowThreshold: time.Second,
+
Colorful: false,
+
},
+
)
+
db, err := gorm.Open(sqlite.Open(path), &gorm.Config{
+
Logger: l,
+
})
+
if err != nil {
+
sl.Error("failed to open database", "err", err)
+
}
+
db.Exec("PRAGMA journal_mode=WAL")
+
return db
+
}
+25
cmd/backfiller2/firehose.go
···
+
package main
+
+
import (
+
"context"
+
"net/http"
+
+
"github.com/gorilla/websocket"
+
)
+
+
func NewFirehoseConnection(ctx context.Context, cursorSvc *CursorService) (*websocket.Conn, error) {
+
url := "wss://bsky.network/xrpc/com.atproto.sync.subscribeRepos"
+
curs, _ := cursorSvc.Get("firehose")
+
if curs != "" {
+
url += "?cursor=" + curs
+
}
+
+
conn, _, err := websocket.DefaultDialer.DialContext(ctx, url, http.Header{
+
"User-Agent": []string{"backfiller/0.1 (@edavis.dev)"},
+
})
+
if err != nil {
+
return nil, err
+
}
+
+
return conn, nil
+
}
+55
cmd/backfiller2/handlers.go
···
+
package main
+
+
import (
+
"bytes"
+
"context"
+
"fmt"
+
"strings"
+
+
appbsky "github.com/bluesky-social/indigo/api/bsky"
+
"github.com/ipfs/go-cid"
+
"gorm.io/gorm"
+
)
+
+
type HandlerService struct {
+
store *gorm.DB
+
}
+
+
func NewHandlerService(store *gorm.DB) *HandlerService {
+
store.AutoMigrate(&Profile{})
+
store.AutoMigrate(&Feedgen{})
+
// TODO the rest
+
+
return &HandlerService{
+
store: store,
+
}
+
}
+
+
func (hs *HandlerService) HandleCreate(ctx context.Context, repo string, rev string, path string, rec *[]byte, cid *cid.Cid) error {
+
if !strings.HasPrefix(path, "app.bsky.feed.generator/") {
+
return nil
+
}
+
+
var out appbsky.FeedGenerator
+
if err := out.UnmarshalCBOR(bytes.NewReader(*rec)); err != nil {
+
return fmt.Errorf("error unmarshalling record: %w", err)
+
}
+
+
feedgen := Feedgen{
+
DisplayName: out.DisplayName,
+
}
+
+
if err := hs.store.Create(&feedgen).Error; err != nil {
+
return fmt.Errorf("error saving feedgen: %w", err)
+
}
+
+
return nil
+
}
+
+
func (hs *HandlerService) HandleUpdate(ctx context.Context, repo string, rev string, path string, rec *[]byte, cid *cid.Cid) error {
+
return nil
+
}
+
+
func (hs *HandlerService) HandleDelete(ctx context.Context, repo string, rev string, path string) error {
+
return nil
+
}
+122
cmd/backfiller2/main.go
···
+
package main
+
+
import (
+
"context"
+
"fmt"
+
"log/slog"
+
"os/signal"
+
"syscall"
+
"time"
+
+
comatproto "github.com/bluesky-social/indigo/api/atproto"
+
"github.com/bluesky-social/indigo/backfill"
+
"github.com/bluesky-social/indigo/events"
+
"github.com/bluesky-social/indigo/events/schedulers/parallel"
+
"github.com/gorilla/websocket"
+
"gorm.io/gorm"
+
)
+
+
type App struct {
+
backfill *backfill.Backfiller
+
cursor *CursorService
+
handler *HandlerService
+
census *CensusService
+
wsconn *websocket.Conn
+
state *gorm.DB
+
content *gorm.DB
+
}
+
+
func NewApp() *App {
+
stateDatabase := NewDatabase("state.db")
+
stateDatabase.AutoMigrate(&backfill.GormDBJob{})
+
+
contentDatabase := NewDatabase("content.db")
+
+
return &App{
+
state: stateDatabase,
+
content: contentDatabase,
+
}
+
}
+
+
func (app *App) Start(ctx context.Context) error {
+
app.cursor = NewCursorService(app.state)
+
go app.cursor.CheckpointCursors(ctx)
+
+
app.handler = NewHandlerService(app.content)
+
+
app.backfill = NewBackfillService(backfill.NewGormstore(app.state), app.handler)
+
go app.backfill.Start()
+
+
app.census = NewCensusService(app.cursor, app.backfill)
+
go app.census.Start(ctx)
+
+
wsconn, err := NewFirehoseConnection(ctx, app.cursor)
+
if err != nil {
+
return fmt.Errorf("error connecting to relay: %w", err)
+
}
+
app.wsconn = wsconn
+
+
rsc := events.RepoStreamCallbacks{
+
RepoCommit: func(evt *comatproto.SyncSubscribeRepos_Commit) error {
+
app.cursor.SetFirehoseCursor(evt.Seq)
+
return app.backfill.HandleEvent(ctx, evt)
+
},
+
// TODO account
+
// TODO identity
+
}
+
+
sched := parallel.NewScheduler(4, 50, "firehose", rsc.EventHandler)
+
+
if err := events.HandleRepoStream(ctx, app.wsconn, sched, nil); err != nil {
+
return fmt.Errorf("error starting repo stream handler: %w", err)
+
}
+
+
return nil
+
}
+
+
func (app *App) Stop(ctx context.Context) error {
+
closeDatabase := func(db *gorm.DB) error {
+
raw, err := db.DB()
+
if err != nil {
+
return fmt.Errorf("error getting raw DB: %w", err)
+
}
+
if err := raw.Close(); err != nil {
+
return fmt.Errorf("error closing DB: %w", err)
+
}
+
return nil
+
}
+
+
if err := closeDatabase(app.state); err != nil {
+
return err
+
}
+
+
if err := closeDatabase(app.content); err != nil {
+
return err
+
}
+
+
if err := app.backfill.Stop(ctx); err != nil {
+
return err
+
}
+
+
return nil
+
}
+
+
func main() {
+
ctx, cancel := signal.NotifyContext(context.TODO(), syscall.SIGINT, syscall.SIGTERM)
+
defer cancel()
+
+
app := NewApp()
+
if err := app.Start(ctx); err != nil {
+
slog.Error("failed to start backfiller", "err", err)
+
}
+
+
<-ctx.Done()
+
slog.Info("shutting down")
+
+
endctx, cancel := context.WithTimeout(context.TODO(), time.Second*15)
+
defer cancel()
+
+
if err := app.Stop(endctx); err != nil {
+
slog.Error("error during shutdown", "err", err)
+
}
+
}
+13
cmd/backfiller2/models.go
···
+
package main
+
+
type Account struct{}
+
type Profile struct{}
+
type List struct{}
+
type Labeler struct{}
+
type Feedgen struct {
+
ID uint `gorm:"primaryKey"`
+
DisplayName string
+
}
+
type StarterPack struct{}
+
type Verification struct{}
+
type Lexicon struct{}