Constellation, Spacedust, Slingshot, UFOs: atproto crates and services for microcosm

import cars: extract links. also fmt

Changed files
+235 -79
links
spacedust
+60 -9
Cargo.lock
···
checksum = "69f7f8c3906b62b754cd5326047894316021dcfe5a194c8ea52bdd94934a3457"
[[package]]
+
name = "arrayref"
+
version = "0.3.9"
+
source = "registry+https://github.com/rust-lang/crates.io-index"
+
checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb"
+
+
[[package]]
name = "arrayvec"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
···
checksum = "5c8214115b7bf84099f1309324e63141d4c5d7cc26862f97a0a857dbefe165bd"
[[package]]
+
name = "blake3"
+
version = "1.8.2"
+
source = "registry+https://github.com/rust-lang/crates.io-index"
+
checksum = "3888aaa89e4b2a40fca9848e400f6a658a5a3978de7be858e209cafa8be9a4a0"
+
dependencies = [
+
"arrayref",
+
"arrayvec",
+
"cc",
+
"cfg-if",
+
"constant_time_eq",
+
]
+
+
[[package]]
name = "block-buffer"
version = "0.10.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
···
]
[[package]]
+
name = "cbor4ii"
+
version = "1.2.0"
+
source = "registry+https://github.com/rust-lang/crates.io-index"
+
checksum = "b28d2802395e3bccd95cc4ae984bff7444b6c1f5981da46a41360c42a2c7e2d9"
+
+
[[package]]
name = "cc"
version = "1.2.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
···
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2f421161cb492475f1661ddc9815a745a1c894592070661180fdec3d4872e9c3"
+
+
[[package]]
+
name = "constant_time_eq"
+
version = "0.3.1"
+
source = "registry+https://github.com/rust-lang/crates.io-index"
+
checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6"
[[package]]
name = "constellation"
···
[[package]]
+
name = "dasl"
+
version = "0.2.0"
+
source = "registry+https://github.com/rust-lang/crates.io-index"
+
checksum = "b59666035a4386b0fd272bd78da4cbc3ccb558941e97579ab00f0eb4639f2a49"
+
dependencies = [
+
"blake3",
+
"cbor4ii 1.2.0",
+
"data-encoding",
+
"data-encoding-macro",
+
"scopeguard",
+
"serde",
+
"serde_bytes",
+
"sha2",
+
"thiserror 2.0.17",
+
]
+
+
[[package]]
name = "data-encoding"
-
version = "2.8.0"
+
version = "2.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-
checksum = "575f75dfd25738df5b91b8e43e14d44bda14637a58fae779fd2b064f8bf3e010"
+
checksum = "2a2330da5de22e8a3cb63252ce2abb30116bf5265e89c0e01bc17015ce30a476"
[[package]]
name = "data-encoding-macro"
-
version = "0.1.17"
+
version = "0.1.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
-
checksum = "9f9724adfcf41f45bf652b3995837669d73c4d49a1b5ac1ff82905ac7d9b5558"
+
checksum = "47ce6c96ea0102f01122a185683611bd5ac8d99e62bc59dd12e6bda344ee673d"
dependencies = [
"data-encoding",
"data-encoding-macro-internal",
···
[[package]]
name = "data-encoding-macro-internal"
-
version = "0.1.15"
+
version = "0.1.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
-
checksum = "18e4fdb82bd54a12e42fb58a800dcae6b9e13982238ce2296dc3570b92148e1f"
+
checksum = "8d162beedaa69905488a8da94f5ac3edb4dd4788b732fadb7bd120b2625c1976"
dependencies = [
"data-encoding",
"syn 2.0.106",
···
version = "0.1.0"
dependencies = [
"anyhow",
+
"dasl",
"fluent-uri",
"nom",
+
"serde",
"thiserror 2.0.17",
"tinyjson",
···
[[package]]
name = "repo-stream"
-
version = "0.2.1"
+
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-
checksum = "727a78c392bd51b1af938e4383f2f6f46ae727cb38394136d1aebab0633faf8e"
+
checksum = "093b48e604c138949bf3d4a1a9bc1165feb1db28a73af0101c84eb703d279f43"
dependencies = [
"bincode 2.0.1",
"futures",
···
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "46182f4f08349a02b45c998ba3215d3f9de826246ba02bb9dddfe9a2a2100778"
dependencies = [
-
"cbor4ii",
+
"cbor4ii 0.2.14",
"ipld-core",
"scopeguard",
"serde",
···
"async-trait",
"clap",
"ctrlc",
+
"dasl",
"dropshot",
"env_logger",
"fjall 3.0.0-pre.0",
+2
links/Cargo.toml
···
[dependencies]
anyhow = "1.0.95"
+
dasl = "0.2.0"
fluent-uri = "0.3.2"
nom = "7.1.3"
+
serde = { version = "1.0.228", features = ["derive"] }
thiserror = "2.0.9"
tinyjson = "2.5.1"
+3 -2
links/src/lib.rs
···
use fluent_uri::Uri;
+
use serde::{Deserialize, Serialize};
pub mod at_uri;
pub mod did;
···
pub use record::collect_links;
-
#[derive(Debug, Clone, Ord, Eq, PartialOrd, PartialEq)]
+
#[derive(Debug, Clone, Ord, Eq, PartialOrd, PartialEq, Serialize, Deserialize)]
pub enum Link {
AtUri(String),
Uri(String),
···
}
}
-
#[derive(Debug, PartialEq)]
+
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct CollectedLink {
pub path: String,
pub target: Link,
+41
links/src/record.rs
···
+
use dasl::drisl::Value as DrislValue;
use tinyjson::JsonValue;
use crate::{parse_any_link, CollectedLink};
···
}
}
+
pub fn walk_drisl(path: &str, v: &DrislValue, found: &mut Vec<CollectedLink>) {
+
match v {
+
DrislValue::Map(o) => {
+
for (key, child) in o {
+
walk_drisl(&format!("{path}.{key}"), child, found)
+
}
+
}
+
DrislValue::Array(a) => {
+
for child in a {
+
let child_p = match child {
+
DrislValue::Map(o) => {
+
if let Some(DrislValue::Text(t)) = o.get("$type") {
+
format!("{path}[{t}]")
+
} else {
+
format!("{path}[]")
+
}
+
}
+
_ => format!("{path}[]"),
+
};
+
walk_drisl(&child_p, child, found)
+
}
+
}
+
DrislValue::Text(s) => {
+
if let Some(link) = parse_any_link(s) {
+
found.push(CollectedLink {
+
path: path.to_string(),
+
target: link,
+
});
+
}
+
}
+
_ => {}
+
}
+
}
+
pub fn collect_links(v: &JsonValue) -> Vec<CollectedLink> {
let mut found = vec![];
walk_record("", v, &mut found);
+
found
+
}
+
+
pub fn collect_links_drisl(v: &DrislValue) -> Vec<CollectedLink> {
+
let mut found = vec![];
+
walk_drisl("", v, &mut found);
found
}
+2 -1
spacedust/Cargo.toml
···
async-trait = "0.1.88"
clap = { version = "4.5.40", features = ["derive"] }
ctrlc = "3.4.7"
+
dasl = "0.2.0"
dropshot = "0.16.2"
env_logger = "0.11.8"
fjall = "3.0.0-pre.0"
···
metrics = "0.24.2"
metrics-exporter-prometheus = { version = "0.17.1", features = ["http-listener"] }
rand = "0.9.1"
-
repo-stream = "0.2.1"
+
repo-stream = "0.2.2"
reqwest = { version = "0.12.24", features = ["json", "stream"] }
schemars = "0.8.22"
semver = "1.0.26"
+1 -4
spacedust/src/bin/import_car_file.rs
···
use clap::Parser;
use std::path::PathBuf;
-
use spacedust::storage::car;
type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>;
···
let Args { file } = Args::parse();
-
let reader = tokio::fs::File::open(file).await?;
-
-
car::import(reader).await?;
+
let _reader = tokio::fs::File::open(file).await?;
Ok(())
}
+108 -44
spacedust/src/bin/import_scraped.rs
···
use clap::Parser;
-
use std::sync::{Arc, atomic::{AtomicUsize, Ordering}};
+
use links::CollectedLink;
+
use repo_stream::{
+
DiskBuilder, DiskStore, Driver, DriverBuilder, Processable, drive::DriverBuilderWithProcessor,
+
drive::NeedDisk,
+
};
use std::path::PathBuf;
-
use tokio::{task::JoinSet, io::AsyncRead};
-
use repo_stream::{DriverBuilder, Driver, DiskBuilder, DiskStore, drive::NeedDisk};
-
+
use std::sync::{
+
Arc,
+
atomic::{AtomicUsize, Ordering},
+
};
+
use tokio::{io::AsyncRead, task::JoinSet};
type Result<T> = anyhow::Result<T>; //std::result::Result<T, Box<dyn std::error::Error>>;
+
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
+
struct CollectedProcessed(CollectedLink);
+
+
impl Processable for CollectedProcessed {
+
fn get_size(&self) -> usize {
+
self.0.path.capacity() + self.0.target.as_str().len()
+
}
+
}
+
+
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
+
struct ErrString(String);
+
+
impl Processable for ErrString {
+
fn get_size(&self) -> usize {
+
self.0.capacity()
+
}
+
}
+
+
type Processed = std::result::Result<Vec<CollectedProcessed>, ErrString>;
+
+
/// hacky for now: put errors in strings 🤷‍♀️
+
fn process(block: Vec<u8>) -> Processed {
+
let value: dasl::drisl::Value = dasl::drisl::from_slice(&block)
+
.map_err(|e| ErrString(format!("failed to parse block with drisl: {e:?}")))?;
+
let links = links::record::collect_links_drisl(&value)
+
.into_iter()
+
.map(CollectedProcessed)
+
.collect();
+
Ok(links)
+
}
#[derive(Debug, Parser)]
struct Args {
···
disk_folder: PathBuf,
}
-
async fn get_cars(cars_folder: PathBuf, tx: async_channel::Sender<tokio::io::BufReader<tokio::fs::File>>) -> Result<()> {
+
async fn get_cars(
+
cars_folder: PathBuf,
+
tx: async_channel::Sender<tokio::io::BufReader<tokio::fs::File>>,
+
) -> Result<()> {
let mut dir = tokio::fs::read_dir(cars_folder).await?;
while let Some(entry) = dir.next_entry().await? {
if !entry.file_type().await?.is_file() {
···
async fn drive_mem<R: AsyncRead + Unpin + Send + Sync + 'static>(
f: R,
-
disk_tx: &async_channel::Sender<NeedDisk<R, usize>>,
-
) -> Result<Option<usize>> {
+
builder: &DriverBuilderWithProcessor<Processed>,
+
disk_tx: &async_channel::Sender<NeedDisk<R, Processed>>,
+
) -> Result<Option<(usize, usize)>> {
let mut n = 0;
-
match DriverBuilder::new()
-
.with_block_processor(|_| 0_usize) // don't care just counting records
-
.with_mem_limit_mb(32)
-
.load_car(f)
-
.await?
-
{
+
let mut n_records = 0;
+
match builder.load_car(f).await? {
Driver::Memory(_commit, mut driver) => {
while let Some(chunk) = driver.next_chunk(512).await? {
-
n += chunk.len();
+
n_records += chunk.len();
+
for (_key, links) in chunk {
+
match links {
+
Ok(links) => n += links.len(),
+
Err(e) => eprintln!("wat: {e:?}"),
+
}
+
}
}
-
Ok(Some(n))
+
Ok(Some((n, n_records)))
}
Driver::Disk(need_disk) => {
disk_tx.send(need_disk).await?;
···
async fn mem_worker<R: AsyncRead + Unpin + Send + Sync + 'static>(
car_rx: async_channel::Receiver<R>,
-
disk_tx: async_channel::Sender<NeedDisk<R, usize>>,
+
disk_tx: async_channel::Sender<NeedDisk<R, Processed>>,
n: Arc<AtomicUsize>,
+
n_records: Arc<AtomicUsize>,
) -> Result<()> {
+
let builder = DriverBuilder::new()
+
.with_block_processor(process) // don't care just counting records
+
.with_mem_limit_mb(128);
while let Ok(f) = car_rx.recv().await {
-
let driven = match drive_mem(f, &disk_tx).await {
+
let driven = match drive_mem(f, &builder, &disk_tx).await {
Ok(d) => d,
Err(e) => {
eprintln!("failed to drive mem: {e:?}. skipping...");
continue;
}
};
-
if let Some(drove) = driven {
+
if let Some((drove, recs)) = driven {
n.fetch_add(drove, Ordering::Relaxed);
+
n_records.fetch_add(recs, Ordering::Relaxed);
}
}
Ok(())
}
async fn drive_disk<R: AsyncRead + Unpin>(
-
needed: NeedDisk<R, usize>,
+
needed: NeedDisk<R, Processed>,
store: DiskStore,
-
) -> Result<(usize, DiskStore)> {
+
) -> Result<(usize, usize, DiskStore)> {
let (_commit, mut driver) = needed.finish_loading(store).await?;
let mut n = 0;
+
let mut n_records = 0;
while let Some(chunk) = driver.next_chunk(512).await? {
-
n += chunk.len();
+
n_records += chunk.len();
+
for (_key, links) in chunk {
+
match links {
+
Ok(links) => n += links.len(),
+
Err(e) => eprintln!("wat: {e:?}"),
+
}
+
}
}
let store = driver.reset_store().await?;
-
Ok((n, store))
+
Ok((n, n_records, store))
}
async fn disk_worker<R: AsyncRead + Unpin>(
worker_id: usize,
-
disk_rx: async_channel::Receiver<NeedDisk<R, usize>>,
+
disk_rx: async_channel::Receiver<NeedDisk<R, Processed>>,
folder: PathBuf,
n: Arc<AtomicUsize>,
+
n_records: Arc<AtomicUsize>,
disk_workers_active: Arc<AtomicUsize>,
) -> Result<()> {
let mut file = folder;
file.push(format!("disk-worker-{worker_id}.sqlite"));
-
let mut store = DiskBuilder::new()
-
.with_cache_size_mb(128)
-
.open(file.clone())
-
.await?;
+
let builder = DiskBuilder::new().with_cache_size_mb(128);
+
let mut store = builder.open(file.clone()).await?;
while let Ok(needed) = disk_rx.recv().await {
let active = disk_workers_active.fetch_add(1, Ordering::AcqRel);
println!("-> disk workers active: {}", active + 1);
-
let drove = match drive_disk(needed, store).await {
-
Ok((d, s)) => {
+
let (drove, records) = match drive_disk(needed, store).await {
+
Ok((d, r, s)) => {
store = s;
-
d
+
(d, r)
}
Err(e) => {
eprintln!("failed to drive disk: {e:?}. skipping...");
-
store = DiskBuilder::new()
-
.with_cache_size_mb(128)
-
.open(file.clone())
-
.await?;
+
store = builder.open(file.clone()).await?;
continue;
}
};
n.fetch_add(drove, Ordering::Relaxed);
+
n_records.fetch_add(records, Ordering::Relaxed);
let were_active = disk_workers_active.fetch_sub(1, Ordering::AcqRel);
println!("<- disk workers active: {}", were_active - 1);
}
Ok(())
}
-
#[tokio::main]
async fn main() -> Result<()> {
env_logger::init();
-
let Args { cars_folder, disk_folder, disk_workers, mem_workers } = Args::parse();
+
let Args {
+
cars_folder,
+
disk_folder,
+
disk_workers,
+
mem_workers,
+
} = Args::parse();
let mut set = JoinSet::<Result<()>>::new();
-
let (cars_tx, cars_rx) = async_channel::bounded(2);
set.spawn(get_cars(cars_folder, cars_tx));
let n: Arc<AtomicUsize> = Arc::new(0.into());
+
let n_records: Arc<AtomicUsize> = Arc::new(0.into());
let disk_workers_active: Arc<AtomicUsize> = Arc::new(0.into());
set.spawn({
let n = n.clone();
+
let n_records = n_records.clone();
let mut interval = tokio::time::interval(std::time::Duration::from_secs(10));
async move {
let mut last_n = n.load(Ordering::Relaxed);
+
let mut last_n_records = n.load(Ordering::Relaxed);
loop {
interval.tick().await;
let n = n.load(Ordering::Relaxed);
-
let diff = n - last_n;
-
println!("rate: {} rec/sec", diff / 10);
-
if diff == 0 {
+
let n_records = n_records.load(Ordering::Relaxed);
+
let diff_n = n - last_n;
+
let diff_records = n_records - last_n_records;
+
println!("rate: {} rec/sec; {} n/sec", diff_records / 10, diff_n / 10);
+
if n_records > 0 && diff_records == 0 {
println!("zero encountered, stopping rate calculation polling.");
break Ok(());
}
last_n = n;
+
last_n_records = n_records;
}
}
});
-
let (needs_disk_tx, needs_disk_rx) = async_channel::bounded(disk_workers);
-
for _ in 0..mem_workers {
-
set.spawn(mem_worker(cars_rx.clone(), needs_disk_tx.clone(), n.clone()));
+
set.spawn(mem_worker(
+
cars_rx.clone(),
+
needs_disk_tx.clone(),
+
n.clone(),
+
n_records.clone(),
+
));
}
drop(cars_rx);
drop(needs_disk_tx);
···
needs_disk_rx.clone(),
disk_folder.clone(),
n.clone(),
+
n_records.clone(),
disk_workers_active.clone(),
));
}
···
println!("task from set joined: {res:?}");
}
-
eprintln!("total records processed: {n:?}");
+
eprintln!("total records processed: {n_records:?}; total n: {n:?}");
Ok(())
}
+12 -13
spacedust/src/bin/scrape_pds.rs
···
-
use tokio::io::AsyncWriteExt;
use clap::Parser;
-
use std::path::PathBuf;
use reqwest::Url;
-
use tokio::{sync::mpsc, time};
use serde::Deserialize;
+
use std::path::PathBuf;
+
use tokio::io::AsyncWriteExt;
+
use tokio::{sync::mpsc, time};
type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>;
···
pds.set_path("/xrpc/com.atproto.sync.getRepo");
pds.set_query(Some(&format!("did={did}")));
-
let mut byte_stream = client
-
.get(pds)
-
.send()
-
.await?
-
.bytes_stream();
+
let mut byte_stream = client.get(pds).send().await?.bytes_stream();
while let Some(stuff) = byte_stream.next().await {
tokio::io::copy(&mut stuff?.as_ref(), &mut w).await?;
···
Ok(())
}
-
#[derive(Debug, Deserialize)]
struct RepoInfo {
···
.expect("json response");
for repo in res.repos {
if repo.active {
-
tx.send(repo.did).await.expect("to be able to send on the channel");
+
tx.send(repo.did)
+
.await
+
.expect("to be able to send on the channel");
}
}
cursor = res.cursor;
···
break;
}
}
-
});
rx
}
-
#[tokio::main]
async fn main() -> Result<()> {
env_logger::init();
-
let Args { pds, throttle_ms, folder } = Args::parse();
+
let Args {
+
pds,
+
throttle_ms,
+
folder,
+
} = Args::parse();
tokio::fs::create_dir_all(folder.clone()).await?;
+1
spacedust/src/storage/car/mod.rs
···
+
+4 -3
spacedust/src/storage/fjall/mod.rs
···
use crate::storage::Storage;
-
struct FjallStorage {
-
}
+
pub struct FjallStorage {}
impl Storage for FjallStorage {
-
fn import_car() { todo!() }
+
fn import_car() {
+
todo!()
+
}
}
+1 -3
spacedust/src/storage/mod.rs
···
pub mod fjall;
pub trait Storage {
-
fn import_car() {
-
-
}
+
fn import_car() {}
}