Constellation, Spacedust, Slingshot, UFOs: atproto crates and services for microcosm

Merge pull request #39 from at-microcosm/many-to-many-counts

many-to-many counts

+9 -9
Cargo.lock
···
[[package]]
name = "clap"
-
version = "4.5.47"
+
version = "4.5.48"
source = "registry+https://github.com/rust-lang/crates.io-index"
-
checksum = "7eac00902d9d136acd712710d71823fb8ac8004ca445a89e73a41d45aa712931"
+
checksum = "e2134bb3ea021b78629caa971416385309e0131b351b25e01dc16fb54e1b5fae"
dependencies = [
"clap_builder",
"clap_derive",
···
[[package]]
name = "clap_builder"
-
version = "4.5.47"
+
version = "4.5.48"
source = "registry+https://github.com/rust-lang/crates.io-index"
-
checksum = "2ad9bbf750e73b5884fb8a211a9424a1906c1e156724260fdae972f31d70e1d6"
+
checksum = "c2ba64afa3c0a6df7fa517765e31314e983f51dda798ffba27b988194fb65dc9"
dependencies = [
"anstream",
"anstyle",
···
checksum = "18e4fdb82bd54a12e42fb58a800dcae6b9e13982238ce2296dc3570b92148e1f"
dependencies = [
"data-encoding",
-
"syn 1.0.109",
+
"syn 2.0.106",
[[package]]
···
checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34"
dependencies = [
"cfg-if",
-
"windows-targets 0.48.5",
+
"windows-targets 0.52.6",
[[package]]
···
[[package]]
name = "reqwest"
-
version = "0.12.22"
+
version = "0.12.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
-
checksum = "cbc931937e6ca3a06e3b6c0aa7841849b160a90351d6ab467a8b9b9959767531"
+
checksum = "d429f34c8092b2d42c7c93cec323bb4adeb7c67698f70839adec842ec10c7ceb"
dependencies = [
"async-compression",
"base64 0.22.1",
···
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
dependencies = [
-
"windows-sys 0.48.0",
+
"windows-sys 0.59.0",
[[package]]
+29 -10
constellation/src/bin/main.rs
···
/// Saved jsonl from jetstream to use instead of a live subscription
#[arg(short, long)]
fixture: Option<PathBuf>,
+
/// run a scan across the target id table and write all key -> ids to id -> keys
+
#[arg(long, action)]
+
repair_target_ids: bool,
}
#[derive(Debug, Clone, ValueEnum)]
···
rocks.start_backup(backup_dir, auto_backup, stay_alive.clone())?;
}
println!("rocks ready.");
-
run(
-
rocks,
-
fixture,
-
args.data,
-
stream,
-
bind,
-
metrics_bind,
-
stay_alive,
-
)
+
std::thread::scope(|s| {
+
if args.repair_target_ids {
+
let rocks = rocks.clone();
+
let stay_alive = stay_alive.clone();
+
s.spawn(move || {
+
let rep = rocks.run_repair(time::Duration::from_millis(0), stay_alive);
+
eprintln!("repair finished: {rep:?}");
+
rep
+
});
+
}
+
s.spawn(|| {
+
let r = run(
+
rocks,
+
fixture,
+
args.data,
+
stream,
+
bind,
+
metrics_bind,
+
stay_alive,
+
);
+
eprintln!("run finished: {r:?}");
+
r
+
});
+
});
+
Ok(())
}
}
}
···
'monitor: loop {
match readable.get_stats() {
-
Ok(StorageStats { dids, targetables, linking_records }) => {
+
Ok(StorageStats { dids, targetables, linking_records, .. }) => {
metrics::gauge!("storage.stats.dids").set(dids as f64);
metrics::gauge!("storage.stats.targetables").set(targetables as f64);
metrics::gauge!("storage.stats.linking_records").set(linking_records as f64);
+4
constellation/src/server/filters.rs
···
pub fn human_number(n: &u64) -> askama::Result<String> {
Ok(n.to_formatted_string(&Locale::en))
}
+
+
pub fn to_u64(n: usize) -> askama::Result<u64> {
+
Ok(n as u64)
+
}
+145 -8
constellation/src/server/mod.rs
···
DEFAULT_CURSOR_LIMIT
}
-
const INDEX_BEGAN_AT_TS: u64 = 1738083600; // TODO: not this
-
fn to500(e: tokio::task::JoinError) -> http::StatusCode {
-
eprintln!("handler join error: {e}");
+
eprintln!("handler error: {e}");
http::StatusCode::INTERNAL_SERVER_ERROR
}
···
let store = store.clone();
move |accept| async {
spawn_blocking(|| hello(accept, store))
+
.await
+
.map_err(to500)?
+
}
+
}),
+
)
+
.route(
+
"/xrpc/blue.microcosm.links.getManyToManyCounts",
+
get({
+
let store = store.clone();
+
move |accept, query| async {
+
spawn_blocking(|| get_many_to_many_counts(accept, query, store))
.await
.map_err(to500)?
}
···
#[template(path = "hello.html.j2")]
struct HelloReponse {
help: &'static str,
-
days_indexed: u64,
+
days_indexed: Option<u64>,
stats: StorageStats,
}
fn hello(
···
let stats = store
.get_stats()
.map_err(|_| http::StatusCode::INTERNAL_SERVER_ERROR)?;
-
let days_indexed = (UNIX_EPOCH + Duration::from_secs(INDEX_BEGAN_AT_TS))
-
.elapsed()
+
let days_indexed = stats
+
.started_at
+
.map(|c| (UNIX_EPOCH + Duration::from_micros(c)).elapsed())
+
.transpose()
.map_err(|_| http::StatusCode::INTERNAL_SERVER_ERROR)?
-
.as_secs()
-
/ 86400;
+
.map(|d| d.as_secs() / 86_400);
Ok(acceptable(accept, HelloReponse {
help: "open this URL in a web browser (or request with Accept: text/html) for information about this API.",
days_indexed,
stats,
}))
+
}
+
+
#[derive(Clone, Deserialize)]
+
#[serde(rename_all = "camelCase")]
+
struct GetManyToManyCountsQuery {
+
subject: String,
+
source: String,
+
/// path to the secondary link in the linking record
+
path_to_other: String,
+
/// filter to linking records (join of the m2m) by these DIDs
+
#[serde(default)]
+
did: Vec<String>,
+
/// filter to specific secondary records
+
#[serde(default)]
+
other_subject: Vec<String>,
+
cursor: Option<OpaqueApiCursor>,
+
/// Set the max number of links to return per page of results
+
#[serde(default = "get_default_cursor_limit")]
+
limit: u64,
+
}
+
#[derive(Serialize)]
+
struct OtherSubjectCount {
+
subject: String,
+
total: u64,
+
distinct: u64,
+
}
+
#[derive(Template, Serialize)]
+
#[template(path = "get-many-to-many-counts.html.j2")]
+
struct GetManyToManyCountsResponse {
+
counts_by_other_subject: Vec<OtherSubjectCount>,
+
cursor: Option<OpaqueApiCursor>,
+
#[serde(skip_serializing)]
+
query: GetManyToManyCountsQuery,
+
}
+
fn get_many_to_many_counts(
+
accept: ExtractAccept,
+
query: axum_extra::extract::Query<GetManyToManyCountsQuery>,
+
store: impl LinkReader,
+
) -> Result<impl IntoResponse, http::StatusCode> {
+
let cursor_key = query
+
.cursor
+
.clone()
+
.map(|oc| ApiKeyedCursor::try_from(oc).map_err(|_| http::StatusCode::BAD_REQUEST))
+
.transpose()?
+
.map(|c| c.next);
+
+
let limit = query.limit;
+
if limit > DEFAULT_CURSOR_LIMIT_MAX {
+
return Err(http::StatusCode::BAD_REQUEST);
+
}
+
+
let filter_dids: HashSet<Did> = HashSet::from_iter(
+
query
+
.did
+
.iter()
+
.map(|d| d.trim())
+
.filter(|d| !d.is_empty())
+
.map(|d| Did(d.to_string())),
+
);
+
+
let filter_other_subjects: HashSet<String> = HashSet::from_iter(
+
query
+
.other_subject
+
.iter()
+
.map(|s| s.trim().to_string())
+
.filter(|s| !s.is_empty()),
+
);
+
+
let Some((collection, path)) = query.source.split_once(':') else {
+
return Err(http::StatusCode::BAD_REQUEST);
+
};
+
let path = format!(".{path}");
+
+
let path_to_other = format!(".{}", query.path_to_other);
+
+
let paged = store
+
.get_many_to_many_counts(
+
&query.subject,
+
collection,
+
&path,
+
&path_to_other,
+
limit,
+
cursor_key,
+
&filter_dids,
+
&filter_other_subjects,
+
)
+
.map_err(|_| http::StatusCode::INTERNAL_SERVER_ERROR)?;
+
+
let cursor = paged.next.map(|next| ApiKeyedCursor { next }.into());
+
+
let items = paged
+
.items
+
.into_iter()
+
.map(|(subject, total, distinct)| OtherSubjectCount {
+
subject,
+
total,
+
distinct,
+
})
+
.collect();
+
+
Ok(acceptable(
+
accept,
+
GetManyToManyCountsResponse {
+
counts_by_other_subject: items,
+
cursor,
+
query: (*query).clone(),
+
},
+
))
}
#[derive(Clone, Deserialize)]
···
OpaqueApiCursor(bincode::DefaultOptions::new().serialize(&item).unwrap())
}
}
+
+
#[derive(Serialize, Deserialize)] // for bincode
+
struct ApiKeyedCursor {
+
next: String, // the key
+
}
+
+
impl TryFrom<OpaqueApiCursor> for ApiKeyedCursor {
+
type Error = bincode::Error;
+
+
fn try_from(item: OpaqueApiCursor) -> Result<Self, Self::Error> {
+
bincode::DefaultOptions::new().deserialize(&item.0)
+
}
+
}
+
+
impl From<ApiKeyedCursor> for OpaqueApiCursor {
+
fn from(item: ApiKeyedCursor) -> Self {
+
OpaqueApiCursor(bincode::DefaultOptions::new().serialize(&item).unwrap())
+
}
+
}
+78 -1
constellation/src/storage/mem_store.rs
···
-
use super::{LinkReader, LinkStorage, PagedAppendingCollection, StorageStats};
+
use super::{
+
LinkReader, LinkStorage, PagedAppendingCollection, PagedOrderedCollection, StorageStats,
+
};
use crate::{ActionableEvent, CountsByCount, Did, RecordId};
use anyhow::Result;
use links::CollectedLink;
···
}
impl LinkReader for MemStorage {
+
fn get_many_to_many_counts(
+
&self,
+
target: &str,
+
collection: &str,
+
path: &str,
+
path_to_other: &str,
+
limit: u64,
+
after: Option<String>,
+
filter_dids: &HashSet<Did>,
+
filter_to_targets: &HashSet<String>,
+
) -> Result<PagedOrderedCollection<(String, u64, u64), String>> {
+
let data = self.0.lock().unwrap();
+
let Some(paths) = data.targets.get(&Target::new(target)) else {
+
return Ok(PagedOrderedCollection::default());
+
};
+
let Some(linkers) = paths.get(&Source::new(collection, path)) else {
+
return Ok(PagedOrderedCollection::default());
+
};
+
+
let path_to_other = RecordPath::new(path_to_other);
+
let filter_to_targets: HashSet<Target> =
+
HashSet::from_iter(filter_to_targets.iter().map(|s| Target::new(s)));
+
+
let mut grouped_counts: HashMap<Target, (u64, HashSet<Did>)> = HashMap::new();
+
for (did, rkey) in linkers.iter().flatten().cloned() {
+
if !filter_dids.is_empty() && !filter_dids.contains(&did) {
+
continue;
+
}
+
if let Some(fwd_target) = data
+
.links
+
.get(&did)
+
.unwrap_or(&HashMap::new())
+
.get(&RepoId {
+
collection: collection.to_string(),
+
rkey,
+
})
+
.unwrap_or(&Vec::new())
+
.iter()
+
.filter_map(|(path, target)| {
+
if *path == path_to_other
+
&& (filter_to_targets.is_empty() || filter_to_targets.contains(target))
+
{
+
Some(target)
+
} else {
+
None
+
}
+
})
+
.take(1)
+
.next()
+
{
+
let e = grouped_counts.entry(fwd_target.clone()).or_default();
+
e.0 += 1;
+
e.1.insert(did.clone());
+
}
+
}
+
let mut items: Vec<(String, u64, u64)> = grouped_counts
+
.iter()
+
.map(|(k, (n, u))| (k.0.clone(), *n, u.len() as u64))
+
.collect();
+
items.sort();
+
items = items
+
.into_iter()
+
.skip_while(|(t, _, _)| after.as_ref().map(|a| t <= a).unwrap_or(false))
+
.take(limit as usize)
+
.collect();
+
let next = if items.len() as u64 >= limit {
+
items.last().map(|(t, _, _)| t.clone())
+
} else {
+
None
+
};
+
Ok(PagedOrderedCollection { items, next })
+
}
+
fn get_count(&self, target: &str, collection: &str, path: &str) -> Result<u64> {
let data = self.0.lock().unwrap();
let Some(paths) = data.targets.get(&Target::new(target)) else {
···
dids,
targetables,
linking_records,
+
started_at: None,
+
other_data: Default::default(),
})
}
}
+225
constellation/src/storage/mod.rs
···
pub total: u64,
}
+
/// A paged collection whose keys are sorted instead of indexed
+
///
+
/// this has weaker guarantees than PagedAppendingCollection: it might
+
/// return a totally consistent snapshot. but it should avoid duplicates
+
/// and each page should at least be internally consistent.
+
#[derive(Debug, PartialEq, Default)]
+
pub struct PagedOrderedCollection<T, K: Ord> {
+
pub items: Vec<T>,
+
pub next: Option<K>,
+
}
+
#[derive(Debug, Deserialize, Serialize, PartialEq)]
pub struct StorageStats {
/// estimate of how many accounts we've seen create links. the _subjects_ of any links are not represented here.
···
/// records with multiple links are single-counted.
/// for LSM stores, deleted links don't decrement this, and updated records with any links will likely increment it.
pub linking_records: u64,
+
+
/// first jetstream cursor when this instance first started
+
pub started_at: Option<u64>,
+
+
/// anything else we want to throw in
+
pub other_data: HashMap<String, u64>,
}
pub trait LinkStorage: Send + Sync {
···
}
pub trait LinkReader: Clone + Send + Sync + 'static {
+
#[allow(clippy::too_many_arguments)]
+
fn get_many_to_many_counts(
+
&self,
+
target: &str,
+
collection: &str,
+
path: &str,
+
path_to_other: &str,
+
limit: u64,
+
after: Option<String>,
+
filter_dids: &HashSet<Did>,
+
filter_to_targets: &HashSet<String>,
+
) -> Result<PagedOrderedCollection<(String, u64, u64), String>>;
+
fn get_count(&self, target: &str, collection: &str, path: &str) -> Result<u64>;
fn get_distinct_did_count(&self, target: &str, collection: &str, path: &str) -> Result<u64>;
···
counts
});
assert_stats(storage.get_stats()?, 1..=1, 2..=2, 1..=1);
+
});
+
+
//////// many-to-many /////////
+
+
test_each_storage!(get_m2m_counts_empty, |storage| {
+
assert_eq!(
+
storage.get_many_to_many_counts(
+
"a.com",
+
"a.b.c",
+
".d.e",
+
".f.g",
+
10,
+
None,
+
&HashSet::new(),
+
&HashSet::new(),
+
)?,
+
PagedOrderedCollection {
+
items: vec![],
+
next: None,
+
}
+
);
+
});
+
+
test_each_storage!(get_m2m_counts_single, |storage| {
+
storage.push(
+
&ActionableEvent::CreateLinks {
+
record_id: RecordId {
+
did: "did:plc:asdf".into(),
+
collection: "app.t.c".into(),
+
rkey: "asdf".into(),
+
},
+
links: vec![
+
CollectedLink {
+
target: Link::Uri("a.com".into()),
+
path: ".abc.uri".into(),
+
},
+
CollectedLink {
+
target: Link::Uri("b.com".into()),
+
path: ".def.uri".into(),
+
},
+
CollectedLink {
+
target: Link::Uri("b.com".into()),
+
path: ".ghi.uri".into(),
+
},
+
],
+
},
+
0,
+
)?;
+
assert_eq!(
+
storage.get_many_to_many_counts(
+
"a.com",
+
"app.t.c",
+
".abc.uri",
+
".def.uri",
+
10,
+
None,
+
&HashSet::new(),
+
&HashSet::new(),
+
)?,
+
PagedOrderedCollection {
+
items: vec![("b.com".to_string(), 1, 1)],
+
next: None,
+
}
+
);
+
});
+
+
test_each_storage!(get_m2m_counts_filters, |storage| {
+
storage.push(
+
&ActionableEvent::CreateLinks {
+
record_id: RecordId {
+
did: "did:plc:asdf".into(),
+
collection: "app.t.c".into(),
+
rkey: "asdf".into(),
+
},
+
links: vec![
+
CollectedLink {
+
target: Link::Uri("a.com".into()),
+
path: ".abc.uri".into(),
+
},
+
CollectedLink {
+
target: Link::Uri("b.com".into()),
+
path: ".def.uri".into(),
+
},
+
],
+
},
+
0,
+
)?;
+
storage.push(
+
&ActionableEvent::CreateLinks {
+
record_id: RecordId {
+
did: "did:plc:asdfasdf".into(),
+
collection: "app.t.c".into(),
+
rkey: "asdf".into(),
+
},
+
links: vec![
+
CollectedLink {
+
target: Link::Uri("a.com".into()),
+
path: ".abc.uri".into(),
+
},
+
CollectedLink {
+
target: Link::Uri("b.com".into()),
+
path: ".def.uri".into(),
+
},
+
],
+
},
+
1,
+
)?;
+
storage.push(
+
&ActionableEvent::CreateLinks {
+
record_id: RecordId {
+
did: "did:plc:fdsa".into(),
+
collection: "app.t.c".into(),
+
rkey: "asdf".into(),
+
},
+
links: vec![
+
CollectedLink {
+
target: Link::Uri("a.com".into()),
+
path: ".abc.uri".into(),
+
},
+
CollectedLink {
+
target: Link::Uri("c.com".into()),
+
path: ".def.uri".into(),
+
},
+
],
+
},
+
2,
+
)?;
+
storage.push(
+
&ActionableEvent::CreateLinks {
+
record_id: RecordId {
+
did: "did:plc:fdsa".into(),
+
collection: "app.t.c".into(),
+
rkey: "asdf2".into(),
+
},
+
links: vec![
+
CollectedLink {
+
target: Link::Uri("a.com".into()),
+
path: ".abc.uri".into(),
+
},
+
CollectedLink {
+
target: Link::Uri("c.com".into()),
+
path: ".def.uri".into(),
+
},
+
],
+
},
+
3,
+
)?;
+
assert_eq!(
+
storage.get_many_to_many_counts(
+
"a.com",
+
"app.t.c",
+
".abc.uri",
+
".def.uri",
+
10,
+
None,
+
&HashSet::new(),
+
&HashSet::new(),
+
)?,
+
PagedOrderedCollection {
+
items: vec![("b.com".to_string(), 2, 2), ("c.com".to_string(), 2, 1),],
+
next: None,
+
}
+
);
+
assert_eq!(
+
storage.get_many_to_many_counts(
+
"a.com",
+
"app.t.c",
+
".abc.uri",
+
".def.uri",
+
10,
+
None,
+
&HashSet::from_iter([Did("did:plc:fdsa".to_string())]),
+
&HashSet::new(),
+
)?,
+
PagedOrderedCollection {
+
items: vec![("c.com".to_string(), 2, 1),],
+
next: None,
+
}
+
);
+
assert_eq!(
+
storage.get_many_to_many_counts(
+
"a.com",
+
"app.t.c",
+
".abc.uri",
+
".def.uri",
+
10,
+
None,
+
&HashSet::new(),
+
&HashSet::from_iter(["b.com".to_string()]),
+
)?,
+
PagedOrderedCollection {
+
items: vec![("b.com".to_string(), 2, 2),],
+
next: None,
+
}
+
);
});
+342 -40
constellation/src/storage/rocks_store.rs
···
-
use super::{ActionableEvent, LinkReader, LinkStorage, PagedAppendingCollection, StorageStats};
+
use super::{
+
ActionableEvent, LinkReader, LinkStorage, PagedAppendingCollection, PagedOrderedCollection,
+
StorageStats,
+
};
use crate::{CountsByCount, Did, RecordId};
use anyhow::{bail, Result};
use bincode::Options as BincodeOptions;
···
MultiThreaded, Options, PrefixRange, ReadOptions, WriteBatch,
};
use serde::{Deserialize, Serialize};
-
use std::collections::{HashMap, HashSet};
+
use std::collections::{BTreeMap, HashMap, HashSet};
use std::io::Read;
use std::marker::PhantomData;
use std::path::{Path, PathBuf};
···
Arc,
};
use std::thread;
-
use std::time::{Duration, Instant};
+
use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
use tokio_util::sync::CancellationToken;
static DID_IDS_CF: &str = "did_ids";
···
static LINK_TARGETS_CF: &str = "link_targets";
static JETSTREAM_CURSOR_KEY: &str = "jetstream_cursor";
+
static STARTED_AT_KEY: &str = "jetstream_first_cursor";
+
// add reverse mappings for targets if this db was running before that was a thing
+
static TARGET_ID_REPAIR_STATE_KEY: &str = "target_id_table_repair_state";
+
+
static COZY_FIRST_CURSOR: u64 = 1_738_083_600_000_000; // constellation.microcosm.blue started
+
+
#[derive(Debug, Clone, Serialize, Deserialize)]
+
struct TargetIdRepairState {
+
/// start time for repair, microseconds timestamp
+
current_us_started_at: u64,
+
/// id table's latest id when repair started
+
id_when_started: u64,
+
/// id table id
+
latest_repaired_i: u64,
+
}
+
impl AsRocksValue for TargetIdRepairState {}
+
impl ValueFromRocks for TargetIdRepairState {}
// todo: actually understand and set these options probably better
fn rocks_opts_base() -> Options {
···
#[derive(Debug, Clone)]
pub struct RocksStorage {
pub db: Arc<DBWithThreadMode<MultiThreaded>>, // TODO: mov seqs here (concat merge op will be fun)
-
did_id_table: IdTable<Did, DidIdValue, true>,
-
target_id_table: IdTable<TargetKey, TargetId, false>,
+
did_id_table: IdTable<Did, DidIdValue>,
+
target_id_table: IdTable<TargetKey, TargetId>,
is_writer: bool,
backup_task: Arc<Option<thread::JoinHandle<Result<()>>>>,
}
···
fn cf_descriptor(&self) -> ColumnFamilyDescriptor {
ColumnFamilyDescriptor::new(&self.name, rocks_opts_base())
}
-
fn init<const WITH_REVERSE: bool>(
-
self,
-
db: &DBWithThreadMode<MultiThreaded>,
-
) -> Result<IdTable<Orig, IdVal, WITH_REVERSE>> {
+
fn init(self, db: &DBWithThreadMode<MultiThreaded>) -> Result<IdTable<Orig, IdVal>> {
if db.cf_handle(&self.name).is_none() {
bail!("failed to get cf handle from db -- was the db open with our .cf_descriptor()?");
}
···
}
}
#[derive(Debug, Clone)]
-
struct IdTable<Orig, IdVal: IdTableValue, const WITH_REVERSE: bool>
+
struct IdTable<Orig, IdVal: IdTableValue>
where
Orig: KeyFromRocks,
for<'a> &'a Orig: AsRocksKey,
···
base: IdTableBase<Orig, IdVal>,
priv_id_seq: u64,
}
-
impl<Orig: Clone, IdVal: IdTableValue, const WITH_REVERSE: bool> IdTable<Orig, IdVal, WITH_REVERSE>
+
impl<Orig: Clone, IdVal: IdTableValue> IdTable<Orig, IdVal>
where
Orig: KeyFromRocks,
for<'v> &'v IdVal: AsRocksValue,
···
_key_marker: PhantomData,
_val_marker: PhantomData,
name: name.into(),
-
id_seq: Arc::new(AtomicU64::new(0)), // zero is "uninint", first seq num will be 1
+
id_seq: Arc::new(AtomicU64::new(0)), // zero is "uninit", first seq num will be 1
}
}
fn get_id_val(
···
id_value
}))
}
+
fn estimate_count(&self) -> u64 {
self.base.id_seq.load(Ordering::SeqCst) - 1 // -1 because seq zero is reserved
}
-
}
-
impl<Orig: Clone, IdVal: IdTableValue> IdTable<Orig, IdVal, true>
-
where
-
Orig: KeyFromRocks,
-
for<'v> &'v IdVal: AsRocksValue,
-
for<'k> &'k Orig: AsRocksKey,
-
{
+
fn get_or_create_id_val(
&mut self,
db: &DBWithThreadMode<MultiThreaded>,
···
}
}
}
-
impl<Orig: Clone, IdVal: IdTableValue> IdTable<Orig, IdVal, false>
-
where
-
Orig: KeyFromRocks,
-
for<'v> &'v IdVal: AsRocksValue,
-
for<'k> &'k Orig: AsRocksKey,
-
{
-
fn get_or_create_id_val(
-
&mut self,
-
db: &DBWithThreadMode<MultiThreaded>,
-
batch: &mut WriteBatch,
-
orig: &Orig,
-
) -> Result<IdVal> {
-
let cf = db.cf_handle(&self.base.name).unwrap();
-
self.__get_or_create_id_val(&cf, db, batch, orig)
-
}
-
}
impl IdTableValue for DidIdValue {
fn new(v: u64) -> Self {
···
}
}
+
fn now() -> u64 {
+
SystemTime::now()
+
.duration_since(UNIX_EPOCH)
+
.unwrap()
+
.as_micros() as u64
+
}
+
impl RocksStorage {
pub fn new(path: impl AsRef<Path>) -> Result<Self> {
Self::describe_metrics();
-
RocksStorage::open_readmode(path, false)
+
let me = RocksStorage::open_readmode(path, false)?;
+
me.global_init()?;
+
Ok(me)
}
pub fn open_readonly(path: impl AsRef<Path>) -> Result<Self> {
···
}
fn open_readmode(path: impl AsRef<Path>, readonly: bool) -> Result<Self> {
-
let did_id_table = IdTable::<_, _, true>::setup(DID_IDS_CF);
-
let target_id_table = IdTable::<_, _, false>::setup(TARGET_IDS_CF);
+
let did_id_table = IdTable::setup(DID_IDS_CF);
+
let target_id_table = IdTable::setup(TARGET_IDS_CF);
+
// note: global stuff like jetstream cursor goes in the default cf
+
// these are bonus extra cfs
let cfs = vec![
// id reference tables
did_id_table.cf_descriptor(),
···
is_writer: !readonly,
backup_task: None.into(),
})
+
}
+
+
fn global_init(&self) -> Result<()> {
+
let first_run = self.db.get(JETSTREAM_CURSOR_KEY)?.is_some();
+
if first_run {
+
self.db.put(STARTED_AT_KEY, _rv(now()))?;
+
+
// hack / temporary: if we're a new db, put in a completed repair
+
// state so we don't run repairs (repairs are for old-code dbs)
+
let completed = TargetIdRepairState {
+
id_when_started: 0,
+
current_us_started_at: 0,
+
latest_repaired_i: 0,
+
};
+
self.db.put(TARGET_ID_REPAIR_STATE_KEY, _rv(completed))?;
+
}
+
Ok(())
+
}
+
+
pub fn run_repair(&self, breather: Duration, stay_alive: CancellationToken) -> Result<bool> {
+
let mut state = match self
+
.db
+
.get(TARGET_ID_REPAIR_STATE_KEY)?
+
.map(|s| _vr(&s))
+
.transpose()?
+
{
+
Some(s) => s,
+
None => TargetIdRepairState {
+
id_when_started: self.did_id_table.priv_id_seq,
+
current_us_started_at: now(),
+
latest_repaired_i: 0,
+
},
+
};
+
+
eprintln!("initial repair state: {state:?}");
+
+
let cf = self.db.cf_handle(TARGET_IDS_CF).unwrap();
+
+
let mut iter = self.db.raw_iterator_cf(&cf);
+
iter.seek_to_first();
+
+
eprintln!("repair iterator sent to first key");
+
+
// skip ahead if we're done some, or take a single first step
+
for _ in 0..state.latest_repaired_i {
+
iter.next();
+
}
+
+
eprintln!(
+
"repair iterator skipped to {}th key",
+
state.latest_repaired_i
+
);
+
+
let mut maybe_done = false;
+
+
let mut write_fast = rocksdb::WriteOptions::default();
+
write_fast.set_sync(false);
+
write_fast.disable_wal(true);
+
+
while !stay_alive.is_cancelled() && !maybe_done {
+
// let mut batch = WriteBatch::default();
+
+
let mut any_written = false;
+
+
for _ in 0..1000 {
+
if state.latest_repaired_i % 1_000_000 == 0 {
+
eprintln!("target iter at {}", state.latest_repaired_i);
+
}
+
state.latest_repaired_i += 1;
+
+
if !iter.valid() {
+
eprintln!("invalid iter, are we done repairing?");
+
maybe_done = true;
+
break;
+
};
+
+
// eprintln!("iterator seems to be valid! getting the key...");
+
let raw_key = iter.key().unwrap();
+
if raw_key.len() == 8 {
+
// eprintln!("found an 8-byte key, skipping it since it's probably an id...");
+
iter.next();
+
continue;
+
}
+
let target: TargetKey = _kr::<TargetKey>(raw_key)?;
+
let target_id: TargetId = _vr(iter.value().unwrap())?;
+
+
self.db
+
.put_cf_opt(&cf, target_id.id().to_be_bytes(), _rv(&target), &write_fast)?;
+
any_written = true;
+
iter.next();
+
}
+
+
if any_written {
+
self.db
+
.put(TARGET_ID_REPAIR_STATE_KEY, _rv(state.clone()))?;
+
std::thread::sleep(breather);
+
}
+
}
+
+
eprintln!("repair iterator done.");
+
+
Ok(false)
}
pub fn start_backup(
···
}
impl LinkReader for RocksStorage {
+
fn get_many_to_many_counts(
+
&self,
+
target: &str,
+
collection: &str,
+
path: &str,
+
path_to_other: &str,
+
limit: u64,
+
after: Option<String>,
+
filter_dids: &HashSet<Did>,
+
filter_to_targets: &HashSet<String>,
+
) -> Result<PagedOrderedCollection<(String, u64, u64), String>> {
+
let collection = Collection(collection.to_string());
+
let path = RPath(path.to_string());
+
+
let target_key = TargetKey(Target(target.to_string()), collection.clone(), path.clone());
+
+
// unfortunately the cursor is a, uh, stringified number.
+
// this was easier for the memstore (plain target, not target id), and
+
// making it generic is a bit awful.
+
// so... parse the number out of a string here :(
+
// TODO: this should bubble up to a BAD_REQUEST response
+
let after = after.map(|s| s.parse::<u64>().map(TargetId)).transpose()?;
+
+
let Some(target_id) = self.target_id_table.get_id_val(&self.db, &target_key)? else {
+
eprintln!("nothin doin for this target, {target_key:?}");
+
return Ok(Default::default());
+
};
+
+
let filter_did_ids: HashMap<DidId, bool> = filter_dids
+
.iter()
+
.filter_map(|did| self.did_id_table.get_id_val(&self.db, did).transpose())
+
.collect::<Result<Vec<DidIdValue>>>()?
+
.into_iter()
+
.map(|DidIdValue(id, active)| (id, active))
+
.collect();
+
+
// stored targets are keyed by triples of (target, collection, path).
+
// target filtering only consideres the target itself, so we actually
+
// need to do a prefix iteration of all target ids for this target and
+
// keep them all.
+
// i *think* the number of keys at a target prefix should usually be
+
// pretty small, so this is hopefully fine. but if it turns out to be
+
// large, we can push this filtering back into the main links loop and
+
// do forward db queries per backlink to get the raw target back out.
+
let mut filter_to_target_ids: HashSet<TargetId> = HashSet::new();
+
for t in filter_to_targets {
+
for (_, target_id) in self.iter_targets_for_target(&Target(t.to_string())) {
+
filter_to_target_ids.insert(target_id);
+
}
+
}
+
+
let linkers = self.get_target_linkers(&target_id)?;
+
+
let mut grouped_counts: BTreeMap<TargetId, (u64, HashSet<DidId>)> = BTreeMap::new();
+
+
for (did_id, rkey) in linkers.0 {
+
if did_id.is_empty() {
+
continue;
+
}
+
+
if !filter_did_ids.is_empty() && filter_did_ids.get(&did_id) != Some(&true) {
+
continue;
+
}
+
+
let record_link_key = RecordLinkKey(did_id, collection.clone(), rkey);
+
let Some(targets) = self.get_record_link_targets(&record_link_key)? else {
+
continue;
+
};
+
+
let Some(fwd_target) = targets
+
.0
+
.into_iter()
+
.filter_map(|RecordLinkTarget(rpath, target_id)| {
+
if rpath.0 == path_to_other
+
&& (filter_to_target_ids.is_empty()
+
|| filter_to_target_ids.contains(&target_id))
+
{
+
Some(target_id)
+
} else {
+
None
+
}
+
})
+
.take(1)
+
.next()
+
else {
+
eprintln!("no forward match");
+
continue;
+
};
+
+
// small relief: we page over target ids, so we can already bail
+
// reprocessing previous pages here
+
if after.as_ref().map(|a| fwd_target <= *a).unwrap_or(false) {
+
continue;
+
}
+
+
// aand we can skip target ids that must be on future pages
+
// (this check continues after the did-lookup, which we have to do)
+
let page_is_full = grouped_counts.len() as u64 >= limit;
+
if page_is_full {
+
let current_max = grouped_counts.keys().next_back().unwrap(); // limit should be non-zero bleh
+
if fwd_target > *current_max {
+
continue;
+
}
+
}
+
+
// bit painful: 2-step lookup to make sure this did is active
+
let Some(did) = self.did_id_table.get_val_from_id(&self.db, did_id.0)? else {
+
eprintln!("failed to look up did from did_id {did_id:?}");
+
continue;
+
};
+
let Some(DidIdValue(_, active)) = self.did_id_table.get_id_val(&self.db, &did)? else {
+
eprintln!("failed to look up did_value from did_id {did_id:?}: {did:?}: data consistency bug?");
+
continue;
+
};
+
if !active {
+
continue;
+
}
+
+
// page-management, continued
+
// if we have a full page, and we're inserting a *new* key less than
+
// the current max, then we can evict the current max
+
let mut should_evict = false;
+
let entry = grouped_counts.entry(fwd_target.clone()).or_insert_with(|| {
+
// this is a *new* key, so kick the max if we're full
+
should_evict = page_is_full;
+
Default::default()
+
});
+
entry.0 += 1;
+
entry.1.insert(did_id);
+
+
if should_evict {
+
grouped_counts.pop_last();
+
}
+
}
+
+
let mut items: Vec<(String, u64, u64)> = Vec::with_capacity(grouped_counts.len());
+
for (target_id, (n, dids)) in &grouped_counts {
+
let Some(target) = self
+
.target_id_table
+
.get_val_from_id(&self.db, target_id.0)?
+
else {
+
eprintln!("failed to look up target from target_id {target_id:?}");
+
continue;
+
};
+
items.push((target.0 .0, *n, dids.len() as u64));
+
}
+
+
let next = if grouped_counts.len() as u64 >= limit {
+
// yeah.... it's a number saved as a string......sorry
+
grouped_counts
+
.keys()
+
.next_back()
+
.map(|k| format!("{}", k.0))
+
} else {
+
None
+
};
+
+
Ok(PagedOrderedCollection { items, next })
+
}
+
fn get_count(&self, target: &str, collection: &str, path: &str) -> Result<u64> {
let target_key = TargetKey(
Target(target.to_string()),
···
.map(|s| s.parse::<u64>())
.transpose()?
.unwrap_or(0);
+
let started_at = self
+
.db
+
.get(STARTED_AT_KEY)?
+
.map(|c| _vr(&c))
+
.transpose()?
+
.unwrap_or(COZY_FIRST_CURSOR);
+
+
let other_data = self
+
.db
+
.get(TARGET_ID_REPAIR_STATE_KEY)?
+
.map(|s| _vr(&s))
+
.transpose()?
+
.map(
+
|TargetIdRepairState {
+
current_us_started_at,
+
id_when_started,
+
latest_repaired_i,
+
}| {
+
HashMap::from([
+
("current_us_started_at".to_string(), current_us_started_at),
+
("id_when_started".to_string(), id_when_started),
+
("latest_repaired_i".to_string(), latest_repaired_i),
+
])
+
},
+
)
+
.unwrap_or(HashMap::default());
+
Ok(StorageStats {
dids,
targetables,
linking_records,
+
started_at: Some(started_at),
+
other_data,
})
···
impl AsRocksValue for &TargetId {}
impl KeyFromRocks for TargetKey {}
impl ValueFromRocks for TargetId {}
+
+
// temp?
+
impl KeyFromRocks for TargetId {}
+
impl AsRocksValue for &TargetKey {}
// target_links table
impl AsRocksKey for &TargetId {}
···
// target ids
-
#[derive(Debug, Clone, Serialize, Deserialize)]
+
#[derive(Debug, Clone, Serialize, Deserialize, PartialOrd, Ord, PartialEq, Eq, Hash)]
struct TargetId(u64); // key
-
#[derive(Debug, Clone, Serialize, Deserialize)]
+
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
pub struct Target(pub String); // the actual target/uri
// targets (uris, dids, etc.): the reverse index
+1 -1
constellation/templates/get-backlinks.html.j2
···
{% extends "base.html.j2" %}
{% import "try-it-macros.html.j2" as try_it %}
-
{% block title %}Links{% endblock %}
+
{% block title %}Backlinks{% endblock %}
{% block description %}All {{ query.source }} records with links to {{ query.subject }}{% endblock %}
{% block content %}
+67
constellation/templates/get-many-to-many-counts.html.j2
···
+
{% extends "base.html.j2" %}
+
{% import "try-it-macros.html.j2" as try_it %}
+
+
{% block title %}Many to Many counts{% endblock %}
+
{% block description %}Counts of many-to-many {{ query.source }} join records with links to {{ query.subject }} and a secondary target at {{ query.path_to_other }}{% endblock %}
+
+
{% block content %}
+
+
{% call try_it::get_many_to_many_counts(
+
query.subject,
+
query.source,
+
query.path_to_other,
+
query.did,
+
query.other_subject,
+
query.limit,
+
) %}
+
+
<h2>
+
Many-to-many links to <code>{{ query.subject }}</code> joining through <code>{{ query.path_to_other }}</code>
+
{% if let Some(browseable_uri) = query.subject|to_browseable %}
+
<small style="font-weight: normal; font-size: 1rem"><a href="{{ browseable_uri }}">browse record</a></small>
+
{% endif %}
+
</h2>
+
+
<p><strong>{% if cursor.is_some() || query.cursor.is_some() %}more than {% endif %}{{ counts_by_other_subject.len()|to_u64|human_number }} joins</strong> <code>{{ query.source }}→{{ query.path_to_other }}</code></p>
+
+
<ul>
+
<li>See direct backlinks at <code>/xrpc/blue.microcosm.links.getBacklinks</code>: <a href="/xrpc/blue.microcosm.links.getBacklinks?subject={{ query.subject|urlencode }}&source={{ query.source|urlencode }}">/xrpc/blue.microcosm.links.getBacklinks?subject={{ query.subject }}&source={{ query.source }}</a></li>
+
<li>See all links to this target at <code>/links/all</code>: <a href="/links/all?target={{ query.subject|urlencode }}">/links/all?target={{ query.subject }}</a></li>
+
</ul>
+
+
<h3>Counts by other subject:</h3>
+
+
{% for counts in counts_by_other_subject %}
+
<pre style="display: block; margin: 1em 2em" class="code"><strong>Joined subject</strong>: {{ counts.subject }}
+
<strong>Joining records</strong>: {{ counts.total }}
+
<strong>Unique joiner ids</strong>: {{ counts.distinct }}
+
-> {% if let Some(browseable_uri) = counts.subject|to_browseable -%}
+
<a href="{{ browseable_uri }}">browse record</a>
+
{%- endif %}</pre>
+
{% endfor %}
+
+
{% if let Some(c) = cursor %}
+
<form method="get" action="/xrpc/blue.microcosm.links.getManyToManyCounts">
+
<input type="hidden" name="subject" value="{{ query.subject }}" />
+
<input type="hidden" name="source" value="{{ query.source }}" />
+
<input type="hidden" name="pathToOther" value="{{ query.path_to_other }}" />
+
{% for did in query.did %}
+
<input type="hidden" name="did" value="{{ did }}" />
+
{% endfor %}
+
{% for otherSubject in query.other_subject %}
+
<input type="hidden" name="otherSubject" value="{{ otherSubject }}" />
+
{% endfor %}
+
<input type="hidden" name="limit" value="{{ query.limit }}" />
+
<input type="hidden" name="cursor" value={{ c|json|safe }} />
+
<button type="submit">next page&hellip;</button>
+
</form>
+
{% else %}
+
<button disabled><em>end of results</em></button>
+
{% endif %}
+
+
<details>
+
<summary>Raw JSON response</summary>
+
<pre class="code">{{ self|tojson }}</pre>
+
</details>
+
+
{% endblock %}
+38 -2
constellation/templates/hello.html.j2
···
<p>It works by recursively walking <em>all</em> records coming through the firehose, searching for anything that looks like a link. Links are indexed by the target they point at, the collection the record came from, and the JSON path to the link in that record.</p>
<p>
-
This server has indexed <span class="stat">{{ stats.linking_records|human_number }}</span> links between <span class="stat">{{ stats.targetables|human_number }}</span> targets and sources from <span class="stat">{{ stats.dids|human_number }}</span> identities over <span class="stat">{{ days_indexed|human_number }}</span> days.<br/>
+
This server has indexed <span class="stat">{{ stats.linking_records|human_number }}</span> links between <span class="stat">{{ stats.targetables|human_number }}</span> targets and sources from <span class="stat">{{ stats.dids|human_number }}</span> identities over <span class="stat">
+
{%- if let Some(days) = days_indexed %}
+
{{ days|human_number }}
+
{% else %}
+
???
+
{% endif -%}
+
</span> days.<br/>
<small>(indexing new records in real time, backfill coming soon!)</small>
</p>
-
<p>But feel free to use it! If you want to be nice, put your project name and bsky username (or email) in your user-agent header for api requests.</p>
+
{# {% for k, v in stats.other_data.iter() %}
+
<p><strong>{{ k }}</strong>: {{ v }}</p>
+
{% endfor %} #}
+
+
<p>You're welcome to use this public instance! Please do not build the torment nexus. If you want to be nice, put your project name and bsky username (or email) in your user-agent header for api requests.</p>
<h2>API Endpoints</h2>
···
<p style="margin-bottom: 0"><strong>Try it:</strong></p>
{% call try_it::get_backlinks("at://did:plc:a4pqq234yw7fqbddawjo7y35/app.bsky.feed.post/3m237ilwc372e", "app.bsky.feed.like:subject.uri", [""], 16) %}
+
+
+
<h3 class="route"><code>GET /xrpc/blue.microcosm.links.getManyToManyCounts</code></h3>
+
+
<p>TODO: description</p>
+
+
<h4>Query parameters:</h4>
+
+
<ul>
+
<li><p><code>subject</code>: required, must url-encode. Example: <code>at://did:plc:vc7f4oafdgxsihk4cry2xpze/app.bsky.feed.post/3lgwdn7vd722r</code></p></li>
+
<li><p><code>source</code>: required. Example: <code>app.bsky.feed.like:subject.uri</code></p></li>
+
<li><p><code>pathToOther</code>: required. Path to the secondary link in the many-to-many record. Example: <code>otherThing.uri</code></p></li>
+
<li><p><code>did</code>: optional, filter links to those from specific users. Include multiple times to filter by multiple users. Example: <code>did=did:plc:vc7f4oafdgxsihk4cry2xpze&did=did:plc:vc7f4oafdgxsihk4cry2xpze</code></p></li>
+
<li><p><code>otherSubject</code>: optional, filter secondary links to specific subjects. Include multiple times to filter by multiple users. Example: <code>at://did:plc:vc7f4oafdgxsihk4cry2xpze/app.bsky.feed.post/3lgwdn7vd722r</code></p></li>
+
<li><p><code>limit</code>: optional. Default: <code>16</code>. Maximum: <code>100</code></p></li>
+
</ul>
+
+
<p style="margin-bottom: 0"><strong>Try it:</strong></p>
+
{% call try_it::get_many_to_many_counts(
+
"at://did:plc:wshs7t2adsemcrrd4snkeqli/sh.tangled.label.definition/good-first-issue",
+
"sh.tangled.label.op:add[].key",
+
"subject",
+
[""],
+
[""],
+
25,
+
) %}
<h3 class="route"><code>GET /links</code></h3>
+43 -1
constellation/templates/try-it-macros.html.j2
···
{% macro get_backlinks(subject, source, dids, limit) %}
<form method="get" action="/xrpc/blue.microcosm.links.getBacklinks">
-
<pre class="code"><strong>GET</strong> /links
+
<pre class="code"><strong>GET</strong> /xrpc/blue.microcosm.links.getBacklinks
?subject= <input type="text" name="subject" value="{{ subject }}" placeholder="at-uri, did, uri..." />
&source= <input type="text" name="source" value="{{ source }}" placeholder="app.bsky.feed.like:subject.uri" />
{%- for did in dids %}{% if !did.is_empty() %}
···
p.insertBefore(document.createTextNode('&did= '), didPlaceholder);
p.insertBefore(i, didPlaceholder);
p.insertBefore(document.createTextNode('\n '), didPlaceholder);
+
});
+
</script>
+
{% endmacro %}
+
+
{% macro get_many_to_many_counts(subject, source, pathToOther, dids, otherSubjects, limit) %}
+
<form method="get" action="/xrpc/blue.microcosm.links.getManyToManyCounts">
+
<pre class="code"><strong>GET</strong> /xrpc/blue.microcosm.links.getManyToManyCounts
+
?subject= <input type="text" name="subject" value="{{ subject }}" placeholder="at-uri, did, uri..." />
+
&source= <input type="text" name="source" value="{{ source }}" placeholder="app.bsky.feed.like:subject.uri" />
+
&pathToOther= <input type="text" name="pathToOther" value="{{ pathToOther }}" placeholder="otherThing.uri" />
+
{%- for did in dids %}{% if !did.is_empty() %}
+
&did= <input type="text" name="did" value="{{ did }}" placeholder="did:plc:..." />{% endif %}{% endfor %}
+
<span id="m2m-subject-placeholder"></span> <button id="m2m-add-subject">+ other subject filter</button>
+
{%- for otherSubject in otherSubjects %}{% if !otherSubject.is_empty() %}
+
&otherSubject= <input type="text" name="did" value="{{ otherSubject }}" placeholder="at-uri, did, uri..." />{% endif %}{% endfor %}
+
<span id="m2m-did-placeholder"></span> <button id="m2m-add-did">+ did filter</button>
+
&limit= <input type="number" name="limit" value="{{ limit }}" max="100" placeholder="100" /> <button type="submit">get links</button></pre>
+
</form>
+
<script>
+
const m2mAddDidButton = document.getElementById('m2m-add-did');
+
const m2mDidPlaceholder = document.getElementById('m2m-did-placeholder');
+
m2mAddDidButton.addEventListener('click', e => {
+
e.preventDefault();
+
const i = document.createElement('input');
+
i.placeholder = 'did:plc:...';
+
i.name = "did"
+
const p = m2mAddDidButton.parentNode;
+
p.insertBefore(document.createTextNode('&did= '), m2mDidPlaceholder);
+
p.insertBefore(i, m2mDidPlaceholder);
+
p.insertBefore(document.createTextNode('\n '), m2mDidPlaceholder);
+
});
+
const m2mAddSubjectButton = document.getElementById('m2m-add-subject');
+
const m2mSubjectPlaceholder = document.getElementById('m2m-subject-placeholder');
+
m2mAddSubjectButton.addEventListener('click', e => {
+
e.preventDefault();
+
const i = document.createElement('input');
+
i.placeholder = 'at-uri, did, uri...';
+
i.name = "otherSubject"
+
const p = m2mAddSubjectButton.parentNode;
+
p.insertBefore(document.createTextNode('&otherSubject= '), m2mSubjectPlaceholder);
+
p.insertBefore(i, m2mSubjectPlaceholder);
+
p.insertBefore(document.createTextNode('\n '), m2mSubjectPlaceholder);
});
</script>
{% endmacro %}