Constellation, Spacedust, Slingshot, UFOs: atproto crates and services for microcosm

Merge branch 'main' into constellation/filtering

+63 -24
Cargo.lock
···
[[package]]
name = "atrium-api"
-
version = "0.25.2"
-
source = "registry+https://github.com/rust-lang/crates.io-index"
-
checksum = "0d4eb9b4787aba546015c8ccda1d3924c157cee13d67848997fba74ac8144a07"
dependencies = [
"atrium-common",
"atrium-xrpc",
···
[[package]]
name = "atrium-common"
-
version = "0.1.1"
-
source = "registry+https://github.com/rust-lang/crates.io-index"
-
checksum = "ba30d2f9e1a8b3db8fc97d0a5f91ee5a28f8acdddb771ad74c1b08eda357ca3d"
dependencies = [
"dashmap",
"lru",
···
[[package]]
name = "atrium-xrpc"
-
version = "0.12.2"
-
source = "registry+https://github.com/rust-lang/crates.io-index"
-
checksum = "18a9e526cb2ed3e0a2ca78c3ce2a943d9041a68e067dadf42923b523771e07df"
dependencies = [
"http",
"serde",
···
]
[[package]]
name = "cc"
version = "1.2.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
···
[[package]]
name = "chrono"
-
version = "0.4.40"
source = "registry+https://github.com/rust-lang/crates.io-index"
-
checksum = "1a7964611d71df112cb1730f2ee67324fcf4d0fc6606acbbe9bfe06df124637c"
dependencies = [
"android-tzdata",
"iana-time-zone",
···
[[package]]
name = "getrandom"
-
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-
checksum = "73fea8450eea4bac3940448fb7ae50d91f034f941199fcd9d909a5a07aa455f0"
dependencies = [
"cfg-if",
"libc",
···
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38f262f097c174adebe41eb73d66ae9c06b2844fb0da69969647bbddd9b0538a"
dependencies = [
-
"getrandom 0.3.2",
"libc",
]
···
[[package]]
name = "rand"
-
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-
checksum = "3779b94aeb87e8bd4e834cee3650289ee9e0d5677f976ecdb6d219e5f4f6cd94"
dependencies = [
"rand_chacha 0.9.0",
"rand_core 0.9.3",
-
"zerocopy 0.8.24",
]
[[package]]
···
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
dependencies = [
-
"getrandom 0.3.2",
]
[[package]]
···
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3fbf2ae1b8bc8e02df939598064d22402220cd5bbcca1c76f7d6a310974d5615"
dependencies = [
"dyn-clone",
"schemars_derive",
"serde",
···
]
[[package]]
name = "serde_spanned"
version = "0.6.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
···
]
[[package]]
name = "sharded-slab"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
···
checksum = "7437ac7763b9b123ccf33c338a5cc1bac6f69b45a136c19bdd8a65e3916435bf"
dependencies = [
"fastrand",
-
"getrandom 0.3.2",
"once_cell",
"rustix 1.0.5",
"windows-sys 0.59.0",
···
[[package]]
name = "tokio-util"
-
version = "0.7.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
-
checksum = "6b9590b93e6fcc1739458317cccd391ad3955e2bde8913edf6f95f9e65a8f034"
dependencies = [
"bytes",
"futures-core",
···
"httparse",
"log",
"native-tls",
-
"rand 0.9.0",
"sha1",
"thiserror 2.0.12",
"url",
···
dependencies = [
"anyhow",
"async-trait",
"bincode 2.0.1",
-
"cardinality-estimator",
"clap",
"dropshot",
"env_logger",
"fjall",
"jetstream",
"log",
"lsm-tree",
···
"semver",
"serde",
"serde_json",
"tempfile",
"thiserror 2.0.12",
"tikv-jemallocator",
"tokio",
]
[[package]]
···
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "458f7a779bf54acc9f347480ac654f68407d3aab21269a6e3c9f922acd9e2da9"
dependencies = [
-
"getrandom 0.3.2",
"serde",
]
···
[[package]]
name = "atrium-api"
+
version = "0.25.3"
+
source = "git+https://github.com/uniphil/atrium?branch=fix%2Fnsid-allow-nonleading-name-digits#c4364f318d337bbc3e3e3aaf97c9f971e95f5f7e"
dependencies = [
"atrium-common",
"atrium-xrpc",
···
[[package]]
name = "atrium-common"
+
version = "0.1.2"
+
source = "git+https://github.com/uniphil/atrium?branch=fix%2Fnsid-allow-nonleading-name-digits#c4364f318d337bbc3e3e3aaf97c9f971e95f5f7e"
dependencies = [
"dashmap",
"lru",
···
[[package]]
name = "atrium-xrpc"
+
version = "0.12.3"
+
source = "git+https://github.com/uniphil/atrium?branch=fix%2Fnsid-allow-nonleading-name-digits#c4364f318d337bbc3e3e3aaf97c9f971e95f5f7e"
dependencies = [
"http",
"serde",
···
]
[[package]]
+
name = "cardinality-estimator-safe"
+
version = "4.0.1"
+
source = "registry+https://github.com/rust-lang/crates.io-index"
+
checksum = "b41ec0cd313b46ba3b508377544b25aa1d56d05ce9e657e77dfb001d5e726e53"
+
dependencies = [
+
"digest",
+
"enum_dispatch",
+
"serde",
+
]
+
+
[[package]]
name = "cc"
version = "1.2.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
···
[[package]]
name = "chrono"
+
version = "0.4.41"
source = "registry+https://github.com/rust-lang/crates.io-index"
+
checksum = "c469d952047f47f91b68d1cba3f10d63c11d73e4636f24f08daf0278abf01c4d"
dependencies = [
"android-tzdata",
"iana-time-zone",
···
[[package]]
name = "getrandom"
+
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
+
checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4"
dependencies = [
"cfg-if",
"libc",
···
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38f262f097c174adebe41eb73d66ae9c06b2844fb0da69969647bbddd9b0538a"
dependencies = [
+
"getrandom 0.3.3",
"libc",
]
···
[[package]]
name = "rand"
+
version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
+
checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97"
dependencies = [
"rand_chacha 0.9.0",
"rand_core 0.9.3",
]
[[package]]
···
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
dependencies = [
+
"getrandom 0.3.3",
]
[[package]]
···
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3fbf2ae1b8bc8e02df939598064d22402220cd5bbcca1c76f7d6a310974d5615"
dependencies = [
+
"chrono",
"dyn-clone",
"schemars_derive",
"serde",
···
]
[[package]]
+
name = "serde_qs"
+
version = "1.0.0-rc.3"
+
source = "registry+https://github.com/rust-lang/crates.io-index"
+
checksum = "4cb0b9062a400c31442e67d1f2b1e7746bebd691110ebee1b7d0c7293b04fab1"
+
dependencies = [
+
"itoa",
+
"percent-encoding",
+
"ryu",
+
"serde",
+
"thiserror 2.0.12",
+
]
+
+
[[package]]
name = "serde_spanned"
version = "0.6.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
···
]
[[package]]
+
name = "sha2"
+
version = "0.10.9"
+
source = "registry+https://github.com/rust-lang/crates.io-index"
+
checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283"
+
dependencies = [
+
"cfg-if",
+
"cpufeatures",
+
"digest",
+
]
+
+
[[package]]
name = "sharded-slab"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
···
checksum = "7437ac7763b9b123ccf33c338a5cc1bac6f69b45a136c19bdd8a65e3916435bf"
dependencies = [
"fastrand",
+
"getrandom 0.3.3",
"once_cell",
"rustix 1.0.5",
"windows-sys 0.59.0",
···
[[package]]
name = "tokio-util"
+
version = "0.7.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
+
checksum = "66a539a9ad6d5d281510d5bd368c973d636c02dbf8a67300bfb6b950696ad7df"
dependencies = [
"bytes",
"futures-core",
···
"httparse",
"log",
"native-tls",
+
"rand 0.9.1",
"sha1",
"thiserror 2.0.12",
"url",
···
dependencies = [
"anyhow",
"async-trait",
+
"base64 0.22.1",
"bincode 2.0.1",
+
"cardinality-estimator-safe",
+
"chrono",
"clap",
"dropshot",
"env_logger",
"fjall",
+
"getrandom 0.3.3",
+
"http",
"jetstream",
"log",
"lsm-tree",
···
"semver",
"serde",
"serde_json",
+
"serde_qs",
+
"sha2",
"tempfile",
"thiserror 2.0.12",
"tikv-jemallocator",
"tokio",
+
"tokio-util",
]
[[package]]
···
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "458f7a779bf54acc9f347480ac654f68407d3aab21269a6e3c9f922acd9e2da9"
dependencies = [
+
"getrandom 0.3.3",
"serde",
]
+2 -2
constellation/src/server/mod.rs
···
Ok(acceptable(
accept,
GetLinkItemsResponse {
-
total: paged.version.0,
linking_records: paged.items,
cursor,
query: (*query).clone(),
···
Ok(acceptable(
accept,
GetDidItemsResponse {
-
total: paged.version.0,
linking_dids: paged.items,
cursor,
query: (*query).clone(),
···
Ok(acceptable(
accept,
GetLinkItemsResponse {
+
total: paged.total,
linking_records: paged.items,
cursor,
query: (*query).clone(),
···
Ok(acceptable(
accept,
GetDidItemsResponse {
+
total: paged.total,
linking_dids: paged.items,
cursor,
query: (*query).clone(),
+6
constellation/src/storage/mem_store.rs
···
version: (0, 0),
items: Vec::new(),
next: None,
});
};
let Some(did_rkeys) = paths.get(&Source::new(collection, path)) else {
···
version: (0, 0),
items: Vec::new(),
next: None,
});
};
···
version: (total as u64, gone as u64),
items,
next,
})
}
···
version: (0, 0),
items: Vec::new(),
next: None,
});
};
let Some(did_rkeys) = paths.get(&Source::new(collection, path)) else {
···
version: (0, 0),
items: Vec::new(),
next: None,
});
};
···
version: (total as u64, gone as u64),
items,
next,
})
}
···
version: (0, 0),
items: Vec::new(),
next: None,
+
total: 0,
});
};
let Some(did_rkeys) = paths.get(&Source::new(collection, path)) else {
···
version: (0, 0),
items: Vec::new(),
next: None,
+
total: 0,
});
};
···
version: (total as u64, gone as u64),
items,
next,
+
total: alive as u64,
})
}
···
version: (0, 0),
items: Vec::new(),
next: None,
+
total: 0,
});
};
let Some(did_rkeys) = paths.get(&Source::new(collection, path)) else {
···
version: (0, 0),
items: Vec::new(),
next: None,
+
total: 0,
});
};
···
version: (total as u64, gone as u64),
items,
next,
+
total: alive as u64,
})
}
+19
constellation/src/storage/mod.rs
···
pub version: (u64, u64), // (collection length, deleted item count) // TODO: change to (total, active)? since dedups isn't "deleted"
pub items: Vec<T>,
pub next: Option<u64>,
}
#[derive(Debug, Deserialize, Serialize, PartialEq)]
···
version: (0, 0),
items: vec![],
next: None,
}
);
assert_eq!(
···
version: (0, 0),
items: vec![],
next: None,
}
);
assert_eq!(storage.get_all_counts("bad-example.com")?, HashMap::new());
···
rkey: "asdf".into(),
}],
next: None,
}
);
assert_eq!(
···
version: (1, 0),
items: vec!["did:plc:asdf".into()],
next: None,
}
);
assert_stats(storage.get_stats()?, 1..=1, 1..=1, 1..=1);
···
},
],
next: Some(3),
}
);
assert_eq!(
···
version: (5, 0),
items: vec!["did:plc:asdf-5".into(), "did:plc:asdf-4".into()],
next: Some(3),
}
);
let links = storage.get_links(
···
},
],
next: Some(1),
}
);
assert_eq!(
···
version: (5, 0),
items: vec!["did:plc:asdf-3".into(), "did:plc:asdf-2".into()],
next: Some(1),
}
);
let links = storage.get_links(
···
rkey: "asdf".into(),
},],
next: None,
}
);
assert_eq!(
···
version: (5, 0),
items: vec!["did:plc:asdf-1".into()],
next: None,
}
);
assert_stats(storage.get_stats()?, 5..=5, 1..=1, 5..=5);
···
},
],
next: Some(2),
}
);
let links = storage.get_links(
···
},
],
next: None,
}
);
assert_stats(storage.get_stats()?, 4..=4, 1..=1, 4..=4);
···
},
],
next: Some(2),
}
);
storage.push(
···
},
],
next: None,
}
);
assert_stats(storage.get_stats()?, 5..=5, 1..=1, 5..=5);
···
},
],
next: Some(2),
}
);
storage.push(
···
rkey: "asdf".into(),
},],
next: None,
}
);
assert_stats(storage.get_stats()?, 4..=4, 1..=1, 3..=3);
···
},
],
next: Some(2),
}
);
storage.push(
···
rkey: "asdf".into(),
},],
next: None,
}
);
assert_stats(storage.get_stats()?, 4..=4, 1..=1, 4..=4);
···
pub version: (u64, u64), // (collection length, deleted item count) // TODO: change to (total, active)? since dedups isn't "deleted"
pub items: Vec<T>,
pub next: Option<u64>,
+
pub total: u64,
}
#[derive(Debug, Deserialize, Serialize, PartialEq)]
···
version: (0, 0),
items: vec![],
next: None,
+
total: 0,
}
);
assert_eq!(
···
version: (0, 0),
items: vec![],
next: None,
+
total: 0,
}
);
assert_eq!(storage.get_all_counts("bad-example.com")?, HashMap::new());
···
rkey: "asdf".into(),
}],
next: None,
+
total: 1,
}
);
assert_eq!(
···
version: (1, 0),
items: vec!["did:plc:asdf".into()],
next: None,
+
total: 1,
}
);
assert_stats(storage.get_stats()?, 1..=1, 1..=1, 1..=1);
···
},
],
next: Some(3),
+
total: 5,
}
);
assert_eq!(
···
version: (5, 0),
items: vec!["did:plc:asdf-5".into(), "did:plc:asdf-4".into()],
next: Some(3),
+
total: 5,
}
);
let links = storage.get_links(
···
},
],
next: Some(1),
+
total: 5,
}
);
assert_eq!(
···
version: (5, 0),
items: vec!["did:plc:asdf-3".into(), "did:plc:asdf-2".into()],
next: Some(1),
+
total: 5,
}
);
let links = storage.get_links(
···
rkey: "asdf".into(),
},],
next: None,
+
total: 5,
}
);
assert_eq!(
···
version: (5, 0),
items: vec!["did:plc:asdf-1".into()],
next: None,
+
total: 5,
}
);
assert_stats(storage.get_stats()?, 5..=5, 1..=1, 5..=5);
···
},
],
next: Some(2),
+
total: 4,
}
);
let links = storage.get_links(
···
},
],
next: None,
+
total: 4,
}
);
assert_stats(storage.get_stats()?, 4..=4, 1..=1, 4..=4);
···
},
],
next: Some(2),
+
total: 4,
}
);
storage.push(
···
},
],
next: None,
+
total: 5,
}
);
assert_stats(storage.get_stats()?, 5..=5, 1..=1, 5..=5);
···
},
],
next: Some(2),
+
total: 4,
}
);
storage.push(
···
rkey: "asdf".into(),
},],
next: None,
+
total: 3,
}
);
assert_stats(storage.get_stats()?, 4..=4, 1..=1, 3..=3);
···
},
],
next: Some(2),
+
total: 4,
}
);
storage.push(
···
rkey: "asdf".into(),
},],
next: None,
+
total: 4,
}
);
assert_stats(storage.get_stats()?, 4..=4, 1..=1, 4..=4);
+4
constellation/src/storage/rocks_store.rs
···
version: (0, 0),
items: Vec::new(),
next: None,
});
};
···
version: (total, gone),
items,
next,
})
}
···
version: (0, 0),
items: Vec::new(),
next: None,
});
};
···
version: (total, gone),
items,
next,
})
}
···
version: (0, 0),
items: Vec::new(),
next: None,
+
total: 0,
});
};
···
version: (total, gone),
items,
next,
+
total: alive,
})
}
···
version: (0, 0),
items: Vec::new(),
next: None,
+
total: 0,
});
};
···
version: (total, gone),
items,
next,
+
total: alive,
})
}
+1 -1
jetstream/Cargo.toml
···
[dependencies]
async-trait = "0.1.83"
-
atrium-api = { version = "0.25.2", default-features = false, features = [
"namespace-appbsky",
] }
tokio = { version = "1.44.2", features = ["full", "sync", "time"] }
···
[dependencies]
async-trait = "0.1.83"
+
atrium-api = { git = "https://github.com/uniphil/atrium", branch = "fix/nsid-allow-nonleading-name-digits", default-features = false, features = [
"namespace-appbsky",
] }
tokio = { version = "1.44.2", features = ["full", "sync", "time"] }
+2 -1
jetstream/src/events.rs
···
///
/// Warning: this exploits the internal implementation detail of jetstream cursors
/// being ~microsecond timestamps.
-
pub fn at(t: SystemTime) -> Self {
let unix_dt = t
.duration_since(UNIX_EPOCH)
.expect("cannot set jetstream cursor earlier than unix epoch");
Self(unix_dt.as_micros() as u64)
···
///
/// Warning: this exploits the internal implementation detail of jetstream cursors
/// being ~microsecond timestamps.
+
pub fn at(t: impl Into<SystemTime>) -> Self {
let unix_dt = t
+
.into()
.duration_since(UNIX_EPOCH)
.expect("cannot set jetstream cursor earlier than unix epoch");
Self(unix_dt.as_micros() as u64)
+74 -81
jetstream/src/lib.rs
···
retry_attempt += 1;
if let Ok((ws_stream, _)) = connect_async(req).await {
let t_connected = Instant::now();
-
log::trace!("jetstream connected. starting websocket task...");
if let Err(e) =
websocket_task(dict, ws_stream, send_channel.clone(), &mut last_cursor)
.await
···
}
log::error!("Jetstream closed after encountering error: {e:?}");
} else {
-
log::error!("Jetstream connection closed cleanly");
}
if t_connected.elapsed() > Duration::from_secs(success_threshold_s) {
retry_attempt = 0;
}
}
if retry_attempt >= max_retries {
-
log::error!("hit max retries, bye");
break;
}
···
let mut closing_connection = false;
loop {
match socket_read.next().await {
-
Some(Ok(message)) => {
-
match message {
-
Message::Text(json) => {
-
let event: JetstreamEvent = match serde_json::from_str(&json) {
-
Ok(ev) => ev,
-
Err(e) => {
-
log::warn!(
-
"failed to parse json: {e:?} (from {})",
-
json.get(..24).unwrap_or(&json)
-
);
-
continue;
-
}
-
};
-
let event_cursor = event.cursor;
-
if let Some(last) = last_cursor {
-
if event_cursor <= *last {
-
log::warn!("event cursor {event_cursor:?} was older than the last one: {last:?}. dropping event.");
-
continue;
-
}
}
-
if send_channel.send(event).await.is_err() {
-
// We can assume that all receivers have been dropped, so we can close
-
// the connection and exit the task.
-
log::info!(
"All receivers for the Jetstream connection have been dropped, closing connection."
);
-
socket_write.close().await?;
-
return Err(JetstreamEventError::ReceiverClosedError);
-
} else if let Some(last) = last_cursor.as_mut() {
-
*last = event_cursor;
-
}
}
-
Message::Binary(zstd_json) => {
-
let mut cursor = IoCursor::new(zstd_json);
-
let decoder = zstd::stream::Decoder::with_prepared_dictionary(
-
&mut cursor,
-
&dictionary,
-
)
-
.map_err(JetstreamEventError::CompressionDictionaryError)?;
-
let event: JetstreamEvent = match serde_json::from_reader(decoder) {
-
Ok(ev) => ev,
-
Err(e) => {
-
log::warn!("failed to parse json: {e:?}");
-
continue;
-
}
-
};
-
let event_cursor = event.cursor;
-
if let Some(last) = last_cursor {
-
if event_cursor <= *last {
-
log::warn!("event cursor {event_cursor:?} was older than the last one: {last:?}. dropping event.");
-
continue;
-
}
}
-
if send_channel.send(event).await.is_err() {
-
// We can assume that all receivers have been dropped, so we can close
-
// the connection and exit the task.
-
log::info!(
"All receivers for the Jetstream connection have been dropped, closing connection."
);
-
socket_write.close().await?;
-
return Err(JetstreamEventError::ReceiverClosedError);
-
} else if let Some(last) = last_cursor.as_mut() {
-
*last = event_cursor;
-
}
}
-
Message::Ping(vec) => {
-
log::trace!("Ping recieved, responding");
-
socket_write
-
.send(Message::Pong(vec))
-
.await
-
.map_err(JetstreamEventError::PingPongError)?;
-
}
-
Message::Close(close_frame) => {
-
log::trace!("Close recieved. I guess we just log here?");
-
if let Some(close_frame) = close_frame {
-
let reason = close_frame.reason;
-
let code = close_frame.code;
-
log::trace!("Connection closed. Reason: {reason}, Code: {code}");
-
}
-
}
-
Message::Pong(pong) => {
-
let pong_payload = String::from_utf8(pong.to_vec())
-
.unwrap_or("Invalid payload".to_string());
-
log::trace!("Pong recieved. Payload: {pong_payload}");
}
-
Message::Frame(_) => (),
}
-
}
Some(Err(error)) => {
log::error!("Web socket error: {error}");
closing_connection = true;
···
retry_attempt += 1;
if let Ok((ws_stream, _)) = connect_async(req).await {
let t_connected = Instant::now();
+
log::info!("jetstream connected. starting websocket task...");
if let Err(e) =
websocket_task(dict, ws_stream, send_channel.clone(), &mut last_cursor)
.await
···
}
log::error!("Jetstream closed after encountering error: {e:?}");
} else {
+
log::warn!("Jetstream connection closed cleanly");
}
if t_connected.elapsed() > Duration::from_secs(success_threshold_s) {
+
log::warn!("Jetstream: more than {success_threshold_s}s since last reconnect, reconnecting immediately.");
retry_attempt = 0;
}
}
if retry_attempt >= max_retries {
+
log::error!("jetstream: hit max retries, bye");
break;
}
···
let mut closing_connection = false;
loop {
match socket_read.next().await {
+
Some(Ok(message)) => match message {
+
Message::Text(json) => {
+
let event: JetstreamEvent = match serde_json::from_str(&json) {
+
Ok(ev) => ev,
+
Err(e) => {
+
log::warn!(
+
"failed to parse json: {e:?} (from {})",
+
json.get(..24).unwrap_or(&json)
+
);
+
continue;
+
}
+
};
+
let event_cursor = event.cursor;
+
if let Some(last) = last_cursor {
+
if event_cursor <= *last {
+
log::warn!("event cursor {event_cursor:?} was not newer than the last one: {last:?}. dropping event.");
+
continue;
}
+
}
+
if send_channel.send(event).await.is_err() {
+
log::warn!(
"All receivers for the Jetstream connection have been dropped, closing connection."
);
+
socket_write.close().await?;
+
return Err(JetstreamEventError::ReceiverClosedError);
+
} else if let Some(last) = last_cursor.as_mut() {
+
*last = event_cursor;
}
+
}
+
Message::Binary(zstd_json) => {
+
let mut cursor = IoCursor::new(zstd_json);
+
let decoder =
+
zstd::stream::Decoder::with_prepared_dictionary(&mut cursor, &dictionary)
+
.map_err(JetstreamEventError::CompressionDictionaryError)?;
+
let event: JetstreamEvent = match serde_json::from_reader(decoder) {
+
Ok(ev) => ev,
+
Err(e) => {
+
log::warn!("failed to parse json: {e:?}");
+
continue;
+
}
+
};
+
let event_cursor = event.cursor;
+
if let Some(last) = last_cursor {
+
if event_cursor <= *last {
+
log::warn!("event cursor {event_cursor:?} was not newer than the last one: {last:?}. dropping event.");
+
continue;
}
+
}
+
if send_channel.send(event).await.is_err() {
+
log::warn!(
"All receivers for the Jetstream connection have been dropped, closing connection."
);
+
socket_write.close().await?;
+
return Err(JetstreamEventError::ReceiverClosedError);
+
} else if let Some(last) = last_cursor.as_mut() {
+
*last = event_cursor;
}
+
}
+
Message::Ping(vec) => {
+
log::trace!("Ping recieved, responding");
+
socket_write
+
.send(Message::Pong(vec))
+
.await
+
.map_err(JetstreamEventError::PingPongError)?;
+
}
+
Message::Close(close_frame) => {
+
log::trace!("Close recieved. I guess we just log here?");
+
if let Some(close_frame) = close_frame {
+
let reason = close_frame.reason;
+
let code = close_frame.code;
+
log::trace!("Connection closed. Reason: {reason}, Code: {code}");
}
+
}
+
Message::Pong(pong) => {
+
let pong_payload =
+
String::from_utf8(pong.to_vec()).unwrap_or("Invalid payload".to_string());
+
log::trace!("Pong recieved. Payload: {pong_payload}");
}
+
Message::Frame(_) => (),
+
},
Some(Err(error)) => {
log::error!("Web socket error: {error}");
closing_connection = true;
+9 -2
ufos/Cargo.toml
···
[dependencies]
anyhow = "1.0.97"
async-trait = "0.1.88"
bincode = { version = "2.0.1", features = ["serde"] }
-
cardinality-estimator = { version = "1.0.2", features = ["with_serde"] }
clap = { version = "4.5.31", features = ["derive"] }
dropshot = "0.16.0"
env_logger = "0.11.7"
fjall = { version = "2.8.0", features = ["lz4"] }
jetstream = { path = "../jetstream" }
log = "0.4.26"
lsm-tree = "2.6.6"
-
schemars = { version = "0.8.22", features = ["raw_value"] }
semver = "1.0.26"
serde = "1.0.219"
serde_json = "1.0.140"
thiserror = "2.0.12"
tokio = { version = "1.44.2", features = ["full", "sync", "time"] }
[target.'cfg(not(target_env = "msvc"))'.dependencies]
tikv-jemallocator = "0.6.0"
···
[dependencies]
anyhow = "1.0.97"
async-trait = "0.1.88"
+
base64 = "0.22.1"
bincode = { version = "2.0.1", features = ["serde"] }
+
cardinality-estimator-safe = { version = "4.0.1", features = ["with_serde", "with_digest"] }
+
chrono = { version = "0.4.41", features = ["serde"] }
clap = { version = "4.5.31", features = ["derive"] }
dropshot = "0.16.0"
env_logger = "0.11.7"
fjall = { version = "2.8.0", features = ["lz4"] }
+
getrandom = "0.3.3"
+
http = "1.3.1"
jetstream = { path = "../jetstream" }
log = "0.4.26"
lsm-tree = "2.6.6"
+
schemars = { version = "0.8.22", features = ["raw_value", "chrono"] }
semver = "1.0.26"
serde = "1.0.219"
serde_json = "1.0.140"
+
serde_qs = "1.0.0-rc.3"
+
sha2 = "0.10.9"
thiserror = "2.0.12"
tokio = { version = "1.44.2", features = ["full", "sync", "time"] }
+
tokio-util = "0.7.15"
[target.'cfg(not(target_env = "msvc"))'.dependencies]
tikv-jemallocator = "0.6.0"
+1 -1
ufos/fuzz/fuzz_targets/counts_value.rs
···
assert_eq!(serialized.len(), n);
let (and_back, n_again) = CountsValue::from_db_bytes(&serialized).unwrap();
assert_eq!(n_again, n);
-
assert_eq!(and_back.records(), counts_value.records());
assert_eq!(and_back.dids().estimate(), counts_value.dids().estimate());
}
});
···
assert_eq!(serialized.len(), n);
let (and_back, n_again) = CountsValue::from_db_bytes(&serialized).unwrap();
assert_eq!(n_again, n);
+
assert_eq!(and_back.counts(), counts_value.counts());
assert_eq!(and_back.dids().estimate(), counts_value.dids().estimate());
}
});
+13
ufos/readme.md
···
cargo clean
```
nginx forward proxy for websocket (run this on another host):
```nginx
···
cargo clean
```
+
for bonilla but 64-bit? (rp4)
+
```bash
+
cross build --release --target aarch64-unknown-linux-gnu && scp ../target/aarch64-unknown-linux-gnu/release/ufos pi@bonilla.local:ufos
+
# ^^ fails due to linker?
+
+
cross build --release --target aarch64-unknown-linux-musl && scp ../target/aarch64-unknown-linux-musl/release/ufos pi@bonilla.local:ufos
+
# seems to work
+
+
rsync -avhP ufos-bff-rl/ pi@bonilla:/mnt/ufos-db/
+
+
RUST_LOG=info ./ufos --jetstream us-west-2 --data /mnt/ufos-db/
+
```
+
nginx forward proxy for websocket (run this on another host):
```nginx
+45 -15
ufos/src/consumer.rs
···
use jetstream::{
events::{Cursor, EventKind, JetstreamEvent},
exports::{Did, Nsid},
···
use std::mem;
use std::time::Duration;
use tokio::sync::mpsc::{channel, Receiver, Sender};
use crate::error::{BatchInsertError, FirehoseEventError};
use crate::{DeleteAccount, EventBatch, UFOsCommit};
···
pub const MAX_BATCHED_COLLECTIONS: usize = 64; // hard limit, MAX_BATCHED_RECORDS applies per-collection
pub const MIN_BATCH_SPAN_SECS: f64 = 2.; // breathe
pub const MAX_BATCH_SPAN_SECS: f64 = 60.; // hard limit, pause consumer if we're unable to send by now
-
pub const SEND_TIMEOUT_S: f64 = 15.; // if the channel is blocked longer than this, something is probably up
-
pub const BATCH_QUEUE_SIZE: usize = 1; // nearly-rendez-vous
pub type LimitedBatch = EventBatch<MAX_BATCHED_RECORDS>;
···
jetstream_receiver: JetstreamReceiver,
batch_sender: Sender<LimitedBatch>,
current_batch: CurrentBatch,
}
pub async fn consume(
jetstream_endpoint: &str,
cursor: Option<Cursor>,
no_compress: bool,
) -> anyhow::Result<Receiver<LimitedBatch>> {
let endpoint = DefaultJetstreamEndpoints::endpoint_or_shortcut(jetstream_endpoint);
if endpoint == jetstream_endpoint {
···
.connect_cursor(cursor)
.await?;
let (batch_sender, batch_reciever) = channel::<LimitedBatch>(BATCH_QUEUE_SIZE);
-
let mut batcher = Batcher::new(jetstream_receiver, batch_sender);
-
tokio::task::spawn(async move { batcher.run().await });
Ok(batch_reciever)
}
impl Batcher {
-
pub fn new(jetstream_receiver: JetstreamReceiver, batch_sender: Sender<LimitedBatch>) -> Self {
Self {
jetstream_receiver,
batch_sender,
current_batch: Default::default(),
}
}
pub async fn run(&mut self) -> anyhow::Result<()> {
loop {
-
if let Some(event) = self.jetstream_receiver.recv().await {
-
self.handle_event(event).await?
-
} else {
-
anyhow::bail!("channel closed");
}
}
}
async fn handle_event(&mut self, event: JetstreamEvent) -> anyhow::Result<()> {
if let Some(earliest) = &self.current_batch.initial_cursor {
if event.cursor.duration_since(earliest)? > Duration::from_secs_f64(MAX_BATCH_SPAN_SECS)
{
-
self.send_current_batch_now(false).await?;
}
} else {
self.current_batch.initial_cursor = Some(event.cursor);
···
if event.cursor.duration_since(earliest)?.as_secs_f64() > MIN_BATCH_SPAN_SECS
&& self.batch_sender.capacity() == BATCH_QUEUE_SIZE
{
-
self.send_current_batch_now(true).await?;
}
}
Ok(())
···
&collection,
commit,
MAX_BATCHED_COLLECTIONS,
);
if let Err(BatchInsertError::BatchFull(commit)) = optimistic_res {
-
self.send_current_batch_now(false).await?;
self.current_batch.batch.insert_commit_by_nsid(
&collection,
commit,
MAX_BATCHED_COLLECTIONS,
)?;
} else {
optimistic_res?;
···
async fn handle_delete_account(&mut self, did: Did, cursor: Cursor) -> anyhow::Result<()> {
if self.current_batch.batch.account_removes.len() >= MAX_ACCOUNT_REMOVES {
-
self.send_current_batch_now(false).await?;
}
self.current_batch
.batch
···
// holds up all consumer progress until it can send to the channel
// use this when the current batch is too full to add more to it
-
async fn send_current_batch_now(&mut self, small: bool) -> anyhow::Result<()> {
let beginning = match self.current_batch.initial_cursor.map(|c| c.elapsed()) {
None => "unknown".to_string(),
Some(Ok(t)) => format!("{:?}", t),
Some(Err(e)) => format!("+{:?}", e.duration()),
};
log::info!(
-
"sending batch now from {beginning}, {}, queue capacity: {}",
if small { "small" } else { "full" },
self.batch_sender.capacity(),
);
let current = mem::take(&mut self.current_batch);
self.batch_sender
.send_timeout(current.batch, Duration::from_secs_f64(SEND_TIMEOUT_S))
.await?;
···
+
use crate::store_types::SketchSecretPrefix;
use jetstream::{
events::{Cursor, EventKind, JetstreamEvent},
exports::{Did, Nsid},
···
use std::mem;
use std::time::Duration;
use tokio::sync::mpsc::{channel, Receiver, Sender};
+
use tokio::time::{timeout, Interval};
use crate::error::{BatchInsertError, FirehoseEventError};
use crate::{DeleteAccount, EventBatch, UFOsCommit};
···
pub const MAX_BATCHED_COLLECTIONS: usize = 64; // hard limit, MAX_BATCHED_RECORDS applies per-collection
pub const MIN_BATCH_SPAN_SECS: f64 = 2.; // breathe
pub const MAX_BATCH_SPAN_SECS: f64 = 60.; // hard limit, pause consumer if we're unable to send by now
+
pub const SEND_TIMEOUT_S: f64 = 150.; // if the channel is blocked longer than this, something is probably up
+
pub const BATCH_QUEUE_SIZE: usize = 64; // used to be 1, but sometimes inserts are just really slow????????
pub type LimitedBatch = EventBatch<MAX_BATCHED_RECORDS>;
···
jetstream_receiver: JetstreamReceiver,
batch_sender: Sender<LimitedBatch>,
current_batch: CurrentBatch,
+
sketch_secret: SketchSecretPrefix,
+
rate_limit: Interval,
}
pub async fn consume(
jetstream_endpoint: &str,
cursor: Option<Cursor>,
no_compress: bool,
+
sketch_secret: SketchSecretPrefix,
) -> anyhow::Result<Receiver<LimitedBatch>> {
let endpoint = DefaultJetstreamEndpoints::endpoint_or_shortcut(jetstream_endpoint);
if endpoint == jetstream_endpoint {
···
.connect_cursor(cursor)
.await?;
let (batch_sender, batch_reciever) = channel::<LimitedBatch>(BATCH_QUEUE_SIZE);
+
let mut batcher = Batcher::new(jetstream_receiver, batch_sender, sketch_secret);
+
tokio::task::spawn(async move {
+
let r = batcher.run().await;
+
log::warn!("batcher ended: {r:?}");
+
});
Ok(batch_reciever)
}
impl Batcher {
+
pub fn new(
+
jetstream_receiver: JetstreamReceiver,
+
batch_sender: Sender<LimitedBatch>,
+
sketch_secret: SketchSecretPrefix,
+
) -> Self {
+
let mut rate_limit = tokio::time::interval(std::time::Duration::from_millis(3));
+
rate_limit.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
Self {
jetstream_receiver,
batch_sender,
current_batch: Default::default(),
+
sketch_secret,
+
rate_limit,
}
}
pub async fn run(&mut self) -> anyhow::Result<()> {
+
// TODO: report errors *from here* probably, since this gets shipped off into a spawned task that might just vanish
loop {
+
match timeout(Duration::from_secs_f64(30.), self.jetstream_receiver.recv()).await {
+
Err(_elapsed) => self.no_events_step().await?,
+
Ok(Some(event)) => self.handle_event(event).await?,
+
Ok(None) => anyhow::bail!("channel closed"),
}
}
}
+
async fn no_events_step(&mut self) -> anyhow::Result<()> {
+
let empty = self.current_batch.batch.is_empty();
+
log::info!("no events received, stepping batcher (empty? {empty})");
+
if !empty {
+
self.send_current_batch_now(true, "no events step").await?;
+
}
+
Ok(())
+
}
+
async fn handle_event(&mut self, event: JetstreamEvent) -> anyhow::Result<()> {
if let Some(earliest) = &self.current_batch.initial_cursor {
if event.cursor.duration_since(earliest)? > Duration::from_secs_f64(MAX_BATCH_SPAN_SECS)
{
+
self.send_current_batch_now(false, "time since event")
+
.await?;
}
} else {
self.current_batch.initial_cursor = Some(event.cursor);
···
if event.cursor.duration_since(earliest)?.as_secs_f64() > MIN_BATCH_SPAN_SECS
&& self.batch_sender.capacity() == BATCH_QUEUE_SIZE
{
+
self.send_current_batch_now(true, "available queue").await?;
}
}
Ok(())
···
&collection,
commit,
MAX_BATCHED_COLLECTIONS,
+
&self.sketch_secret,
);
if let Err(BatchInsertError::BatchFull(commit)) = optimistic_res {
+
self.send_current_batch_now(false, "handle commit").await?;
self.current_batch.batch.insert_commit_by_nsid(
&collection,
commit,
MAX_BATCHED_COLLECTIONS,
+
&self.sketch_secret,
)?;
} else {
optimistic_res?;
···
async fn handle_delete_account(&mut self, did: Did, cursor: Cursor) -> anyhow::Result<()> {
if self.current_batch.batch.account_removes.len() >= MAX_ACCOUNT_REMOVES {
+
self.send_current_batch_now(false, "delete account").await?;
}
self.current_batch
.batch
···
// holds up all consumer progress until it can send to the channel
// use this when the current batch is too full to add more to it
+
async fn send_current_batch_now(&mut self, small: bool, referrer: &str) -> anyhow::Result<()> {
let beginning = match self.current_batch.initial_cursor.map(|c| c.elapsed()) {
None => "unknown".to_string(),
Some(Ok(t)) => format!("{:?}", t),
Some(Err(e)) => format!("+{:?}", e.duration()),
};
log::info!(
+
"sending batch now from {beginning}, {}, queue capacity: {}, referrer: {referrer}",
if small { "small" } else { "full" },
self.batch_sender.capacity(),
);
let current = mem::take(&mut self.current_batch);
+
self.rate_limit.tick().await;
self.batch_sender
.send_timeout(current.batch, Duration::from_secs_f64(SEND_TIMEOUT_S))
.await?;
+112 -33
ufos/src/db_types.rs
···
pub enum EncodingError {
#[error("failed to parse Atrium string type: {0}")]
BadAtriumStringType(&'static str),
#[error("failed to bincode-encode: {0}")]
BincodeEncodeFailed(#[from] EncodeError),
#[error("failed to bincode-decode: {0}")]
···
InvalidTruncated(u64, u64),
}
fn bincode_conf() -> impl Config {
standard()
.with_big_endian()
···
}
pub trait DbBytes {
-
fn to_db_bytes(&self) -> Result<Vec<u8>, EncodingError>;
fn from_db_bytes(bytes: &[u8]) -> Result<(Self, usize), EncodingError>
where
Self: Sized;
}
#[derive(PartialEq)]
···
pub fn from_pair(prefix: P, suffix: S) -> Self {
Self { prefix, suffix }
}
-
pub fn from_prefix_to_db_bytes(prefix: &P) -> Result<Vec<u8>, EncodingError> {
prefix.to_db_bytes()
}
-
pub fn to_prefix_db_bytes(&self) -> Result<Vec<u8>, EncodingError> {
self.prefix.to_db_bytes()
}
-
pub fn prefix_range_end(prefix: &P) -> Result<Vec<u8>, EncodingError> {
-
let prefix_bytes = prefix.to_db_bytes()?;
-
let (_, Bound::Excluded(range_end)) = prefix_to_range(&prefix_bytes) else {
-
return Err(EncodingError::BadRangeBound);
-
};
-
Ok(range_end.to_vec())
}
-
pub fn range_end(&self) -> Result<Vec<u8>, EncodingError> {
Self::prefix_range_end(&self.prefix)
}
pub fn range(&self) -> Result<Range<Vec<u8>>, EncodingError> {
···
}
}
impl<P: DbBytes + std::fmt::Debug, S: DbBytes + std::fmt::Debug> fmt::Debug for DbConcat<P, S> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "DbConcat<{:?} || {:?}>", self.prefix, self.suffix)
···
}
impl<P: DbBytes, S: DbBytes> DbBytes for DbConcat<P, S> {
-
fn to_db_bytes(&self) -> Result<Vec<u8>, EncodingError> {
let mut combined = self.prefix.to_db_bytes()?;
combined.append(&mut self.suffix.to_db_bytes()?);
Ok(combined)
···
#[derive(Debug, Default, PartialEq)]
pub struct DbEmpty(());
impl DbBytes for DbEmpty {
-
fn to_db_bytes(&self) -> Result<Vec<u8>, EncodingError> {
Ok(vec![])
}
fn from_db_bytes(_: &[u8]) -> Result<(Self, usize), EncodingError> {
···
}
}
impl<S: StaticStr> DbBytes for DbStaticStr<S> {
-
fn to_db_bytes(&self) -> Result<Vec<u8>, EncodingError> {
S::static_str().to_string().to_db_bytes()
}
fn from_db_bytes(bytes: &[u8]) -> Result<(Self, usize), EncodingError> {
···
where
T: BincodeEncode + BincodeDecode<()> + UseBincodePlz + Sized + std::fmt::Debug,
{
-
fn to_db_bytes(&self) -> Result<Vec<u8>, EncodingError> {
Ok(encode_to_vec(self, bincode_conf())?)
}
fn from_db_bytes(bytes: &[u8]) -> Result<(Self, usize), EncodingError> {
···
/// helper trait: impl on a type to get helpers to implement DbBytes
pub trait SerdeBytes: serde::Serialize + for<'a> serde::Deserialize<'a> {
-
fn to_bytes(&self) -> Result<Vec<u8>, EncodingError>
where
Self: std::fmt::Debug,
{
···
//////
impl DbBytes for Vec<u8> {
-
fn to_db_bytes(&self) -> Result<Vec<u8>, EncodingError> {
Ok(self.to_vec())
}
fn from_db_bytes(bytes: &[u8]) -> Result<(Self, usize), EncodingError> {
Ok((bytes.to_owned(), bytes.len()))
}
···
/// TODO: wrap in another type. it's actually probably not desirable to serialize strings this way
/// *except* where needed as a prefix.
impl DbBytes for String {
-
fn to_db_bytes(&self) -> Result<Vec<u8>, EncodingError> {
let mut v = self.as_bytes().to_vec();
if v.contains(&0x00) {
return Err(EncodingError::StringContainedNull);
···
}
}
impl DbBytes for Did {
fn from_db_bytes(bytes: &[u8]) -> Result<(Self, usize), EncodingError> {
let (s, n) = decode_from_slice(bytes, bincode_conf())?;
let me = Self::new(s).map_err(EncodingError::BadAtriumStringType)?;
Ok((me, n))
}
-
fn to_db_bytes(&self) -> Result<Vec<u8>, EncodingError> {
Ok(encode_to_vec(self.as_ref(), bincode_conf())?)
}
}
impl DbBytes for Nsid {
fn from_db_bytes(bytes: &[u8]) -> Result<(Self, usize), EncodingError> {
-
let (s, n) = decode_from_slice(bytes, bincode_conf())?;
let me = Self::new(s).map_err(EncodingError::BadAtriumStringType)?;
Ok((me, n))
}
-
fn to_db_bytes(&self) -> Result<Vec<u8>, EncodingError> {
-
Ok(encode_to_vec(self.as_ref(), bincode_conf())?)
}
}
···
let me = Self::new(s).map_err(EncodingError::BadAtriumStringType)?;
Ok((me, n))
}
-
fn to_db_bytes(&self) -> Result<Vec<u8>, EncodingError> {
Ok(encode_to_vec(self.as_ref(), bincode_conf())?)
}
}
impl DbBytes for Cursor {
-
fn to_db_bytes(&self) -> Result<Vec<u8>, EncodingError> {
Ok(self.to_raw_u64().to_be_bytes().to_vec())
}
fn from_db_bytes(bytes: &[u8]) -> Result<(Self, usize), EncodingError> {
···
}
impl DbBytes for serde_json::Value {
-
fn to_db_bytes(&self) -> Result<Vec<u8>, EncodingError> {
self.to_string().to_db_bytes()
}
fn from_db_bytes(bytes: &[u8]) -> Result<(Self, usize), EncodingError> {
···
#[cfg(test)]
mod test {
-
use super::{Cursor, DbBytes, DbConcat, DbEmpty, DbStaticStr, EncodingError, StaticStr};
#[test]
-
fn test_db_empty() -> Result<(), EncodingError> {
let original = DbEmpty::default();
let serialized = original.to_db_bytes()?;
assert_eq!(serialized.len(), 0);
···
}
#[test]
-
fn test_string_roundtrip() -> Result<(), EncodingError> {
for (case, desc) in [
("", "empty string"),
("a", "basic string"),
···
}
#[test]
-
fn test_string_serialized_lexicographic_sort() -> Result<(), EncodingError> {
let aa = "aa".to_string().to_db_bytes()?;
let b = "b".to_string().to_db_bytes()?;
assert!(b > aa);
···
}
#[test]
-
fn test_string_cursor_prefix_roundtrip() -> Result<(), EncodingError> {
type TwoThings = DbConcat<String, Cursor>;
for (lazy_prefix, tired_suffix, desc) in [
("", 0, "empty string and cursor"),
···
}
#[test]
-
fn test_cursor_string_prefix_roundtrip() -> Result<(), EncodingError> {
type TwoThings = DbConcat<Cursor, String>;
for (tired_prefix, sad_suffix, desc) in [
(0, "", "empty string and cursor"),
···
}
#[test]
-
fn test_static_str() -> Result<(), EncodingError> {
#[derive(Debug, PartialEq)]
struct AStaticStr {}
impl StaticStr for AStaticStr {
···
}
#[test]
-
fn test_static_str_empty() -> Result<(), EncodingError> {
#[derive(Debug, PartialEq)]
struct AnEmptyStr {}
impl StaticStr for AnEmptyStr {
···
}
#[test]
-
fn test_static_prefix() -> Result<(), EncodingError> {
#[derive(Debug, PartialEq)]
struct AStaticPrefix {}
impl StaticStr for AStaticPrefix {
···
pub enum EncodingError {
#[error("failed to parse Atrium string type: {0}")]
BadAtriumStringType(&'static str),
+
#[error("Not enough NSID segments for a usable prefix")]
+
NotEnoughNsidSegments,
#[error("failed to bincode-encode: {0}")]
BincodeEncodeFailed(#[from] EncodeError),
#[error("failed to bincode-decode: {0}")]
···
InvalidTruncated(u64, u64),
}
+
pub type EncodingResult<T> = Result<T, EncodingError>;
+
fn bincode_conf() -> impl Config {
standard()
.with_big_endian()
···
}
pub trait DbBytes {
+
fn to_db_bytes(&self) -> EncodingResult<Vec<u8>>;
fn from_db_bytes(bytes: &[u8]) -> Result<(Self, usize), EncodingError>
where
Self: Sized;
+
fn as_prefix_range_end(&self) -> EncodingResult<Vec<u8>> {
+
let bytes = self.to_db_bytes()?;
+
let (_, Bound::Excluded(range_end)) = prefix_to_range(&bytes) else {
+
return Err(EncodingError::BadRangeBound);
+
};
+
Ok(range_end.to_vec())
+
}
+
}
+
+
pub trait SubPrefixBytes<T> {
+
fn sub_prefix(input: T) -> EncodingResult<Vec<u8>>;
}
#[derive(PartialEq)]
···
pub fn from_pair(prefix: P, suffix: S) -> Self {
Self { prefix, suffix }
}
+
pub fn from_prefix_to_db_bytes(prefix: &P) -> EncodingResult<Vec<u8>> {
prefix.to_db_bytes()
}
+
pub fn to_prefix_db_bytes(&self) -> EncodingResult<Vec<u8>> {
self.prefix.to_db_bytes()
}
+
pub fn prefix_range_end(prefix: &P) -> EncodingResult<Vec<u8>> {
+
prefix.as_prefix_range_end()
}
+
pub fn range_end(&self) -> EncodingResult<Vec<u8>> {
Self::prefix_range_end(&self.prefix)
}
pub fn range(&self) -> Result<Range<Vec<u8>>, EncodingError> {
···
}
}
+
impl<P: DbBytes + Default, S: DbBytes + Default> Default for DbConcat<P, S> {
+
fn default() -> Self {
+
Self {
+
prefix: Default::default(),
+
suffix: Default::default(),
+
}
+
}
+
}
+
impl<P: DbBytes + std::fmt::Debug, S: DbBytes + std::fmt::Debug> fmt::Debug for DbConcat<P, S> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "DbConcat<{:?} || {:?}>", self.prefix, self.suffix)
···
}
impl<P: DbBytes, S: DbBytes> DbBytes for DbConcat<P, S> {
+
fn to_db_bytes(&self) -> EncodingResult<Vec<u8>> {
let mut combined = self.prefix.to_db_bytes()?;
combined.append(&mut self.suffix.to_db_bytes()?);
Ok(combined)
···
#[derive(Debug, Default, PartialEq)]
pub struct DbEmpty(());
impl DbBytes for DbEmpty {
+
fn to_db_bytes(&self) -> EncodingResult<Vec<u8>> {
Ok(vec![])
}
fn from_db_bytes(_: &[u8]) -> Result<(Self, usize), EncodingError> {
···
}
}
impl<S: StaticStr> DbBytes for DbStaticStr<S> {
+
fn to_db_bytes(&self) -> EncodingResult<Vec<u8>> {
S::static_str().to_string().to_db_bytes()
}
fn from_db_bytes(bytes: &[u8]) -> Result<(Self, usize), EncodingError> {
···
where
T: BincodeEncode + BincodeDecode<()> + UseBincodePlz + Sized + std::fmt::Debug,
{
+
fn to_db_bytes(&self) -> EncodingResult<Vec<u8>> {
Ok(encode_to_vec(self, bincode_conf())?)
}
fn from_db_bytes(bytes: &[u8]) -> Result<(Self, usize), EncodingError> {
···
/// helper trait: impl on a type to get helpers to implement DbBytes
pub trait SerdeBytes: serde::Serialize + for<'a> serde::Deserialize<'a> {
+
fn to_bytes(&self) -> EncodingResult<Vec<u8>>
where
Self: std::fmt::Debug,
{
···
//////
+
impl<const N: usize> UseBincodePlz for [u8; N] {}
+
+
// bare bytes (NOT prefix-encoded!)
impl DbBytes for Vec<u8> {
+
fn to_db_bytes(&self) -> EncodingResult<Vec<u8>> {
Ok(self.to_vec())
}
+
// greedy, consumes ALL remaining bytes
fn from_db_bytes(bytes: &[u8]) -> Result<(Self, usize), EncodingError> {
Ok((bytes.to_owned(), bytes.len()))
}
···
/// TODO: wrap in another type. it's actually probably not desirable to serialize strings this way
/// *except* where needed as a prefix.
impl DbBytes for String {
+
fn to_db_bytes(&self) -> EncodingResult<Vec<u8>> {
let mut v = self.as_bytes().to_vec();
if v.contains(&0x00) {
return Err(EncodingError::StringContainedNull);
···
}
}
+
impl SubPrefixBytes<&str> for String {
+
fn sub_prefix(input: &str) -> EncodingResult<Vec<u8>> {
+
let v = input.as_bytes();
+
if v.contains(&0x00) {
+
return Err(EncodingError::StringContainedNull);
+
}
+
// NO null terminator!!
+
Ok(v.to_vec())
+
}
+
}
+
impl DbBytes for Did {
fn from_db_bytes(bytes: &[u8]) -> Result<(Self, usize), EncodingError> {
let (s, n) = decode_from_slice(bytes, bincode_conf())?;
let me = Self::new(s).map_err(EncodingError::BadAtriumStringType)?;
Ok((me, n))
}
+
fn to_db_bytes(&self) -> EncodingResult<Vec<u8>> {
Ok(encode_to_vec(self.as_ref(), bincode_conf())?)
}
}
impl DbBytes for Nsid {
fn from_db_bytes(bytes: &[u8]) -> Result<(Self, usize), EncodingError> {
+
let (s, n) = String::from_db_bytes(bytes)?; // null-terminated DbBytes impl!!
let me = Self::new(s).map_err(EncodingError::BadAtriumStringType)?;
Ok((me, n))
}
+
fn to_db_bytes(&self) -> EncodingResult<Vec<u8>> {
+
String::to_db_bytes(&self.to_string()) // null-terminated DbBytes impl!!!!
+
}
+
}
+
impl SubPrefixBytes<&str> for Nsid {
+
fn sub_prefix(input: &str) -> EncodingResult<Vec<u8>> {
+
String::sub_prefix(input)
}
}
···
let me = Self::new(s).map_err(EncodingError::BadAtriumStringType)?;
Ok((me, n))
}
+
fn to_db_bytes(&self) -> EncodingResult<Vec<u8>> {
Ok(encode_to_vec(self.as_ref(), bincode_conf())?)
}
}
impl DbBytes for Cursor {
+
fn to_db_bytes(&self) -> EncodingResult<Vec<u8>> {
Ok(self.to_raw_u64().to_be_bytes().to_vec())
}
fn from_db_bytes(bytes: &[u8]) -> Result<(Self, usize), EncodingError> {
···
}
impl DbBytes for serde_json::Value {
+
fn to_db_bytes(&self) -> EncodingResult<Vec<u8>> {
self.to_string().to_db_bytes()
}
fn from_db_bytes(bytes: &[u8]) -> Result<(Self, usize), EncodingError> {
···
#[cfg(test)]
mod test {
+
use super::{
+
Cursor, DbBytes, DbConcat, DbEmpty, DbStaticStr, EncodingResult, Nsid, StaticStr,
+
SubPrefixBytes,
+
};
#[test]
+
fn test_db_empty() -> EncodingResult<()> {
let original = DbEmpty::default();
let serialized = original.to_db_bytes()?;
assert_eq!(serialized.len(), 0);
···
}
#[test]
+
fn test_string_roundtrip() -> EncodingResult<()> {
for (case, desc) in [
("", "empty string"),
("a", "basic string"),
···
}
#[test]
+
fn test_string_serialized_lexicographic_sort() -> EncodingResult<()> {
let aa = "aa".to_string().to_db_bytes()?;
let b = "b".to_string().to_db_bytes()?;
assert!(b > aa);
···
}
#[test]
+
fn test_nullstring_can_prefix() -> EncodingResult<()> {
+
for (s, pre, is_pre, desc) in [
+
("", "", true, "empty strings"),
+
("", "a", false, "longer prefix"),
+
("a", "", true, "empty prefix matches"),
+
("a", "a", true, "whole string matches"),
+
("a", "b", false, "entirely different"),
+
("ab", "a", true, "prefix matches"),
+
("ab", "b", false, "shorter and entirely different"),
+
] {
+
let serialized = s.to_string().to_db_bytes()?;
+
let prefixed = String::sub_prefix(pre)?;
+
assert_eq!(serialized.starts_with(&prefixed), is_pre, "{}", desc);
+
}
+
Ok(())
+
}
+
+
#[test]
+
fn test_nsid_can_prefix() -> EncodingResult<()> {
+
for (s, pre, is_pre, desc) in [
+
("ab.cd.ef", "", true, "empty prefix"),
+
("ab.cd.ef", "a", true, "tiny prefix"),
+
("ab.cd.ef", "abc", false, "bad prefix"),
+
("ab.cd.ef", "ab", true, "segment prefix"),
+
("ab.cd.ef", "ab.cd", true, "multi-segment prefix"),
+
("ab.cd.ef", "ab.cd.ef", true, "full match"),
+
("ab.cd.ef", "ab.cd.ef.g", false, "prefix longer"),
+
] {
+
let serialized = Nsid::new(s.to_string()).unwrap().to_db_bytes()?;
+
let prefixed = Nsid::sub_prefix(pre)?;
+
assert_eq!(serialized.starts_with(&prefixed), is_pre, "{}", desc);
+
}
+
Ok(())
+
}
+
+
#[test]
+
fn test_string_cursor_prefix_roundtrip() -> EncodingResult<()> {
type TwoThings = DbConcat<String, Cursor>;
for (lazy_prefix, tired_suffix, desc) in [
("", 0, "empty string and cursor"),
···
}
#[test]
+
fn test_cursor_string_prefix_roundtrip() -> EncodingResult<()> {
type TwoThings = DbConcat<Cursor, String>;
for (tired_prefix, sad_suffix, desc) in [
(0, "", "empty string and cursor"),
···
}
#[test]
+
fn test_static_str() -> EncodingResult<()> {
#[derive(Debug, PartialEq)]
struct AStaticStr {}
impl StaticStr for AStaticStr {
···
}
#[test]
+
fn test_static_str_empty() -> EncodingResult<()> {
#[derive(Debug, PartialEq)]
struct AnEmptyStr {}
impl StaticStr for AnEmptyStr {
···
}
#[test]
+
fn test_static_prefix() -> EncodingResult<()> {
#[derive(Debug, PartialEq)]
struct AStaticPrefix {}
impl StaticStr for AStaticPrefix {
+4
ufos/src/error.rs
···
Stolen,
#[error("Failed to join tokio task: {0}")]
JoinError(#[from] tokio::task::JoinError),
}
···
Stolen,
#[error("Failed to join tokio task: {0}")]
JoinError(#[from] tokio::task::JoinError),
+
#[error("Background task already started")]
+
BackgroundAlreadyStarted,
+
#[error("Batch sender exited")]
+
BatchSenderExited,
}
+57 -11
ufos/src/file_consumer.rs
···
use crate::consumer::{Batcher, LimitedBatch, BATCH_QUEUE_SIZE};
use anyhow::Result;
use jetstream::{error::JetstreamEventError, events::JetstreamEvent};
use std::path::PathBuf;
···
sync::mpsc::{channel, Receiver, Sender},
};
-
async fn read_jsonl(f: File, sender: Sender<JetstreamEvent>) -> Result<()> {
let mut lines = BufReader::new(f).lines();
while let Some(line) = lines.next_line().await? {
-
let event: JetstreamEvent =
-
serde_json::from_str(&line).map_err(JetstreamEventError::ReceivedMalformedJSON)?;
-
if sender.send(event).await.is_err() {
-
log::warn!("All receivers for the jsonl fixture have been dropped, bye.");
-
return Err(JetstreamEventError::ReceiverClosedError.into());
}
}
-
Ok(())
}
-
pub async fn consume(p: PathBuf) -> Result<Receiver<LimitedBatch>> {
let f = File::open(p).await?;
let (jsonl_sender, jsonl_receiver) = channel::<JetstreamEvent>(16);
let (batch_sender, batch_reciever) = channel::<LimitedBatch>(BATCH_QUEUE_SIZE);
-
let mut batcher = Batcher::new(jsonl_receiver, batch_sender);
-
tokio::task::spawn(async move { read_jsonl(f, jsonl_sender).await });
-
tokio::task::spawn(async move { batcher.run().await });
Ok(batch_reciever)
}
···
use crate::consumer::{Batcher, LimitedBatch, BATCH_QUEUE_SIZE};
+
use crate::store_types::SketchSecretPrefix;
+
use crate::Cursor;
use anyhow::Result;
use jetstream::{error::JetstreamEventError, events::JetstreamEvent};
use std::path::PathBuf;
···
sync::mpsc::{channel, Receiver, Sender},
};
+
async fn read_jsonl(f: File, sender: Sender<JetstreamEvent>, cursor: Option<Cursor>) -> Result<()> {
let mut lines = BufReader::new(f).lines();
+
if let Some(db_cursor) = cursor {
+
log::info!("jsonl fixture: skipping events before cursor {db_cursor:?}");
+
let mut bad_lines = 0;
+
let mut skipped = 0;
+
while let Some(line) = lines.next_line().await? {
+
let Ok(event) = serde_json::from_str::<JetstreamEvent>(&line) else {
+
bad_lines += 1;
+
continue;
+
};
+
if event.cursor < db_cursor {
+
skipped += 1;
+
continue;
+
}
+
if event.cursor == db_cursor {
+
log::info!("jsonl fixture: found existing db cursor! skipped {skipped} old events and failed parsing {bad_lines} lines");
+
break;
+
}
+
anyhow::bail!("jsonl fixture: did not find existing db cursor, found event cursor {:?} which is newer. bailing.", event.cursor);
+
}
+
} else {
+
log::info!("jsonl fixture: no cursor provided, sending every event");
+
}
+
+
log::info!("jsonl fixture: now sending events");
while let Some(line) = lines.next_line().await? {
+
match serde_json::from_str::<JetstreamEvent>(&line) {
+
Ok(event) => match sender.send(event).await {
+
Ok(_) => {}
+
Err(e) => {
+
log::warn!("All receivers for the jsonl fixture have been dropped, bye: {e:?}");
+
return Err(JetstreamEventError::ReceiverClosedError.into());
+
}
+
},
+
Err(parse_err) => {
+
log::warn!("failed to parse event: {parse_err:?} from event:\n{line}");
+
continue;
+
}
}
}
+
log::info!("reached end of jsonl file, looping on noop to keep server alive.");
+
loop {
+
tokio::time::sleep(std::time::Duration::from_secs_f64(10.)).await;
+
}
}
+
pub async fn consume(
+
p: PathBuf,
+
sketch_secret: SketchSecretPrefix,
+
cursor: Option<Cursor>,
+
) -> Result<Receiver<LimitedBatch>> {
let f = File::open(p).await?;
let (jsonl_sender, jsonl_receiver) = channel::<JetstreamEvent>(16);
let (batch_sender, batch_reciever) = channel::<LimitedBatch>(BATCH_QUEUE_SIZE);
+
let mut batcher = Batcher::new(jsonl_receiver, batch_sender, sketch_secret);
+
tokio::task::spawn(async move {
+
let r = read_jsonl(f, jsonl_sender, cursor).await;
+
log::warn!("read_jsonl finished: {r:?}");
+
});
+
tokio::task::spawn(async move {
+
let r = batcher.run().await;
+
log::warn!("batcher finished: {r:?}");
+
});
Ok(batch_reciever)
}
+51
ufos/src/index_html.rs
···
···
+
pub const INDEX_HTML: &str = r#"<!doctype html>
+
<html lang="en">
+
<head>
+
<meta charset="utf-8" />
+
<title>UFOs API Documentation</title>
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
+
<meta name="description" content="API Documentation for UFOs: Samples and stats for all atproto lexicons." />
+
<style>
+
.custom-header {
+
height: 42px;
+
background-color: var(--scalar-background-1);
+
box-shadow: inset 0 -1px 0 var(--scalar-border-color);
+
color: var(--scalar-color-1);
+
font-size: var(--scalar-font-size-3);
+
font-family: 'Iowan Old Style', 'Palatino Linotype', 'URW Palladio L', P052, serif;
+
padding: 0 18px;
+
justify-content: space-between;
+
}
+
.custom-header,
+
.custom-header nav {
+
display: flex;
+
align-items: center;
+
gap: 18px;
+
}
+
.custom-header a:hover {
+
color: var(--scalar-color-2);
+
}
+
</style>
+
</head>
+
<body>
+
<header class="custom-header scalar-app">
+
<b>a <a href="https://microcosm.blue">microcosm</a> project</b>
+
<nav>
+
<a href="https://bsky.app/profile/microcosm.blue">@microcosm.blue</a>
+
<a href="https://github.com/at-microcosm">github</a>
+
</nav>
+
</header>
+
+
<script id="api-reference" type="application/json" data-url="/openapi""></script>
+
+
<script>
+
var configuration = {
+
theme: 'purple',
+
}
+
document.getElementById('api-reference').dataset.configuration = JSON.stringify(configuration)
+
</script>
+
+
<script src="https://cdn.jsdelivr.net/npm/@scalar/api-reference"></script>
+
</body>
+
</html>
+
"#;
+297 -164
ufos/src/lib.rs
···
pub mod db_types;
pub mod error;
pub mod file_consumer;
pub mod server;
pub mod storage;
pub mod storage_fjall;
-
pub mod storage_mem;
pub mod store_types;
use crate::error::BatchInsertError;
-
use cardinality_estimator::CardinalityEstimator;
use error::FirehoseEventError;
use jetstream::events::{CommitEvent, CommitOp, Cursor};
use jetstream::exports::{Did, Nsid, RecordKey};
use schemars::JsonSchema;
use serde::Serialize;
use serde_json::value::RawValue;
use std::collections::HashMap;
#[derive(Debug, Default, Clone)]
pub struct CollectionCommits<const LIMIT: usize> {
-
pub total_seen: usize,
-
pub dids_estimate: CardinalityEstimator<Did>,
pub commits: Vec<UFOsCommit>,
head: usize,
-
non_creates: usize,
}
impl<const LIMIT: usize> CollectionCommits<LIMIT> {
···
self.head = 0;
}
}
-
pub fn truncating_insert(&mut self, commit: UFOsCommit) -> Result<(), BatchInsertError> {
-
if self.non_creates == LIMIT {
return Err(BatchInsertError::BatchFull(commit));
}
-
let did = commit.did.clone();
-
let is_create = commit.action.is_create();
if self.commits.len() < LIMIT {
self.commits.push(commit);
-
if self.commits.capacity() > LIMIT {
-
self.commits.shrink_to(LIMIT); // save mem?????? maybe??
-
}
} else {
let head_started_at = self.head;
loop {
let candidate = self
···
return Err(BatchInsertError::BatchForever);
}
}
-
}
-
-
if is_create {
-
self.total_seen += 1;
-
self.dids_estimate.insert(&did);
-
} else {
-
self.non_creates += 1;
}
Ok(())
···
collection: &Nsid,
commit: UFOsCommit,
max_collections: usize,
) -> Result<(), BatchInsertError> {
let map = &mut self.commits_by_nsid;
if !map.contains_key(collection) && map.len() >= max_collections {
···
}
map.entry(collection.clone())
.or_default()
-
.truncating_insert(commit)?;
Ok(())
-
}
-
pub fn total_records(&self) -> usize {
-
self.commits_by_nsid.values().map(|v| v.commits.len()).sum()
-
}
-
pub fn total_seen(&self) -> usize {
-
self.commits_by_nsid.values().map(|v| v.total_seen).sum()
}
pub fn total_collections(&self) -> usize {
self.commits_by_nsid.len()
···
self.account_removes.len()
}
pub fn estimate_dids(&self) -> usize {
-
let mut estimator = CardinalityEstimator::<Did>::new();
for commits in self.commits_by_nsid.values() {
estimator.merge(&commits.dids_estimate);
}
···
}
#[derive(Debug, Serialize, JsonSchema)]
pub enum ConsumerInfo {
Jetstream {
endpoint: String,
started_at: u64,
latest_cursor: Option<u64>,
},
}
-
#[derive(Debug, Default, PartialEq, Serialize, JsonSchema)]
-
pub struct TopCollections {
-
total_records: u64,
dids_estimate: u64,
-
nsid_child_segments: HashMap<String, TopCollections>,
}
-
// this is not safe from ~DOS
-
// todo: remove this and just iterate the all-time rollups to get nsids? (or recent rollups?)
-
impl From<TopCollections> for Vec<String> {
-
fn from(tc: TopCollections) -> Self {
-
let mut me = vec![];
-
for (segment, children) in tc.nsid_child_segments {
-
let child_segments: Self = children.into();
-
if child_segments.is_empty() {
-
me.push(segment);
-
} else {
-
for ch in child_segments {
-
let nsid = format!("{segment}.{ch}");
-
me.push(nsid);
-
}
-
}
}
-
me
}
}
···
use super::*;
#[test]
-
fn test_top_collections_to_nsids() {
-
let empty_tc = TopCollections::default();
-
assert_eq!(Into::<Vec<String>>::into(empty_tc), Vec::<String>::new());
-
-
let tc = TopCollections {
-
nsid_child_segments: HashMap::from([
-
(
-
"a".to_string(),
-
TopCollections {
-
nsid_child_segments: HashMap::from([
-
("b".to_string(), TopCollections::default()),
-
("c".to_string(), TopCollections::default()),
-
]),
-
..Default::default()
-
},
-
),
-
("z".to_string(), TopCollections::default()),
-
]),
-
..Default::default()
-
};
-
-
let mut nsids: Vec<String> = tc.into();
-
nsids.sort();
-
assert_eq!(nsids, ["a.b", "a.c", "z"]);
-
}
-
-
#[test]
fn test_truncating_insert_truncates() -> anyhow::Result<()> {
let mut commits: CollectionCommits<2> = Default::default();
-
commits.truncating_insert(UFOsCommit {
-
cursor: Cursor::from_raw_u64(100),
-
did: Did::new("did:plc:whatever".to_string()).unwrap(),
-
rkey: RecordKey::new("rkey-asdf-a".to_string()).unwrap(),
-
rev: "rev-asdf".to_string(),
-
action: CommitAction::Put(PutAction {
-
record: RawValue::from_string("{}".to_string())?,
-
is_update: false,
-
}),
-
})?;
-
commits.truncating_insert(UFOsCommit {
-
cursor: Cursor::from_raw_u64(101),
-
did: Did::new("did:plc:whatever".to_string()).unwrap(),
-
rkey: RecordKey::new("rkey-asdf-b".to_string()).unwrap(),
-
rev: "rev-asdg".to_string(),
-
action: CommitAction::Put(PutAction {
-
record: RawValue::from_string("{}".to_string())?,
-
is_update: false,
-
}),
-
})?;
-
commits.truncating_insert(UFOsCommit {
-
cursor: Cursor::from_raw_u64(102),
-
did: Did::new("did:plc:whatever".to_string()).unwrap(),
-
rkey: RecordKey::new("rkey-asdf-c".to_string()).unwrap(),
-
rev: "rev-asdh".to_string(),
-
action: CommitAction::Put(PutAction {
-
record: RawValue::from_string("{}".to_string())?,
-
is_update: false,
-
}),
-
})?;
-
assert_eq!(commits.total_seen, 3);
assert_eq!(commits.dids_estimate.estimate(), 1);
assert_eq!(commits.commits.len(), 2);
···
}
#[test]
fn test_truncating_insert_does_not_truncate_deletes() -> anyhow::Result<()> {
let mut commits: CollectionCommits<2> = Default::default();
-
commits.truncating_insert(UFOsCommit {
-
cursor: Cursor::from_raw_u64(100),
-
did: Did::new("did:plc:whatever".to_string()).unwrap(),
-
rkey: RecordKey::new("rkey-asdf-a".to_string()).unwrap(),
-
rev: "rev-asdf".to_string(),
-
action: CommitAction::Cut,
-
})?;
-
commits.truncating_insert(UFOsCommit {
-
cursor: Cursor::from_raw_u64(101),
-
did: Did::new("did:plc:whatever".to_string()).unwrap(),
-
rkey: RecordKey::new("rkey-asdf-b".to_string()).unwrap(),
-
rev: "rev-asdg".to_string(),
-
action: CommitAction::Put(PutAction {
-
record: RawValue::from_string("{}".to_string())?,
-
is_update: false,
-
}),
-
})?;
-
commits.truncating_insert(UFOsCommit {
-
cursor: Cursor::from_raw_u64(102),
-
did: Did::new("did:plc:whatever".to_string()).unwrap(),
-
rkey: RecordKey::new("rkey-asdf-c".to_string()).unwrap(),
-
rev: "rev-asdh".to_string(),
-
action: CommitAction::Put(PutAction {
-
record: RawValue::from_string("{}".to_string())?,
-
is_update: false,
-
}),
-
})?;
-
assert_eq!(commits.total_seen, 2);
assert_eq!(commits.dids_estimate.estimate(), 1);
assert_eq!(commits.commits.len(), 2);
···
let mut commits: CollectionCommits<2> = Default::default();
commits
-
.truncating_insert(UFOsCommit {
-
cursor: Cursor::from_raw_u64(100),
-
did: Did::new("did:plc:whatever".to_string()).unwrap(),
-
rkey: RecordKey::new("rkey-asdf-a".to_string()).unwrap(),
-
rev: "rev-asdf".to_string(),
-
action: CommitAction::Cut,
-
})
.unwrap();
// this create will just be discarded
commits
-
.truncating_insert(UFOsCommit {
-
cursor: Cursor::from_raw_u64(80),
-
did: Did::new("did:plc:whatever".to_string()).unwrap(),
-
rkey: RecordKey::new("rkey-asdf-zzz".to_string()).unwrap(),
-
rev: "rev-asdzzz".to_string(),
-
action: CommitAction::Put(PutAction {
-
record: RawValue::from_string("{}".to_string())?,
-
is_update: false,
-
}),
-
})
.unwrap();
commits
-
.truncating_insert(UFOsCommit {
-
cursor: Cursor::from_raw_u64(101),
-
did: Did::new("did:plc:whatever".to_string()).unwrap(),
-
rkey: RecordKey::new("rkey-asdf-b".to_string()).unwrap(),
-
rev: "rev-asdg".to_string(),
-
action: CommitAction::Cut,
-
})
.unwrap();
-
let res = commits.truncating_insert(UFOsCommit {
-
cursor: Cursor::from_raw_u64(102),
-
did: Did::new("did:plc:whatever".to_string()).unwrap(),
-
rkey: RecordKey::new("rkey-asdf-c".to_string()).unwrap(),
-
rev: "rev-asdh".to_string(),
-
action: CommitAction::Cut,
-
});
assert!(res.is_err());
let overflowed = match res {
···
pub mod db_types;
pub mod error;
pub mod file_consumer;
+
pub mod index_html;
pub mod server;
pub mod storage;
pub mod storage_fjall;
pub mod store_types;
+
use crate::db_types::{EncodingError, EncodingResult};
use crate::error::BatchInsertError;
+
use crate::store_types::SketchSecretPrefix;
+
use cardinality_estimator_safe::{Element, Sketch};
use error::FirehoseEventError;
use jetstream::events::{CommitEvent, CommitOp, Cursor};
use jetstream::exports::{Did, Nsid, RecordKey};
use schemars::JsonSchema;
use serde::Serialize;
use serde_json::value::RawValue;
+
use sha2::Sha256;
use std::collections::HashMap;
+
use std::time::Duration;
+
+
fn did_element(sketch_secret: &SketchSecretPrefix, did: &Did) -> Element<14> {
+
Element::from_digest_with_prefix::<Sha256>(sketch_secret, did.as_bytes())
+
}
+
+
pub fn nice_duration(dt: Duration) -> String {
+
let secs = dt.as_secs_f64();
+
if secs < 1. {
+
return format!("{:.0}ms", secs * 1000.);
+
}
+
if secs < 60. {
+
return format!("{secs:.02}s");
+
}
+
let mins = (secs / 60.).floor();
+
let rsecs = secs - (mins * 60.);
+
if mins < 60. {
+
return format!("{mins:.0}m{rsecs:.0}s");
+
}
+
let hrs = (mins / 60.).floor();
+
let rmins = mins - (hrs * 60.);
+
if hrs < 24. {
+
return format!("{hrs:.0}h{rmins:.0}m{rsecs:.0}s");
+
}
+
let days = (hrs / 24.).floor();
+
let rhrs = hrs - (days * 24.);
+
format!("{days:.0}d{rhrs:.0}h{rmins:.0}m{rsecs:.0}s")
+
}
#[derive(Debug, Default, Clone)]
pub struct CollectionCommits<const LIMIT: usize> {
+
pub creates: usize,
+
pub updates: usize,
+
pub deletes: usize,
+
pub dids_estimate: Sketch<14>,
pub commits: Vec<UFOsCommit>,
head: usize,
}
impl<const LIMIT: usize> CollectionCommits<LIMIT> {
···
self.head = 0;
}
}
+
/// lossy-ish commit insertion
+
///
+
/// - new commits are *always* added to the batch or else rejected as full.
+
/// - when LIMIT is reached, new commits can displace existing `creates`.
+
/// `update`s and `delete`s are *never* displaced.
+
/// - if all batched `creates` have been displaced, the batch is full.
+
///
+
/// in general it's rare for commits to be displaced except for very high-
+
/// volume collections such as `app.bsky.feed.like`.
+
///
+
/// it could be nice in the future to retain all batched commits and just
+
/// drop new `creates` after a limit instead.
+
pub fn truncating_insert(
+
&mut self,
+
commit: UFOsCommit,
+
sketch_secret: &SketchSecretPrefix,
+
) -> Result<(), BatchInsertError> {
+
if (self.updates + self.deletes) == LIMIT {
+
// nothing can be displaced (only `create`s may be displaced)
return Err(BatchInsertError::BatchFull(commit));
}
+
+
// every kind of commit counts as "user activity"
+
self.dids_estimate
+
.insert(did_element(sketch_secret, &commit.did));
+
+
match commit.action {
+
CommitAction::Put(PutAction {
+
is_update: false, ..
+
}) => {
+
self.creates += 1;
+
}
+
CommitAction::Put(PutAction {
+
is_update: true, ..
+
}) => {
+
self.updates += 1;
+
}
+
CommitAction::Cut => {
+
self.deletes += 1;
+
}
+
}
+
if self.commits.len() < LIMIT {
+
// normal insert: there's space left to put a new commit at the end
self.commits.push(commit);
} else {
+
// displacement insert: find an old `create` we can displace
let head_started_at = self.head;
loop {
let candidate = self
···
return Err(BatchInsertError::BatchForever);
}
}
}
Ok(())
···
collection: &Nsid,
commit: UFOsCommit,
max_collections: usize,
+
sketch_secret: &SketchSecretPrefix,
) -> Result<(), BatchInsertError> {
let map = &mut self.commits_by_nsid;
if !map.contains_key(collection) && map.len() >= max_collections {
···
}
map.entry(collection.clone())
.or_default()
+
.truncating_insert(commit, sketch_secret)?;
Ok(())
}
pub fn total_collections(&self) -> usize {
self.commits_by_nsid.len()
···
self.account_removes.len()
}
pub fn estimate_dids(&self) -> usize {
+
let mut estimator = Sketch::<14>::default();
for commits in self.commits_by_nsid.values() {
estimator.merge(&commits.dids_estimate);
}
···
}
#[derive(Debug, Serialize, JsonSchema)]
+
#[serde(rename_all = "camelCase")]
pub enum ConsumerInfo {
Jetstream {
endpoint: String,
started_at: u64,
latest_cursor: Option<u64>,
+
rollup_cursor: Option<u64>,
},
}
+
#[derive(Debug, PartialEq, Serialize, JsonSchema)]
+
pub struct NsidCount {
+
nsid: String,
+
creates: u64,
+
// TODO: add updates and deletes
+
dids_estimate: u64,
+
}
+
+
#[derive(Debug, PartialEq, Serialize, JsonSchema)]
+
pub struct PrefixCount {
+
prefix: String,
+
creates: u64,
+
// TODO: add updates and deletes
dids_estimate: u64,
}
+
#[derive(Debug, PartialEq, Serialize, JsonSchema)]
+
#[serde(tag = "type", rename_all = "camelCase")]
+
pub enum PrefixChild {
+
Collection(NsidCount),
+
Prefix(PrefixCount),
+
}
+
+
#[derive(Debug, Serialize, JsonSchema)]
+
pub struct NsidPrefix(String);
+
impl NsidPrefix {
+
/// Input must not include a trailing dot.
+
pub fn new(pre: &str) -> EncodingResult<Self> {
+
// it's a valid prefix if appending `.name` makes it a valid NSID
+
Nsid::new(format!("{pre}.name")).map_err(EncodingError::BadAtriumStringType)?;
+
// hack (shouldn't really be here): reject prefixes that aren't at least 2 segments long
+
if !pre.contains('.') {
+
return Err(EncodingError::NotEnoughNsidSegments);
}
+
Ok(Self(pre.to_string()))
+
}
+
pub fn is_group_of(&self, other: &Nsid) -> bool {
+
assert!(
+
other.as_str().starts_with(&self.0),
+
"must be a prefix of other"
+
);
+
self.0 == other.domain_authority()
+
}
+
/// The prefix as initialized (no trailing dot)
+
pub fn as_str(&self) -> &str {
+
self.0.as_str()
+
}
+
/// The prefix with a trailing `.` appended to avoid matching a longer segment
+
pub fn terminated(&self) -> String {
+
format!("{}.", self.0)
+
}
+
}
+
+
#[derive(Debug, Serialize, JsonSchema)]
+
pub struct JustCount {
+
creates: u64,
+
updates: u64,
+
deletes: u64,
+
dids_estimate: u64,
+
}
+
+
#[derive(Debug)]
+
pub enum OrderCollectionsBy {
+
Lexi { cursor: Option<Vec<u8>> },
+
RecordsCreated,
+
DidsEstimate,
+
}
+
impl Default for OrderCollectionsBy {
+
fn default() -> Self {
+
Self::Lexi { cursor: None }
}
}
···
use super::*;
#[test]
fn test_truncating_insert_truncates() -> anyhow::Result<()> {
let mut commits: CollectionCommits<2> = Default::default();
+
commits.truncating_insert(
+
UFOsCommit {
+
cursor: Cursor::from_raw_u64(100),
+
did: Did::new("did:plc:whatever".to_string()).unwrap(),
+
rkey: RecordKey::new("rkey-asdf-a".to_string()).unwrap(),
+
rev: "rev-asdf".to_string(),
+
action: CommitAction::Put(PutAction {
+
record: RawValue::from_string("{}".to_string())?,
+
is_update: false,
+
}),
+
},
+
&[0u8; 16],
+
)?;
+
commits.truncating_insert(
+
UFOsCommit {
+
cursor: Cursor::from_raw_u64(101),
+
did: Did::new("did:plc:whatever".to_string()).unwrap(),
+
rkey: RecordKey::new("rkey-asdf-b".to_string()).unwrap(),
+
rev: "rev-asdg".to_string(),
+
action: CommitAction::Put(PutAction {
+
record: RawValue::from_string("{}".to_string())?,
+
is_update: false,
+
}),
+
},
+
&[0u8; 16],
+
)?;
+
commits.truncating_insert(
+
UFOsCommit {
+
cursor: Cursor::from_raw_u64(102),
+
did: Did::new("did:plc:whatever".to_string()).unwrap(),
+
rkey: RecordKey::new("rkey-asdf-c".to_string()).unwrap(),
+
rev: "rev-asdh".to_string(),
+
action: CommitAction::Put(PutAction {
+
record: RawValue::from_string("{}".to_string())?,
+
is_update: false,
+
}),
+
},
+
&[0u8; 16],
+
)?;
+
assert_eq!(commits.creates, 3);
assert_eq!(commits.dids_estimate.estimate(), 1);
assert_eq!(commits.commits.len(), 2);
···
}
#[test]
+
fn test_truncating_insert_counts_updates() -> anyhow::Result<()> {
+
let mut commits: CollectionCommits<2> = Default::default();
+
+
commits.truncating_insert(
+
UFOsCommit {
+
cursor: Cursor::from_raw_u64(100),
+
did: Did::new("did:plc:whatever".to_string()).unwrap(),
+
rkey: RecordKey::new("rkey-asdf-a".to_string()).unwrap(),
+
rev: "rev-asdf".to_string(),
+
action: CommitAction::Put(PutAction {
+
record: RawValue::from_string("{}".to_string())?,
+
is_update: true,
+
}),
+
},
+
&[0u8; 16],
+
)?;
+
+
assert_eq!(commits.creates, 0);
+
assert_eq!(commits.updates, 1);
+
assert_eq!(commits.deletes, 0);
+
assert_eq!(commits.dids_estimate.estimate(), 1);
+
assert_eq!(commits.commits.len(), 1);
+
Ok(())
+
}
+
+
#[test]
fn test_truncating_insert_does_not_truncate_deletes() -> anyhow::Result<()> {
let mut commits: CollectionCommits<2> = Default::default();
+
commits.truncating_insert(
+
UFOsCommit {
+
cursor: Cursor::from_raw_u64(100),
+
did: Did::new("did:plc:whatever".to_string()).unwrap(),
+
rkey: RecordKey::new("rkey-asdf-a".to_string()).unwrap(),
+
rev: "rev-asdf".to_string(),
+
action: CommitAction::Cut,
+
},
+
&[0u8; 16],
+
)?;
+
commits.truncating_insert(
+
UFOsCommit {
+
cursor: Cursor::from_raw_u64(101),
+
did: Did::new("did:plc:whatever".to_string()).unwrap(),
+
rkey: RecordKey::new("rkey-asdf-b".to_string()).unwrap(),
+
rev: "rev-asdg".to_string(),
+
action: CommitAction::Put(PutAction {
+
record: RawValue::from_string("{}".to_string())?,
+
is_update: false,
+
}),
+
},
+
&[0u8; 16],
+
)?;
+
commits.truncating_insert(
+
UFOsCommit {
+
cursor: Cursor::from_raw_u64(102),
+
did: Did::new("did:plc:whatever".to_string()).unwrap(),
+
rkey: RecordKey::new("rkey-asdf-c".to_string()).unwrap(),
+
rev: "rev-asdh".to_string(),
+
action: CommitAction::Put(PutAction {
+
record: RawValue::from_string("{}".to_string())?,
+
is_update: false,
+
}),
+
},
+
&[0u8; 16],
+
)?;
+
assert_eq!(commits.creates, 2);
+
assert_eq!(commits.deletes, 1);
assert_eq!(commits.dids_estimate.estimate(), 1);
assert_eq!(commits.commits.len(), 2);
···
let mut commits: CollectionCommits<2> = Default::default();
commits
+
.truncating_insert(
+
UFOsCommit {
+
cursor: Cursor::from_raw_u64(100),
+
did: Did::new("did:plc:whatever".to_string()).unwrap(),
+
rkey: RecordKey::new("rkey-asdf-a".to_string()).unwrap(),
+
rev: "rev-asdf".to_string(),
+
action: CommitAction::Cut,
+
},
+
&[0u8; 16],
+
)
.unwrap();
// this create will just be discarded
commits
+
.truncating_insert(
+
UFOsCommit {
+
cursor: Cursor::from_raw_u64(80),
+
did: Did::new("did:plc:whatever".to_string()).unwrap(),
+
rkey: RecordKey::new("rkey-asdf-zzz".to_string()).unwrap(),
+
rev: "rev-asdzzz".to_string(),
+
action: CommitAction::Put(PutAction {
+
record: RawValue::from_string("{}".to_string())?,
+
is_update: false,
+
}),
+
},
+
&[0u8; 16],
+
)
.unwrap();
commits
+
.truncating_insert(
+
UFOsCommit {
+
cursor: Cursor::from_raw_u64(101),
+
did: Did::new("did:plc:whatever".to_string()).unwrap(),
+
rkey: RecordKey::new("rkey-asdf-b".to_string()).unwrap(),
+
rev: "rev-asdg".to_string(),
+
action: CommitAction::Cut,
+
},
+
&[0u8; 16],
+
)
.unwrap();
+
let res = commits.truncating_insert(
+
UFOsCommit {
+
cursor: Cursor::from_raw_u64(102),
+
did: Did::new("did:plc:whatever".to_string()).unwrap(),
+
rkey: RecordKey::new("rkey-asdf-c".to_string()).unwrap(),
+
rev: "rev-asdh".to_string(),
+
action: CommitAction::Cut,
+
},
+
&[0u8; 16],
+
);
assert!(res.is_err());
let overflowed = match res {
+148 -84
ufos/src/main.rs
···
use clap::Parser;
use jetstream::events::Cursor;
use std::path::PathBuf;
use ufos::consumer;
-
use ufos::error::StorageError;
use ufos::file_consumer;
use ufos::server;
-
use ufos::storage::{StorageWhatever, StoreReader, StoreWriter};
use ufos::storage_fjall::FjallStorage;
-
use ufos::storage_mem::MemStorage;
#[cfg(not(target_env = "msvc"))]
use tikv_jemallocator::Jemalloc;
···
static GLOBAL: Jemalloc = Jemalloc;
/// Aggregate links in the at-mosphere
-
#[derive(Parser, Debug)]
#[command(version, about, long_about = None)]
struct Args {
/// Jetstream server to connect to (exclusive with --fixture). Provide either a wss:// URL, or a shorhand value:
···
#[arg(long)]
data: PathBuf,
/// DEBUG: don't start the jetstream consumer or its write loop
-
/// todo: restore this
#[arg(long, action)]
pause_writer: bool,
/// DEBUG: force the rw loop to fall behind by pausing it
/// todo: restore this
#[arg(long, action)]
pause_rw: bool,
-
/// DEBUG: use an in-memory store instead of fjall
#[arg(long, action)]
-
in_mem: bool,
/// DEBUG: interpret jetstream as a file fixture
#[arg(long, action)]
jetstream_fixture: bool,
}
-
// #[tokio::main(flavor = "current_thread")] // TODO: move this to config via args
#[tokio::main]
async fn main() -> anyhow::Result<()> {
env_logger::init();
let args = Args::parse();
let jetstream = args.jetstream.clone();
-
if args.in_mem {
-
let (read_store, write_store, cursor) = MemStorage::init(
-
args.data,
-
jetstream,
-
args.jetstream_force,
-
Default::default(),
-
)?;
-
go(
-
args.jetstream,
-
args.jetstream_fixture,
-
args.pause_writer,
-
read_store,
-
write_store,
-
cursor,
-
)
-
.await?;
-
} else {
-
let (read_store, write_store, cursor) = FjallStorage::init(
-
args.data,
-
jetstream,
-
args.jetstream_force,
-
Default::default(),
-
)?;
-
go(
-
args.jetstream,
-
args.jetstream_fixture,
-
args.pause_writer,
-
read_store,
-
write_store,
-
cursor,
-
)
-
.await?;
-
}
-
Ok(())
}
-
async fn go(
-
jetstream: String,
-
jetstream_fixture: bool,
-
pause_writer: bool,
-
read_store: impl StoreReader + 'static,
-
mut write_store: impl StoreWriter + 'static,
cursor: Option<Cursor>,
) -> anyhow::Result<()> {
println!("starting server with storage...");
-
let serving = server::serve(read_store);
-
let t1 = tokio::task::spawn(async {
-
let r = serving.await;
-
log::warn!("serving ended with: {r:?}");
-
});
-
let t2: tokio::task::JoinHandle<anyhow::Result<()>> = tokio::task::spawn({
-
async move {
-
if !pause_writer {
-
println!(
-
"starting consumer with cursor: {cursor:?} from {:?} ago",
-
cursor.map(|c| c.elapsed())
-
);
-
let mut batches = if jetstream_fixture {
-
file_consumer::consume(jetstream.into()).await?
-
} else {
-
consumer::consume(&jetstream, cursor, false).await?
-
};
-
tokio::task::spawn_blocking(move || {
-
while let Some(event_batch) = batches.blocking_recv() {
-
write_store.insert_batch(event_batch)?;
-
write_store
-
.step_rollup()
-
.inspect_err(|e| log::error!("laksjdfl: {e:?}"))?;
-
}
-
Ok::<(), StorageError>(())
-
})
-
.await??;
-
log::warn!("storage.receive ended with");
-
} else {
-
log::info!("not starting jetstream or the write loop.");
-
}
-
Ok(())
-
}
-
});
tokio::select! {
-
z = t1 => log::warn!("serve task ended: {z:?}"),
-
z = t2 => log::warn!("storage task ended: {z:?}"),
};
println!("bye!");
Ok(())
}
···
use clap::Parser;
use jetstream::events::Cursor;
use std::path::PathBuf;
+
use std::time::{Duration, SystemTime};
use ufos::consumer;
use ufos::file_consumer;
use ufos::server;
+
use ufos::storage::{StorageWhatever, StoreBackground, StoreReader, StoreWriter};
use ufos::storage_fjall::FjallStorage;
+
use ufos::store_types::SketchSecretPrefix;
+
use ufos::{nice_duration, ConsumerInfo};
#[cfg(not(target_env = "msvc"))]
use tikv_jemallocator::Jemalloc;
···
static GLOBAL: Jemalloc = Jemalloc;
/// Aggregate links in the at-mosphere
+
#[derive(Parser, Debug, Clone)]
#[command(version, about, long_about = None)]
struct Args {
/// Jetstream server to connect to (exclusive with --fixture). Provide either a wss:// URL, or a shorhand value:
···
#[arg(long)]
data: PathBuf,
/// DEBUG: don't start the jetstream consumer or its write loop
#[arg(long, action)]
pause_writer: bool,
+
/// Adjust runtime settings like background task intervals for efficient backfill
+
#[arg(long, action)]
+
backfill: bool,
/// DEBUG: force the rw loop to fall behind by pausing it
/// todo: restore this
#[arg(long, action)]
pause_rw: bool,
+
/// reset the rollup cursor, scrape through missed things in the past (backfill)
#[arg(long, action)]
+
reroll: bool,
/// DEBUG: interpret jetstream as a file fixture
#[arg(long, action)]
jetstream_fixture: bool,
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
env_logger::init();
let args = Args::parse();
let jetstream = args.jetstream.clone();
+
let (read_store, write_store, cursor, sketch_secret) = FjallStorage::init(
+
args.data.clone(),
+
jetstream,
+
args.jetstream_force,
+
Default::default(),
+
)?;
+
go(args, read_store, write_store, cursor, sketch_secret).await?;
Ok(())
}
+
async fn go<B: StoreBackground>(
+
args: Args,
+
read_store: impl StoreReader + 'static + Clone,
+
mut write_store: impl StoreWriter<B> + 'static,
cursor: Option<Cursor>,
+
sketch_secret: SketchSecretPrefix,
) -> anyhow::Result<()> {
println!("starting server with storage...");
+
let serving = server::serve(read_store.clone());
+
if args.pause_writer {
+
log::info!("not starting jetstream or the write loop.");
+
serving.await.map_err(|e| anyhow::anyhow!(e))?;
+
return Ok(());
+
}
+
let batches = if args.jetstream_fixture {
+
log::info!("starting with jestream file fixture: {:?}", args.jetstream);
+
file_consumer::consume(args.jetstream.into(), sketch_secret, cursor).await?
+
} else {
+
log::info!(
+
"starting consumer with cursor: {cursor:?} from {:?} ago",
+
cursor.map(|c| c.elapsed())
+
);
+
consumer::consume(&args.jetstream, cursor, false, sketch_secret).await?
+
};
+
let rolling = write_store
+
.background_tasks(args.reroll)?
+
.run(args.backfill);
+
let consuming = write_store.receive_batches(batches);
+
let stating = do_update_stuff(read_store);
tokio::select! {
+
z = serving => log::warn!("serve task ended: {z:?}"),
+
z = rolling => log::warn!("rollup task ended: {z:?}"),
+
z = consuming => log::warn!("consuming task ended: {z:?}"),
+
z = stating => log::warn!("status task ended: {z:?}"),
};
println!("bye!");
Ok(())
}
+
+
async fn do_update_stuff(read_store: impl StoreReader) {
+
let started_at = std::time::SystemTime::now();
+
let mut first_cursor = None;
+
let mut first_rollup = None;
+
let mut last_at = std::time::SystemTime::now();
+
let mut last_cursor = None;
+
let mut last_rollup = None;
+
let mut interval = tokio::time::interval(std::time::Duration::from_secs(4));
+
interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
+
loop {
+
interval.tick().await;
+
match read_store.get_consumer_info().await {
+
Err(e) => log::warn!("failed to get jetstream consumer info: {e:?}"),
+
Ok(ConsumerInfo::Jetstream {
+
latest_cursor,
+
rollup_cursor,
+
..
+
}) => {
+
let now = std::time::SystemTime::now();
+
let latest_cursor = latest_cursor.map(Cursor::from_raw_u64);
+
let rollup_cursor = rollup_cursor.map(Cursor::from_raw_u64);
+
backfill_info(
+
latest_cursor,
+
rollup_cursor,
+
last_cursor,
+
last_rollup,
+
last_at,
+
first_cursor,
+
first_rollup,
+
started_at,
+
now,
+
);
+
first_cursor = first_cursor.or(latest_cursor);
+
first_rollup = first_rollup.or(rollup_cursor);
+
last_cursor = latest_cursor;
+
last_rollup = rollup_cursor;
+
last_at = now;
+
}
+
}
+
}
+
}
+
+
#[allow(clippy::too_many_arguments)]
+
fn backfill_info(
+
latest_cursor: Option<Cursor>,
+
rollup_cursor: Option<Cursor>,
+
last_cursor: Option<Cursor>,
+
last_rollup: Option<Cursor>,
+
last_at: SystemTime,
+
first_cursor: Option<Cursor>,
+
first_rollup: Option<Cursor>,
+
started_at: SystemTime,
+
now: SystemTime,
+
) {
+
let nice_dt_two_maybes = |earlier: Option<Cursor>, later: Option<Cursor>| match (earlier, later)
+
{
+
(Some(earlier), Some(later)) => match later.duration_since(&earlier) {
+
Ok(dt) => nice_duration(dt),
+
Err(e) => {
+
let rev_dt = e.duration();
+
format!("+{}", nice_duration(rev_dt))
+
}
+
},
+
_ => "unknown".to_string(),
+
};
+
+
let rate = |mlatest: Option<Cursor>, msince: Option<Cursor>, real: Duration| {
+
mlatest
+
.zip(msince)
+
.map(|(latest, since)| {
+
latest
+
.duration_since(&since)
+
.unwrap_or(Duration::from_millis(1))
+
})
+
.map(|dtc| format!("{:.2}", dtc.as_secs_f64() / real.as_secs_f64()))
+
.unwrap_or("??".into())
+
};
+
+
let dt_real = now
+
.duration_since(last_at)
+
.unwrap_or(Duration::from_millis(1));
+
+
let dt_real_total = now
+
.duration_since(started_at)
+
.unwrap_or(Duration::from_millis(1));
+
+
let cursor_rate = rate(latest_cursor, last_cursor, dt_real);
+
let cursor_avg = rate(latest_cursor, first_cursor, dt_real_total);
+
+
let rollup_rate = rate(rollup_cursor, last_rollup, dt_real);
+
let rollup_avg = rate(rollup_cursor, first_rollup, dt_real_total);
+
+
log::info!(
+
"cursor: {} behind (→{}, {cursor_rate}x, {cursor_avg}x avg). rollup: {} behind (→{}, {rollup_rate}x, {rollup_avg}x avg).",
+
latest_cursor.map(|c| c.elapsed().map(nice_duration).unwrap_or("++".to_string())).unwrap_or("?".to_string()),
+
nice_dt_two_maybes(last_cursor, latest_cursor),
+
rollup_cursor.map(|c| c.elapsed().map(nice_duration).unwrap_or("++".to_string())).unwrap_or("?".to_string()),
+
nice_dt_two_maybes(last_rollup, rollup_cursor),
+
);
+
}
-249
ufos/src/server.rs
···
-
use crate::storage::StoreReader;
-
use crate::{ConsumerInfo, Nsid, TopCollections, UFOsRecord};
-
use dropshot::endpoint;
-
use dropshot::ApiDescription;
-
use dropshot::ConfigDropshot;
-
use dropshot::ConfigLogging;
-
use dropshot::ConfigLoggingLevel;
-
use dropshot::HttpError;
-
use dropshot::HttpResponseHeaders;
-
use dropshot::HttpResponseOk;
-
use dropshot::Query;
-
use dropshot::RequestContext;
-
use dropshot::ServerBuilder;
-
use schemars::JsonSchema;
-
use serde::{Deserialize, Serialize};
-
use std::collections::HashMap;
-
use std::sync::Arc;
-
-
struct Context {
-
pub spec: Arc<serde_json::Value>,
-
storage: Box<dyn StoreReader>,
-
}
-
-
/// Meta: get the openapi spec for this api
-
#[endpoint {
-
method = GET,
-
path = "/openapi",
-
}]
-
async fn get_openapi(ctx: RequestContext<Context>) -> OkCorsResponse<serde_json::Value> {
-
let spec = (*ctx.context().spec).clone();
-
ok_cors(spec)
-
}
-
-
#[derive(Debug, Serialize, JsonSchema)]
-
struct MetaInfo {
-
storage_name: String,
-
storage: serde_json::Value,
-
consumer: ConsumerInfo,
-
}
-
/// Get meta information about UFOs itself
-
#[endpoint {
-
method = GET,
-
path = "/meta"
-
}]
-
async fn get_meta_info(ctx: RequestContext<Context>) -> OkCorsResponse<MetaInfo> {
-
let Context { storage, .. } = ctx.context();
-
let failed_to_get =
-
|what| move |e| HttpError::for_internal_error(format!("failed to get {what}: {e:?}"));
-
-
let storage_info = storage
-
.get_storage_stats()
-
.await
-
.map_err(failed_to_get("storage info"))?;
-
-
let consumer = storage
-
.get_consumer_info()
-
.await
-
.map_err(failed_to_get("consumer info"))?;
-
-
ok_cors(MetaInfo {
-
storage_name: storage.name(),
-
storage: storage_info,
-
consumer,
-
})
-
}
-
fn to_multiple_nsids(s: &str) -> Result<Vec<Nsid>, String> {
-
let mut out = Vec::new();
-
for collection in s.split(',') {
-
let Ok(nsid) = Nsid::new(collection.to_string()) else {
-
return Err(format!("collection {collection:?} was not a valid NSID"));
-
};
-
out.push(nsid);
-
}
-
Ok(out)
-
}
-
-
#[derive(Debug, Deserialize, JsonSchema)]
-
struct RecordsCollectionsQuery {
-
collection: Option<String>, // JsonSchema not implemented for Nsid :(
-
}
-
#[derive(Debug, Serialize, JsonSchema)]
-
struct ApiRecord {
-
did: String,
-
collection: String,
-
rkey: String,
-
record: Box<serde_json::value::RawValue>,
-
time_us: u64,
-
}
-
impl From<UFOsRecord> for ApiRecord {
-
fn from(ufo: UFOsRecord) -> Self {
-
Self {
-
did: ufo.did.to_string(),
-
collection: ufo.collection.to_string(),
-
rkey: ufo.rkey.to_string(),
-
record: ufo.record,
-
time_us: ufo.cursor.to_raw_u64(),
-
}
-
}
-
}
-
/// Get recent records by collection
-
///
-
/// Multiple collections are supported. they will be delivered in one big array with no
-
/// specified order.
-
#[endpoint {
-
method = GET,
-
path = "/records",
-
}]
-
async fn get_records_by_collections(
-
ctx: RequestContext<Context>,
-
collection_query: Query<RecordsCollectionsQuery>,
-
) -> OkCorsResponse<Vec<ApiRecord>> {
-
let Context { storage, .. } = ctx.context();
-
let mut limit = 42;
-
let query = collection_query.into_inner();
-
let collections = if let Some(provided_collection) = query.collection {
-
to_multiple_nsids(&provided_collection)
-
.map_err(|reason| HttpError::for_bad_request(None, reason))?
-
} else {
-
let all_collections_should_be_nsids: Vec<String> = storage
-
.get_top_collections()
-
.await
-
.map_err(|e| {
-
HttpError::for_internal_error(format!("failed to get top collections: {e:?}"))
-
})?
-
.into();
-
let mut all_collections = Vec::with_capacity(all_collections_should_be_nsids.len());
-
for raw_nsid in all_collections_should_be_nsids {
-
let nsid = Nsid::new(raw_nsid).map_err(|e| {
-
HttpError::for_internal_error(format!("failed to parse nsid: {e:?}"))
-
})?;
-
all_collections.push(nsid);
-
}
-
-
limit = 12;
-
all_collections
-
};
-
-
let records = storage
-
.get_records_by_collections(&collections, limit, true)
-
.await
-
.map_err(|e| HttpError::for_internal_error(e.to_string()))?
-
.into_iter()
-
.map(|r| r.into())
-
.collect();
-
-
ok_cors(records)
-
}
-
-
#[derive(Debug, Deserialize, JsonSchema)]
-
struct TotalSeenCollectionsQuery {
-
collection: String, // JsonSchema not implemented for Nsid :(
-
}
-
#[derive(Debug, Serialize, JsonSchema)]
-
struct TotalCounts {
-
total_records: u64,
-
dids_estimate: u64,
-
}
-
/// Get total records seen by collection
-
#[endpoint {
-
method = GET,
-
path = "/records/total-seen"
-
}]
-
async fn get_records_total_seen(
-
ctx: RequestContext<Context>,
-
collection_query: Query<TotalSeenCollectionsQuery>,
-
) -> OkCorsResponse<HashMap<String, TotalCounts>> {
-
let Context { storage, .. } = ctx.context();
-
-
let query = collection_query.into_inner();
-
let collections = to_multiple_nsids(&query.collection)
-
.map_err(|reason| HttpError::for_bad_request(None, reason))?;
-
-
let mut seen_by_collection = HashMap::with_capacity(collections.len());
-
-
for collection in &collections {
-
let (total_records, dids_estimate) = storage
-
.get_counts_by_collection(collection)
-
.await
-
.map_err(|e| HttpError::for_internal_error(format!("boooo: {e:?}")))?;
-
-
seen_by_collection.insert(
-
collection.to_string(),
-
TotalCounts {
-
total_records,
-
dids_estimate,
-
},
-
);
-
}
-
-
ok_cors(seen_by_collection)
-
}
-
-
/// Get top collections
-
#[endpoint {
-
method = GET,
-
path = "/collections"
-
}]
-
async fn get_top_collections(ctx: RequestContext<Context>) -> OkCorsResponse<TopCollections> {
-
let Context { storage, .. } = ctx.context();
-
let collections = storage
-
.get_top_collections()
-
.await
-
.map_err(|e| HttpError::for_internal_error(format!("boooo: {e:?}")))?;
-
-
ok_cors(collections)
-
}
-
-
pub async fn serve(storage: impl StoreReader + 'static) -> Result<(), String> {
-
let log = ConfigLogging::StderrTerminal {
-
level: ConfigLoggingLevel::Info,
-
}
-
.to_logger("hello-ufos")
-
.map_err(|e| e.to_string())?;
-
-
let mut api = ApiDescription::new();
-
-
api.register(get_openapi).unwrap();
-
api.register(get_meta_info).unwrap();
-
api.register(get_records_by_collections).unwrap();
-
api.register(get_records_total_seen).unwrap();
-
api.register(get_top_collections).unwrap();
-
-
let context = Context {
-
spec: Arc::new(
-
api.openapi("UFOs", semver::Version::new(0, 0, 0))
-
.json()
-
.map_err(|e| e.to_string())?,
-
),
-
storage: Box::new(storage),
-
};
-
-
ServerBuilder::new(api, context, log)
-
.config(ConfigDropshot {
-
bind_address: "0.0.0.0:9999".parse().unwrap(),
-
..Default::default()
-
})
-
.start()
-
.map_err(|error| format!("failed to start server: {}", error))?
-
.await
-
}
-
-
/// awkward helpers
-
type OkCorsResponse<T> = Result<HttpResponseHeaders<HttpResponseOk<T>>, HttpError>;
-
fn ok_cors<T: Send + Sync + Serialize + JsonSchema>(t: T) -> OkCorsResponse<T> {
-
let mut res = HttpResponseHeaders::new_unnamed(HttpResponseOk(t));
-
res.headers_mut()
-
.insert("access-control-allow-origin", "*".parse().unwrap());
-
Ok(res)
-
}
···
+72
ufos/src/server/collections_query.rs
···
···
+
use crate::Nsid;
+
use async_trait::async_trait;
+
use dropshot::{
+
ApiEndpointBodyContentType, ExtractorMetadata, HttpError, Query, RequestContext, ServerContext,
+
SharedExtractor,
+
};
+
use schemars::JsonSchema;
+
use serde::Deserialize;
+
use std::collections::HashSet;
+
+
/// The real type that gets deserialized
+
#[derive(Debug, Deserialize, JsonSchema)]
+
pub struct MultiCollectionQuery {
+
pub collection: Vec<String>,
+
}
+
+
/// The fake corresponding type for docs that dropshot won't freak out about a
+
/// vec for
+
#[derive(Deserialize, JsonSchema)]
+
#[allow(dead_code)]
+
struct MultiCollectionQueryForDocs {
+
/// One or more collection [NSID](https://atproto.com/specs/nsid)s
+
///
+
/// Pass this parameter multiple times to specify multiple collections, like
+
/// `collection=app.bsky.feed.like&collection=app.bsky.feed.post`
+
collection: String,
+
}
+
+
impl TryFrom<MultiCollectionQuery> for HashSet<Nsid> {
+
type Error = HttpError;
+
fn try_from(mcq: MultiCollectionQuery) -> Result<Self, Self::Error> {
+
let mut out = HashSet::with_capacity(mcq.collection.len());
+
for c in mcq.collection {
+
let nsid = Nsid::new(c).map_err(|e| {
+
HttpError::for_bad_request(
+
None,
+
format!("failed to convert collection to an NSID: {e:?}"),
+
)
+
})?;
+
out.insert(nsid);
+
}
+
Ok(out)
+
}
+
}
+
+
// The `SharedExtractor` implementation for Query<QueryType> describes how to
+
// construct an instance of `Query<QueryType>` from an HTTP request: namely, by
+
// parsing the query string to an instance of `QueryType`.
+
#[async_trait]
+
impl SharedExtractor for MultiCollectionQuery {
+
async fn from_request<Context: ServerContext>(
+
ctx: &RequestContext<Context>,
+
) -> Result<MultiCollectionQuery, HttpError> {
+
let raw_query = ctx.request.uri().query().unwrap_or("");
+
let q = serde_qs::from_str(raw_query).map_err(|e| {
+
HttpError::for_bad_request(None, format!("unable to parse query string: {}", e))
+
})?;
+
Ok(q)
+
}
+
+
fn metadata(body_content_type: ApiEndpointBodyContentType) -> ExtractorMetadata {
+
// HACK: query type switcheroo: passing MultiCollectionQuery to
+
// `metadata` would "helpfully" panic because dropshot believes we can
+
// only have scalar types in a query.
+
//
+
// so instead we have a fake second type whose only job is to look the
+
// same as MultiCollectionQuery exept that it has `String` instead of
+
// `Vec<String>`, which dropshot will accept, and generate ~close-enough
+
// docs for.
+
<Query<MultiCollectionQueryForDocs> as SharedExtractor>::metadata(body_content_type)
+
}
+
}
+23
ufos/src/server/cors.rs
···
···
+
use dropshot::{HttpError, HttpResponseHeaders, HttpResponseOk};
+
use schemars::JsonSchema;
+
use serde::Serialize;
+
+
pub type OkCorsResponse<T> = Result<HttpResponseHeaders<HttpResponseOk<T>>, HttpError>;
+
+
/// Helper for constructing Ok responses: return OkCors(T).into()
+
/// (not happy with this yet)
+
pub struct OkCors<T: Serialize + JsonSchema + Send + Sync>(pub T);
+
+
impl<T> From<OkCors<T>> for OkCorsResponse<T>
+
where
+
T: Serialize + JsonSchema + Send + Sync,
+
{
+
fn from(ok: OkCors<T>) -> OkCorsResponse<T> {
+
let mut res = HttpResponseHeaders::new_unnamed(HttpResponseOk(ok.0));
+
res.headers_mut()
+
.insert("access-control-allow-origin", "*".parse().unwrap());
+
Ok(res)
+
}
+
}
+
+
// TODO: cors for HttpError
+638
ufos/src/server/mod.rs
···
···
+
mod collections_query;
+
mod cors;
+
+
use crate::index_html::INDEX_HTML;
+
use crate::storage::StoreReader;
+
use crate::store_types::{HourTruncatedCursor, WeekTruncatedCursor};
+
use crate::{
+
ConsumerInfo, Cursor, JustCount, Nsid, NsidCount, NsidPrefix, OrderCollectionsBy, PrefixChild,
+
UFOsRecord,
+
};
+
use base64::{engine::general_purpose::URL_SAFE_NO_PAD, Engine as _};
+
use chrono::{DateTime, Utc};
+
use collections_query::MultiCollectionQuery;
+
use cors::{OkCors, OkCorsResponse};
+
use dropshot::endpoint;
+
use dropshot::ApiDescription;
+
use dropshot::Body;
+
use dropshot::ConfigDropshot;
+
use dropshot::ConfigLogging;
+
use dropshot::ConfigLoggingLevel;
+
use dropshot::HttpError;
+
use dropshot::Query;
+
use dropshot::RequestContext;
+
use dropshot::ServerBuilder;
+
+
use http::{Response, StatusCode};
+
use schemars::JsonSchema;
+
use serde::{Deserialize, Serialize};
+
use std::collections::{HashMap, HashSet};
+
use std::sync::Arc;
+
use std::time::{Duration, SystemTime, UNIX_EPOCH};
+
+
struct Context {
+
pub spec: Arc<serde_json::Value>,
+
storage: Box<dyn StoreReader>,
+
}
+
+
fn dt_to_cursor(dt: DateTime<Utc>) -> Result<HourTruncatedCursor, HttpError> {
+
let t = dt.timestamp_micros();
+
if t < 0 {
+
Err(HttpError::for_bad_request(None, "timestamp too old".into()))
+
} else {
+
let t = t as u64;
+
let t_now = SystemTime::now()
+
.duration_since(UNIX_EPOCH)
+
.unwrap()
+
.as_micros() as u64;
+
const ONE_HOUR: u64 = 60 * 60 * 1_000_000;
+
if t > t_now && (t - t_now > 2 * ONE_HOUR) {
+
Err(HttpError::for_bad_request(None, "future timestamp".into()))
+
} else {
+
Ok(HourTruncatedCursor::truncate_raw_u64(t))
+
}
+
}
+
}
+
+
/// Serve index page as html
+
#[endpoint {
+
method = GET,
+
path = "/",
+
/*
+
* not useful to have this in openapi
+
*/
+
unpublished = true,
+
}]
+
async fn index(_ctx: RequestContext<Context>) -> Result<Response<Body>, HttpError> {
+
Ok(Response::builder()
+
.status(StatusCode::OK)
+
.header(http::header::CONTENT_TYPE, "text/html")
+
.body(INDEX_HTML.into())?)
+
}
+
+
/// Meta: get the openapi spec for this api
+
#[endpoint {
+
method = GET,
+
path = "/openapi",
+
/*
+
* not useful to have this in openapi
+
*/
+
unpublished = true,
+
}]
+
async fn get_openapi(ctx: RequestContext<Context>) -> OkCorsResponse<serde_json::Value> {
+
let spec = (*ctx.context().spec).clone();
+
OkCors(spec).into()
+
}
+
+
#[derive(Debug, Serialize, JsonSchema)]
+
struct MetaInfo {
+
storage_name: String,
+
storage: serde_json::Value,
+
consumer: ConsumerInfo,
+
}
+
/// UFOs meta-info
+
#[endpoint {
+
method = GET,
+
path = "/meta"
+
}]
+
async fn get_meta_info(ctx: RequestContext<Context>) -> OkCorsResponse<MetaInfo> {
+
let Context { storage, .. } = ctx.context();
+
let failed_to_get =
+
|what| move |e| HttpError::for_internal_error(format!("failed to get {what}: {e:?}"));
+
+
let storage_info = storage
+
.get_storage_stats()
+
.await
+
.map_err(failed_to_get("storage info"))?;
+
+
let consumer = storage
+
.get_consumer_info()
+
.await
+
.map_err(failed_to_get("consumer info"))?;
+
+
OkCors(MetaInfo {
+
storage_name: storage.name(),
+
storage: storage_info,
+
consumer,
+
})
+
.into()
+
}
+
+
// TODO: replace with normal (🙃) multi-qs value somehow
+
fn to_multiple_nsids(s: &str) -> Result<HashSet<Nsid>, String> {
+
let mut out = HashSet::new();
+
for collection in s.split(',') {
+
let Ok(nsid) = Nsid::new(collection.to_string()) else {
+
return Err(format!("collection {collection:?} was not a valid NSID"));
+
};
+
out.insert(nsid);
+
}
+
Ok(out)
+
}
+
+
#[derive(Debug, Deserialize, JsonSchema)]
+
struct RecordsCollectionsQuery {
+
collection: Option<String>, // JsonSchema not implemented for Nsid :(
+
}
+
#[derive(Debug, Serialize, JsonSchema)]
+
struct ApiRecord {
+
did: String,
+
collection: String,
+
rkey: String,
+
record: Box<serde_json::value::RawValue>,
+
time_us: u64,
+
}
+
impl From<UFOsRecord> for ApiRecord {
+
fn from(ufo: UFOsRecord) -> Self {
+
Self {
+
did: ufo.did.to_string(),
+
collection: ufo.collection.to_string(),
+
rkey: ufo.rkey.to_string(),
+
record: ufo.record,
+
time_us: ufo.cursor.to_raw_u64(),
+
}
+
}
+
}
+
/// Record samples
+
///
+
/// Get most recent records seen in the firehose, by collection NSID
+
///
+
/// Multiple collections are supported. They will be delivered in one big array with no
+
/// specified order.
+
#[endpoint {
+
method = GET,
+
path = "/records",
+
}]
+
async fn get_records_by_collections(
+
ctx: RequestContext<Context>,
+
collection_query: Query<RecordsCollectionsQuery>,
+
) -> OkCorsResponse<Vec<ApiRecord>> {
+
let Context { storage, .. } = ctx.context();
+
let mut limit = 42;
+
let query = collection_query.into_inner();
+
let collections = if let Some(provided_collection) = query.collection {
+
to_multiple_nsids(&provided_collection)
+
.map_err(|reason| HttpError::for_bad_request(None, reason))?
+
} else {
+
limit = 12;
+
let min_time_ago = SystemTime::now() - Duration::from_secs(86_400 * 3); // we want at least 3 days of data
+
let since: WeekTruncatedCursor = Cursor::at(min_time_ago).into();
+
let (collections, _) = storage
+
.get_collections(
+
1000,
+
Default::default(),
+
Some(since.try_as().unwrap()),
+
None,
+
)
+
.await
+
.map_err(|e| HttpError::for_internal_error(e.to_string()))?;
+
collections
+
.into_iter()
+
.map(|c| Nsid::new(c.nsid).unwrap())
+
.collect()
+
};
+
+
let records = storage
+
.get_records_by_collections(collections, limit, true)
+
.await
+
.map_err(|e| HttpError::for_internal_error(e.to_string()))?
+
.into_iter()
+
.map(|r| r.into())
+
.collect();
+
+
OkCors(records).into()
+
}
+
+
#[derive(Debug, Deserialize, JsonSchema)]
+
struct CollectionsStatsQuery {
+
/// Limit stats to those seen after this UTC datetime
+
///
+
/// default: 1 week ago
+
since: Option<DateTime<Utc>>,
+
/// Limit stats to those seen before this UTC datetime
+
///
+
/// default: now
+
until: Option<DateTime<Utc>>,
+
}
+
/// Collection stats
+
///
+
/// Get record statistics for collections during a specific time period.
+
///
+
/// Note: the statistics are "rolled up" into hourly buckets in the background,
+
/// so the data here can be as stale as that background task is behind. See the
+
/// meta info endpoint to find out how up-to-date the rollup currently is. (In
+
/// general it sholud be pretty close to live)
+
#[endpoint {
+
method = GET,
+
path = "/collections/stats"
+
}]
+
async fn get_collection_stats(
+
ctx: RequestContext<Context>,
+
collections_query: MultiCollectionQuery,
+
query: Query<CollectionsStatsQuery>,
+
) -> OkCorsResponse<HashMap<String, JustCount>> {
+
let Context { storage, .. } = ctx.context();
+
let q = query.into_inner();
+
let collections: HashSet<Nsid> = collections_query.try_into()?;
+
+
let since = q.since.map(dt_to_cursor).transpose()?.unwrap_or_else(|| {
+
let week_ago_secs = 7 * 86_400;
+
let week_ago = SystemTime::now() - Duration::from_secs(week_ago_secs);
+
Cursor::at(week_ago).into()
+
});
+
+
let until = q.until.map(dt_to_cursor).transpose()?;
+
+
let mut seen_by_collection = HashMap::with_capacity(collections.len());
+
+
for collection in &collections {
+
let counts = storage
+
.get_collection_counts(collection, since, until)
+
.await
+
.map_err(|e| HttpError::for_internal_error(format!("boooo: {e:?}")))?;
+
+
seen_by_collection.insert(collection.to_string(), counts);
+
}
+
+
OkCors(seen_by_collection).into()
+
}
+
+
#[derive(Debug, Serialize, JsonSchema)]
+
struct CollectionsResponse {
+
/// Each known collection and its associated statistics
+
///
+
/// The order is unspecified.
+
collections: Vec<NsidCount>,
+
/// Include in a follow-up request to get the next page of results, if more are available
+
cursor: Option<String>,
+
}
+
#[derive(Debug, Deserialize, JsonSchema)]
+
#[serde(rename_all = "kebab-case")]
+
pub enum CollectionsQueryOrder {
+
RecordsCreated,
+
DidsEstimate,
+
}
+
impl From<&CollectionsQueryOrder> for OrderCollectionsBy {
+
fn from(q: &CollectionsQueryOrder) -> Self {
+
match q {
+
CollectionsQueryOrder::RecordsCreated => OrderCollectionsBy::RecordsCreated,
+
CollectionsQueryOrder::DidsEstimate => OrderCollectionsBy::DidsEstimate,
+
}
+
}
+
}
+
#[derive(Debug, Deserialize, JsonSchema)]
+
struct CollectionsQuery {
+
/// The maximum number of collections to return in one request.
+
///
+
/// Default: `100` normally, `32` if `order` is specified.
+
#[schemars(range(min = 1, max = 200))]
+
limit: Option<usize>,
+
/// Get a paginated response with more collections.
+
///
+
/// Always omit the cursor for the first request. If more collections than the limit are available, the response will contain a non-null `cursor` to include with the next request.
+
///
+
/// `cursor` is mutually exclusive with `order`.
+
cursor: Option<String>,
+
/// Limit collections and statistics to those seen after this UTC datetime
+
since: Option<DateTime<Utc>>,
+
/// Limit collections and statistics to those seen before this UTC datetime
+
until: Option<DateTime<Utc>>,
+
/// Get a limited, sorted list
+
///
+
/// Mutually exclusive with `cursor` -- sorted results cannot be paged.
+
order: Option<CollectionsQueryOrder>,
+
}
+
+
/// List collections
+
///
+
/// With statistics.
+
///
+
/// ## To fetch a full list:
+
///
+
/// Omit the `order` parameter and page through the results using the `cursor`. There have been a lot of collections seen in the ATmosphere, well over 400 at time of writing, so you *will* need to make a series of paginaged requests with `cursor`s to get them all.
+
///
+
/// The set of collections across multiple requests is not guaranteed to be a perfectly consistent snapshot:
+
///
+
/// - all collection NSIDs observed before the first request will be included in the results
+
///
+
/// - *new* NSIDs observed in the firehose *while paging* might be included or excluded from the final set
+
///
+
/// - no duplicate NSIDs will occur in the combined results
+
///
+
/// In practice this is close enough for most use-cases to not worry about.
+
///
+
/// ## To fetch the top collection NSIDs:
+
///
+
/// Specify the `order` parameter (must be either `records-created` or `did-estimate`). Note that ordered results cannot be paged.
+
///
+
/// All statistics are bucketed hourly, so the most granular effecitve time boundary for `since` and `until` is one hour.
+
#[endpoint {
+
method = GET,
+
path = "/collections"
+
}]
+
async fn get_collections(
+
ctx: RequestContext<Context>,
+
query: Query<CollectionsQuery>,
+
) -> OkCorsResponse<CollectionsResponse> {
+
let Context { storage, .. } = ctx.context();
+
let q = query.into_inner();
+
+
if q.cursor.is_some() && q.order.is_some() {
+
let msg = "`cursor` is mutually exclusive with `order`. ordered results cannot be paged.";
+
return Err(HttpError::for_bad_request(None, msg.to_string()));
+
}
+
+
let order = if let Some(ref o) = q.order {
+
o.into()
+
} else {
+
let cursor = q
+
.cursor
+
.and_then(|c| if c.is_empty() { None } else { Some(c) })
+
.map(|c| URL_SAFE_NO_PAD.decode(&c))
+
.transpose()
+
.map_err(|e| HttpError::for_bad_request(None, format!("invalid cursor: {e:?}")))?;
+
OrderCollectionsBy::Lexi { cursor }
+
};
+
+
let limit = match (q.limit, q.order) {
+
(Some(limit), _) => limit,
+
(None, Some(_)) => 32,
+
(None, None) => 100,
+
};
+
+
if !(1..=200).contains(&limit) {
+
let msg = format!("limit not in 1..=200: {}", limit);
+
return Err(HttpError::for_bad_request(None, msg));
+
}
+
+
let since = q.since.map(dt_to_cursor).transpose()?;
+
let until = q.until.map(dt_to_cursor).transpose()?;
+
+
let (collections, next_cursor) = storage
+
.get_collections(limit, order, since, until)
+
.await
+
.map_err(|e| HttpError::for_internal_error(format!("oh shoot: {e:?}")))?;
+
+
let next_cursor = next_cursor.map(|c| URL_SAFE_NO_PAD.encode(c));
+
+
OkCors(CollectionsResponse {
+
collections,
+
cursor: next_cursor,
+
})
+
.into()
+
}
+
+
#[derive(Debug, Serialize, JsonSchema)]
+
struct PrefixResponse {
+
/// Note that total may not include counts beyond the current page (TODO)
+
total: JustCount,
+
children: Vec<PrefixChild>,
+
/// Include in a follow-up request to get the next page of results, if more are available
+
cursor: Option<String>,
+
}
+
#[derive(Debug, Deserialize, JsonSchema)]
+
struct PrefixQuery {
+
///
+
/// The final segment of a collection NSID is the `name`, and everything before it is called its `group`. eg:
+
///
+
/// - `app.bsky.feed.post` and `app.bsky.feed.like` are both in the _lexicon group_ "`app.bsky.feed`".
+
///
+
prefix: String,
+
/// The maximum number of collections to return in one request.
+
///
+
/// The number of items actually returned may be less than the limit. If paginating, this does **not** indicate that no
+
/// more items are available! Check if the `cursor` in the response is `null` to determine the end of items.
+
///
+
/// Default: `100` normally, `32` if `order` is specified.
+
#[schemars(range(min = 1, max = 200))]
+
limit: Option<usize>,
+
/// Get a paginated response with more collections.
+
///
+
/// Always omit the cursor for the first request. If more collections than the limit are available, the response will contain a non-null `cursor` to include with the next request.
+
///
+
/// `cursor` is mutually exclusive with `order`.
+
cursor: Option<String>,
+
/// Limit collections and statistics to those seen after this UTC datetime
+
///
+
/// Default: all-time
+
since: Option<DateTime<Utc>>,
+
/// Limit collections and statistics to those seen before this UTC datetime
+
///
+
/// Default: now
+
until: Option<DateTime<Utc>>,
+
/// Get a limited, sorted list
+
///
+
/// Mutually exclusive with `cursor` -- sorted results cannot be paged.
+
order: Option<CollectionsQueryOrder>,
+
}
+
/// Prefix-filter collections list
+
///
+
/// This endpoint enumerates all collection NSIDs for a lexicon group.
+
///
+
/// ## To fetch a full list:
+
///
+
/// Omit the `order` parameter and page through the results using the `cursor`. There have been a lot of collections seen in the ATmosphere, well over 400 at time of writing, so you *will* need to make a series of paginaged requests with `cursor`s to get them all.
+
///
+
/// The set of collections across multiple requests is not guaranteed to be a perfectly consistent snapshot:
+
///
+
/// - all collection NSIDs observed before the first request will be included in the results
+
///
+
/// - *new* NSIDs observed in the firehose *while paging* might be included or excluded from the final set
+
///
+
/// - no duplicate NSIDs will occur in the combined results
+
///
+
/// In practice this is close enough for most use-cases to not worry about.
+
///
+
/// ## To fetch the top collection NSIDs:
+
///
+
/// Specify the `order` parameter (must be either `records-created` or `did-estimate`). Note that ordered results cannot be paged.
+
///
+
/// All statistics are bucketed hourly, so the most granular effecitve time boundary for `since` and `until` is one hour.
+
#[endpoint {
+
method = GET,
+
path = "/prefix"
+
}]
+
async fn get_prefix(
+
ctx: RequestContext<Context>,
+
query: Query<PrefixQuery>,
+
) -> OkCorsResponse<PrefixResponse> {
+
let Context { storage, .. } = ctx.context();
+
let q = query.into_inner();
+
+
let prefix = NsidPrefix::new(&q.prefix).map_err(|e| {
+
HttpError::for_bad_request(
+
None,
+
format!("{:?} was not a valid NSID prefix: {e:?}", q.prefix),
+
)
+
})?;
+
+
if q.cursor.is_some() && q.order.is_some() {
+
let msg = "`cursor` is mutually exclusive with `order`. ordered results cannot be paged.";
+
return Err(HttpError::for_bad_request(None, msg.to_string()));
+
}
+
+
let order = if let Some(ref o) = q.order {
+
o.into()
+
} else {
+
let cursor = q
+
.cursor
+
.and_then(|c| if c.is_empty() { None } else { Some(c) })
+
.map(|c| URL_SAFE_NO_PAD.decode(&c))
+
.transpose()
+
.map_err(|e| HttpError::for_bad_request(None, format!("invalid cursor: {e:?}")))?;
+
OrderCollectionsBy::Lexi { cursor }
+
};
+
+
let limit = match (q.limit, q.order) {
+
(Some(limit), _) => limit,
+
(None, Some(_)) => 32,
+
(None, None) => 100,
+
};
+
+
if !(1..=200).contains(&limit) {
+
let msg = format!("limit not in 1..=200: {}", limit);
+
return Err(HttpError::for_bad_request(None, msg));
+
}
+
+
let since = q.since.map(dt_to_cursor).transpose()?;
+
let until = q.until.map(dt_to_cursor).transpose()?;
+
+
let (total, children, next_cursor) = storage
+
.get_prefix(prefix, limit, order, since, until)
+
.await
+
.map_err(|e| HttpError::for_internal_error(format!("oh shoot: {e:?}")))?;
+
+
let next_cursor = next_cursor.map(|c| URL_SAFE_NO_PAD.encode(c));
+
+
OkCors(PrefixResponse {
+
total,
+
children,
+
cursor: next_cursor,
+
})
+
.into()
+
}
+
+
#[derive(Debug, Deserialize, JsonSchema)]
+
struct CollectionTimeseriesQuery {
+
collection: String, // JsonSchema not implemented for Nsid :(
+
/// Limit collections and statistics to those seen after this UTC datetime
+
///
+
/// default: 1 week ago
+
since: Option<DateTime<Utc>>,
+
/// Limit collections and statistics to those seen before this UTC datetime
+
///
+
/// default: now
+
until: Option<DateTime<Utc>>,
+
/// time steps between data, in seconds
+
///
+
/// the step will be rounded down to the nearest hour
+
///
+
/// default: 86400 (24hrs)
+
#[schemars(range(min = 3600))]
+
step: Option<u64>,
+
// todo: rolling averages
+
}
+
#[derive(Debug, Serialize, JsonSchema)]
+
struct CollectionTimeseriesResponse {
+
range: Vec<DateTime<Utc>>,
+
series: HashMap<String, Vec<JustCount>>,
+
}
+
/// Collection timeseries stats
+
#[endpoint {
+
method = GET,
+
path = "/timeseries"
+
}]
+
async fn get_timeseries(
+
ctx: RequestContext<Context>,
+
query: Query<CollectionTimeseriesQuery>,
+
) -> OkCorsResponse<CollectionTimeseriesResponse> {
+
let Context { storage, .. } = ctx.context();
+
let q = query.into_inner();
+
+
let since = q.since.map(dt_to_cursor).transpose()?.unwrap_or_else(|| {
+
let week_ago_secs = 7 * 86_400;
+
let week_ago = SystemTime::now() - Duration::from_secs(week_ago_secs);
+
Cursor::at(week_ago).into()
+
});
+
+
let until = q.until.map(dt_to_cursor).transpose()?;
+
+
let step = if let Some(secs) = q.step {
+
if secs < 3600 {
+
let msg = format!("step is too small: {}", secs);
+
Err(HttpError::for_bad_request(None, msg))?;
+
}
+
(secs / 3600) * 3600 // trucate to hour
+
} else {
+
86_400
+
};
+
+
let nsid = Nsid::new(q.collection).map_err(|e| {
+
HttpError::for_bad_request(None, format!("collection was not a valid NSID: {:?}", e))
+
})?;
+
+
let (range_cursors, series) = storage
+
.get_timeseries(vec![nsid], since, until, step)
+
.await
+
.map_err(|e| HttpError::for_internal_error(format!("oh shoot: {e:?}")))?;
+
+
let range = range_cursors
+
.into_iter()
+
.map(|c| DateTime::<Utc>::from_timestamp_micros(c.to_raw_u64() as i64).unwrap())
+
.collect();
+
+
let series = series
+
.into_iter()
+
.map(|(k, v)| (k.to_string(), v.iter().map(Into::into).collect()))
+
.collect();
+
+
OkCors(CollectionTimeseriesResponse { range, series }).into()
+
}
+
+
pub async fn serve(storage: impl StoreReader + 'static) -> Result<(), String> {
+
let log = ConfigLogging::StderrTerminal {
+
level: ConfigLoggingLevel::Info,
+
}
+
.to_logger("hello-ufos")
+
.map_err(|e| e.to_string())?;
+
+
let mut api = ApiDescription::new();
+
+
api.register(index).unwrap();
+
api.register(get_openapi).unwrap();
+
api.register(get_meta_info).unwrap();
+
api.register(get_records_by_collections).unwrap();
+
api.register(get_collection_stats).unwrap();
+
api.register(get_collections).unwrap();
+
api.register(get_prefix).unwrap();
+
api.register(get_timeseries).unwrap();
+
+
let context = Context {
+
spec: Arc::new(
+
api.openapi(
+
"UFOs: Every lexicon in the ATmosphere",
+
env!("CARGO_PKG_VERSION")
+
.parse()
+
.inspect_err(|e| {
+
log::warn!("failed to parse cargo package version for openapi: {e:?}")
+
})
+
.unwrap_or(semver::Version::new(0, 0, 1)),
+
)
+
.description("Samples and statistics of atproto records by their collection NSID")
+
.contact_name("part of @microcosm.blue")
+
.contact_url("https://microcosm.blue")
+
.json()
+
.map_err(|e| e.to_string())?,
+
),
+
storage: Box::new(storage),
+
};
+
+
ServerBuilder::new(api, context, log)
+
.config(ConfigDropshot {
+
bind_address: "0.0.0.0:9999".parse().unwrap(),
+
..Default::default()
+
})
+
.start()
+
.map_err(|error| format!("failed to start server: {}", error))?
+
.await
+
}
+99 -10
ufos/src/storage.rs
···
-
// use crate::store_types::CountsValue;
-
use crate::{error::StorageError, ConsumerInfo, Cursor, EventBatch, TopCollections, UFOsRecord};
use async_trait::async_trait;
use jetstream::exports::{Did, Nsid};
use std::path::Path;
pub type StorageResult<T> = Result<T, StorageError>;
-
pub trait StorageWhatever<R: StoreReader, W: StoreWriter, C> {
fn init(
path: impl AsRef<Path>,
endpoint: String,
force_endpoint: bool,
config: C,
-
) -> StorageResult<(R, W, Option<Cursor>)>
where
Self: Sized;
}
-
pub trait StoreWriter: Send + Sync {
fn insert_batch<const LIMIT: usize>(
&mut self,
event_batch: EventBatch<LIMIT>,
) -> StorageResult<()>;
-
fn step_rollup(&mut self) -> StorageResult<usize>;
-
fn trim_collection(&mut self, collection: &Nsid, limit: usize) -> StorageResult<()>;
fn delete_account(&mut self, did: &Did) -> StorageResult<usize>;
}
#[async_trait]
···
async fn get_consumer_info(&self) -> StorageResult<ConsumerInfo>;
-
async fn get_top_collections(&self) -> StorageResult<TopCollections>;
-
async fn get_counts_by_collection(&self, collection: &Nsid) -> StorageResult<(u64, u64)>;
async fn get_records_by_collections(
&self,
-
collections: &[Nsid],
limit: usize,
expand_each_collection: bool,
) -> StorageResult<Vec<UFOsRecord>>;
···
+
use crate::store_types::{CountsValue, HourTruncatedCursor, SketchSecretPrefix};
+
use crate::{
+
error::StorageError, ConsumerInfo, Cursor, EventBatch, JustCount, NsidCount, NsidPrefix,
+
OrderCollectionsBy, PrefixChild, UFOsRecord,
+
};
use async_trait::async_trait;
use jetstream::exports::{Did, Nsid};
+
use std::collections::{HashMap, HashSet};
use std::path::Path;
+
use std::time::{Duration, SystemTime};
+
use tokio::sync::mpsc::Receiver;
+
use tokio_util::sync::CancellationToken;
pub type StorageResult<T> = Result<T, StorageError>;
+
pub trait StorageWhatever<R: StoreReader, W: StoreWriter<B>, B: StoreBackground, C> {
fn init(
path: impl AsRef<Path>,
endpoint: String,
force_endpoint: bool,
config: C,
+
) -> StorageResult<(R, W, Option<Cursor>, SketchSecretPrefix)>
where
Self: Sized;
}
+
#[async_trait]
+
pub trait StoreWriter<B: StoreBackground>: Clone + Send + Sync
+
where
+
Self: 'static,
+
{
+
fn background_tasks(&mut self, reroll: bool) -> StorageResult<B>;
+
+
async fn receive_batches<const LIMIT: usize>(
+
self,
+
mut batches: Receiver<EventBatch<LIMIT>>,
+
) -> StorageResult<()> {
+
while let Some(event_batch) = batches.recv().await {
+
let token = CancellationToken::new();
+
let cancelled = token.clone();
+
tokio::spawn(async move {
+
let started = SystemTime::now();
+
let mut concerned = false;
+
loop {
+
tokio::select! {
+
_ = tokio::time::sleep(Duration::from_secs_f64(3.)) => {
+
log::warn!("taking a long time to insert an event batch ({:?})...", started.elapsed());
+
concerned = true;
+
}
+
_ = cancelled.cancelled() => {
+
if concerned {
+
log::warn!("finally inserted slow event batch (or failed) after {:?}", started.elapsed());
+
}
+
break
+
}
+
}
+
}
+
});
+
tokio::task::spawn_blocking({
+
let mut me = self.clone();
+
move || {
+
let _guard = token.drop_guard();
+
me.insert_batch(event_batch)
+
}
+
})
+
.await??;
+
}
+
+
Err(StorageError::BatchSenderExited)
+
}
+
fn insert_batch<const LIMIT: usize>(
&mut self,
event_batch: EventBatch<LIMIT>,
) -> StorageResult<()>;
+
fn step_rollup(&mut self) -> StorageResult<(usize, HashSet<Nsid>)>;
+
fn trim_collection(
+
&mut self,
+
collection: &Nsid,
+
limit: usize,
+
full_scan: bool,
+
) -> StorageResult<(usize, usize, bool)>;
fn delete_account(&mut self, did: &Did) -> StorageResult<usize>;
+
}
+
+
#[async_trait]
+
pub trait StoreBackground: Send + Sync {
+
async fn run(mut self, backfill: bool) -> StorageResult<()>;
}
#[async_trait]
···
async fn get_consumer_info(&self) -> StorageResult<ConsumerInfo>;
+
async fn get_collections(
+
&self,
+
limit: usize,
+
order: OrderCollectionsBy,
+
since: Option<HourTruncatedCursor>,
+
until: Option<HourTruncatedCursor>,
+
) -> StorageResult<(Vec<NsidCount>, Option<Vec<u8>>)>;
+
async fn get_prefix(
+
&self,
+
prefix: NsidPrefix,
+
limit: usize,
+
order: OrderCollectionsBy,
+
since: Option<HourTruncatedCursor>,
+
until: Option<HourTruncatedCursor>,
+
) -> StorageResult<(JustCount, Vec<PrefixChild>, Option<Vec<u8>>)>;
+
+
async fn get_timeseries(
+
&self,
+
collections: Vec<Nsid>,
+
since: HourTruncatedCursor,
+
until: Option<HourTruncatedCursor>,
+
step: u64,
+
) -> StorageResult<(Vec<HourTruncatedCursor>, HashMap<Nsid, Vec<CountsValue>>)>;
+
+
async fn get_collection_counts(
+
&self,
+
collection: &Nsid,
+
since: HourTruncatedCursor,
+
until: Option<HourTruncatedCursor>,
+
) -> StorageResult<JustCount>;
async fn get_records_by_collections(
&self,
+
collections: HashSet<Nsid>,
limit: usize,
expand_each_collection: bool,
) -> StorageResult<Vec<UFOsRecord>>;
+1367 -364
ufos/src/storage_fjall.rs
···
-
use crate::db_types::{db_complete, DbBytes, DbStaticStr, StaticStr};
use crate::error::StorageError;
-
use crate::storage::{StorageResult, StorageWhatever, StoreReader, StoreWriter};
use crate::store_types::{
-
AllTimeRollupKey, CountsValue, DeleteAccountQueueKey, DeleteAccountQueueVal,
-
HourTruncatedCursor, HourlyRollupKey, JetstreamCursorKey, JetstreamCursorValue,
-
JetstreamEndpointKey, JetstreamEndpointValue, LiveCountsKey, NewRollupCursorKey,
-
NewRollupCursorValue, NsidRecordFeedKey, NsidRecordFeedVal, RecordLocationKey,
-
RecordLocationMeta, RecordLocationVal, RecordRawValue, TakeoffKey, TakeoffValue,
-
WeekTruncatedCursor, WeeklyRollupKey,
};
-
use crate::{CommitAction, ConsumerInfo, Did, EventBatch, Nsid, TopCollections, UFOsRecord};
use async_trait::async_trait;
-
use fjall::{Batch as FjallBatch, Config, Keyspace, PartitionCreateOptions, PartitionHandle};
use jetstream::events::Cursor;
-
use std::collections::HashMap;
use std::path::Path;
-
use std::time::SystemTime;
-
const MAX_BATCHED_CLEANUP_SIZE: usize = 1024; // try to commit progress for longer feeds
const MAX_BATCHED_ACCOUNT_DELETE_RECORDS: usize = 1024;
const MAX_BATCHED_ROLLUP_COUNTS: usize = 256;
···
/// - key: "takeoff" (literal)
/// - val: u64 (micros timestamp, not from jetstream for now so not precise)
///
/// - Rollup cursor (bg work: roll stats into hourlies, delete accounts, old record deletes)
/// - key: "rollup_cursor" (literal)
/// - val: u64 (tracks behind js_cursor)
///
///
/// Partition: 'feed'
///
···
/// - key: "live_counts" || u64 || nullstr (js_cursor, nsid)
/// - val: u64 || HLL (count (not cursor), estimator)
///
/// - Hourly total record counts and dids estimate per collection
/// - key: "hourly_counts" || u64 || nullstr (hour, nsid)
/// - val: u64 || HLL (count (not cursor), estimator)
///
/// - Weekly total record counts and dids estimate per collection
-
/// - key: "weekly_counts" || u64 || nullstr (hour, nsid)
/// - val: u64 || HLL (count (not cursor), estimator)
///
/// - All-time total record counts and dids estimate per collection
/// - key: "ever_counts" || nullstr (nsid)
/// - val: u64 || HLL (count (not cursor), estimator)
///
-
/// - TODO: sorted indexes for all-times?
///
///
/// Partition: 'queues'
···
pub temp: bool,
}
-
impl StorageWhatever<FjallReader, FjallWriter, FjallConfig> for FjallStorage {
fn init(
path: impl AsRef<Path>,
endpoint: String,
force_endpoint: bool,
_config: FjallConfig,
-
) -> StorageResult<(FjallReader, FjallWriter, Option<Cursor>)> {
let keyspace = {
let config = Config::new(path);
-
#[cfg(not(test))]
-
let config = config.fsync_ms(Some(4_000));
config.open()?
};
···
let js_cursor = get_static_neu::<JetstreamCursorKey, JetstreamCursorValue>(&global)?;
-
if js_cursor.is_some() {
let stored_endpoint =
get_static_neu::<JetstreamEndpointKey, JetstreamEndpointValue>(&global)?;
-
let JetstreamEndpointValue(stored) = stored_endpoint.ok_or(StorageError::InitError(
"found cursor but missing js_endpoint, refusing to start.".to_string(),
))?;
if stored != endpoint {
if force_endpoint {
···
)?;
} else {
return Err(StorageError::InitError(format!(
-
"stored js_endpoint {stored:?} differs from provided {endpoint:?}, refusing to start.")));
}
}
} else {
-
insert_static_neu::<JetstreamEndpointKey>(
&global,
JetstreamEndpointValue(endpoint.to_string()),
)?;
-
insert_static_neu::<TakeoffKey>(&global, Cursor::at(SystemTime::now()))?;
-
insert_static_neu::<NewRollupCursorKey>(&global, Cursor::from_start())?;
-
}
let reader = FjallReader {
keyspace: keyspace.clone(),
···
rollups: rollups.clone(),
};
let writer = FjallWriter {
keyspace,
global,
feeds,
···
rollups,
queues,
};
-
Ok((reader, writer, js_cursor))
}
}
···
}
}
impl FjallReader {
fn get_storage_stats(&self) -> StorageResult<serde_json::Value> {
let rollup_cursor =
···
get_snapshot_static_neu::<JetstreamCursorKey, JetstreamCursorValue>(&global)?
.map(|c| c.to_raw_u64());
Ok(ConsumerInfo::Jetstream {
endpoint,
started_at,
latest_cursor,
})
}
-
fn get_top_collections(&self) -> Result<TopCollections, StorageError> {
-
// TODO: limit nsid traversal depth
-
// TODO: limit nsid traversal breadth
-
// TODO: be serious about anything
-
// TODO: probably use a stack of segments to reduce to ~log-n merges
-
#[derive(Default)]
-
struct Blah {
-
counts: CountsValue,
-
children: HashMap<String, Blah>,
}
-
impl From<&Blah> for TopCollections {
-
fn from(bla: &Blah) -> Self {
-
Self {
-
total_records: bla.counts.records(),
-
dids_estimate: bla.counts.dids().estimate() as u64,
-
nsid_child_segments: HashMap::from_iter(
-
bla.children.iter().map(|(k, v)| (k.to_string(), v.into())),
-
),
}
}
}
-
let mut b = Blah::default();
-
let prefix = AllTimeRollupKey::from_prefix_to_db_bytes(&Default::default())?;
-
for kv in self.rollups.prefix(&prefix.to_db_bytes()?) {
-
let (key_bytes, val_bytes) = kv?;
-
let key = db_complete::<AllTimeRollupKey>(&key_bytes)?;
-
let val = db_complete::<CountsValue>(&val_bytes)?;
-
let mut node = &mut b;
-
node.counts.merge(&val);
-
for segment in key.collection().split('.') {
-
node = node.children.entry(segment.to_string()).or_default();
-
node.counts.merge(&val);
}
}
-
Ok((&b).into())
}
-
fn get_counts_by_collection(&self, collection: &Nsid) -> StorageResult<(u64, u64)> {
-
// 0. grab a snapshot in case rollups happen while we're working
-
let instant = self.keyspace.instant();
-
let global = self.global.snapshot_at(instant);
-
let rollups = self.rollups.snapshot_at(instant);
-
// 1. all-time counts
-
let all_time_key = AllTimeRollupKey::new(collection).to_db_bytes()?;
-
let mut total_counts = rollups
-
.get(&all_time_key)?
-
.as_deref()
-
.map(db_complete::<CountsValue>)
-
.transpose()?
-
.unwrap_or_default();
-
// 2. live counts that haven't been rolled into all-time yet.
-
let rollup_cursor =
-
get_snapshot_static_neu::<NewRollupCursorKey, NewRollupCursorValue>(&global)?.ok_or(
-
StorageError::BadStateError("Could not find current rollup cursor".to_string()),
-
)?;
-
let full_range = LiveCountsKey::range_from_cursor(rollup_cursor)?;
-
for kv in rollups.range(full_range) {
-
let (key_bytes, val_bytes) = kv?;
-
let key = db_complete::<LiveCountsKey>(&key_bytes)?;
-
if key.collection() == collection {
-
let counts = db_complete::<CountsValue>(&val_bytes)?;
-
total_counts.merge(&counts);
}
}
-
Ok((
-
total_counts.records(),
-
total_counts.dids().estimate() as u64,
-
))
}
fn get_records_by_collections(
&self,
-
collections: &[Nsid],
limit: usize,
expand_each_collection: bool,
) -> StorageResult<Vec<UFOsRecord>> {
···
}
let mut record_iterators = Vec::new();
for collection in collections {
-
let iter = RecordIterator::new(&self.feeds, self.records.clone(), collection, limit)?;
record_iterators.push(iter.peekable());
}
let mut merged = Vec::new();
···
let s = self.clone();
tokio::task::spawn_blocking(move || FjallReader::get_consumer_info(&s)).await?
}
-
async fn get_top_collections(&self) -> Result<TopCollections, StorageError> {
let s = self.clone();
-
tokio::task::spawn_blocking(move || FjallReader::get_top_collections(&s)).await?
}
-
async fn get_counts_by_collection(&self, collection: &Nsid) -> StorageResult<(u64, u64)> {
let s = self.clone();
let collection = collection.clone();
-
tokio::task::spawn_blocking(move || FjallReader::get_counts_by_collection(&s, &collection))
-
.await?
}
async fn get_records_by_collections(
&self,
-
collections: &[Nsid],
limit: usize,
expand_each_collection: bool,
) -> StorageResult<Vec<UFOsRecord>> {
let s = self.clone();
-
let collections = collections.to_vec();
tokio::task::spawn_blocking(move || {
-
FjallReader::get_records_by_collections(&s, &collections, limit, expand_each_collection)
})
.await?
}
}
pub struct FjallWriter {
keyspace: Keyspace,
global: PartitionHandle,
feeds: PartitionHandle,
···
timelies: impl Iterator<Item = Result<(fjall::Slice, fjall::Slice), fjall::Error>>,
cursor_exclusive_limit: Option<Cursor>,
rollup_limit: usize,
-
) -> StorageResult<usize> {
// current strategy is to buffer counts in mem before writing the rollups
// we *could* read+write every single batch to rollup.. but their merge is associative so
// ...so save the db some work up front? is this worth it? who knows...
#[derive(Eq, Hash, PartialEq)]
enum Rollup {
···
break;
}
batch.remove(&self.rollups, key_bytes);
let val = db_complete::<CountsValue>(&val_bytes)?;
counts_by_rollup
···
last_cursor = key.cursor();
}
for ((nsid, rollup), counts) in counts_by_rollup {
-
let key_bytes = match rollup {
Rollup::Hourly(hourly_cursor) => {
-
let k = HourlyRollupKey::new(hourly_cursor, &nsid);
-
k.to_db_bytes()?
}
Rollup::Weekly(weekly_cursor) => {
-
let k = WeeklyRollupKey::new(weekly_cursor, &nsid);
-
k.to_db_bytes()?
}
-
Rollup::AllTime => {
-
let k = AllTimeRollupKey::new(&nsid);
-
k.to_db_bytes()?
-
}
};
let mut rolled: CountsValue = self
.rollups
-
.get(&key_bytes)?
.as_deref()
.map(db_complete::<CountsValue>)
.transpose()?
.unwrap_or_default();
-
// try to round-trip before inserting, for funsies
-
let tripppin = counts.to_db_bytes()?;
-
let (and_back, n) = CountsValue::from_db_bytes(&tripppin)?;
-
assert_eq!(n, tripppin.len());
-
assert_eq!(counts.prefix, and_back.prefix);
-
assert_eq!(counts.dids().estimate(), and_back.dids().estimate());
-
if counts.records() > 200_000_000_000 {
-
panic!("COUNTS maybe wtf? {counts:?}")
}
-
rolled.merge(&counts);
-
batch.insert(&self.rollups, &key_bytes, &rolled.to_db_bytes()?);
}
insert_batch_static_neu::<NewRollupCursorKey>(&mut batch, &self.global, last_cursor)?;
batch.commit()?;
-
Ok(cursors_advanced)
}
}
-
impl StoreWriter for FjallWriter {
fn insert_batch<const LIMIT: usize>(
&mut self,
event_batch: EventBatch<LIMIT>,
···
}
}
let live_counts_key: LiveCountsKey = (latest, &nsid).into();
-
let counts_value = CountsValue::new(commits.total_seen as u64, commits.dids_estimate);
batch.insert(
&self.rollups,
&live_counts_key.to_db_bytes()?,
···
Ok(())
}
-
fn step_rollup(&mut self) -> StorageResult<usize> {
let rollup_cursor =
get_static_neu::<NewRollupCursorKey, NewRollupCursorValue>(&self.global)?.ok_or(
StorageError::BadStateError("Could not find current rollup cursor".to_string()),
···
let live_counts_range = LiveCountsKey::range_from_cursor(rollup_cursor)?;
let mut timely_iter = self.rollups.range(live_counts_range).peekable();
-
let timely_next_cursor = timely_iter
.peek_mut()
-
.map(|kv| -> StorageResult<Cursor> {
match kv {
Err(e) => Err(std::mem::replace(e, fjall::Error::Poisoned))?,
Ok((key_bytes, _)) => {
let key = db_complete::<LiveCountsKey>(key_bytes)?;
-
Ok(key.cursor())
}
}
})
···
})
.transpose()?;
-
let cursors_stepped = match (timely_next_cursor, next_delete) {
-
(
-
Some(timely_next_cursor),
-
Some((delete_cursor, delete_key_bytes, delete_val_bytes)),
-
) => {
-
if timely_next_cursor < delete_cursor {
-
self.rollup_live_counts(
timely_iter,
Some(delete_cursor),
MAX_BATCHED_ROLLUP_COUNTS,
-
)?
} else {
self.rollup_delete_account(delete_cursor, &delete_key_bytes, &delete_val_bytes)?
}
}
(Some(_), None) => {
-
self.rollup_live_counts(timely_iter, None, MAX_BATCHED_ROLLUP_COUNTS)?
}
(None, Some((delete_cursor, delete_key_bytes, delete_val_bytes))) => {
self.rollup_delete_account(delete_cursor, &delete_key_bytes, &delete_val_bytes)?
···
(None, None) => 0,
};
-
Ok(cursors_stepped)
}
fn trim_collection(
&mut self,
collection: &Nsid,
limit: usize,
-
// TODO: could add a start cursor limit to avoid iterating deleted stuff at the start (/end)
-
) -> StorageResult<()> {
let mut dangling_feed_keys_cleaned = 0;
let mut records_deleted = 0;
-
let mut batch = self.keyspace.batch();
-
let prefix = NsidRecordFeedKey::from_prefix_to_db_bytes(collection)?;
-
let mut found = 0;
-
for kv in self.feeds.prefix(prefix).rev() {
let (key_bytes, val_bytes) = kv?;
let feed_key = db_complete::<NsidRecordFeedKey>(&key_bytes)?;
let feed_val = db_complete::<NsidRecordFeedVal>(&val_bytes)?;
···
let Some(location_val_bytes) = self.records.get(&location_key_bytes)? else {
// record was deleted (hopefully)
-
batch.remove(&self.feeds, &location_key_bytes);
dangling_feed_keys_cleaned += 1;
continue;
};
let (meta, _) = RecordLocationMeta::from_db_bytes(&location_val_bytes)?;
if meta.cursor() != feed_key.cursor() {
// older/different version
-
batch.remove(&self.feeds, &location_key_bytes);
dangling_feed_keys_cleaned += 1;
continue;
}
if meta.rev != feed_val.rev() {
// weird...
log::warn!("record lookup: cursor match but rev did not...? removing.");
-
batch.remove(&self.feeds, &location_key_bytes);
dangling_feed_keys_cleaned += 1;
continue;
}
-
if batch.len() >= MAX_BATCHED_CLEANUP_SIZE {
-
batch.commit()?;
-
batch = self.keyspace.batch();
-
}
-
-
found += 1;
-
if found <= limit {
continue;
}
-
batch.remove(&self.feeds, &location_key_bytes);
-
batch.remove(&self.records, &location_key_bytes);
records_deleted += 1;
}
-
batch.commit()?;
-
log::info!("trim_collection ({collection:?}) removed {dangling_feed_keys_cleaned} dangling feed entries and {records_deleted} records");
-
Ok(())
}
fn delete_account(&mut self, did: &Did) -> Result<usize, StorageError> {
···
}
batch.commit()?;
Ok(records_deleted)
}
}
···
Ok(())
}
/// Set a value to a fixed key
fn insert_batch_static_neu<K: StaticStr>(
batch: &mut FjallBatch,
···
////////// temp stuff to remove:
-
// fn summarize_batch<const LIMIT: usize>(batch: &EventBatch<LIMIT>) -> String {
-
// format!(
-
// "batch of {: >3} samples from {: >4} records in {: >2} collections from ~{: >4} DIDs, {} acct removes, cursor {: <12?}",
-
// batch.total_records(),
-
// batch.total_seen(),
-
// batch.total_collections(),
-
// batch.estimate_dids(),
-
// batch.account_removes(),
-
// batch.latest_cursor().map(|c| c.elapsed()),
-
// )
-
// }
-
#[cfg(test)]
mod tests {
use super::*;
···
use serde_json::value::RawValue;
fn fjall_db() -> (FjallReader, FjallWriter) {
-
let (read, write, _) = FjallStorage::init(
tempfile::tempdir().unwrap(),
"offline test (no real jetstream endpoint)".to_string(),
false,
···
}
const TEST_BATCH_LIMIT: usize = 16;
#[derive(Debug, Default)]
struct TestBatch {
···
.commits_by_nsid
.entry(collection.clone())
.or_default()
-
.truncating_insert(commit)
.unwrap();
collection
···
.commits_by_nsid
.entry(collection.clone())
.or_default()
-
.truncating_insert(commit)
.unwrap();
collection
···
.commits_by_nsid
.entry(collection.clone())
.or_default()
-
.truncating_insert(commit)
.unwrap();
collection
···
fn test_hello() -> anyhow::Result<()> {
let (read, mut write) = fjall_db();
write.insert_batch::<TEST_BATCH_LIMIT>(EventBatch::default())?;
-
let (records, dids) =
-
read.get_counts_by_collection(&Nsid::new("a.b.c".to_string()).unwrap())?;
-
assert_eq!(records, 0);
-
assert_eq!(dids, 0);
Ok(())
}
···
100,
);
write.insert_batch(batch.batch)?;
-
let (records, dids) = read.get_counts_by_collection(&collection)?;
-
assert_eq!(records, 1);
-
assert_eq!(dids, 1);
-
let (records, dids) =
-
read.get_counts_by_collection(&Nsid::new("d.e.f".to_string()).unwrap())?;
-
assert_eq!(records, 0);
-
assert_eq!(dids, 0);
-
let records = read.get_records_by_collections(&[collection], 2, false)?;
assert_eq!(records.len(), 1);
let rec = &records[0];
assert_eq!(rec.record.get(), "{}");
assert!(!rec.is_update);
-
let records =
-
read.get_records_by_collections(&[Nsid::new("d.e.f".to_string()).unwrap()], 2, false)?;
assert_eq!(records.len(), 0);
Ok(())
···
write.insert_batch(batch.batch)?;
let records = read.get_records_by_collections(
-
&[
Nsid::new("a.a.a".to_string()).unwrap(),
Nsid::new("a.a.b".to_string()).unwrap(),
Nsid::new("a.a.c".to_string()).unwrap(),
-
],
100,
false,
)?;
···
write.insert_batch(batch.batch)?;
let records = read.get_records_by_collections(
-
&[
Nsid::new("a.a.a".to_string()).unwrap(),
Nsid::new("a.a.b".to_string()).unwrap(),
Nsid::new("a.a.c".to_string()).unwrap(),
-
],
2,
true,
)?;
···
101,
);
write.insert_batch(batch.batch)?;
-
let (records, dids) = read.get_counts_by_collection(&collection)?;
-
assert_eq!(records, 1);
-
assert_eq!(dids, 1);
-
let records = read.get_records_by_collections(&[collection], 2, false)?;
assert_eq!(records.len(), 1);
let rec = &records[0];
assert_eq!(rec.record.get(), r#"{"ch": "ch-ch-ch-changes"}"#);
···
101,
);
write.insert_batch(batch.batch)?;
-
let (records, dids) = read.get_counts_by_collection(&collection)?;
-
assert_eq!(records, 1);
-
assert_eq!(dids, 1);
-
let records = read.get_records_by_collections(&[collection], 2, false)?;
assert_eq!(records.len(), 0);
Ok(())
···
write.insert_batch(batch.batch)?;
let records = read.get_records_by_collections(
-
&[Nsid::new("a.a.a".to_string()).unwrap()],
100,
false,
)?;
assert_eq!(records.len(), 1);
let records = read.get_records_by_collections(
-
&[Nsid::new("a.a.b".to_string()).unwrap()],
100,
false,
)?;
assert_eq!(records.len(), 10);
let records = read.get_records_by_collections(
-
&[Nsid::new("a.a.c".to_string()).unwrap()],
100,
false,
)?;
assert_eq!(records.len(), 1);
let records = read.get_records_by_collections(
-
&[Nsid::new("a.a.d".to_string()).unwrap()],
100,
false,
)?;
assert_eq!(records.len(), 0);
-
write.trim_collection(&Nsid::new("a.a.a".to_string()).unwrap(), 6)?;
-
write.trim_collection(&Nsid::new("a.a.b".to_string()).unwrap(), 6)?;
-
write.trim_collection(&Nsid::new("a.a.c".to_string()).unwrap(), 6)?;
-
write.trim_collection(&Nsid::new("a.a.d".to_string()).unwrap(), 6)?;
let records = read.get_records_by_collections(
-
&[Nsid::new("a.a.a".to_string()).unwrap()],
100,
false,
)?;
assert_eq!(records.len(), 1);
let records = read.get_records_by_collections(
-
&[Nsid::new("a.a.b".to_string()).unwrap()],
100,
false,
)?;
assert_eq!(records.len(), 6);
let records = read.get_records_by_collections(
-
&[Nsid::new("a.a.c".to_string()).unwrap()],
100,
false,
)?;
assert_eq!(records.len(), 1);
let records = read.get_records_by_collections(
-
&[Nsid::new("a.a.d".to_string()).unwrap()],
100,
false,
)?;
···
write.insert_batch(batch.batch)?;
let records = read.get_records_by_collections(
-
&[Nsid::new("a.a.a".to_string()).unwrap()],
100,
false,
)?;
···
assert_eq!(records_deleted, 2);
let records = read.get_records_by_collections(
-
&[Nsid::new("a.a.a".to_string()).unwrap()],
100,
false,
)?;
···
write.step_rollup()?;
-
let records =
-
read.get_records_by_collections(&[Nsid::new("a.a.a".to_string()).unwrap()], 1, false)?;
assert_eq!(records.len(), 0);
Ok(())
···
);
write.insert_batch(batch.batch)?;
-
let n = write.step_rollup()?;
assert_eq!(n, 1);
let mut batch = TestBatch::default();
batch.delete_account("did:plc:person-a", 10_001);
write.insert_batch(batch.batch)?;
-
let records =
-
read.get_records_by_collections(&[Nsid::new("a.a.a".to_string()).unwrap()], 1, false)?;
assert_eq!(records.len(), 1);
-
let n = write.step_rollup()?;
assert_eq!(n, 1);
-
let records =
-
read.get_records_by_collections(&[Nsid::new("a.a.a".to_string()).unwrap()], 1, false)?;
assert_eq!(records.len(), 0);
let mut batch = TestBatch::default();
batch.delete_account("did:plc:person-a", 9_999);
write.insert_batch(batch.batch)?;
-
let n = write.step_rollup()?;
assert_eq!(n, 0);
Ok(())
···
);
write.insert_batch(batch.batch)?;
-
let n = write.step_rollup()?;
assert_eq!(n, 2);
-
let n = write.step_rollup()?;
assert_eq!(n, 0);
Ok(())
···
write.insert_batch(batch.batch)?;
// before any rollup
-
let (records, dids) =
-
read.get_counts_by_collection(&Nsid::new("a.a.a".to_string()).unwrap())?;
-
assert_eq!(records, 3);
-
assert_eq!(dids, 2);
// first batch rolled up
-
let n = write.step_rollup()?;
assert_eq!(n, 1);
-
let (records, dids) =
-
read.get_counts_by_collection(&Nsid::new("a.a.a".to_string()).unwrap())?;
-
assert_eq!(records, 3);
-
assert_eq!(dids, 2);
// delete account rolled up
-
let n = write.step_rollup()?;
assert_eq!(n, 1);
-
let (records, dids) =
-
read.get_counts_by_collection(&Nsid::new("a.a.a".to_string()).unwrap())?;
-
assert_eq!(records, 3);
-
assert_eq!(dids, 2);
// second batch rolled up
-
let n = write.step_rollup()?;
assert_eq!(n, 1);
-
let (records, dids) =
-
read.get_counts_by_collection(&Nsid::new("a.a.a".to_string()).unwrap())?;
-
assert_eq!(records, 3);
-
assert_eq!(dids, 2);
// no more rollups left
-
let n = write.step_rollup()?;
assert_eq!(n, 0);
Ok(())
}
#[test]
-
fn get_top_collections() -> anyhow::Result<()> {
let (read, mut write) = fjall_db();
let mut batch = TestBatch::default();
···
None,
10_000,
);
batch.create(
-
"did:plc:person-b",
-
"a.a.b",
-
"rkey-bbb",
"{}",
-
Some("rev-bbb"),
None,
-
10_001,
);
batch.create(
-
"did:plc:person-c",
-
"a.b.c",
-
"rkey-ccc",
"{}",
-
Some("rev-ccc"),
None,
-
10_002,
);
batch.create(
"did:plc:person-a",
-
"a.a.a",
-
"rkey-aaa-2",
"{}",
-
Some("rev-aaa-2"),
None,
-
10_003,
);
write.insert_batch(batch.batch)?;
-
let n = write.step_rollup()?;
-
assert_eq!(n, 3); // 3 collections
-
-
let tops = read.get_top_collections()?;
assert_eq!(
-
tops,
-
TopCollections {
-
total_records: 4,
-
dids_estimate: 3,
-
nsid_child_segments: HashMap::from([(
-
"a".to_string(),
-
TopCollections {
-
total_records: 4,
-
dids_estimate: 3,
-
nsid_child_segments: HashMap::from([
-
(
-
"a".to_string(),
-
TopCollections {
-
total_records: 3,
-
dids_estimate: 2,
-
nsid_child_segments: HashMap::from([
-
(
-
"a".to_string(),
-
TopCollections {
-
total_records: 2,
-
dids_estimate: 1,
-
nsid_child_segments: HashMap::from([]),
-
},
-
),
-
(
-
"b".to_string(),
-
TopCollections {
-
total_records: 1,
-
dids_estimate: 1,
-
nsid_child_segments: HashMap::from([]),
-
}
-
),
-
]),
-
},
-
),
-
(
-
"b".to_string(),
-
TopCollections {
-
total_records: 1,
-
dids_estimate: 1,
-
nsid_child_segments: HashMap::from([(
-
"c".to_string(),
-
TopCollections {
-
total_records: 1,
-
dids_estimate: 1,
-
nsid_child_segments: HashMap::from([]),
-
},
-
),]),
-
},
-
),
-
]),
-
},
-
),]),
-
}
);
Ok(())
}
#[test]
-
fn get_top_collections_with_parent_nsid() -> anyhow::Result<()> {
let (read, mut write) = fjall_db();
let mut batch = TestBatch::default();
batch.create(
-
"did:plc:inze6wrmsm7pjl7yta3oig77",
"a.a.a.a",
-
"aaaa",
-
r#""child nsid""#,
Some("rev-aaaa"),
None,
-
100,
);
batch.create(
-
"did:plc:inze6wrmsm7pjl7yta3oig77",
-
"a.a.a",
-
"aaa",
-
r#""parent nsid""#,
-
Some("rev-aaa"),
None,
-
101,
);
write.insert_batch(batch.batch)?;
-
-
let n = write.step_rollup()?;
-
assert_eq!(n, 2); // 3 collections
-
let tops = read.get_top_collections()?;
assert_eq!(
-
tops,
-
TopCollections {
-
total_records: 2,
-
dids_estimate: 1,
-
nsid_child_segments: HashMap::from([(
-
"a".to_string(),
-
TopCollections {
-
total_records: 2,
-
dids_estimate: 1,
-
nsid_child_segments: HashMap::from([(
-
"a".to_string(),
-
TopCollections {
-
total_records: 2,
-
dids_estimate: 1,
-
nsid_child_segments: HashMap::from([(
-
"a".to_string(),
-
TopCollections {
-
total_records: 2,
-
dids_estimate: 1,
-
nsid_child_segments: HashMap::from([(
-
"a".to_string(),
-
TopCollections {
-
total_records: 1,
-
dids_estimate: 1,
-
nsid_child_segments: HashMap::from([]),
-
},
-
),]),
-
},
-
),]),
-
},
-
),]),
-
},
-
),]),
-
}
);
-
// TODO: handle leaf node counts explicitly, since parent NSIDs can be leaves themselves
Ok(())
}
}
···
+
use crate::db_types::{
+
db_complete, DbBytes, DbStaticStr, EncodingResult, StaticStr, SubPrefixBytes,
+
};
use crate::error::StorageError;
+
use crate::storage::{StorageResult, StorageWhatever, StoreBackground, StoreReader, StoreWriter};
use crate::store_types::{
+
AllTimeDidsKey, AllTimeRecordsKey, AllTimeRollupKey, CommitCounts, CountsValue, CursorBucket,
+
DeleteAccountQueueKey, DeleteAccountQueueVal, HourTruncatedCursor, HourlyDidsKey,
+
HourlyRecordsKey, HourlyRollupKey, HourlyRollupStaticPrefix, JetstreamCursorKey,
+
JetstreamCursorValue, JetstreamEndpointKey, JetstreamEndpointValue, LiveCountsKey,
+
NewRollupCursorKey, NewRollupCursorValue, NsidRecordFeedKey, NsidRecordFeedVal,
+
RecordLocationKey, RecordLocationMeta, RecordLocationVal, RecordRawValue, SketchSecretKey,
+
SketchSecretPrefix, TakeoffKey, TakeoffValue, TrimCollectionCursorKey, WeekTruncatedCursor,
+
WeeklyDidsKey, WeeklyRecordsKey, WeeklyRollupKey, WithCollection, WithRank, HOUR_IN_MICROS,
+
WEEK_IN_MICROS,
};
+
use crate::{
+
nice_duration, CommitAction, ConsumerInfo, Did, EncodingError, EventBatch, JustCount, Nsid,
+
NsidCount, NsidPrefix, OrderCollectionsBy, PrefixChild, PrefixCount, UFOsRecord,
+
};
use async_trait::async_trait;
+
use fjall::{
+
Batch as FjallBatch, Config, Keyspace, PartitionCreateOptions, PartitionHandle, Snapshot,
+
};
use jetstream::events::Cursor;
+
use std::collections::{HashMap, HashSet};
+
use std::iter::Peekable;
+
use std::ops::Bound;
use std::path::Path;
+
use std::sync::{
+
atomic::{AtomicBool, Ordering},
+
Arc,
+
};
+
use std::time::{Duration, Instant, SystemTime};
const MAX_BATCHED_ACCOUNT_DELETE_RECORDS: usize = 1024;
const MAX_BATCHED_ROLLUP_COUNTS: usize = 256;
···
/// - key: "takeoff" (literal)
/// - val: u64 (micros timestamp, not from jetstream for now so not precise)
///
+
/// - Cardinality estimator secret
+
/// - key: "sketch_secret" (literal)
+
/// - val: [u8; 16]
+
///
/// - Rollup cursor (bg work: roll stats into hourlies, delete accounts, old record deletes)
/// - key: "rollup_cursor" (literal)
/// - val: u64 (tracks behind js_cursor)
///
+
/// - Feed trim cursor (bg work: delete oldest excess records)
+
/// - key: "trim_cursor" || nullstr (nsid)
+
/// - val: u64 (earliest previously-removed feed entry jetstream cursor)
///
/// Partition: 'feed'
///
···
/// - key: "live_counts" || u64 || nullstr (js_cursor, nsid)
/// - val: u64 || HLL (count (not cursor), estimator)
///
+
///
/// - Hourly total record counts and dids estimate per collection
/// - key: "hourly_counts" || u64 || nullstr (hour, nsid)
/// - val: u64 || HLL (count (not cursor), estimator)
///
+
/// - Hourly record count ranking
+
/// - key: "hourly_rank_records" || u64 || u64 || nullstr (hour, count, nsid)
+
/// - val: [empty]
+
///
+
/// - Hourly did estimate ranking
+
/// - key: "hourly_rank_dids" || u64 || u64 || nullstr (hour, dids estimate, nsid)
+
/// - val: [empty]
+
///
+
///
/// - Weekly total record counts and dids estimate per collection
+
/// - key: "weekly_counts" || u64 || nullstr (week, nsid)
/// - val: u64 || HLL (count (not cursor), estimator)
+
///
+
/// - Weekly record count ranking
+
/// - key: "weekly_rank_records" || u64 || u64 || nullstr (week, count, nsid)
+
/// - val: [empty]
+
///
+
/// - Weekly did estimate ranking
+
/// - key: "weekly_rank_dids" || u64 || u64 || nullstr (week, dids estimate, nsid)
+
/// - val: [empty]
+
///
///
/// - All-time total record counts and dids estimate per collection
/// - key: "ever_counts" || nullstr (nsid)
/// - val: u64 || HLL (count (not cursor), estimator)
///
+
/// - All-time total record record count ranking
+
/// - key: "ever_rank_records" || u64 || nullstr (count, nsid)
+
/// - val: [empty]
+
///
+
/// - All-time did estimate ranking
+
/// - key: "ever_rank_dids" || u64 || nullstr (dids estimate, nsid)
+
/// - val: [empty]
///
///
/// Partition: 'queues'
···
pub temp: bool,
}
+
impl StorageWhatever<FjallReader, FjallWriter, FjallBackground, FjallConfig> for FjallStorage {
fn init(
path: impl AsRef<Path>,
endpoint: String,
force_endpoint: bool,
_config: FjallConfig,
+
) -> StorageResult<(FjallReader, FjallWriter, Option<Cursor>, SketchSecretPrefix)> {
let keyspace = {
let config = Config::new(path);
+
// #[cfg(not(test))]
+
// let config = config.fsync_ms(Some(4_000));
config.open()?
};
···
let js_cursor = get_static_neu::<JetstreamCursorKey, JetstreamCursorValue>(&global)?;
+
let sketch_secret = if js_cursor.is_some() {
let stored_endpoint =
get_static_neu::<JetstreamEndpointKey, JetstreamEndpointValue>(&global)?;
let JetstreamEndpointValue(stored) = stored_endpoint.ok_or(StorageError::InitError(
"found cursor but missing js_endpoint, refusing to start.".to_string(),
))?;
+
+
let Some(stored_secret) =
+
get_static_neu::<SketchSecretKey, SketchSecretPrefix>(&global)?
+
else {
+
return Err(StorageError::InitError(
+
"found cursor but missing sketch_secret, refusing to start.".to_string(),
+
));
+
};
if stored != endpoint {
if force_endpoint {
···
)?;
} else {
return Err(StorageError::InitError(format!(
+
"stored js_endpoint {stored:?} differs from provided {endpoint:?}, refusing to start without --jetstream-force.")));
}
}
+
stored_secret
} else {
+
log::info!("initializing a fresh db!");
+
init_static_neu::<JetstreamEndpointKey>(
&global,
JetstreamEndpointValue(endpoint.to_string()),
)?;
+
+
log::info!("generating new secret for cardinality sketches...");
+
let mut sketch_secret: SketchSecretPrefix = [0u8; 16];
+
getrandom::fill(&mut sketch_secret).map_err(|e| {
+
StorageError::InitError(format!(
+
"failed to get a random secret for cardinality sketches: {e:?}"
+
))
+
})?;
+
init_static_neu::<SketchSecretKey>(&global, sketch_secret)?;
+
+
init_static_neu::<TakeoffKey>(&global, Cursor::at(SystemTime::now()))?;
+
init_static_neu::<NewRollupCursorKey>(&global, Cursor::from_start())?;
+
+
sketch_secret
+
};
let reader = FjallReader {
keyspace: keyspace.clone(),
···
rollups: rollups.clone(),
};
let writer = FjallWriter {
+
bg_taken: Arc::new(AtomicBool::new(false)),
keyspace,
global,
feeds,
···
rollups,
queues,
};
+
Ok((reader, writer, js_cursor, sketch_secret))
}
}
···
}
}
+
type GetCounts = Box<dyn FnOnce() -> StorageResult<CountsValue>>;
+
type GetByterCounts = StorageResult<(Nsid, GetCounts)>;
+
type NsidCounter = Box<dyn Iterator<Item = GetByterCounts>>;
+
fn get_lexi_iter<T: WithCollection + DbBytes + 'static>(
+
snapshot: &Snapshot,
+
start: Bound<Vec<u8>>,
+
end: Bound<Vec<u8>>,
+
) -> StorageResult<NsidCounter> {
+
Ok(Box::new(snapshot.range((start, end)).map(|kv| {
+
let (k_bytes, v_bytes) = kv?;
+
let key = db_complete::<T>(&k_bytes)?;
+
let nsid = key.collection().clone();
+
let get_counts: GetCounts = Box::new(move || Ok(db_complete::<CountsValue>(&v_bytes)?));
+
Ok((nsid, get_counts))
+
})))
+
}
+
type GetRollupKey = Arc<dyn Fn(&Nsid) -> EncodingResult<Vec<u8>>>;
+
fn get_lookup_iter<T: WithCollection + WithRank + DbBytes + 'static>(
+
snapshot: lsm_tree::Snapshot,
+
start: Bound<Vec<u8>>,
+
end: Bound<Vec<u8>>,
+
get_rollup_key: GetRollupKey,
+
) -> StorageResult<NsidCounter> {
+
Ok(Box::new(snapshot.range((start, end)).rev().map(
+
move |kv| {
+
let (k_bytes, _) = kv?;
+
let key = db_complete::<T>(&k_bytes)?;
+
let nsid = key.collection().clone();
+
let get_counts: GetCounts = Box::new({
+
let nsid = nsid.clone();
+
let snapshot = snapshot.clone();
+
let get_rollup_key = get_rollup_key.clone();
+
move || {
+
let db_count_bytes = snapshot.get(get_rollup_key(&nsid)?)?.expect(
+
"integrity: all-time rank rollup must have corresponding all-time count rollup",
+
);
+
Ok(db_complete::<CountsValue>(&db_count_bytes)?)
+
}
+
});
+
Ok((nsid, get_counts))
+
},
+
)))
+
}
+
+
type CollectionSerieses = HashMap<Nsid, Vec<CountsValue>>;
+
impl FjallReader {
fn get_storage_stats(&self) -> StorageResult<serde_json::Value> {
let rollup_cursor =
···
get_snapshot_static_neu::<JetstreamCursorKey, JetstreamCursorValue>(&global)?
.map(|c| c.to_raw_u64());
+
let rollup_cursor =
+
get_snapshot_static_neu::<NewRollupCursorKey, NewRollupCursorValue>(&global)?
+
.map(|c| c.to_raw_u64());
+
Ok(ConsumerInfo::Jetstream {
endpoint,
started_at,
latest_cursor,
+
rollup_cursor,
})
}
+
fn get_earliest_hour(&self, rollups: Option<&Snapshot>) -> StorageResult<HourTruncatedCursor> {
+
let cursor = rollups
+
.unwrap_or(&self.rollups.snapshot())
+
.prefix(HourlyRollupStaticPrefix::default().to_db_bytes()?)
+
.next()
+
.transpose()?
+
.map(|(key_bytes, _)| db_complete::<HourlyRollupKey>(&key_bytes))
+
.transpose()?
+
.map(|key| key.cursor())
+
.unwrap_or_else(|| Cursor::from_start().into());
+
Ok(cursor)
+
}
+
+
fn get_lexi_collections(
+
&self,
+
snapshot: Snapshot,
+
limit: usize,
+
cursor: Option<Vec<u8>>,
+
buckets: Vec<CursorBucket>,
+
) -> StorageResult<(Vec<NsidCount>, Option<Vec<u8>>)> {
+
let cursor_nsid = cursor.as_deref().map(db_complete::<Nsid>).transpose()?;
+
let mut iters: Vec<Peekable<NsidCounter>> = Vec::with_capacity(buckets.len());
+
for bucket in &buckets {
+
let it: NsidCounter = match bucket {
+
CursorBucket::Hour(t) => {
+
let start = cursor_nsid
+
.as_ref()
+
.map(|nsid| HourlyRollupKey::after_nsid(*t, nsid))
+
.unwrap_or_else(|| HourlyRollupKey::start(*t))?;
+
let end = HourlyRollupKey::end(*t)?;
+
get_lexi_iter::<HourlyRollupKey>(&snapshot, start, end)?
+
}
+
CursorBucket::Week(t) => {
+
let start = cursor_nsid
+
.as_ref()
+
.map(|nsid| WeeklyRollupKey::after_nsid(*t, nsid))
+
.unwrap_or_else(|| WeeklyRollupKey::start(*t))?;
+
let end = WeeklyRollupKey::end(*t)?;
+
get_lexi_iter::<WeeklyRollupKey>(&snapshot, start, end)?
+
}
+
CursorBucket::AllTime => {
+
let start = cursor_nsid
+
.as_ref()
+
.map(AllTimeRollupKey::after_nsid)
+
.unwrap_or_else(AllTimeRollupKey::start)?;
+
let end = AllTimeRollupKey::end()?;
+
get_lexi_iter::<AllTimeRollupKey>(&snapshot, start, end)?
+
}
+
};
+
iters.push(it.peekable());
+
}
+
let mut out = Vec::new();
+
let mut current_nsid = None;
+
for _ in 0..limit {
+
// double-scan the iters for each element: this could be eliminated but we're starting simple.
+
// first scan: find the lowest nsid
+
// second scan: take + merge, and advance all iters with lowest nsid
+
let mut lowest: Option<Nsid> = None;
+
for iter in &mut iters {
+
if let Some(bla) = iter.peek_mut() {
+
let (nsid, _) = match bla {
+
Ok(v) => v,
+
Err(e) => Err(std::mem::replace(e, StorageError::Stolen))?,
+
};
+
lowest = match lowest {
+
Some(ref current) if nsid.as_str() > current.as_str() => lowest,
+
_ => Some(nsid.clone()),
+
};
+
}
+
}
+
current_nsid = lowest.clone();
+
let Some(nsid) = lowest else { break };
+
let mut merged = CountsValue::default();
+
for iter in &mut iters {
+
// unwrap: potential fjall error was already checked & bailed over when peeking in the first loop
+
if let Some(Ok((_, get_counts))) = iter.next_if(|v| v.as_ref().unwrap().0 == nsid) {
+
let counts = get_counts()?;
+
merged.merge(&counts);
+
}
+
}
+
out.push(NsidCount {
+
nsid: nsid.to_string(),
+
creates: merged.counts().creates,
+
dids_estimate: merged.dids().estimate() as u64,
+
});
}
+
+
let next_cursor = current_nsid.map(|s| s.to_db_bytes()).transpose()?;
+
Ok((out, next_cursor))
+
}
+
+
fn get_ordered_collections(
+
&self,
+
snapshot: Snapshot,
+
limit: usize,
+
order: OrderCollectionsBy,
+
buckets: Vec<CursorBucket>,
+
) -> StorageResult<Vec<NsidCount>> {
+
let mut iters: Vec<NsidCounter> = Vec::with_capacity(buckets.len());
+
+
for bucket in buckets {
+
let it: NsidCounter = match (&order, bucket) {
+
(OrderCollectionsBy::RecordsCreated, CursorBucket::Hour(t)) => {
+
get_lookup_iter::<HourlyRecordsKey>(
+
snapshot.clone(),
+
HourlyRecordsKey::start(t)?,
+
HourlyRecordsKey::end(t)?,
+
Arc::new({
+
move |collection| HourlyRollupKey::new(t, collection).to_db_bytes()
+
}),
+
)?
}
+
(OrderCollectionsBy::DidsEstimate, CursorBucket::Hour(t)) => {
+
get_lookup_iter::<HourlyDidsKey>(
+
snapshot.clone(),
+
HourlyDidsKey::start(t)?,
+
HourlyDidsKey::end(t)?,
+
Arc::new({
+
move |collection| HourlyRollupKey::new(t, collection).to_db_bytes()
+
}),
+
)?
+
}
+
(OrderCollectionsBy::RecordsCreated, CursorBucket::Week(t)) => {
+
get_lookup_iter::<WeeklyRecordsKey>(
+
snapshot.clone(),
+
WeeklyRecordsKey::start(t)?,
+
WeeklyRecordsKey::end(t)?,
+
Arc::new({
+
move |collection| WeeklyRollupKey::new(t, collection).to_db_bytes()
+
}),
+
)?
+
}
+
(OrderCollectionsBy::DidsEstimate, CursorBucket::Week(t)) => {
+
get_lookup_iter::<WeeklyDidsKey>(
+
snapshot.clone(),
+
WeeklyDidsKey::start(t)?,
+
WeeklyDidsKey::end(t)?,
+
Arc::new({
+
move |collection| WeeklyRollupKey::new(t, collection).to_db_bytes()
+
}),
+
)?
+
}
+
(OrderCollectionsBy::RecordsCreated, CursorBucket::AllTime) => {
+
get_lookup_iter::<AllTimeRecordsKey>(
+
snapshot.clone(),
+
AllTimeRecordsKey::start()?,
+
AllTimeRecordsKey::end()?,
+
Arc::new(|collection| AllTimeRollupKey::new(collection).to_db_bytes()),
+
)?
+
}
+
(OrderCollectionsBy::DidsEstimate, CursorBucket::AllTime) => {
+
get_lookup_iter::<AllTimeDidsKey>(
+
snapshot.clone(),
+
AllTimeDidsKey::start()?,
+
AllTimeDidsKey::end()?,
+
Arc::new(|collection| AllTimeRollupKey::new(collection).to_db_bytes()),
+
)?
+
}
+
(OrderCollectionsBy::Lexi { .. }, _) => unreachable!(),
+
};
+
iters.push(it);
+
}
+
+
// overfetch by taking a bit more than the limit
+
// merge by collection
+
// sort by requested order, take limit, discard all remaining
+
//
+
// this isn't guaranteed to be correct, but it will hopefully be close most of the time:
+
// - it's possible that some NSIDs might score low during some time-buckets, and miss being merged
+
// - overfetching hopefully helps a bit by catching nsids near the threshold more often, but. yeah.
+
//
+
// this thing is heavy, there's probably a better way
+
let mut ranked: HashMap<Nsid, CountsValue> = HashMap::with_capacity(limit * 2);
+
for iter in iters {
+
for pair in iter.take((limit as f64 * 1.3).ceil() as usize) {
+
let (nsid, get_counts) = pair?;
+
let counts = get_counts()?;
+
ranked.entry(nsid).or_default().merge(&counts);
}
}
+
let mut ranked: Vec<(Nsid, CountsValue)> = ranked.into_iter().collect();
+
match order {
+
OrderCollectionsBy::RecordsCreated => ranked.sort_by_key(|(_, c)| c.counts().creates),
+
OrderCollectionsBy::DidsEstimate => ranked.sort_by_key(|(_, c)| c.dids().estimate()),
+
OrderCollectionsBy::Lexi { .. } => unreachable!(),
+
}
+
let counts = ranked
+
.into_iter()
+
.rev()
+
.take(limit)
+
.map(|(nsid, cv)| NsidCount {
+
nsid: nsid.to_string(),
+
creates: cv.counts().creates,
+
dids_estimate: cv.dids().estimate() as u64,
+
})
+
.collect();
+
Ok(counts)
+
}
+
fn get_collections(
+
&self,
+
limit: usize,
+
order: OrderCollectionsBy,
+
since: Option<HourTruncatedCursor>,
+
until: Option<HourTruncatedCursor>,
+
) -> StorageResult<(Vec<NsidCount>, Option<Vec<u8>>)> {
+
let snapshot = self.rollups.snapshot();
+
let buckets = if let (None, None) = (since, until) {
+
vec![CursorBucket::AllTime]
+
} else {
+
let mut lower = self.get_earliest_hour(Some(&snapshot))?;
+
if let Some(specified) = since {
+
if specified > lower {
+
lower = specified;
+
}
+
}
+
let upper = until.unwrap_or_else(|| Cursor::at(SystemTime::now()).into());
+
CursorBucket::buckets_spanning(lower, upper)
+
};
+
match order {
+
OrderCollectionsBy::Lexi { cursor } => {
+
self.get_lexi_collections(snapshot, limit, cursor, buckets)
+
}
+
_ => Ok((
+
self.get_ordered_collections(snapshot, limit, order, buckets)?,
+
None,
+
)),
+
}
+
}
+
fn get_lexi_prefix(
+
&self,
+
snapshot: Snapshot,
+
prefix: NsidPrefix,
+
limit: usize,
+
cursor: Option<Vec<u8>>,
+
buckets: Vec<CursorBucket>,
+
) -> StorageResult<(JustCount, Vec<PrefixChild>, Option<Vec<u8>>)> {
+
// let prefix_sub_with_null = prefix.as_str().to_string().to_db_bytes()?;
+
let prefix_sub = String::sub_prefix(&prefix.terminated())?; // with trailing dot to ensure full segment match
+
let cursor_child = cursor
+
.as_deref()
+
.map(|encoded_bytes| {
+
let decoded: String = db_complete(encoded_bytes)?;
+
// TODO: write some tests for cursors, there's probably bugs here
+
let as_sub_prefix_with_null = decoded.to_db_bytes()?;
+
Ok::<_, EncodingError>(as_sub_prefix_with_null)
+
})
+
.transpose()?;
+
let mut iters: Vec<NsidCounter> = Vec::with_capacity(buckets.len());
+
for bucket in &buckets {
+
let it: NsidCounter = match bucket {
+
CursorBucket::Hour(t) => {
+
let start = cursor_child
+
.as_ref()
+
.map(|child| HourlyRollupKey::after_nsid_prefix(*t, child))
+
.unwrap_or_else(|| HourlyRollupKey::after_nsid_prefix(*t, &prefix_sub))?;
+
let end = HourlyRollupKey::nsid_prefix_end(*t, &prefix_sub)?;
+
get_lexi_iter::<HourlyRollupKey>(&snapshot, start, end)?
+
}
+
CursorBucket::Week(t) => {
+
let start = cursor_child
+
.as_ref()
+
.map(|child| WeeklyRollupKey::after_nsid_prefix(*t, child))
+
.unwrap_or_else(|| WeeklyRollupKey::after_nsid_prefix(*t, &prefix_sub))?;
+
let end = WeeklyRollupKey::nsid_prefix_end(*t, &prefix_sub)?;
+
get_lexi_iter::<WeeklyRollupKey>(&snapshot, start, end)?
+
}
+
CursorBucket::AllTime => {
+
let start = cursor_child
+
.as_ref()
+
.map(|child| AllTimeRollupKey::after_nsid_prefix(child))
+
.unwrap_or_else(|| AllTimeRollupKey::after_nsid_prefix(&prefix_sub))?;
+
let end = AllTimeRollupKey::nsid_prefix_end(&prefix_sub)?;
+
get_lexi_iter::<AllTimeRollupKey>(&snapshot, start, end)?
+
}
+
};
+
iters.push(it);
+
}
+
+
// with apologies
+
let mut iters: Vec<_> = iters
+
.into_iter()
+
.map(|it| {
+
it.map(|bla| {
+
bla.map(|(nsid, v)| {
+
let Some(child) = Child::from_prefix(&nsid, &prefix) else {
+
panic!("failed from_prefix: {nsid:?} {prefix:?} (bad iter bounds?)");
+
};
+
(child, v)
+
})
+
})
+
.peekable()
+
})
+
.collect();
+
+
let mut items = Vec::new();
+
let mut prefix_count = CountsValue::default();
+
#[derive(Debug, Clone, PartialEq)]
+
enum Child {
+
FullNsid(String),
+
ChildPrefix(String),
+
}
+
impl Child {
+
fn from_prefix(nsid: &Nsid, prefix: &NsidPrefix) -> Option<Self> {
+
if prefix.is_group_of(nsid) {
+
return Some(Child::FullNsid(nsid.to_string()));
+
}
+
let suffix = nsid.as_str().strip_prefix(&format!("{}.", prefix.0))?;
+
let (segment, _) = suffix.split_once('.').unwrap();
+
let child_prefix = format!("{}.{segment}", prefix.0);
+
Some(Child::ChildPrefix(child_prefix))
+
}
+
fn is_before(&self, other: &Child) -> bool {
+
match (self, other) {
+
(Child::FullNsid(s), Child::ChildPrefix(o)) if s == o => true,
+
(Child::ChildPrefix(s), Child::FullNsid(o)) if s == o => false,
+
(Child::FullNsid(s), Child::FullNsid(o)) => s < o,
+
(Child::ChildPrefix(s), Child::ChildPrefix(o)) => s < o,
+
(Child::FullNsid(s), Child::ChildPrefix(o)) => s < o,
+
(Child::ChildPrefix(s), Child::FullNsid(o)) => s < o,
+
}
+
}
+
fn into_inner(self) -> String {
+
match self {
+
Child::FullNsid(s) => s,
+
Child::ChildPrefix(s) => s,
+
}
}
}
+
let mut current_child: Option<Child> = None;
+
for _ in 0..limit {
+
// double-scan the iters for each element: this could be eliminated but we're starting simple.
+
// first scan: find the lowest nsid
+
// second scan: take + merge, and advance all iters with lowest nsid
+
let mut lowest: Option<Child> = None;
+
for iter in &mut iters {
+
if let Some(bla) = iter.peek_mut() {
+
let (child, _) = match bla {
+
Ok(v) => v,
+
Err(e) => Err(std::mem::replace(e, StorageError::Stolen))?,
+
};
+
lowest = match lowest {
+
Some(ref current) if current.is_before(child) => lowest,
+
_ => Some(child.clone()),
+
};
+
}
+
}
+
current_child = lowest.clone();
+
let Some(child) = lowest else { break };
+
+
let mut merged = CountsValue::default();
+
for iter in &mut iters {
+
// unwrap: potential fjall error was already checked & bailed over when peeking in the first loop
+
while let Some(Ok((_, get_counts))) =
+
iter.next_if(|v| v.as_ref().unwrap().0 == child)
+
{
+
let counts = get_counts()?;
+
prefix_count.merge(&counts);
+
merged.merge(&counts);
+
}
+
}
+
items.push(match child {
+
Child::FullNsid(nsid) => PrefixChild::Collection(NsidCount {
+
nsid,
+
creates: merged.counts().creates,
+
dids_estimate: merged.dids().estimate() as u64,
+
}),
+
Child::ChildPrefix(prefix) => PrefixChild::Prefix(PrefixCount {
+
prefix,
+
creates: merged.counts().creates,
+
dids_estimate: merged.dids().estimate() as u64,
+
}),
+
});
+
}
+
+
// TODO: could serialize the prefix count (with sketch) into the cursor so that uniqs can actually count up?
+
// ....er the sketch is probably too big
+
// TODO: this is probably buggy on child-type boundaries bleh
+
let next_cursor = current_child
+
.map(|s| s.into_inner().to_db_bytes())
+
.transpose()?;
+
+
Ok(((&prefix_count).into(), items, next_cursor))
}
+
fn get_prefix(
+
&self,
+
prefix: NsidPrefix,
+
limit: usize,
+
order: OrderCollectionsBy,
+
since: Option<HourTruncatedCursor>,
+
until: Option<HourTruncatedCursor>,
+
) -> StorageResult<(JustCount, Vec<PrefixChild>, Option<Vec<u8>>)> {
+
let snapshot = self.rollups.snapshot();
+
let buckets = if let (None, None) = (since, until) {
+
vec![CursorBucket::AllTime]
+
} else {
+
let mut lower = self.get_earliest_hour(Some(&snapshot))?;
+
if let Some(specified) = since {
+
if specified > lower {
+
lower = specified;
+
}
+
}
+
let upper = until.unwrap_or_else(|| Cursor::at(SystemTime::now()).into());
+
CursorBucket::buckets_spanning(lower, upper)
+
};
+
match order {
+
OrderCollectionsBy::Lexi { cursor } => {
+
self.get_lexi_prefix(snapshot, prefix, limit, cursor, buckets)
+
}
+
_ => todo!(),
+
}
+
}
+
/// - step: output series time step, in seconds
+
fn get_timeseries(
+
&self,
+
collections: Vec<Nsid>,
+
since: HourTruncatedCursor,
+
until: Option<HourTruncatedCursor>,
+
step: u64,
+
) -> StorageResult<(Vec<HourTruncatedCursor>, CollectionSerieses)> {
+
if step > WEEK_IN_MICROS {
+
panic!("week-stepping is todo");
+
}
+
let until = until.unwrap_or_else(|| Cursor::at(SystemTime::now()).into());
+
let Ok(dt) = Cursor::from(until).duration_since(&Cursor::from(since)) else {
+
return Ok((
+
// empty: until < since
+
vec![],
+
collections.into_iter().map(|c| (c, vec![])).collect(),
+
));
+
};
+
let n_hours = (dt.as_micros() as u64) / HOUR_IN_MICROS;
+
let mut counts_by_hour = Vec::with_capacity(n_hours as usize);
+
let snapshot = self.rollups.snapshot();
+
for hour in (0..n_hours).map(|i| since.nth_next(i)) {
+
let mut counts = Vec::with_capacity(collections.len());
+
for nsid in &collections {
+
let count = snapshot
+
.get(&HourlyRollupKey::new(hour, nsid).to_db_bytes()?)?
+
.as_deref()
+
.map(db_complete::<CountsValue>)
+
.transpose()?
+
.unwrap_or_default();
+
counts.push(count);
+
}
+
counts_by_hour.push((hour, counts));
+
}
+
let step_hours = step / (HOUR_IN_MICROS / 1_000_000);
+
let mut output_hours = Vec::with_capacity(step_hours as usize);
+
let mut output_series: CollectionSerieses = collections
+
.iter()
+
.map(|c| (c.clone(), Vec::with_capacity(step_hours as usize)))
+
.collect();
+
for chunk in counts_by_hour.chunks(step_hours as usize) {
+
output_hours.push(chunk[0].0); // always guaranteed to have at least one element in a chunks chunk
+
for (i, collection) in collections.iter().enumerate() {
+
let mut c = CountsValue::default();
+
for (_, counts) in chunk {
+
c.merge(&counts[i]);
+
}
+
output_series
+
.get_mut(collection)
+
.expect("output series is initialized with all collections")
+
.push(c);
}
}
+
+
Ok((output_hours, output_series))
+
}
+
+
fn get_collection_counts(
+
&self,
+
collection: &Nsid,
+
since: HourTruncatedCursor,
+
until: Option<HourTruncatedCursor>,
+
) -> StorageResult<JustCount> {
+
// grab snapshots in case rollups happen while we're working
+
let rollups = self.rollups.snapshot();
+
+
let until = until.unwrap_or_else(|| Cursor::at(SystemTime::now()).into());
+
let buckets = CursorBucket::buckets_spanning(since, until);
+
let mut total_counts = CountsValue::default();
+
+
for bucket in buckets {
+
let key = match bucket {
+
CursorBucket::Hour(t) => HourlyRollupKey::new(t, collection).to_db_bytes()?,
+
CursorBucket::Week(t) => WeeklyRollupKey::new(t, collection).to_db_bytes()?,
+
CursorBucket::AllTime => unreachable!(), // TODO: fall back on this if the time span spans the whole dataset?
+
};
+
let count = rollups
+
.get(&key)?
+
.as_deref()
+
.map(db_complete::<CountsValue>)
+
.transpose()?
+
.unwrap_or_default();
+
total_counts.merge(&count);
+
}
+
+
Ok((&total_counts).into())
}
fn get_records_by_collections(
&self,
+
collections: HashSet<Nsid>,
limit: usize,
expand_each_collection: bool,
) -> StorageResult<Vec<UFOsRecord>> {
···
}
let mut record_iterators = Vec::new();
for collection in collections {
+
let iter = RecordIterator::new(&self.feeds, self.records.clone(), &collection, limit)?;
record_iterators.push(iter.peekable());
}
let mut merged = Vec::new();
···
let s = self.clone();
tokio::task::spawn_blocking(move || FjallReader::get_consumer_info(&s)).await?
}
+
async fn get_collections(
+
&self,
+
limit: usize,
+
order: OrderCollectionsBy,
+
since: Option<HourTruncatedCursor>,
+
until: Option<HourTruncatedCursor>,
+
) -> StorageResult<(Vec<NsidCount>, Option<Vec<u8>>)> {
let s = self.clone();
+
tokio::task::spawn_blocking(move || {
+
FjallReader::get_collections(&s, limit, order, since, until)
+
})
+
.await?
}
+
async fn get_prefix(
+
&self,
+
prefix: NsidPrefix,
+
limit: usize,
+
order: OrderCollectionsBy,
+
since: Option<HourTruncatedCursor>,
+
until: Option<HourTruncatedCursor>,
+
) -> StorageResult<(JustCount, Vec<PrefixChild>, Option<Vec<u8>>)> {
+
let s = self.clone();
+
tokio::task::spawn_blocking(move || {
+
FjallReader::get_prefix(&s, prefix, limit, order, since, until)
+
})
+
.await?
+
}
+
async fn get_timeseries(
+
&self,
+
collections: Vec<Nsid>,
+
since: HourTruncatedCursor,
+
until: Option<HourTruncatedCursor>,
+
step: u64,
+
) -> StorageResult<(Vec<HourTruncatedCursor>, CollectionSerieses)> {
+
let s = self.clone();
+
tokio::task::spawn_blocking(move || {
+
FjallReader::get_timeseries(&s, collections, since, until, step)
+
})
+
.await?
+
}
+
async fn get_collection_counts(
+
&self,
+
collection: &Nsid,
+
since: HourTruncatedCursor,
+
until: Option<HourTruncatedCursor>,
+
) -> StorageResult<JustCount> {
let s = self.clone();
let collection = collection.clone();
+
tokio::task::spawn_blocking(move || {
+
FjallReader::get_collection_counts(&s, &collection, since, until)
+
})
+
.await?
}
async fn get_records_by_collections(
&self,
+
collections: HashSet<Nsid>,
limit: usize,
expand_each_collection: bool,
) -> StorageResult<Vec<UFOsRecord>> {
let s = self.clone();
tokio::task::spawn_blocking(move || {
+
FjallReader::get_records_by_collections(&s, collections, limit, expand_each_collection)
})
.await?
}
}
+
#[derive(Clone)]
pub struct FjallWriter {
+
bg_taken: Arc<AtomicBool>,
keyspace: Keyspace,
global: PartitionHandle,
feeds: PartitionHandle,
···
timelies: impl Iterator<Item = Result<(fjall::Slice, fjall::Slice), fjall::Error>>,
cursor_exclusive_limit: Option<Cursor>,
rollup_limit: usize,
+
) -> StorageResult<(usize, HashSet<Nsid>)> {
// current strategy is to buffer counts in mem before writing the rollups
// we *could* read+write every single batch to rollup.. but their merge is associative so
// ...so save the db some work up front? is this worth it? who knows...
+
+
let mut dirty_nsids = HashSet::new();
#[derive(Eq, Hash, PartialEq)]
enum Rollup {
···
break;
}
+
dirty_nsids.insert(key.collection().clone());
+
batch.remove(&self.rollups, key_bytes);
let val = db_complete::<CountsValue>(&val_bytes)?;
counts_by_rollup
···
last_cursor = key.cursor();
}
+
// go through each new rollup thing and merge it with whatever might already be in the db
for ((nsid, rollup), counts) in counts_by_rollup {
+
let rollup_key_bytes = match rollup {
Rollup::Hourly(hourly_cursor) => {
+
HourlyRollupKey::new(hourly_cursor, &nsid).to_db_bytes()?
}
Rollup::Weekly(weekly_cursor) => {
+
WeeklyRollupKey::new(weekly_cursor, &nsid).to_db_bytes()?
}
+
Rollup::AllTime => AllTimeRollupKey::new(&nsid).to_db_bytes()?,
};
let mut rolled: CountsValue = self
.rollups
+
.get(&rollup_key_bytes)?
.as_deref()
.map(db_complete::<CountsValue>)
.transpose()?
.unwrap_or_default();
+
// now that we have values, we can know the exising ranks
+
let before_creates_count = rolled.counts().creates;
+
let before_dids_estimate = rolled.dids().estimate() as u64;
+
+
// update the rollup
+
rolled.merge(&counts);
+
+
// new ranks
+
let new_creates_count = rolled.counts().creates;
+
let new_dids_estimate = rolled.dids().estimate() as u64;
+
+
// update create-ranked secondary index if rank changed
+
if new_creates_count != before_creates_count {
+
let (old_k, new_k) = match rollup {
+
Rollup::Hourly(cursor) => (
+
HourlyRecordsKey::new(cursor, before_creates_count.into(), &nsid)
+
.to_db_bytes()?,
+
HourlyRecordsKey::new(cursor, new_creates_count.into(), &nsid)
+
.to_db_bytes()?,
+
),
+
Rollup::Weekly(cursor) => (
+
WeeklyRecordsKey::new(cursor, before_creates_count.into(), &nsid)
+
.to_db_bytes()?,
+
WeeklyRecordsKey::new(cursor, new_creates_count.into(), &nsid)
+
.to_db_bytes()?,
+
),
+
Rollup::AllTime => (
+
AllTimeRecordsKey::new(before_creates_count.into(), &nsid).to_db_bytes()?,
+
AllTimeRecordsKey::new(new_creates_count.into(), &nsid).to_db_bytes()?,
+
),
+
};
+
batch.remove(&self.rollups, &old_k); // TODO: when fjall gets weak delete, this will hopefully work way better
+
batch.insert(&self.rollups, &new_k, "");
+
}
+
+
// update dids-ranked secondary index if rank changed
+
if new_dids_estimate != before_dids_estimate {
+
let (old_k, new_k) = match rollup {
+
Rollup::Hourly(cursor) => (
+
HourlyDidsKey::new(cursor, before_dids_estimate.into(), &nsid)
+
.to_db_bytes()?,
+
HourlyDidsKey::new(cursor, new_dids_estimate.into(), &nsid)
+
.to_db_bytes()?,
+
),
+
Rollup::Weekly(cursor) => (
+
WeeklyDidsKey::new(cursor, before_dids_estimate.into(), &nsid)
+
.to_db_bytes()?,
+
WeeklyDidsKey::new(cursor, new_dids_estimate.into(), &nsid)
+
.to_db_bytes()?,
+
),
+
Rollup::AllTime => (
+
AllTimeDidsKey::new(before_dids_estimate.into(), &nsid).to_db_bytes()?,
+
AllTimeDidsKey::new(new_dids_estimate.into(), &nsid).to_db_bytes()?,
+
),
+
};
+
batch.remove(&self.rollups, &old_k); // TODO: when fjall gets weak delete, this will hopefully work way better
+
batch.insert(&self.rollups, &new_k, "");
}
+
// replace the main counts rollup
+
batch.insert(&self.rollups, &rollup_key_bytes, &rolled.to_db_bytes()?);
}
insert_batch_static_neu::<NewRollupCursorKey>(&mut batch, &self.global, last_cursor)?;
batch.commit()?;
+
Ok((cursors_advanced, dirty_nsids))
}
}
+
impl StoreWriter<FjallBackground> for FjallWriter {
+
fn background_tasks(&mut self, reroll: bool) -> StorageResult<FjallBackground> {
+
if self.bg_taken.swap(true, Ordering::SeqCst) {
+
Err(StorageError::BackgroundAlreadyStarted)
+
} else {
+
if reroll {
+
log::info!("reroll: resetting rollup cursor...");
+
insert_static_neu::<NewRollupCursorKey>(&self.global, Cursor::from_start())?;
+
log::info!("reroll: clearing trim cursors...");
+
let mut batch = self.keyspace.batch();
+
for kv in self
+
.global
+
.prefix(TrimCollectionCursorKey::from_prefix_to_db_bytes(
+
&Default::default(),
+
)?)
+
{
+
let (k, _) = kv?;
+
batch.remove(&self.global, k);
+
}
+
let n = batch.len();
+
batch.commit()?;
+
log::info!("reroll: cleared {n} trim cursors.");
+
}
+
Ok(FjallBackground(self.clone()))
+
}
+
}
+
fn insert_batch<const LIMIT: usize>(
&mut self,
event_batch: EventBatch<LIMIT>,
···
}
}
let live_counts_key: LiveCountsKey = (latest, &nsid).into();
+
let counts_value = CountsValue::new(
+
CommitCounts {
+
creates: commits.creates as u64,
+
updates: commits.updates as u64,
+
deletes: commits.deletes as u64,
+
},
+
commits.dids_estimate,
+
);
batch.insert(
&self.rollups,
&live_counts_key.to_db_bytes()?,
···
Ok(())
}
+
fn step_rollup(&mut self) -> StorageResult<(usize, HashSet<Nsid>)> {
+
let mut dirty_nsids = HashSet::new();
+
let rollup_cursor =
get_static_neu::<NewRollupCursorKey, NewRollupCursorValue>(&self.global)?.ok_or(
StorageError::BadStateError("Could not find current rollup cursor".to_string()),
···
let live_counts_range = LiveCountsKey::range_from_cursor(rollup_cursor)?;
let mut timely_iter = self.rollups.range(live_counts_range).peekable();
+
let timely_next = timely_iter
.peek_mut()
+
.map(|kv| -> StorageResult<LiveCountsKey> {
match kv {
Err(e) => Err(std::mem::replace(e, fjall::Error::Poisoned))?,
Ok((key_bytes, _)) => {
let key = db_complete::<LiveCountsKey>(key_bytes)?;
+
Ok(key)
}
}
})
···
})
.transpose()?;
+
let cursors_stepped = match (timely_next, next_delete) {
+
(Some(timely), Some((delete_cursor, delete_key_bytes, delete_val_bytes))) => {
+
if timely.cursor() < delete_cursor {
+
let (n, dirty) = self.rollup_live_counts(
timely_iter,
Some(delete_cursor),
MAX_BATCHED_ROLLUP_COUNTS,
+
)?;
+
dirty_nsids.extend(dirty);
+
n
} else {
self.rollup_delete_account(delete_cursor, &delete_key_bytes, &delete_val_bytes)?
}
}
(Some(_), None) => {
+
let (n, dirty) =
+
self.rollup_live_counts(timely_iter, None, MAX_BATCHED_ROLLUP_COUNTS)?;
+
dirty_nsids.extend(dirty);
+
n
}
(None, Some((delete_cursor, delete_key_bytes, delete_val_bytes))) => {
self.rollup_delete_account(delete_cursor, &delete_key_bytes, &delete_val_bytes)?
···
(None, None) => 0,
};
+
Ok((cursors_stepped, dirty_nsids))
}
fn trim_collection(
&mut self,
collection: &Nsid,
limit: usize,
+
full_scan: bool,
+
) -> StorageResult<(usize, usize, bool)> {
let mut dangling_feed_keys_cleaned = 0;
let mut records_deleted = 0;
+
let live_range = if full_scan {
+
let start = NsidRecordFeedKey::from_prefix_to_db_bytes(collection)?;
+
let end = NsidRecordFeedKey::prefix_range_end(collection)?;
+
start..end
+
} else {
+
let feed_trim_cursor_key =
+
TrimCollectionCursorKey::new(collection.clone()).to_db_bytes()?;
+
let trim_cursor = self
+
.global
+
.get(&feed_trim_cursor_key)?
+
.map(|value_bytes| db_complete(&value_bytes))
+
.transpose()?
+
.unwrap_or(Cursor::from_start());
+
NsidRecordFeedKey::from_pair(collection.clone(), trim_cursor).range_to_prefix_end()?
+
};
+
let mut live_records_found = 0;
+
let mut candidate_new_feed_lower_cursor = None;
+
let ended_early = false;
+
let mut current_cursor: Option<Cursor> = None;
+
for (i, kv) in self.feeds.range(live_range).rev().enumerate() {
+
if i > 0 && i % 500_000 == 0 {
+
log::info!(
+
"trim: at {i} for {:?} (now at {})",
+
collection.to_string(),
+
current_cursor
+
.map(|c| c
+
.elapsed()
+
.map(nice_duration)
+
.unwrap_or("[not past]".into()))
+
.unwrap_or("??".into()),
+
);
+
}
let (key_bytes, val_bytes) = kv?;
let feed_key = db_complete::<NsidRecordFeedKey>(&key_bytes)?;
let feed_val = db_complete::<NsidRecordFeedVal>(&val_bytes)?;
···
let Some(location_val_bytes) = self.records.get(&location_key_bytes)? else {
// record was deleted (hopefully)
+
self.feeds.remove(&*key_bytes)?;
dangling_feed_keys_cleaned += 1;
continue;
};
let (meta, _) = RecordLocationMeta::from_db_bytes(&location_val_bytes)?;
+
current_cursor = Some(meta.cursor());
if meta.cursor() != feed_key.cursor() {
// older/different version
+
self.feeds.remove(&*key_bytes)?;
dangling_feed_keys_cleaned += 1;
continue;
}
if meta.rev != feed_val.rev() {
// weird...
log::warn!("record lookup: cursor match but rev did not...? removing.");
+
self.records.remove(&location_key_bytes)?;
+
self.feeds.remove(&*key_bytes)?;
dangling_feed_keys_cleaned += 1;
continue;
}
+
live_records_found += 1;
+
if live_records_found <= limit {
continue;
}
+
if candidate_new_feed_lower_cursor.is_none() {
+
candidate_new_feed_lower_cursor = Some(feed_key.cursor());
+
}
+
self.feeds.remove(&location_key_bytes)?;
+
self.feeds.remove(key_bytes)?;
records_deleted += 1;
}
+
if !ended_early {
+
if let Some(new_cursor) = candidate_new_feed_lower_cursor {
+
self.global.insert(
+
&TrimCollectionCursorKey::new(collection.clone()).to_db_bytes()?,
+
&new_cursor.to_db_bytes()?,
+
)?;
+
}
+
}
+
log::trace!("trim_collection ({collection:?}) removed {dangling_feed_keys_cleaned} dangling feed entries and {records_deleted} records (ended early? {ended_early})");
+
Ok((dangling_feed_keys_cleaned, records_deleted, ended_early))
}
fn delete_account(&mut self, did: &Did) -> Result<usize, StorageError> {
···
}
batch.commit()?;
Ok(records_deleted)
+
}
+
}
+
+
pub struct FjallBackground(FjallWriter);
+
+
#[async_trait]
+
impl StoreBackground for FjallBackground {
+
async fn run(mut self, backfill: bool) -> StorageResult<()> {
+
let mut dirty_nsids = HashSet::new();
+
+
// backfill condition here is iffy -- longer is good when doing the main ingest and then collection trims
+
// shorter once those are done helps things catch up
+
// the best setting for non-backfill is non-obvious.. it can be pretty slow and still be fine
+
let mut rollup =
+
tokio::time::interval(Duration::from_micros(if backfill { 100 } else { 32_000 }));
+
rollup.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
+
+
// backfill condition again iffy. collection trims should probably happen in their own phase.
+
let mut trim = tokio::time::interval(Duration::from_secs(if backfill { 18 } else { 9 }));
+
trim.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
+
+
loop {
+
tokio::select! {
+
_ = rollup.tick() => {
+
let mut db = self.0.clone();
+
let (n, dirty) = tokio::task::spawn_blocking(move || db.step_rollup()).await??;
+
if n == 0 {
+
rollup.reset_after(Duration::from_millis(1_200)); // we're caught up, take a break
+
}
+
dirty_nsids.extend(dirty);
+
log::trace!("rolled up {n} items ({} collections now dirty)", dirty_nsids.len());
+
},
+
_ = trim.tick() => {
+
let n = dirty_nsids.len();
+
log::trace!("trimming {n} nsids: {dirty_nsids:?}");
+
let t0 = Instant::now();
+
let (mut total_danglers, mut total_deleted) = (0, 0);
+
let mut completed = HashSet::new();
+
for collection in &dirty_nsids {
+
let mut db = self.0.clone();
+
let c = collection.clone();
+
let (danglers, deleted, ended_early) = tokio::task::spawn_blocking(move || db.trim_collection(&c, 512, false)).await??;
+
total_danglers += danglers;
+
total_deleted += deleted;
+
if !ended_early {
+
completed.insert(collection.clone());
+
}
+
if total_deleted > 10_000_000 {
+
log::info!("trim stopped early, more than 10M records already deleted.");
+
break;
+
}
+
}
+
for c in completed {
+
dirty_nsids.remove(&c);
+
}
+
log::info!("finished trimming {n} nsids in {:?}: {total_danglers} dangling and {total_deleted} total removed.", t0.elapsed());
+
},
+
};
+
}
}
}
···
Ok(())
}
+
/// Set a value to a fixed key, erroring if the value already exists
+
///
+
/// Intended for single-threaded init: not safe under concurrency, since there
+
/// is no transaction between checking if the already exists and writing it.
+
fn init_static_neu<K: StaticStr>(
+
global: &PartitionHandle,
+
value: impl DbBytes,
+
) -> StorageResult<()> {
+
let key_bytes = DbStaticStr::<K>::default().to_db_bytes()?;
+
if global.get(&key_bytes)?.is_some() {
+
return Err(StorageError::InitError(format!(
+
"init failed: value for key {key_bytes:?} already exists"
+
)));
+
}
+
let value_bytes = value.to_db_bytes()?;
+
global.insert(&key_bytes, &value_bytes)?;
+
Ok(())
+
}
+
/// Set a value to a fixed key
fn insert_batch_static_neu<K: StaticStr>(
batch: &mut FjallBatch,
···
////////// temp stuff to remove:
#[cfg(test)]
mod tests {
use super::*;
···
use serde_json::value::RawValue;
fn fjall_db() -> (FjallReader, FjallWriter) {
+
let (read, write, _, _) = FjallStorage::init(
tempfile::tempdir().unwrap(),
"offline test (no real jetstream endpoint)".to_string(),
false,
···
}
const TEST_BATCH_LIMIT: usize = 16;
+
fn beginning() -> HourTruncatedCursor {
+
Cursor::from_start().into()
+
}
#[derive(Debug, Default)]
struct TestBatch {
···
.commits_by_nsid
.entry(collection.clone())
.or_default()
+
.truncating_insert(commit, &[0u8; 16])
.unwrap();
collection
···
.commits_by_nsid
.entry(collection.clone())
.or_default()
+
.truncating_insert(commit, &[0u8; 16])
.unwrap();
collection
···
.commits_by_nsid
.entry(collection.clone())
.or_default()
+
.truncating_insert(commit, &[0u8; 16])
.unwrap();
collection
···
fn test_hello() -> anyhow::Result<()> {
let (read, mut write) = fjall_db();
write.insert_batch::<TEST_BATCH_LIMIT>(EventBatch::default())?;
+
let JustCount {
+
creates,
+
dids_estimate,
+
..
+
} = read.get_collection_counts(
+
&Nsid::new("a.b.c".to_string()).unwrap(),
+
beginning(),
+
None,
+
)?;
+
assert_eq!(creates, 0);
+
assert_eq!(dids_estimate, 0);
Ok(())
}
···
100,
);
write.insert_batch(batch.batch)?;
+
write.step_rollup()?;
+
let JustCount {
+
creates,
+
dids_estimate,
+
..
+
} = read.get_collection_counts(&collection, beginning(), None)?;
+
assert_eq!(creates, 1);
+
assert_eq!(dids_estimate, 1);
+
let JustCount {
+
creates,
+
dids_estimate,
+
..
+
} = read.get_collection_counts(
+
&Nsid::new("d.e.f".to_string()).unwrap(),
+
beginning(),
+
None,
+
)?;
+
assert_eq!(creates, 0);
+
assert_eq!(dids_estimate, 0);
+
let records = read.get_records_by_collections([collection].into(), 2, false)?;
assert_eq!(records.len(), 1);
let rec = &records[0];
assert_eq!(rec.record.get(), "{}");
assert!(!rec.is_update);
+
let records = read.get_records_by_collections(
+
[Nsid::new("d.e.f".to_string()).unwrap()].into(),
+
2,
+
false,
+
)?;
assert_eq!(records.len(), 0);
Ok(())
···
write.insert_batch(batch.batch)?;
let records = read.get_records_by_collections(
+
HashSet::from([
Nsid::new("a.a.a".to_string()).unwrap(),
Nsid::new("a.a.b".to_string()).unwrap(),
Nsid::new("a.a.c".to_string()).unwrap(),
+
]),
100,
false,
)?;
···
write.insert_batch(batch.batch)?;
let records = read.get_records_by_collections(
+
HashSet::from([
Nsid::new("a.a.a".to_string()).unwrap(),
Nsid::new("a.a.b".to_string()).unwrap(),
Nsid::new("a.a.c".to_string()).unwrap(),
+
]),
2,
true,
)?;
···
101,
);
write.insert_batch(batch.batch)?;
+
write.step_rollup()?;
+
let JustCount {
+
creates,
+
dids_estimate,
+
..
+
} = read.get_collection_counts(&collection, beginning(), None)?;
+
assert_eq!(creates, 1);
+
assert_eq!(dids_estimate, 1);
+
let records = read.get_records_by_collections([collection].into(), 2, false)?;
assert_eq!(records.len(), 1);
let rec = &records[0];
assert_eq!(rec.record.get(), r#"{"ch": "ch-ch-ch-changes"}"#);
···
101,
);
write.insert_batch(batch.batch)?;
+
write.step_rollup()?;
+
let JustCount {
+
creates,
+
dids_estimate,
+
..
+
} = read.get_collection_counts(&collection, beginning(), None)?;
+
assert_eq!(creates, 1);
+
assert_eq!(dids_estimate, 1);
+
let records = read.get_records_by_collections([collection].into(), 2, false)?;
assert_eq!(records.len(), 0);
Ok(())
···
write.insert_batch(batch.batch)?;
let records = read.get_records_by_collections(
+
HashSet::from([Nsid::new("a.a.a".to_string()).unwrap()]),
100,
false,
)?;
assert_eq!(records.len(), 1);
let records = read.get_records_by_collections(
+
HashSet::from([Nsid::new("a.a.b".to_string()).unwrap()]),
100,
false,
)?;
assert_eq!(records.len(), 10);
let records = read.get_records_by_collections(
+
HashSet::from([Nsid::new("a.a.c".to_string()).unwrap()]),
100,
false,
)?;
assert_eq!(records.len(), 1);
let records = read.get_records_by_collections(
+
HashSet::from([Nsid::new("a.a.d".to_string()).unwrap()]),
100,
false,
)?;
assert_eq!(records.len(), 0);
+
write.trim_collection(&Nsid::new("a.a.a".to_string()).unwrap(), 6, false)?;
+
write.trim_collection(&Nsid::new("a.a.b".to_string()).unwrap(), 6, false)?;
+
write.trim_collection(&Nsid::new("a.a.c".to_string()).unwrap(), 6, false)?;
+
write.trim_collection(&Nsid::new("a.a.d".to_string()).unwrap(), 6, false)?;
let records = read.get_records_by_collections(
+
HashSet::from([Nsid::new("a.a.a".to_string()).unwrap()]),
100,
false,
)?;
assert_eq!(records.len(), 1);
let records = read.get_records_by_collections(
+
HashSet::from([Nsid::new("a.a.b".to_string()).unwrap()]),
100,
false,
)?;
assert_eq!(records.len(), 6);
let records = read.get_records_by_collections(
+
HashSet::from([Nsid::new("a.a.c".to_string()).unwrap()]),
100,
false,
)?;
assert_eq!(records.len(), 1);
let records = read.get_records_by_collections(
+
HashSet::from([Nsid::new("a.a.d".to_string()).unwrap()]),
100,
false,
)?;
···
write.insert_batch(batch.batch)?;
let records = read.get_records_by_collections(
+
HashSet::from([Nsid::new("a.a.a".to_string()).unwrap()]),
100,
false,
)?;
···
assert_eq!(records_deleted, 2);
let records = read.get_records_by_collections(
+
HashSet::from([Nsid::new("a.a.a".to_string()).unwrap()]),
100,
false,
)?;
···
write.step_rollup()?;
+
let records = read.get_records_by_collections(
+
[Nsid::new("a.a.a".to_string()).unwrap()].into(),
+
1,
+
false,
+
)?;
assert_eq!(records.len(), 0);
Ok(())
···
);
write.insert_batch(batch.batch)?;
+
let (n, _) = write.step_rollup()?;
assert_eq!(n, 1);
let mut batch = TestBatch::default();
batch.delete_account("did:plc:person-a", 10_001);
write.insert_batch(batch.batch)?;
+
let records = read.get_records_by_collections(
+
[Nsid::new("a.a.a".to_string()).unwrap()].into(),
+
1,
+
false,
+
)?;
assert_eq!(records.len(), 1);
+
let (n, _) = write.step_rollup()?;
assert_eq!(n, 1);
+
let records = read.get_records_by_collections(
+
[Nsid::new("a.a.a".to_string()).unwrap()].into(),
+
1,
+
false,
+
)?;
assert_eq!(records.len(), 0);
let mut batch = TestBatch::default();
batch.delete_account("did:plc:person-a", 9_999);
write.insert_batch(batch.batch)?;
+
let (n, _) = write.step_rollup()?;
assert_eq!(n, 0);
Ok(())
···
);
write.insert_batch(batch.batch)?;
+
let (n, _) = write.step_rollup()?;
assert_eq!(n, 2);
+
let (n, _) = write.step_rollup()?;
assert_eq!(n, 0);
Ok(())
···
write.insert_batch(batch.batch)?;
// before any rollup
+
let JustCount {
+
creates,
+
dids_estimate,
+
..
+
} = read.get_collection_counts(
+
&Nsid::new("a.a.a".to_string()).unwrap(),
+
beginning(),
+
None,
+
)?;
+
assert_eq!(creates, 0);
+
assert_eq!(dids_estimate, 0);
// first batch rolled up
+
let (n, _) = write.step_rollup()?;
assert_eq!(n, 1);
+
let JustCount {
+
creates,
+
dids_estimate,
+
..
+
} = read.get_collection_counts(
+
&Nsid::new("a.a.a".to_string()).unwrap(),
+
beginning(),
+
None,
+
)?;
+
assert_eq!(creates, 2);
+
assert_eq!(dids_estimate, 2);
// delete account rolled up
+
let (n, _) = write.step_rollup()?;
assert_eq!(n, 1);
+
let JustCount {
+
creates,
+
dids_estimate,
+
..
+
} = read.get_collection_counts(
+
&Nsid::new("a.a.a".to_string()).unwrap(),
+
beginning(),
+
None,
+
)?;
+
assert_eq!(creates, 2);
+
assert_eq!(dids_estimate, 2);
// second batch rolled up
+
let (n, _) = write.step_rollup()?;
assert_eq!(n, 1);
+
let JustCount {
+
creates,
+
dids_estimate,
+
..
+
} = read.get_collection_counts(
+
&Nsid::new("a.a.a".to_string()).unwrap(),
+
beginning(),
+
None,
+
)?;
+
assert_eq!(creates, 3);
+
assert_eq!(dids_estimate, 2);
// no more rollups left
+
let (n, _) = write.step_rollup()?;
assert_eq!(n, 0);
Ok(())
}
#[test]
+
fn get_prefix_children_lexi_empty() {
+
let (read, _) = fjall_db();
+
let (
+
JustCount {
+
creates,
+
dids_estimate,
+
..
+
},
+
children,
+
cursor,
+
) = read
+
.get_prefix(
+
NsidPrefix::new("aaa.aaa").unwrap(),
+
10,
+
OrderCollectionsBy::Lexi { cursor: None },
+
None,
+
None,
+
)
+
.unwrap();
+
+
assert_eq!(creates, 0);
+
assert_eq!(dids_estimate, 0);
+
assert_eq!(children, vec![]);
+
assert_eq!(cursor, None);
+
}
+
+
#[test]
+
fn get_prefix_excludes_exact_collection() -> anyhow::Result<()> {
let (read, mut write) = fjall_db();
let mut batch = TestBatch::default();
···
None,
10_000,
);
+
write.insert_batch(batch.batch)?;
+
write.step_rollup()?;
+
+
let (
+
JustCount {
+
creates,
+
dids_estimate,
+
..
+
},
+
children,
+
cursor,
+
) = read.get_prefix(
+
NsidPrefix::new("a.a.a").unwrap(),
+
10,
+
OrderCollectionsBy::Lexi { cursor: None },
+
None,
+
None,
+
)?;
+
assert_eq!(creates, 0);
+
assert_eq!(dids_estimate, 0);
+
assert_eq!(children, vec![]);
+
assert_eq!(cursor, None);
+
Ok(())
+
}
+
+
#[test]
+
fn get_prefix_excludes_neighbour_collection() -> anyhow::Result<()> {
+
let (read, mut write) = fjall_db();
+
+
let mut batch = TestBatch::default();
batch.create(
+
"did:plc:person-a",
+
"a.a.aa",
+
"rkey-aaa",
"{}",
+
Some("rev-aaa"),
None,
+
10_000,
);
+
write.insert_batch(batch.batch)?;
+
write.step_rollup()?;
+
+
let (
+
JustCount {
+
creates,
+
dids_estimate,
+
..
+
},
+
children,
+
cursor,
+
) = read.get_prefix(
+
NsidPrefix::new("a.a.a").unwrap(),
+
10,
+
OrderCollectionsBy::Lexi { cursor: None },
+
None,
+
None,
+
)?;
+
assert_eq!(creates, 0);
+
assert_eq!(dids_estimate, 0);
+
assert_eq!(children, vec![]);
+
assert_eq!(cursor, None);
+
Ok(())
+
}
+
+
#[test]
+
fn get_prefix_includes_child_collection() -> anyhow::Result<()> {
+
let (read, mut write) = fjall_db();
+
+
let mut batch = TestBatch::default();
batch.create(
+
"did:plc:person-a",
+
"a.a.a",
+
"rkey-aaa",
"{}",
+
Some("rev-aaa"),
None,
+
10_000,
);
+
write.insert_batch(batch.batch)?;
+
write.step_rollup()?;
+
+
let (
+
JustCount {
+
creates,
+
dids_estimate,
+
..
+
},
+
children,
+
cursor,
+
) = read.get_prefix(
+
NsidPrefix::new("a.a").unwrap(),
+
10,
+
OrderCollectionsBy::Lexi { cursor: None },
+
None,
+
None,
+
)?;
+
assert_eq!(creates, 1);
+
assert_eq!(dids_estimate, 1);
+
assert_eq!(
+
children,
+
vec![PrefixChild::Collection(NsidCount {
+
nsid: "a.a.a".to_string(),
+
creates: 1,
+
dids_estimate: 1
+
}),]
+
);
+
assert_eq!(cursor, None);
+
Ok(())
+
}
+
+
#[test]
+
fn get_prefix_includes_child_prefix() -> anyhow::Result<()> {
+
let (read, mut write) = fjall_db();
+
+
let mut batch = TestBatch::default();
batch.create(
"did:plc:person-a",
+
"a.a.a.a",
+
"rkey-aaaa",
"{}",
+
Some("rev-aaaa"),
None,
+
10_000,
);
write.insert_batch(batch.batch)?;
+
write.step_rollup()?;
+
let (
+
JustCount {
+
creates,
+
dids_estimate,
+
..
+
},
+
children,
+
cursor,
+
) = read.get_prefix(
+
NsidPrefix::new("a.a").unwrap(),
+
10,
+
OrderCollectionsBy::Lexi { cursor: None },
+
None,
+
None,
+
)?;
+
assert_eq!(creates, 1);
+
assert_eq!(dids_estimate, 1);
assert_eq!(
+
children,
+
vec![PrefixChild::Prefix(PrefixCount {
+
prefix: "a.a.a".to_string(),
+
creates: 1,
+
dids_estimate: 1
+
}),]
);
+
assert_eq!(cursor, None);
Ok(())
}
#[test]
+
fn get_prefix_merges_child_prefixes() -> anyhow::Result<()> {
let (read, mut write) = fjall_db();
let mut batch = TestBatch::default();
batch.create(
+
"did:plc:person-a",
"a.a.a.a",
+
"rkey-aaaa",
+
"{}",
Some("rev-aaaa"),
None,
+
10_000,
);
batch.create(
+
"did:plc:person-a",
+
"a.a.a.b",
+
"rkey-aaab",
+
"{}",
+
Some("rev-aaab"),
None,
+
10_001,
);
write.insert_batch(batch.batch)?;
+
write.step_rollup()?;
+
let (
+
JustCount {
+
creates,
+
dids_estimate,
+
..
+
},
+
children,
+
cursor,
+
) = read.get_prefix(
+
NsidPrefix::new("a.a").unwrap(),
+
10,
+
OrderCollectionsBy::Lexi { cursor: None },
+
None,
+
None,
+
)?;
+
assert_eq!(creates, 2);
+
assert_eq!(dids_estimate, 1);
assert_eq!(
+
children,
+
vec![PrefixChild::Prefix(PrefixCount {
+
prefix: "a.a.a".to_string(),
+
creates: 2,
+
dids_estimate: 1
+
}),]
);
+
assert_eq!(cursor, None);
+
Ok(())
+
}
+
#[test]
+
fn get_prefix_exact_and_child_and_prefix() -> anyhow::Result<()> {
+
let (read, mut write) = fjall_db();
+
let mut batch = TestBatch::default();
+
// exact:
+
batch.create(
+
"did:plc:person-a",
+
"a.a.a",
+
"rkey-aaa",
+
"{}",
+
Some("rev-aaa"),
+
None,
+
10_000,
+
);
+
// child:
+
batch.create(
+
"did:plc:person-a",
+
"a.a.a.a",
+
"rkey-aaaa",
+
"{}",
+
Some("rev-aaaa"),
+
None,
+
10_001,
+
);
+
// prefix:
+
batch.create(
+
"did:plc:person-a",
+
"a.a.a.a.a",
+
"rkey-aaaaa",
+
"{}",
+
Some("rev-aaaaa"),
+
None,
+
10_002,
+
);
+
write.insert_batch(batch.batch)?;
+
write.step_rollup()?;
+
+
let (
+
JustCount {
+
creates,
+
dids_estimate,
+
..
+
},
+
children,
+
cursor,
+
) = read.get_prefix(
+
NsidPrefix::new("a.a.a").unwrap(),
+
10,
+
OrderCollectionsBy::Lexi { cursor: None },
+
None,
+
None,
+
)?;
+
assert_eq!(creates, 2);
+
assert_eq!(dids_estimate, 1);
+
assert_eq!(
+
children,
+
vec![
+
PrefixChild::Collection(NsidCount {
+
nsid: "a.a.a.a".to_string(),
+
creates: 1,
+
dids_estimate: 1
+
}),
+
PrefixChild::Prefix(PrefixCount {
+
prefix: "a.a.a.a".to_string(),
+
creates: 1,
+
dids_estimate: 1
+
}),
+
]
+
);
+
assert_eq!(cursor, None);
Ok(())
}
}
-1844
ufos/src/storage_mem.rs
···
-
use std::ops::Bound;
-
use std::sync::Arc;
-
-
use crate::db_types::{db_complete, DbBytes, DbStaticStr, StaticStr};
-
use crate::error::StorageError;
-
use crate::storage::{StorageResult, StorageWhatever, StoreReader, StoreWriter};
-
use crate::store_types::{
-
AllTimeRollupKey, CountsValue, DeleteAccountQueueKey, DeleteAccountQueueVal,
-
HourTruncatedCursor, HourlyRollupKey, JetstreamCursorKey, JetstreamCursorValue,
-
JetstreamEndpointKey, JetstreamEndpointValue, LiveCountsKey, NewRollupCursorKey,
-
NewRollupCursorValue, NsidRecordFeedKey, NsidRecordFeedVal, RecordLocationKey,
-
RecordLocationMeta, RecordLocationVal, RecordRawValue, TakeoffKey, TakeoffValue,
-
WeekTruncatedCursor, WeeklyRollupKey,
-
};
-
use crate::{CommitAction, ConsumerInfo, Did, EventBatch, Nsid, TopCollections, UFOsRecord};
-
use async_trait::async_trait;
-
use jetstream::events::Cursor;
-
use lsm_tree::range::prefix_to_range;
-
use std::collections::BTreeMap;
-
use std::collections::HashMap;
-
use std::path::Path;
-
use std::sync::Mutex;
-
use std::sync::RwLock;
-
use std::time::SystemTime;
-
-
const MAX_BATCHED_CLEANUP_SIZE: usize = 1024; // try to commit progress for longer feeds
-
const MAX_BATCHED_ACCOUNT_DELETE_RECORDS: usize = 1024;
-
const MAX_BATCHED_ROLLUP_COUNTS: usize = 256;
-
-
///
-
/// new data format, roughly:
-
///
-
/// Partion: 'global'
-
///
-
/// - Global sequence counter (is the jetstream cursor -- monotonic with many gaps)
-
/// - key: "js_cursor" (literal)
-
/// - val: u64
-
///
-
/// - Jetstream server endpoint (persisted because the cursor can't be used on another instance without data loss)
-
/// - key: "js_endpoint" (literal)
-
/// - val: string (URL of the instance)
-
///
-
/// - Launch date
-
/// - key: "takeoff" (literal)
-
/// - val: u64 (micros timestamp, not from jetstream for now so not precise)
-
///
-
/// - Rollup cursor (bg work: roll stats into hourlies, delete accounts, old record deletes)
-
/// - key: "rollup_cursor" (literal)
-
/// - val: u64 (tracks behind js_cursor)
-
///
-
///
-
/// Partition: 'feed'
-
///
-
/// - Per-collection list of record references ordered by jetstream cursor
-
/// - key: nullstr || u64 (collection nsid null-terminated, jetstream cursor)
-
/// - val: nullstr || nullstr || nullstr (did, rkey, rev. rev is mostly a sanity-check for now.)
-
///
-
///
-
/// Partition: 'records'
-
///
-
/// - Actual records by their atproto location
-
/// - key: nullstr || nullstr || nullstr (did, collection, rkey)
-
/// - val: u64 || bool || nullstr || rawval (js_cursor, is_update, rev, actual record)
-
///
-
///
-
/// Partition: 'rollups'
-
///
-
/// - Live (batched) records counts and dids estimate per collection
-
/// - key: "live_counts" || u64 || nullstr (js_cursor, nsid)
-
/// - val: u64 || HLL (count (not cursor), estimator)
-
///
-
/// - Hourly total record counts and dids estimate per collection
-
/// - key: "hourly_counts" || u64 || nullstr (hour, nsid)
-
/// - val: u64 || HLL (count (not cursor), estimator)
-
///
-
/// - Weekly total record counts and dids estimate per collection
-
/// - key: "weekly_counts" || u64 || nullstr (hour, nsid)
-
/// - val: u64 || HLL (count (not cursor), estimator)
-
///
-
/// - All-time total record counts and dids estimate per collection
-
/// - key: "ever_counts" || nullstr (nsid)
-
/// - val: u64 || HLL (count (not cursor), estimator)
-
///
-
/// - TODO: sorted indexes for all-times?
-
///
-
///
-
/// Partition: 'queues'
-
///
-
/// - Delete account queue
-
/// - key: "delete_acount" || u64 (js_cursor)
-
/// - val: nullstr (did)
-
///
-
///
-
/// TODO: moderation actions
-
/// TODO: account privacy preferences. Might wait for the protocol-level (PDS-level?) stuff to land. Will probably do lazy fetching + caching on read.
-
#[derive(Debug)]
-
pub struct MemStorage {}
-
-
#[derive(Debug, Default)]
-
pub struct MemConfig {
-
/// drop the db when the storage is dropped
-
///
-
/// this is only meant for tests
-
#[cfg(test)]
-
pub temp: bool,
-
}
-
-
////////////
-
////////////
-
////////////
-
////////////
-
////////////
-
////////////
-
-
struct BatchSentinel {}
-
-
#[derive(Clone)]
-
struct MemKeyspace {
-
keyspace_guard: Arc<RwLock<BatchSentinel>>,
-
}
-
-
impl MemKeyspace {
-
pub fn open() -> Self {
-
Self {
-
keyspace_guard: Arc::new(RwLock::new(BatchSentinel {})),
-
}
-
}
-
pub fn open_partition(&self, _name: &str) -> StorageResult<MemPartion> {
-
Ok(MemPartion {
-
// name: name.to_string(),
-
keyspace_guard: self.keyspace_guard.clone(),
-
contents: Default::default(),
-
})
-
}
-
pub fn batch(&self) -> MemBatch {
-
MemBatch {
-
keyspace_guard: self.keyspace_guard.clone(),
-
tasks: Vec::new(),
-
}
-
}
-
pub fn instant(&self) -> u64 {
-
1
-
}
-
}
-
-
enum BatchTask {
-
Insert {
-
p: MemPartion,
-
key: Vec<u8>,
-
val: Vec<u8>,
-
},
-
Remove {
-
p: MemPartion,
-
key: Vec<u8>,
-
},
-
}
-
struct MemBatch {
-
keyspace_guard: Arc<RwLock<BatchSentinel>>,
-
tasks: Vec<BatchTask>,
-
}
-
impl MemBatch {
-
pub fn insert(&mut self, p: &MemPartion, key: &[u8], val: &[u8]) {
-
self.tasks.push(BatchTask::Insert {
-
p: p.clone(),
-
key: key.to_vec(),
-
val: val.to_vec(),
-
});
-
}
-
pub fn remove(&mut self, p: &MemPartion, key: &[u8]) {
-
self.tasks.push(BatchTask::Remove {
-
p: p.clone(),
-
key: key.to_vec(),
-
});
-
}
-
pub fn len(&self) -> usize {
-
self.tasks.len()
-
}
-
pub fn commit(&mut self) -> StorageResult<()> {
-
let _guard = self.keyspace_guard.write().unwrap();
-
for task in &mut self.tasks {
-
match task {
-
BatchTask::Insert { p, key, val } => p
-
.contents
-
.try_lock()
-
.unwrap()
-
.insert(key.to_vec(), val.to_vec()),
-
BatchTask::Remove { p, key } => p.contents.try_lock().unwrap().remove(key),
-
};
-
}
-
Ok(())
-
}
-
}
-
-
#[derive(Clone)]
-
struct MemPartion {
-
// name: String,
-
keyspace_guard: Arc<RwLock<BatchSentinel>>,
-
contents: Arc<Mutex<BTreeMap<Vec<u8>, Vec<u8>>>>,
-
}
-
impl MemPartion {
-
pub fn get(&self, key: &[u8]) -> StorageResult<Option<Vec<u8>>> {
-
let _guard = self.keyspace_guard.read().unwrap();
-
Ok(self.contents.lock().unwrap().get(key).cloned())
-
}
-
pub fn prefix(&self, pre: &[u8]) -> Vec<StorageResult<(Vec<u8>, Vec<u8>)>> {
-
// let prefix_bytes = prefix.to_db_bytes()?;
-
let (_, Bound::Excluded(range_end)) = prefix_to_range(pre) else {
-
panic!("bad range thing");
-
};
-
-
return self.range(pre.to_vec()..range_end.to_vec());
-
}
-
pub fn range(&self, r: std::ops::Range<Vec<u8>>) -> Vec<StorageResult<(Vec<u8>, Vec<u8>)>> {
-
let _guard = self.keyspace_guard.read().unwrap();
-
self.contents
-
.lock()
-
.unwrap()
-
.range(r)
-
.map(|(k, v)| Ok((k.clone(), v.clone())))
-
.collect()
-
}
-
pub fn insert(&self, key: &[u8], val: &[u8]) -> StorageResult<()> {
-
let _guard = self.keyspace_guard.read().unwrap();
-
self.contents
-
.lock()
-
.unwrap()
-
.insert(key.to_vec(), val.to_vec());
-
Ok(())
-
}
-
// pub fn remove(&self, key: &[u8]) -> StorageResult<()> {
-
// let _guard = self.keyspace_guard.read().unwrap();
-
// self.contents
-
// .lock()
-
// .unwrap()
-
// .remove(key);
-
// Ok(())
-
// }
-
pub fn snapshot_at(&self, _instant: u64) -> Self {
-
self.clone()
-
}
-
pub fn snapshot(&self) -> Self {
-
self.clone()
-
}
-
}
-
-
////////////
-
////////////
-
////////////
-
////////////
-
////////////
-
////////////
-
-
impl StorageWhatever<MemReader, MemWriter, MemConfig> for MemStorage {
-
fn init(
-
_path: impl AsRef<Path>,
-
endpoint: String,
-
force_endpoint: bool,
-
_config: MemConfig,
-
) -> StorageResult<(MemReader, MemWriter, Option<Cursor>)> {
-
let keyspace = MemKeyspace::open();
-
-
let global = keyspace.open_partition("global")?;
-
let feeds = keyspace.open_partition("feeds")?;
-
let records = keyspace.open_partition("records")?;
-
let rollups = keyspace.open_partition("rollups")?;
-
let queues = keyspace.open_partition("queues")?;
-
-
let js_cursor = get_static_neu::<JetstreamCursorKey, JetstreamCursorValue>(&global)?;
-
-
if js_cursor.is_some() {
-
let stored_endpoint =
-
get_static_neu::<JetstreamEndpointKey, JetstreamEndpointValue>(&global)?;
-
-
let JetstreamEndpointValue(stored) = stored_endpoint.ok_or(StorageError::InitError(
-
"found cursor but missing js_endpoint, refusing to start.".to_string(),
-
))?;
-
-
if stored != endpoint {
-
if force_endpoint {
-
log::warn!("forcing a jetstream switch from {stored:?} to {endpoint:?}");
-
insert_static_neu::<JetstreamEndpointKey>(
-
&global,
-
JetstreamEndpointValue(endpoint.to_string()),
-
)?;
-
} else {
-
return Err(StorageError::InitError(format!(
-
"stored js_endpoint {stored:?} differs from provided {endpoint:?}, refusing to start.")));
-
}
-
}
-
} else {
-
insert_static_neu::<JetstreamEndpointKey>(
-
&global,
-
JetstreamEndpointValue(endpoint.to_string()),
-
)?;
-
insert_static_neu::<TakeoffKey>(&global, Cursor::at(SystemTime::now()))?;
-
insert_static_neu::<NewRollupCursorKey>(&global, Cursor::from_start())?;
-
}
-
-
let reader = MemReader {
-
keyspace: keyspace.clone(),
-
global: global.clone(),
-
feeds: feeds.clone(),
-
records: records.clone(),
-
rollups: rollups.clone(),
-
};
-
let writer = MemWriter {
-
keyspace,
-
global,
-
feeds,
-
records,
-
rollups,
-
queues,
-
};
-
Ok((reader, writer, js_cursor))
-
}
-
}
-
-
type MemRKV = StorageResult<(Vec<u8>, Vec<u8>)>;
-
-
#[derive(Clone)]
-
pub struct MemReader {
-
keyspace: MemKeyspace,
-
global: MemPartion,
-
feeds: MemPartion,
-
records: MemPartion,
-
rollups: MemPartion,
-
}
-
-
/// An iterator that knows how to skip over deleted/invalidated records
-
struct RecordIterator {
-
db_iter: Box<dyn Iterator<Item = MemRKV>>,
-
records: MemPartion,
-
limit: usize,
-
fetched: usize,
-
}
-
impl RecordIterator {
-
pub fn new(
-
feeds: &MemPartion,
-
records: MemPartion,
-
collection: &Nsid,
-
limit: usize,
-
) -> StorageResult<Self> {
-
let prefix = NsidRecordFeedKey::from_prefix_to_db_bytes(collection)?;
-
let db_iter = feeds.prefix(&prefix).into_iter().rev();
-
Ok(Self {
-
db_iter: Box::new(db_iter),
-
records,
-
limit,
-
fetched: 0,
-
})
-
}
-
fn get_record(&self, db_next: MemRKV) -> StorageResult<Option<UFOsRecord>> {
-
let (key_bytes, val_bytes) = db_next?;
-
let feed_key = db_complete::<NsidRecordFeedKey>(&key_bytes)?;
-
let feed_val = db_complete::<NsidRecordFeedVal>(&val_bytes)?;
-
let location_key: RecordLocationKey = (&feed_key, &feed_val).into();
-
-
let Some(location_val_bytes) = self.records.get(&location_key.to_db_bytes()?)? else {
-
// record was deleted (hopefully)
-
return Ok(None);
-
};
-
-
let (meta, n) = RecordLocationMeta::from_db_bytes(&location_val_bytes)?;
-
-
if meta.cursor() != feed_key.cursor() {
-
// older/different version
-
return Ok(None);
-
}
-
if meta.rev != feed_val.rev() {
-
// weird...
-
log::warn!("record lookup: cursor match but rev did not...? excluding.");
-
return Ok(None);
-
}
-
let Some(raw_value_bytes) = location_val_bytes.get(n..) else {
-
log::warn!(
-
"record lookup: found record but could not get bytes to decode the record??"
-
);
-
return Ok(None);
-
};
-
let rawval = db_complete::<RecordRawValue>(raw_value_bytes)?;
-
Ok(Some(UFOsRecord {
-
collection: feed_key.collection().clone(),
-
cursor: feed_key.cursor(),
-
did: feed_val.did().clone(),
-
rkey: feed_val.rkey().clone(),
-
rev: meta.rev.to_string(),
-
record: rawval.try_into()?,
-
is_update: meta.is_update,
-
}))
-
}
-
}
-
impl Iterator for RecordIterator {
-
type Item = StorageResult<Option<UFOsRecord>>;
-
fn next(&mut self) -> Option<Self::Item> {
-
if self.fetched == self.limit {
-
return Some(Ok(None));
-
}
-
let record = loop {
-
let db_next = self.db_iter.next()?; // None short-circuits here
-
match self.get_record(db_next) {
-
Err(e) => return Some(Err(e)),
-
Ok(Some(record)) => break record,
-
Ok(None) => continue,
-
}
-
};
-
self.fetched += 1;
-
Some(Ok(Some(record)))
-
}
-
}
-
-
impl MemReader {
-
fn get_storage_stats(&self) -> StorageResult<serde_json::Value> {
-
let rollup_cursor =
-
get_static_neu::<NewRollupCursorKey, NewRollupCursorValue>(&self.global)?
-
.map(|c| c.to_raw_u64());
-
-
Ok(serde_json::json!({
-
"rollup_cursor": rollup_cursor,
-
}))
-
}
-
-
fn get_consumer_info(&self) -> StorageResult<ConsumerInfo> {
-
let global = self.global.snapshot();
-
-
let endpoint =
-
get_snapshot_static_neu::<JetstreamEndpointKey, JetstreamEndpointValue>(&global)?
-
.ok_or(StorageError::BadStateError(
-
"Could not find jetstream endpoint".to_string(),
-
))?
-
.0;
-
-
let started_at = get_snapshot_static_neu::<TakeoffKey, TakeoffValue>(&global)?
-
.ok_or(StorageError::BadStateError(
-
"Could not find jetstream takeoff time".to_string(),
-
))?
-
.to_raw_u64();
-
-
let latest_cursor =
-
get_snapshot_static_neu::<JetstreamCursorKey, JetstreamCursorValue>(&global)?
-
.map(|c| c.to_raw_u64());
-
-
Ok(ConsumerInfo::Jetstream {
-
endpoint,
-
started_at,
-
latest_cursor,
-
})
-
}
-
-
fn get_top_collections(&self) -> Result<TopCollections, StorageError> {
-
// TODO: limit nsid traversal depth
-
// TODO: limit nsid traversal breadth
-
// TODO: be serious about anything
-
-
// TODO: probably use a stack of segments to reduce to ~log-n merges
-
-
#[derive(Default)]
-
struct Blah {
-
counts: CountsValue,
-
children: HashMap<String, Blah>,
-
}
-
impl From<&Blah> for TopCollections {
-
fn from(bla: &Blah) -> Self {
-
Self {
-
total_records: bla.counts.records(),
-
dids_estimate: bla.counts.dids().estimate() as u64,
-
nsid_child_segments: HashMap::from_iter(
-
bla.children.iter().map(|(k, v)| (k.to_string(), v.into())),
-
),
-
}
-
}
-
}
-
-
let mut b = Blah::default();
-
let prefix = AllTimeRollupKey::from_prefix_to_db_bytes(&Default::default())?;
-
for kv in self.rollups.prefix(&prefix.to_db_bytes()?) {
-
let (key_bytes, val_bytes) = kv?;
-
let key = db_complete::<AllTimeRollupKey>(&key_bytes)?;
-
let val = db_complete::<CountsValue>(&val_bytes)?;
-
-
let mut node = &mut b;
-
node.counts.merge(&val);
-
for segment in key.collection().split('.') {
-
node = node.children.entry(segment.to_string()).or_default();
-
node.counts.merge(&val);
-
}
-
}
-
-
Ok((&b).into())
-
}
-
-
fn get_counts_by_collection(&self, collection: &Nsid) -> StorageResult<(u64, u64)> {
-
// 0. grab a snapshot in case rollups happen while we're working
-
let instant = self.keyspace.instant();
-
let global = self.global.snapshot_at(instant);
-
let rollups = self.rollups.snapshot_at(instant);
-
-
// 1. all-time counts
-
let all_time_key = AllTimeRollupKey::new(collection).to_db_bytes()?;
-
let mut total_counts = rollups
-
.get(&all_time_key)?
-
.as_deref()
-
.map(db_complete::<CountsValue>)
-
.transpose()?
-
.unwrap_or_default();
-
-
// 2. live counts that haven't been rolled into all-time yet.
-
let rollup_cursor =
-
get_snapshot_static_neu::<NewRollupCursorKey, NewRollupCursorValue>(&global)?.ok_or(
-
StorageError::BadStateError("Could not find current rollup cursor".to_string()),
-
)?;
-
-
let full_range = LiveCountsKey::range_from_cursor(rollup_cursor)?;
-
for kv in rollups.range(full_range) {
-
let (key_bytes, val_bytes) = kv?;
-
let key = db_complete::<LiveCountsKey>(&key_bytes)?;
-
if key.collection() == collection {
-
let counts = db_complete::<CountsValue>(&val_bytes)?;
-
total_counts.merge(&counts);
-
}
-
}
-
Ok((
-
total_counts.records(),
-
total_counts.dids().estimate() as u64,
-
))
-
}
-
-
fn get_records_by_collections(
-
&self,
-
collections: &[Nsid],
-
limit: usize,
-
_expand_each_collection: bool,
-
) -> StorageResult<Vec<UFOsRecord>> {
-
if collections.is_empty() {
-
return Ok(vec![]);
-
}
-
let mut record_iterators = Vec::new();
-
for collection in collections {
-
let iter = RecordIterator::new(&self.feeds, self.records.clone(), collection, limit)?;
-
record_iterators.push(iter.peekable());
-
}
-
let mut merged = Vec::new();
-
loop {
-
let mut latest: Option<(Cursor, usize)> = None; // ugh
-
for (i, iter) in record_iterators.iter_mut().enumerate() {
-
let Some(it) = iter.peek_mut() else {
-
continue;
-
};
-
let it = match it {
-
Ok(v) => v,
-
Err(e) => Err(std::mem::replace(e, StorageError::Stolen))?,
-
};
-
let Some(rec) = it else {
-
break;
-
};
-
if let Some((cursor, _)) = latest {
-
if rec.cursor > cursor {
-
latest = Some((rec.cursor, i))
-
}
-
} else {
-
latest = Some((rec.cursor, i));
-
}
-
}
-
let Some((_, idx)) = latest else {
-
break;
-
};
-
// yeah yeah whateverrrrrrrrrrrrrrrr
-
merged.push(record_iterators[idx].next().unwrap().unwrap().unwrap());
-
}
-
Ok(merged)
-
}
-
}
-
-
#[async_trait]
-
impl StoreReader for MemReader {
-
fn name(&self) -> String {
-
"in-memory store".into()
-
}
-
async fn get_storage_stats(&self) -> StorageResult<serde_json::Value> {
-
let s = self.clone();
-
tokio::task::spawn_blocking(move || MemReader::get_storage_stats(&s)).await?
-
}
-
async fn get_consumer_info(&self) -> StorageResult<ConsumerInfo> {
-
let s = self.clone();
-
tokio::task::spawn_blocking(move || MemReader::get_consumer_info(&s)).await?
-
}
-
async fn get_top_collections(&self) -> Result<TopCollections, StorageError> {
-
let s = self.clone();
-
tokio::task::spawn_blocking(move || MemReader::get_top_collections(&s)).await?
-
}
-
async fn get_counts_by_collection(&self, collection: &Nsid) -> StorageResult<(u64, u64)> {
-
let s = self.clone();
-
let collection = collection.clone();
-
tokio::task::spawn_blocking(move || MemReader::get_counts_by_collection(&s, &collection))
-
.await?
-
}
-
async fn get_records_by_collections(
-
&self,
-
collections: &[Nsid],
-
limit: usize,
-
expand_each_collection: bool,
-
) -> StorageResult<Vec<UFOsRecord>> {
-
let s = self.clone();
-
let collections = collections.to_vec();
-
tokio::task::spawn_blocking(move || {
-
MemReader::get_records_by_collections(&s, &collections, limit, expand_each_collection)
-
})
-
.await?
-
}
-
}
-
-
pub struct MemWriter {
-
keyspace: MemKeyspace,
-
global: MemPartion,
-
feeds: MemPartion,
-
records: MemPartion,
-
rollups: MemPartion,
-
queues: MemPartion,
-
}
-
-
impl MemWriter {
-
fn rollup_delete_account(
-
&mut self,
-
cursor: Cursor,
-
key_bytes: &[u8],
-
val_bytes: &[u8],
-
) -> StorageResult<usize> {
-
let did = db_complete::<DeleteAccountQueueVal>(val_bytes)?;
-
self.delete_account(&did)?;
-
let mut batch = self.keyspace.batch();
-
batch.remove(&self.queues, key_bytes);
-
insert_batch_static_neu::<NewRollupCursorKey>(&mut batch, &self.global, cursor)?;
-
batch.commit()?;
-
Ok(1)
-
}
-
-
fn rollup_live_counts(
-
&mut self,
-
timelies: impl Iterator<Item = Result<(Vec<u8>, Vec<u8>), StorageError>>,
-
cursor_exclusive_limit: Option<Cursor>,
-
rollup_limit: usize,
-
) -> StorageResult<usize> {
-
// current strategy is to buffer counts in mem before writing the rollups
-
// we *could* read+write every single batch to rollup.. but their merge is associative so
-
// ...so save the db some work up front? is this worth it? who knows...
-
-
log::warn!("sup!!!");
-
-
#[derive(Eq, Hash, PartialEq)]
-
enum Rollup {
-
Hourly(HourTruncatedCursor),
-
Weekly(WeekTruncatedCursor),
-
AllTime,
-
}
-
-
let mut batch = self.keyspace.batch();
-
let mut cursors_advanced = 0;
-
let mut last_cursor = Cursor::from_start();
-
let mut counts_by_rollup: HashMap<(Nsid, Rollup), CountsValue> = HashMap::new();
-
-
log::warn!("about to loop....");
-
for (i, kv) in timelies.enumerate() {
-
log::warn!("loop {i} {kv:?}...");
-
if i >= rollup_limit {
-
break;
-
}
-
-
let (key_bytes, val_bytes) = kv?;
-
let key = db_complete::<LiveCountsKey>(&key_bytes)
-
.inspect_err(|e| log::warn!("rlc: key: {e:?}"))?;
-
-
if cursor_exclusive_limit
-
.map(|limit| key.cursor() > limit)
-
.unwrap_or(false)
-
{
-
break;
-
}
-
-
batch.remove(&self.rollups, &key_bytes);
-
let val = db_complete::<CountsValue>(&val_bytes)
-
.inspect_err(|e| log::warn!("rlc: val: {e:?}"))?;
-
counts_by_rollup
-
.entry((
-
key.collection().clone(),
-
Rollup::Hourly(key.cursor().into()),
-
))
-
.or_default()
-
.merge(&val);
-
counts_by_rollup
-
.entry((
-
key.collection().clone(),
-
Rollup::Weekly(key.cursor().into()),
-
))
-
.or_default()
-
.merge(&val);
-
counts_by_rollup
-
.entry((key.collection().clone(), Rollup::AllTime))
-
.or_default()
-
.merge(&val);
-
-
cursors_advanced += 1;
-
last_cursor = key.cursor();
-
}
-
log::warn!("done looping. looping cbr counts(?)..");
-
-
for ((nsid, rollup), counts) in counts_by_rollup {
-
log::warn!(
-
"######################## cbr loop {nsid:?} {counts:?} ########################"
-
);
-
let key_bytes = match rollup {
-
Rollup::Hourly(hourly_cursor) => {
-
let k = HourlyRollupKey::new(hourly_cursor, &nsid);
-
log::info!("hrly k: {k:?}");
-
k.to_db_bytes()?
-
}
-
Rollup::Weekly(weekly_cursor) => {
-
let k = WeeklyRollupKey::new(weekly_cursor, &nsid);
-
log::info!("weekly k: {k:?}");
-
k.to_db_bytes()?
-
}
-
Rollup::AllTime => {
-
let k = AllTimeRollupKey::new(&nsid);
-
log::info!("alltime k: {k:?}");
-
k.to_db_bytes()?
-
}
-
};
-
// log::info!("key bytes: {key_bytes:?}");
-
let mut rolled: CountsValue = self
-
.rollups
-
.get(&key_bytes)?
-
.inspect(|v| {
-
let lax = CountsValue::from_db_bytes(v);
-
log::info!(
-
"val: len={}, lax={lax:?} first32={:?}",
-
v.len(),
-
v.get(..32)
-
);
-
})
-
.as_deref()
-
.map(db_complete::<CountsValue>)
-
.transpose()
-
.inspect_err(|e| log::warn!("oooh did we break on the rolled thing? {e:?}"))?
-
.unwrap_or_default();
-
-
// try to round-trip before inserting, for funsies
-
let tripppin = counts.to_db_bytes()?;
-
let (and_back, n) = CountsValue::from_db_bytes(&tripppin)?;
-
assert_eq!(n, tripppin.len());
-
assert_eq!(counts.prefix, and_back.prefix);
-
assert_eq!(counts.dids().estimate(), and_back.dids().estimate());
-
if counts.records() > 20000000 {
-
panic!("COUNTS maybe wtf? {counts:?}")
-
}
-
// assert_eq!(rolled, and_back);
-
-
rolled.merge(&counts);
-
-
// try to round-trip before inserting, for funsies
-
let tripppin = rolled.to_db_bytes()?;
-
let (and_back, n) = CountsValue::from_db_bytes(&tripppin)?;
-
assert_eq!(n, tripppin.len());
-
assert_eq!(rolled.prefix, and_back.prefix);
-
assert_eq!(rolled.dids().estimate(), and_back.dids().estimate());
-
if rolled.records() > 20000000 {
-
panic!("maybe wtf? {rolled:?}")
-
}
-
// assert_eq!(rolled, and_back);
-
-
batch.insert(&self.rollups, &key_bytes, &rolled.to_db_bytes()?);
-
}
-
-
log::warn!("done cbr loop.");
-
-
insert_batch_static_neu::<NewRollupCursorKey>(&mut batch, &self.global, last_cursor)
-
.inspect_err(|e| log::warn!("insert neu: {e:?}"))?;
-
-
batch.commit()?;
-
-
log::warn!("ok finished rlc stuff. huh.");
-
Ok(cursors_advanced)
-
}
-
}
-
-
impl StoreWriter for MemWriter {
-
fn insert_batch<const LIMIT: usize>(
-
&mut self,
-
event_batch: EventBatch<LIMIT>,
-
) -> StorageResult<()> {
-
if event_batch.is_empty() {
-
return Ok(());
-
}
-
-
let mut batch = self.keyspace.batch();
-
-
// would be nice not to have to iterate everything at once here
-
let latest = event_batch.latest_cursor().unwrap();
-
-
for (nsid, commits) in event_batch.commits_by_nsid {
-
for commit in commits.commits {
-
let location_key: RecordLocationKey = (&commit, &nsid).into();
-
-
match commit.action {
-
CommitAction::Cut => {
-
batch.remove(&self.records, &location_key.to_db_bytes()?);
-
}
-
CommitAction::Put(put_action) => {
-
let feed_key = NsidRecordFeedKey::from_pair(nsid.clone(), commit.cursor);
-
let feed_val: NsidRecordFeedVal =
-
(&commit.did, &commit.rkey, commit.rev.as_str()).into();
-
batch.insert(
-
&self.feeds,
-
&feed_key.to_db_bytes()?,
-
&feed_val.to_db_bytes()?,
-
);
-
-
let location_val: RecordLocationVal =
-
(commit.cursor, commit.rev.as_str(), put_action).into();
-
batch.insert(
-
&self.records,
-
&location_key.to_db_bytes()?,
-
&location_val.to_db_bytes()?,
-
);
-
}
-
}
-
}
-
let live_counts_key: LiveCountsKey = (latest, &nsid).into();
-
let counts_value = CountsValue::new(commits.total_seen as u64, commits.dids_estimate);
-
batch.insert(
-
&self.rollups,
-
&live_counts_key.to_db_bytes()?,
-
&counts_value.to_db_bytes()?,
-
);
-
}
-
-
for remove in event_batch.account_removes {
-
let queue_key = DeleteAccountQueueKey::new(remove.cursor);
-
let queue_val: DeleteAccountQueueVal = remove.did;
-
batch.insert(
-
&self.queues,
-
&queue_key.to_db_bytes()?,
-
&queue_val.to_db_bytes()?,
-
);
-
}
-
-
batch.insert(
-
&self.global,
-
&DbStaticStr::<JetstreamCursorKey>::default().to_db_bytes()?,
-
&latest.to_db_bytes()?,
-
);
-
-
batch.commit()?;
-
Ok(())
-
}
-
-
fn step_rollup(&mut self) -> StorageResult<usize> {
-
let rollup_cursor =
-
get_static_neu::<NewRollupCursorKey, NewRollupCursorValue>(&self.global)?
-
.ok_or(StorageError::BadStateError(
-
"Could not find current rollup cursor".to_string(),
-
))
-
.inspect_err(|e| log::warn!("failed getting rollup cursor: {e:?}"))?;
-
-
// timelies
-
let live_counts_range = LiveCountsKey::range_from_cursor(rollup_cursor)
-
.inspect_err(|e| log::warn!("live counts range: {e:?}"))?;
-
let mut timely_iter = self.rollups.range(live_counts_range).into_iter().peekable();
-
-
let timely_next_cursor = timely_iter
-
.peek_mut()
-
.map(|kv| -> StorageResult<Cursor> {
-
match kv {
-
Err(e) => Err(std::mem::replace(e, StorageError::Stolen))?,
-
Ok((key_bytes, _)) => {
-
let key = db_complete::<LiveCountsKey>(key_bytes).inspect_err(|e| {
-
log::warn!("failed getting key for next timely: {e:?}")
-
})?;
-
Ok(key.cursor())
-
}
-
}
-
})
-
.transpose()
-
.inspect_err(|e| log::warn!("something about timely: {e:?}"))?;
-
-
// delete accounts
-
let delete_accounts_range =
-
DeleteAccountQueueKey::new(rollup_cursor).range_to_prefix_end()?;
-
-
let next_delete = self
-
.queues
-
.range(delete_accounts_range)
-
.into_iter()
-
.next()
-
.transpose()
-
.inspect_err(|e| log::warn!("range for next delete: {e:?}"))?
-
.map(|(key_bytes, val_bytes)| {
-
db_complete::<DeleteAccountQueueKey>(&key_bytes)
-
.inspect_err(|e| log::warn!("failed inside next delete thing????: {e:?}"))
-
.map(|k| (k.suffix, key_bytes, val_bytes))
-
})
-
.transpose()
-
.inspect_err(|e| log::warn!("failed getting next delete: {e:?}"))?;
-
-
let cursors_stepped = match (timely_next_cursor, next_delete) {
-
(
-
Some(timely_next_cursor),
-
Some((delete_cursor, delete_key_bytes, delete_val_bytes)),
-
) => {
-
if timely_next_cursor < delete_cursor {
-
self.rollup_live_counts(
-
timely_iter,
-
Some(delete_cursor),
-
MAX_BATCHED_ROLLUP_COUNTS,
-
)
-
.inspect_err(|e| log::warn!("rolling up live counts: {e:?}"))?
-
} else {
-
self.rollup_delete_account(delete_cursor, &delete_key_bytes, &delete_val_bytes)
-
.inspect_err(|e| log::warn!("deleting acocunt: {e:?}"))?
-
}
-
}
-
(Some(_), None) => self
-
.rollup_live_counts(timely_iter, None, MAX_BATCHED_ROLLUP_COUNTS)
-
.inspect_err(|e| log::warn!("rolling up (lasjdflkajs): {e:?}"))?,
-
(None, Some((delete_cursor, delete_key_bytes, delete_val_bytes))) => self
-
.rollup_delete_account(delete_cursor, &delete_key_bytes, &delete_val_bytes)
-
.inspect_err(|e| log::warn!("deleting acocunt other branch: {e:?}"))?,
-
(None, None) => 0,
-
};
-
-
Ok(cursors_stepped)
-
}
-
-
fn trim_collection(
-
&mut self,
-
collection: &Nsid,
-
limit: usize,
-
// TODO: could add a start cursor limit to avoid iterating deleted stuff at the start (/end)
-
) -> StorageResult<()> {
-
let mut dangling_feed_keys_cleaned = 0;
-
let mut records_deleted = 0;
-
-
let mut batch = self.keyspace.batch();
-
-
let prefix = NsidRecordFeedKey::from_prefix_to_db_bytes(collection)?;
-
let mut found = 0;
-
for kv in self.feeds.prefix(&prefix).into_iter().rev() {
-
let (key_bytes, val_bytes) = kv?;
-
let feed_key = db_complete::<NsidRecordFeedKey>(&key_bytes)?;
-
let feed_val = db_complete::<NsidRecordFeedVal>(&val_bytes)?;
-
let location_key: RecordLocationKey = (&feed_key, &feed_val).into();
-
let location_key_bytes = location_key.to_db_bytes()?;
-
-
let Some(location_val_bytes) = self.records.get(&location_key_bytes)? else {
-
// record was deleted (hopefully)
-
batch.remove(&self.feeds, &location_key_bytes);
-
dangling_feed_keys_cleaned += 1;
-
continue;
-
};
-
-
let (meta, _) = RecordLocationMeta::from_db_bytes(&location_val_bytes)?;
-
-
if meta.cursor() != feed_key.cursor() {
-
// older/different version
-
batch.remove(&self.feeds, &location_key_bytes);
-
dangling_feed_keys_cleaned += 1;
-
continue;
-
}
-
if meta.rev != feed_val.rev() {
-
// weird...
-
log::warn!("record lookup: cursor match but rev did not...? removing.");
-
batch.remove(&self.feeds, &location_key_bytes);
-
dangling_feed_keys_cleaned += 1;
-
continue;
-
}
-
-
if batch.len() >= MAX_BATCHED_CLEANUP_SIZE {
-
batch.commit()?;
-
batch = self.keyspace.batch();
-
}
-
-
found += 1;
-
if found <= limit {
-
continue;
-
}
-
-
batch.remove(&self.feeds, &location_key_bytes);
-
batch.remove(&self.records, &location_key_bytes);
-
records_deleted += 1;
-
}
-
-
batch.commit()?;
-
-
log::info!("trim_collection ({collection:?}) removed {dangling_feed_keys_cleaned} dangling feed entries and {records_deleted} records");
-
Ok(())
-
}
-
-
fn delete_account(&mut self, did: &Did) -> Result<usize, StorageError> {
-
let mut records_deleted = 0;
-
let mut batch = self.keyspace.batch();
-
let prefix = RecordLocationKey::from_prefix_to_db_bytes(did)?;
-
for kv in self.records.prefix(&prefix) {
-
let (key_bytes, _) = kv?;
-
batch.remove(&self.records, &key_bytes);
-
records_deleted += 1;
-
if batch.len() >= MAX_BATCHED_ACCOUNT_DELETE_RECORDS {
-
batch.commit()?;
-
batch = self.keyspace.batch();
-
}
-
}
-
batch.commit()?;
-
Ok(records_deleted)
-
}
-
}
-
-
/// Get a value from a fixed key
-
fn get_static_neu<K: StaticStr, V: DbBytes>(global: &MemPartion) -> StorageResult<Option<V>> {
-
let key_bytes = DbStaticStr::<K>::default().to_db_bytes()?;
-
let value = global
-
.get(&key_bytes)?
-
.map(|value_bytes| db_complete(&value_bytes))
-
.transpose()?;
-
Ok(value)
-
}
-
-
/// Get a value from a fixed key
-
fn get_snapshot_static_neu<K: StaticStr, V: DbBytes>(
-
global: &MemPartion,
-
) -> StorageResult<Option<V>> {
-
let key_bytes = DbStaticStr::<K>::default().to_db_bytes()?;
-
let value = global
-
.get(&key_bytes)?
-
.map(|value_bytes| db_complete(&value_bytes))
-
.transpose()?;
-
Ok(value)
-
}
-
-
/// Set a value to a fixed key
-
fn insert_static_neu<K: StaticStr>(global: &MemPartion, value: impl DbBytes) -> StorageResult<()> {
-
let key_bytes = DbStaticStr::<K>::default().to_db_bytes()?;
-
let value_bytes = value.to_db_bytes()?;
-
global.insert(&key_bytes, &value_bytes)?;
-
Ok(())
-
}
-
-
/// Set a value to a fixed key
-
fn insert_batch_static_neu<K: StaticStr>(
-
batch: &mut MemBatch,
-
global: &MemPartion,
-
value: impl DbBytes,
-
) -> StorageResult<()> {
-
let key_bytes = DbStaticStr::<K>::default().to_db_bytes()?;
-
let value_bytes = value.to_db_bytes()?;
-
batch.insert(global, &key_bytes, &value_bytes);
-
Ok(())
-
}
-
-
#[derive(Debug, serde::Serialize, schemars::JsonSchema)]
-
pub struct StorageInfo {
-
pub keyspace_disk_space: u64,
-
pub keyspace_journal_count: usize,
-
pub keyspace_sequence: u64,
-
pub global_approximate_len: usize,
-
}
-
-
#[cfg(test)]
-
mod tests {
-
use super::*;
-
use crate::{DeleteAccount, RecordKey, UFOsCommit};
-
use jetstream::events::{CommitEvent, CommitOp};
-
use jetstream::exports::Cid;
-
use serde_json::value::RawValue;
-
-
fn fjall_db() -> (MemReader, MemWriter) {
-
let (read, write, _) = MemStorage::init(
-
tempfile::tempdir().unwrap(),
-
"offline test (no real jetstream endpoint)".to_string(),
-
false,
-
MemConfig { temp: true },
-
)
-
.unwrap();
-
(read, write)
-
}
-
-
const TEST_BATCH_LIMIT: usize = 16;
-
-
#[derive(Debug, Default)]
-
struct TestBatch {
-
pub batch: EventBatch<TEST_BATCH_LIMIT>,
-
}
-
-
impl TestBatch {
-
#[allow(clippy::too_many_arguments)]
-
pub fn create(
-
&mut self,
-
did: &str,
-
collection: &str,
-
rkey: &str,
-
record: &str,
-
rev: Option<&str>,
-
cid: Option<Cid>,
-
cursor: u64,
-
) -> Nsid {
-
let did = Did::new(did.to_string()).unwrap();
-
let collection = Nsid::new(collection.to_string()).unwrap();
-
let record = RawValue::from_string(record.to_string()).unwrap();
-
let cid = cid.unwrap_or(
-
"bafyreidofvwoqvd2cnzbun6dkzgfucxh57tirf3ohhde7lsvh4fu3jehgy"
-
.parse()
-
.unwrap(),
-
);
-
-
let event = CommitEvent {
-
collection,
-
rkey: RecordKey::new(rkey.to_string()).unwrap(),
-
rev: rev.unwrap_or("asdf").to_string(),
-
operation: CommitOp::Create,
-
record: Some(record),
-
cid: Some(cid),
-
};
-
-
let (commit, collection) =
-
UFOsCommit::from_commit_info(event, did.clone(), Cursor::from_raw_u64(cursor))
-
.unwrap();
-
-
self.batch
-
.commits_by_nsid
-
.entry(collection.clone())
-
.or_default()
-
.truncating_insert(commit)
-
.unwrap();
-
-
collection
-
}
-
#[allow(clippy::too_many_arguments)]
-
pub fn update(
-
&mut self,
-
did: &str,
-
collection: &str,
-
rkey: &str,
-
record: &str,
-
rev: Option<&str>,
-
cid: Option<Cid>,
-
cursor: u64,
-
) -> Nsid {
-
let did = Did::new(did.to_string()).unwrap();
-
let collection = Nsid::new(collection.to_string()).unwrap();
-
let record = RawValue::from_string(record.to_string()).unwrap();
-
let cid = cid.unwrap_or(
-
"bafyreidofvwoqvd2cnzbun6dkzgfucxh57tirf3ohhde7lsvh4fu3jehgy"
-
.parse()
-
.unwrap(),
-
);
-
-
let event = CommitEvent {
-
collection,
-
rkey: RecordKey::new(rkey.to_string()).unwrap(),
-
rev: rev.unwrap_or("asdf").to_string(),
-
operation: CommitOp::Update,
-
record: Some(record),
-
cid: Some(cid),
-
};
-
-
let (commit, collection) =
-
UFOsCommit::from_commit_info(event, did.clone(), Cursor::from_raw_u64(cursor))
-
.unwrap();
-
-
self.batch
-
.commits_by_nsid
-
.entry(collection.clone())
-
.or_default()
-
.truncating_insert(commit)
-
.unwrap();
-
-
collection
-
}
-
#[allow(clippy::too_many_arguments)]
-
pub fn delete(
-
&mut self,
-
did: &str,
-
collection: &str,
-
rkey: &str,
-
rev: Option<&str>,
-
cursor: u64,
-
) -> Nsid {
-
let did = Did::new(did.to_string()).unwrap();
-
let collection = Nsid::new(collection.to_string()).unwrap();
-
let event = CommitEvent {
-
collection,
-
rkey: RecordKey::new(rkey.to_string()).unwrap(),
-
rev: rev.unwrap_or("asdf").to_string(),
-
operation: CommitOp::Delete,
-
record: None,
-
cid: None,
-
};
-
-
let (commit, collection) =
-
UFOsCommit::from_commit_info(event, did, Cursor::from_raw_u64(cursor)).unwrap();
-
-
self.batch
-
.commits_by_nsid
-
.entry(collection.clone())
-
.or_default()
-
.truncating_insert(commit)
-
.unwrap();
-
-
collection
-
}
-
pub fn delete_account(&mut self, did: &str, cursor: u64) -> Did {
-
let did = Did::new(did.to_string()).unwrap();
-
self.batch.account_removes.push(DeleteAccount {
-
did: did.clone(),
-
cursor: Cursor::from_raw_u64(cursor),
-
});
-
did
-
}
-
}
-
-
#[test]
-
fn test_hello() -> anyhow::Result<()> {
-
let (read, mut write) = fjall_db();
-
write.insert_batch::<TEST_BATCH_LIMIT>(EventBatch::default())?;
-
let (records, dids) =
-
read.get_counts_by_collection(&Nsid::new("a.b.c".to_string()).unwrap())?;
-
assert_eq!(records, 0);
-
assert_eq!(dids, 0);
-
Ok(())
-
}
-
-
#[test]
-
fn test_insert_one() -> anyhow::Result<()> {
-
let (read, mut write) = fjall_db();
-
-
let mut batch = TestBatch::default();
-
let collection = batch.create(
-
"did:plc:inze6wrmsm7pjl7yta3oig77",
-
"a.b.c",
-
"asdf",
-
"{}",
-
Some("rev-z"),
-
None,
-
100,
-
);
-
write.insert_batch(batch.batch)?;
-
-
let (records, dids) = read.get_counts_by_collection(&collection)?;
-
assert_eq!(records, 1);
-
assert_eq!(dids, 1);
-
let (records, dids) =
-
read.get_counts_by_collection(&Nsid::new("d.e.f".to_string()).unwrap())?;
-
assert_eq!(records, 0);
-
assert_eq!(dids, 0);
-
-
let records = read.get_records_by_collections(&[collection], 2, false)?;
-
assert_eq!(records.len(), 1);
-
let rec = &records[0];
-
assert_eq!(rec.record.get(), "{}");
-
assert!(!rec.is_update);
-
-
let records =
-
read.get_records_by_collections(&[Nsid::new("d.e.f".to_string()).unwrap()], 2, false)?;
-
assert_eq!(records.len(), 0);
-
-
Ok(())
-
}
-
-
#[test]
-
fn test_get_multi_collection() -> anyhow::Result<()> {
-
let (read, mut write) = fjall_db();
-
-
let mut batch = TestBatch::default();
-
batch.create(
-
"did:plc:inze6wrmsm7pjl7yta3oig77",
-
"a.a.a",
-
"aaa",
-
r#""earliest""#,
-
Some("rev-a"),
-
None,
-
100,
-
);
-
batch.create(
-
"did:plc:inze6wrmsm7pjl7yta3oig77",
-
"a.a.b",
-
"aab",
-
r#""in between""#,
-
Some("rev-ab"),
-
None,
-
101,
-
);
-
batch.create(
-
"did:plc:inze6wrmsm7pjl7yta3oig77",
-
"a.a.a",
-
"aaa-2",
-
r#""last""#,
-
Some("rev-a-2"),
-
None,
-
102,
-
);
-
write.insert_batch(batch.batch)?;
-
-
let records = read.get_records_by_collections(
-
&[
-
Nsid::new("a.a.a".to_string()).unwrap(),
-
Nsid::new("a.a.b".to_string()).unwrap(),
-
Nsid::new("a.a.c".to_string()).unwrap(),
-
],
-
100,
-
false,
-
)?;
-
assert_eq!(records.len(), 3);
-
assert_eq!(records[0].record.get(), r#""last""#);
-
assert_eq!(
-
records[0].collection,
-
Nsid::new("a.a.a".to_string()).unwrap()
-
);
-
assert_eq!(records[1].record.get(), r#""in between""#);
-
assert_eq!(
-
records[1].collection,
-
Nsid::new("a.a.b".to_string()).unwrap()
-
);
-
assert_eq!(records[2].record.get(), r#""earliest""#);
-
assert_eq!(
-
records[2].collection,
-
Nsid::new("a.a.a".to_string()).unwrap()
-
);
-
-
Ok(())
-
}
-
-
#[test]
-
fn test_update_one() -> anyhow::Result<()> {
-
let (read, mut write) = fjall_db();
-
-
let mut batch = TestBatch::default();
-
let collection = batch.create(
-
"did:plc:inze6wrmsm7pjl7yta3oig77",
-
"a.b.c",
-
"rkey-asdf",
-
"{}",
-
Some("rev-a"),
-
None,
-
100,
-
);
-
write.insert_batch(batch.batch)?;
-
-
let mut batch = TestBatch::default();
-
batch.update(
-
"did:plc:inze6wrmsm7pjl7yta3oig77",
-
"a.b.c",
-
"rkey-asdf",
-
r#"{"ch": "ch-ch-ch-changes"}"#,
-
Some("rev-z"),
-
None,
-
101,
-
);
-
write.insert_batch(batch.batch)?;
-
-
let (records, dids) = read.get_counts_by_collection(&collection)?;
-
assert_eq!(records, 1);
-
assert_eq!(dids, 1);
-
-
let records = read.get_records_by_collections(&[collection], 2, false)?;
-
assert_eq!(records.len(), 1);
-
let rec = &records[0];
-
assert_eq!(rec.record.get(), r#"{"ch": "ch-ch-ch-changes"}"#);
-
assert!(rec.is_update);
-
Ok(())
-
}
-
-
#[test]
-
fn test_delete_one() -> anyhow::Result<()> {
-
let (read, mut write) = fjall_db();
-
-
let mut batch = TestBatch::default();
-
let collection = batch.create(
-
"did:plc:inze6wrmsm7pjl7yta3oig77",
-
"a.b.c",
-
"rkey-asdf",
-
"{}",
-
Some("rev-a"),
-
None,
-
100,
-
);
-
write.insert_batch(batch.batch)?;
-
-
let mut batch = TestBatch::default();
-
batch.delete(
-
"did:plc:inze6wrmsm7pjl7yta3oig77",
-
"a.b.c",
-
"rkey-asdf",
-
Some("rev-z"),
-
101,
-
);
-
write.insert_batch(batch.batch)?;
-
-
let (records, dids) = read.get_counts_by_collection(&collection)?;
-
assert_eq!(records, 1);
-
assert_eq!(dids, 1);
-
-
let records = read.get_records_by_collections(&[collection], 2, false)?;
-
assert_eq!(records.len(), 0);
-
-
Ok(())
-
}
-
-
#[test]
-
fn test_collection_trim() -> anyhow::Result<()> {
-
let (read, mut write) = fjall_db();
-
-
let mut batch = TestBatch::default();
-
batch.create(
-
"did:plc:inze6wrmsm7pjl7yta3oig77",
-
"a.a.a",
-
"rkey-aaa",
-
"{}",
-
Some("rev-aaa"),
-
None,
-
10_000,
-
);
-
let mut last_b_cursor;
-
for i in 1..=10 {
-
last_b_cursor = 11_000 + i;
-
batch.create(
-
&format!("did:plc:inze6wrmsm7pjl7yta3oig7{}", i % 3),
-
"a.a.b",
-
&format!("rkey-bbb-{i}"),
-
&format!(r#"{{"n": {i}}}"#),
-
Some(&format!("rev-bbb-{i}")),
-
None,
-
last_b_cursor,
-
);
-
}
-
batch.create(
-
"did:plc:inze6wrmsm7pjl7yta3oig77",
-
"a.a.c",
-
"rkey-ccc",
-
"{}",
-
Some("rev-ccc"),
-
None,
-
12_000,
-
);
-
-
write.insert_batch(batch.batch)?;
-
-
let records = read.get_records_by_collections(
-
&[Nsid::new("a.a.a".to_string()).unwrap()],
-
100,
-
false,
-
)?;
-
assert_eq!(records.len(), 1);
-
let records = read.get_records_by_collections(
-
&[Nsid::new("a.a.b".to_string()).unwrap()],
-
100,
-
false,
-
)?;
-
assert_eq!(records.len(), 10);
-
let records = read.get_records_by_collections(
-
&[Nsid::new("a.a.c".to_string()).unwrap()],
-
100,
-
false,
-
)?;
-
assert_eq!(records.len(), 1);
-
let records = read.get_records_by_collections(
-
&[Nsid::new("a.a.d".to_string()).unwrap()],
-
100,
-
false,
-
)?;
-
assert_eq!(records.len(), 0);
-
-
write.trim_collection(&Nsid::new("a.a.a".to_string()).unwrap(), 6)?;
-
write.trim_collection(&Nsid::new("a.a.b".to_string()).unwrap(), 6)?;
-
write.trim_collection(&Nsid::new("a.a.c".to_string()).unwrap(), 6)?;
-
write.trim_collection(&Nsid::new("a.a.d".to_string()).unwrap(), 6)?;
-
-
let records = read.get_records_by_collections(
-
&[Nsid::new("a.a.a".to_string()).unwrap()],
-
100,
-
false,
-
)?;
-
assert_eq!(records.len(), 1);
-
let records = read.get_records_by_collections(
-
&[Nsid::new("a.a.b".to_string()).unwrap()],
-
100,
-
false,
-
)?;
-
assert_eq!(records.len(), 6);
-
let records = read.get_records_by_collections(
-
&[Nsid::new("a.a.c".to_string()).unwrap()],
-
100,
-
false,
-
)?;
-
assert_eq!(records.len(), 1);
-
let records = read.get_records_by_collections(
-
&[Nsid::new("a.a.d".to_string()).unwrap()],
-
100,
-
false,
-
)?;
-
assert_eq!(records.len(), 0);
-
-
Ok(())
-
}
-
-
#[test]
-
fn test_delete_account() -> anyhow::Result<()> {
-
let (read, mut write) = fjall_db();
-
-
let mut batch = TestBatch::default();
-
batch.create(
-
"did:plc:person-a",
-
"a.a.a",
-
"rkey-aaa",
-
"{}",
-
Some("rev-aaa"),
-
None,
-
10_000,
-
);
-
for i in 1..=2 {
-
batch.create(
-
"did:plc:person-b",
-
"a.a.a",
-
&format!("rkey-bbb-{i}"),
-
&format!(r#"{{"n": {i}}}"#),
-
Some(&format!("rev-bbb-{i}")),
-
None,
-
11_000 + i,
-
);
-
}
-
write.insert_batch(batch.batch)?;
-
-
let records = read.get_records_by_collections(
-
&[Nsid::new("a.a.a".to_string()).unwrap()],
-
100,
-
false,
-
)?;
-
assert_eq!(records.len(), 3);
-
-
let records_deleted =
-
write.delete_account(&Did::new("did:plc:person-b".to_string()).unwrap())?;
-
assert_eq!(records_deleted, 2);
-
-
let records = read.get_records_by_collections(
-
&[Nsid::new("a.a.a".to_string()).unwrap()],
-
100,
-
false,
-
)?;
-
assert_eq!(records.len(), 1);
-
-
Ok(())
-
}
-
-
#[test]
-
fn rollup_delete_account_removes_record() -> anyhow::Result<()> {
-
let (read, mut write) = fjall_db();
-
-
let mut batch = TestBatch::default();
-
batch.create(
-
"did:plc:person-a",
-
"a.a.a",
-
"rkey-aaa",
-
"{}",
-
Some("rev-aaa"),
-
None,
-
10_000,
-
);
-
write.insert_batch(batch.batch)?;
-
-
let mut batch = TestBatch::default();
-
batch.delete_account("did:plc:person-a", 9_999); // queue it before the rollup
-
write.insert_batch(batch.batch)?;
-
-
write.step_rollup()?;
-
-
let records =
-
read.get_records_by_collections(&[Nsid::new("a.a.a".to_string()).unwrap()], 1, false)?;
-
assert_eq!(records.len(), 0);
-
-
Ok(())
-
}
-
-
#[test]
-
fn rollup_delete_live_count_step() -> anyhow::Result<()> {
-
let (read, mut write) = fjall_db();
-
-
let mut batch = TestBatch::default();
-
batch.create(
-
"did:plc:person-a",
-
"a.a.a",
-
"rkey-aaa",
-
"{}",
-
Some("rev-aaa"),
-
None,
-
10_000,
-
);
-
write.insert_batch(batch.batch)?;
-
-
let n = write.step_rollup()?;
-
assert_eq!(n, 1);
-
-
let mut batch = TestBatch::default();
-
batch.delete_account("did:plc:person-a", 10_001);
-
write.insert_batch(batch.batch)?;
-
-
let records =
-
read.get_records_by_collections(&[Nsid::new("a.a.a".to_string()).unwrap()], 1, false)?;
-
assert_eq!(records.len(), 1);
-
-
let n = write.step_rollup()?;
-
assert_eq!(n, 1);
-
-
let records =
-
read.get_records_by_collections(&[Nsid::new("a.a.a".to_string()).unwrap()], 1, false)?;
-
assert_eq!(records.len(), 0);
-
-
let mut batch = TestBatch::default();
-
batch.delete_account("did:plc:person-a", 9_999);
-
write.insert_batch(batch.batch)?;
-
-
let n = write.step_rollup()?;
-
assert_eq!(n, 0);
-
-
Ok(())
-
}
-
-
#[test]
-
fn rollup_multiple_count_batches() -> anyhow::Result<()> {
-
let (_read, mut write) = fjall_db();
-
-
let mut batch = TestBatch::default();
-
batch.create(
-
"did:plc:person-a",
-
"a.a.a",
-
"rkey-aaa",
-
"{}",
-
Some("rev-aaa"),
-
None,
-
10_000,
-
);
-
write.insert_batch(batch.batch)?;
-
-
let mut batch = TestBatch::default();
-
batch.create(
-
"did:plc:person-a",
-
"a.a.a",
-
"rkey-aab",
-
"{}",
-
Some("rev-aab"),
-
None,
-
10_001,
-
);
-
write.insert_batch(batch.batch)?;
-
-
let n = write.step_rollup()?;
-
assert_eq!(n, 2);
-
-
let n = write.step_rollup()?;
-
assert_eq!(n, 0);
-
-
Ok(())
-
}
-
-
#[test]
-
fn counts_before_and_after_rollup() -> anyhow::Result<()> {
-
let (read, mut write) = fjall_db();
-
-
let mut batch = TestBatch::default();
-
batch.create(
-
"did:plc:person-a",
-
"a.a.a",
-
"rkey-aaa",
-
"{}",
-
Some("rev-aaa"),
-
None,
-
10_000,
-
);
-
batch.create(
-
"did:plc:person-b",
-
"a.a.a",
-
"rkey-bbb",
-
"{}",
-
Some("rev-bbb"),
-
None,
-
10_001,
-
);
-
write.insert_batch(batch.batch)?;
-
-
let mut batch = TestBatch::default();
-
batch.delete_account("did:plc:person-a", 11_000);
-
write.insert_batch(batch.batch)?;
-
-
let mut batch = TestBatch::default();
-
batch.create(
-
"did:plc:person-a",
-
"a.a.a",
-
"rkey-aac",
-
"{}",
-
Some("rev-aac"),
-
None,
-
12_000,
-
);
-
write.insert_batch(batch.batch)?;
-
-
// before any rollup
-
let (records, dids) =
-
read.get_counts_by_collection(&Nsid::new("a.a.a".to_string()).unwrap())?;
-
assert_eq!(records, 3);
-
assert_eq!(dids, 2);
-
-
// first batch rolled up
-
let n = write.step_rollup()?;
-
assert_eq!(n, 1);
-
-
let (records, dids) =
-
read.get_counts_by_collection(&Nsid::new("a.a.a".to_string()).unwrap())?;
-
assert_eq!(records, 3);
-
assert_eq!(dids, 2);
-
-
// delete account rolled up
-
let n = write.step_rollup()?;
-
assert_eq!(n, 1);
-
-
let (records, dids) =
-
read.get_counts_by_collection(&Nsid::new("a.a.a".to_string()).unwrap())?;
-
assert_eq!(records, 3);
-
assert_eq!(dids, 2);
-
-
// second batch rolled up
-
let n = write.step_rollup()?;
-
assert_eq!(n, 1);
-
-
let (records, dids) =
-
read.get_counts_by_collection(&Nsid::new("a.a.a".to_string()).unwrap())?;
-
assert_eq!(records, 3);
-
assert_eq!(dids, 2);
-
-
// no more rollups left
-
let n = write.step_rollup()?;
-
assert_eq!(n, 0);
-
-
Ok(())
-
}
-
-
#[test]
-
fn get_top_collections() -> anyhow::Result<()> {
-
let (read, mut write) = fjall_db();
-
-
let mut batch = TestBatch::default();
-
batch.create(
-
"did:plc:person-a",
-
"a.a.a",
-
"rkey-aaa",
-
"{}",
-
Some("rev-aaa"),
-
None,
-
10_000,
-
);
-
batch.create(
-
"did:plc:person-b",
-
"a.a.b",
-
"rkey-bbb",
-
"{}",
-
Some("rev-bbb"),
-
None,
-
10_001,
-
);
-
batch.create(
-
"did:plc:person-c",
-
"a.b.c",
-
"rkey-ccc",
-
"{}",
-
Some("rev-ccc"),
-
None,
-
10_002,
-
);
-
batch.create(
-
"did:plc:person-a",
-
"a.a.a",
-
"rkey-aaa-2",
-
"{}",
-
Some("rev-aaa-2"),
-
None,
-
10_003,
-
);
-
write.insert_batch(batch.batch)?;
-
-
let n = write.step_rollup()?;
-
assert_eq!(n, 3); // 3 collections
-
-
let tops = read.get_top_collections()?;
-
assert_eq!(
-
tops,
-
TopCollections {
-
total_records: 4,
-
dids_estimate: 3,
-
nsid_child_segments: HashMap::from([(
-
"a".to_string(),
-
TopCollections {
-
total_records: 4,
-
dids_estimate: 3,
-
nsid_child_segments: HashMap::from([
-
(
-
"a".to_string(),
-
TopCollections {
-
total_records: 3,
-
dids_estimate: 2,
-
nsid_child_segments: HashMap::from([
-
(
-
"a".to_string(),
-
TopCollections {
-
total_records: 2,
-
dids_estimate: 1,
-
nsid_child_segments: HashMap::from([]),
-
},
-
),
-
(
-
"b".to_string(),
-
TopCollections {
-
total_records: 1,
-
dids_estimate: 1,
-
nsid_child_segments: HashMap::from([]),
-
}
-
),
-
]),
-
},
-
),
-
(
-
"b".to_string(),
-
TopCollections {
-
total_records: 1,
-
dids_estimate: 1,
-
nsid_child_segments: HashMap::from([(
-
"c".to_string(),
-
TopCollections {
-
total_records: 1,
-
dids_estimate: 1,
-
nsid_child_segments: HashMap::from([]),
-
},
-
),]),
-
},
-
),
-
]),
-
},
-
),]),
-
}
-
);
-
Ok(())
-
}
-
}
···
+492 -106
ufos/src/store_types.rs
···
use crate::db_types::{
-
DbBytes, DbConcat, DbStaticStr, EncodingError, SerdeBytes, StaticStr, UseBincodePlz,
};
-
use crate::{Cursor, Did, Nsid, PutAction, RecordKey, UFOsCommit};
use bincode::{Decode, Encode};
-
use cardinality_estimator::CardinalityEstimator;
-
use std::ops::Range;
-
/// key format: ["js_cursor"]
-
#[derive(Debug, PartialEq)]
-
pub struct JetstreamCursorKey {}
-
impl StaticStr for JetstreamCursorKey {
-
fn static_str() -> &'static str {
-
"js_cursor"
-
}
}
pub type JetstreamCursorValue = Cursor;
-
/// key format: ["rollup_cursor"]
-
#[derive(Debug, PartialEq)]
-
pub struct NewRollupCursorKey {}
-
impl StaticStr for NewRollupCursorKey {
-
fn static_str() -> &'static str {
-
"rollup_cursor"
-
}
-
}
// pub type NewRollupCursorKey = DbStaticStr<_NewRollupCursorKey>;
/// value format: [rollup_cursor(Cursor)|collection(Nsid)]
pub type NewRollupCursorValue = Cursor;
-
/// key format: ["js_endpoint"]
-
#[derive(Debug, PartialEq)]
-
pub struct TakeoffKey {}
-
impl StaticStr for TakeoffKey {
-
fn static_str() -> &'static str {
-
"takeoff"
}
}
pub type TakeoffValue = Cursor;
-
/// key format: ["js_endpoint"]
-
#[derive(Debug, PartialEq)]
-
pub struct JetstreamEndpointKey {}
-
impl StaticStr for JetstreamEndpointKey {
-
fn static_str() -> &'static str {
-
"js_endpoint"
-
}
-
}
#[derive(Debug, PartialEq)]
pub struct JetstreamEndpointValue(pub String);
/// String wrapper for jetstream endpoint value
···
}
}
pub type NsidRecordFeedKey = DbConcat<Nsid, Cursor>;
impl NsidRecordFeedKey {
pub fn collection(&self) -> &Nsid {
···
}
}
-
#[derive(Debug, PartialEq)]
-
pub struct _LiveRecordsStaticStr {}
-
impl StaticStr for _LiveRecordsStaticStr {
-
fn static_str() -> &'static str {
-
"live_counts"
-
}
-
}
type LiveCountsStaticPrefix = DbStaticStr<_LiveRecordsStaticStr>;
type LiveCountsCursorPrefix = DbConcat<LiveCountsStaticPrefix, Cursor>;
···
pub fn cursor(&self) -> Cursor {
self.prefix.suffix
}
-
pub fn collection(&self) -> &Nsid {
&self.suffix
}
}
···
)
}
}
-
#[derive(Debug, PartialEq, Decode, Encode)]
-
pub struct TotalRecordsValue(pub u64);
-
impl UseBincodePlz for TotalRecordsValue {}
-
#[derive(Debug, PartialEq, serde::Serialize, serde::Deserialize)]
-
pub struct EstimatedDidsValue(pub CardinalityEstimator<Did>);
impl SerdeBytes for EstimatedDidsValue {}
impl DbBytes for EstimatedDidsValue {
#[cfg(test)]
···
#[cfg(not(test))]
fn to_db_bytes(&self) -> Result<Vec<u8>, EncodingError> {
-
Ok(vec![1, 2, 3]) // TODO: un-stub when their heap overflow is fixed
}
#[cfg(not(test))]
fn from_db_bytes(bytes: &[u8]) -> Result<(Self, usize), EncodingError> {
-
if bytes.len() < 3 {
-
return Err(EncodingError::DecodeNotEnoughBytes);
-
}
-
Ok((Self(CardinalityEstimator::new()), 3)) // TODO: un-stub when their heap overflow is fixed
}
}
-
pub type CountsValue = DbConcat<TotalRecordsValue, EstimatedDidsValue>;
impl CountsValue {
-
pub fn new(total: u64, dids: CardinalityEstimator<Did>) -> Self {
Self {
-
prefix: TotalRecordsValue(total),
suffix: EstimatedDidsValue(dids),
}
}
-
pub fn records(&self) -> u64 {
-
self.prefix.0
}
-
pub fn dids(&self) -> &CardinalityEstimator<Did> {
&self.suffix.0
}
pub fn merge(&mut self, other: &Self) {
-
self.prefix.0 += other.records();
-
self.suffix.0.merge(other.dids());
}
}
-
impl Default for CountsValue {
-
fn default() -> Self {
Self {
-
prefix: TotalRecordsValue(0),
-
suffix: EstimatedDidsValue(CardinalityEstimator::new()),
}
}
}
-
#[derive(Debug, PartialEq)]
-
pub struct _DeleteAccountStaticStr {}
-
impl StaticStr for _DeleteAccountStaticStr {
-
fn static_str() -> &'static str {
-
"delete_acount"
-
}
-
}
pub type DeleteAccountStaticPrefix = DbStaticStr<_DeleteAccountStaticStr>;
pub type DeleteAccountQueueKey = DbConcat<DeleteAccountStaticPrefix, Cursor>;
impl DeleteAccountQueueKey {
···
}
pub type DeleteAccountQueueVal = Did;
-
#[derive(Debug, PartialEq)]
-
pub struct _HourlyRollupStaticStr {}
-
impl StaticStr for _HourlyRollupStaticStr {
-
fn static_str() -> &'static str {
-
"hourly_counts"
}
}
pub type HourlyRollupStaticPrefix = DbStaticStr<_HourlyRollupStaticStr>;
-
pub type HourlyRollupKey = DbConcat<DbConcat<HourlyRollupStaticPrefix, HourTruncatedCursor>, Nsid>;
impl HourlyRollupKey {
-
pub fn new(hourly_cursor: HourTruncatedCursor, nsid: &Nsid) -> Self {
Self::from_pair(
-
DbConcat::from_pair(Default::default(), hourly_cursor),
nsid.clone(),
)
}
}
pub type HourlyRollupVal = CountsValue;
-
#[derive(Debug, PartialEq)]
-
pub struct _WeeklyRollupStaticStr {}
-
impl StaticStr for _WeeklyRollupStaticStr {
-
fn static_str() -> &'static str {
-
"weekly_counts"
-
}
-
}
pub type WeeklyRollupStaticPrefix = DbStaticStr<_WeeklyRollupStaticStr>;
-
pub type WeeklyRollupKey = DbConcat<DbConcat<WeeklyRollupStaticPrefix, WeekTruncatedCursor>, Nsid>;
impl WeeklyRollupKey {
-
pub fn new(weekly_cursor: WeekTruncatedCursor, nsid: &Nsid) -> Self {
Self::from_pair(
-
DbConcat::from_pair(Default::default(), weekly_cursor),
nsid.clone(),
)
}
}
pub type WeeklyRollupVal = CountsValue;
-
#[derive(Debug, PartialEq)]
-
pub struct _AllTimeRollupStaticStr {}
-
impl StaticStr for _AllTimeRollupStaticStr {
-
fn static_str() -> &'static str {
-
"ever_counts"
-
}
-
}
pub type AllTimeRollupStaticPrefix = DbStaticStr<_AllTimeRollupStaticStr>;
pub type AllTimeRollupKey = DbConcat<AllTimeRollupStaticPrefix, Nsid>;
impl AllTimeRollupKey {
pub fn new(nsid: &Nsid) -> Self {
Self::from_pair(Default::default(), nsid.clone())
}
-
pub fn collection(&self) -> &Nsid {
&self.suffix
}
}
pub type AllTimeRollupVal = CountsValue;
#[derive(Debug, Copy, Clone, PartialEq, Hash, PartialOrd, Eq)]
pub struct TruncatedCursor<const MOD: u64>(u64);
impl<const MOD: u64> TruncatedCursor<MOD> {
-
fn truncate(raw: u64) -> u64 {
(raw / MOD) * MOD
}
pub fn try_from_raw_u64(time_us: u64) -> Result<Self, EncodingError> {
···
}
pub fn try_from_cursor(cursor: Cursor) -> Result<Self, EncodingError> {
Self::try_from_raw_u64(cursor.to_raw_u64())
}
pub fn truncate_cursor(cursor: Cursor) -> Self {
let raw = cursor.to_raw_u64();
let truncated = Self::truncate(raw);
Self(truncated)
}
}
impl<const MOD: u64> From<TruncatedCursor<MOD>> for Cursor {
fn from(truncated: TruncatedCursor<MOD>) -> Self {
···
}
}
-
const HOUR_IN_MICROS: u64 = 1_000_000 * 3600;
pub type HourTruncatedCursor = TruncatedCursor<HOUR_IN_MICROS>;
-
const WEEK_IN_MICROS: u64 = HOUR_IN_MICROS * 24 * 7;
pub type WeekTruncatedCursor = TruncatedCursor<WEEK_IN_MICROS>;
#[cfg(test)]
mod test {
use super::{
-
CardinalityEstimator, CountsValue, Cursor, Did, EncodingError, HourTruncatedCursor,
-
HourlyRollupKey, Nsid, HOUR_IN_MICROS,
};
use crate::db_types::DbBytes;
#[test]
fn test_by_hourly_rollup_key() -> Result<(), EncodingError> {
···
#[test]
fn test_by_hourly_rollup_value() -> Result<(), EncodingError> {
-
let mut estimator = CardinalityEstimator::new();
for i in 0..10 {
-
estimator.insert(&Did::new(format!("did:plc:inze6wrmsm7pjl7yta3oig7{i}")).unwrap());
}
-
let original = CountsValue::new(123, estimator.clone());
let serialized = original.to_db_bytes()?;
let (restored, bytes_consumed) = CountsValue::from_db_bytes(&serialized)?;
assert_eq!(restored, original);
assert_eq!(bytes_consumed, serialized.len());
for i in 10..1_000 {
-
estimator.insert(&Did::new(format!("did:plc:inze6wrmsm7pjl7yta3oig{i}")).unwrap());
}
-
let original = CountsValue::new(123, estimator);
let serialized = original.to_db_bytes()?;
let (restored, bytes_consumed) = CountsValue::from_db_bytes(&serialized)?;
assert_eq!(restored, original);
···
assert_eq!(back, us);
let diff = us.to_raw_u64() - back.to_raw_u64();
assert_eq!(diff, 0);
}
}
···
use crate::db_types::{
+
DbBytes, DbConcat, DbStaticStr, EncodingError, EncodingResult, SerdeBytes, StaticStr,
+
UseBincodePlz,
};
+
use crate::{Cursor, Did, JustCount, Nsid, PutAction, RecordKey, UFOsCommit};
use bincode::{Decode, Encode};
+
use cardinality_estimator_safe::Sketch;
+
use std::ops::{Bound, Range};
+
macro_rules! static_str {
+
($prefix:expr, $name:ident) => {
+
#[derive(Debug, PartialEq)]
+
pub struct $name {}
+
impl StaticStr for $name {
+
fn static_str() -> &'static str {
+
$prefix
+
}
+
}
+
};
}
+
+
// key format: ["js_cursor"]
+
static_str!("js_cursor", JetstreamCursorKey);
pub type JetstreamCursorValue = Cursor;
+
// key format: ["sketch_secret"]
+
static_str!("sketch_secret", SketchSecretKey);
+
pub type SketchSecretPrefix = [u8; 16];
+
+
// key format: ["rollup_cursor"]
+
static_str!("rollup_cursor", NewRollupCursorKey);
// pub type NewRollupCursorKey = DbStaticStr<_NewRollupCursorKey>;
/// value format: [rollup_cursor(Cursor)|collection(Nsid)]
pub type NewRollupCursorValue = Cursor;
+
static_str!("trim_cursor", _TrimCollectionStaticStr);
+
type TrimCollectionCursorPrefix = DbStaticStr<_TrimCollectionStaticStr>;
+
pub type TrimCollectionCursorKey = DbConcat<TrimCollectionCursorPrefix, Nsid>;
+
impl TrimCollectionCursorKey {
+
pub fn new(collection: Nsid) -> Self {
+
Self::from_pair(Default::default(), collection)
}
}
+
pub type TrimCollectionCursorVal = Cursor;
+
+
// key format: ["js_endpoint"]
+
static_str!("takeoff", TakeoffKey);
pub type TakeoffValue = Cursor;
+
// key format: ["js_endpoint"]
+
static_str!("js_endpoint", JetstreamEndpointKey);
#[derive(Debug, PartialEq)]
pub struct JetstreamEndpointValue(pub String);
/// String wrapper for jetstream endpoint value
···
}
}
+
pub trait WithCollection {
+
fn collection(&self) -> &Nsid;
+
}
+
+
pub trait WithRank {
+
fn rank(&self) -> u64;
+
}
+
pub type NsidRecordFeedKey = DbConcat<Nsid, Cursor>;
impl NsidRecordFeedKey {
pub fn collection(&self) -> &Nsid {
···
}
}
+
static_str!("live_counts", _LiveRecordsStaticStr);
type LiveCountsStaticPrefix = DbStaticStr<_LiveRecordsStaticStr>;
type LiveCountsCursorPrefix = DbConcat<LiveCountsStaticPrefix, Cursor>;
···
pub fn cursor(&self) -> Cursor {
self.prefix.suffix
}
+
}
+
impl WithCollection for LiveCountsKey {
+
fn collection(&self) -> &Nsid {
&self.suffix
}
}
···
)
}
}
+
+
#[derive(Debug, Clone, Copy, Default, PartialEq, Decode, Encode)]
+
pub struct CommitCounts {
+
pub creates: u64,
+
pub updates: u64,
+
pub deletes: u64,
+
}
+
impl CommitCounts {
+
pub fn merge(&mut self, other: &Self) {
+
self.creates += other.creates;
+
self.updates += other.updates;
+
self.deletes += other.deletes;
+
}
+
}
+
impl UseBincodePlz for CommitCounts {}
+
#[derive(Debug, Default, PartialEq, serde::Serialize, serde::Deserialize)]
+
pub struct EstimatedDidsValue(pub Sketch<14>);
impl SerdeBytes for EstimatedDidsValue {}
impl DbBytes for EstimatedDidsValue {
#[cfg(test)]
···
#[cfg(not(test))]
fn to_db_bytes(&self) -> Result<Vec<u8>, EncodingError> {
+
SerdeBytes::to_bytes(self)
}
#[cfg(not(test))]
fn from_db_bytes(bytes: &[u8]) -> Result<(Self, usize), EncodingError> {
+
SerdeBytes::from_bytes(bytes)
}
}
+
pub type CountsValue = DbConcat<CommitCounts, EstimatedDidsValue>;
impl CountsValue {
+
pub fn new(counts: CommitCounts, dids: Sketch<14>) -> Self {
Self {
+
prefix: counts,
suffix: EstimatedDidsValue(dids),
}
}
+
pub fn counts(&self) -> CommitCounts {
+
self.prefix
}
+
pub fn dids(&self) -> &Sketch<14> {
&self.suffix.0
}
pub fn merge(&mut self, other: &Self) {
+
self.prefix.merge(&other.prefix);
+
self.suffix.0.merge(&other.suffix.0);
}
}
+
impl From<&CountsValue> for JustCount {
+
fn from(cv: &CountsValue) -> Self {
+
let CommitCounts {
+
creates,
+
updates,
+
deletes,
+
} = cv.counts();
Self {
+
creates,
+
updates,
+
deletes,
+
dids_estimate: cv.dids().estimate() as u64,
}
}
}
+
static_str!("delete_acount", _DeleteAccountStaticStr);
pub type DeleteAccountStaticPrefix = DbStaticStr<_DeleteAccountStaticStr>;
pub type DeleteAccountQueueKey = DbConcat<DeleteAccountStaticPrefix, Cursor>;
impl DeleteAccountQueueKey {
···
}
pub type DeleteAccountQueueVal = Did;
+
/// big-endian encoded u64 for LSM prefix-fiendly key
+
#[derive(Debug, Clone, Copy, PartialEq)]
+
pub struct KeyRank(u64);
+
impl DbBytes for KeyRank {
+
fn to_db_bytes(&self) -> Result<Vec<u8>, EncodingError> {
+
Ok(self.0.to_be_bytes().to_vec())
+
}
+
fn from_db_bytes(bytes: &[u8]) -> Result<(Self, usize), EncodingError> {
+
if bytes.len() < 8 {
+
return Err(EncodingError::DecodeNotEnoughBytes);
+
}
+
let bytes8 = TryInto::<[u8; 8]>::try_into(&bytes[..8])?;
+
let rank = KeyRank(u64::from_be_bytes(bytes8));
+
Ok((rank, 8))
+
}
+
}
+
impl From<u64> for KeyRank {
+
fn from(n: u64) -> Self {
+
Self(n)
+
}
+
}
+
impl From<KeyRank> for u64 {
+
fn from(kr: KeyRank) -> Self {
+
kr.0
+
}
+
}
+
+
pub type BucketedRankRecordsKey<P, C> =
+
DbConcat<DbConcat<DbStaticStr<P>, C>, DbConcat<KeyRank, Nsid>>;
+
impl<P, C> BucketedRankRecordsKey<P, C>
+
where
+
P: StaticStr + PartialEq + std::fmt::Debug,
+
C: DbBytes + PartialEq + std::fmt::Debug + Clone,
+
{
+
pub fn new(cursor: C, rank: KeyRank, nsid: &Nsid) -> Self {
+
Self::from_pair(
+
DbConcat::from_pair(Default::default(), cursor),
+
DbConcat::from_pair(rank, nsid.clone()),
+
)
+
}
+
pub fn with_rank(&self, new_rank: KeyRank) -> Self {
+
Self::new(self.prefix.suffix.clone(), new_rank, &self.suffix.suffix)
+
}
+
pub fn start(cursor: C) -> EncodingResult<Bound<Vec<u8>>> {
+
let prefix: DbConcat<DbStaticStr<P>, C> = DbConcat::from_pair(Default::default(), cursor);
+
Ok(Bound::Included(Self::from_prefix_to_db_bytes(&prefix)?))
+
}
+
pub fn end(cursor: C) -> EncodingResult<Bound<Vec<u8>>> {
+
let prefix: DbConcat<DbStaticStr<P>, C> = DbConcat::from_pair(Default::default(), cursor);
+
Ok(Bound::Excluded(Self::prefix_range_end(&prefix)?))
}
}
+
impl<P: StaticStr, C: DbBytes> WithCollection for BucketedRankRecordsKey<P, C> {
+
fn collection(&self) -> &Nsid {
+
&self.suffix.suffix
+
}
+
}
+
impl<P: StaticStr, C: DbBytes> WithRank for BucketedRankRecordsKey<P, C> {
+
fn rank(&self) -> u64 {
+
self.suffix.prefix.into()
+
}
+
}
+
+
static_str!("hourly_counts", _HourlyRollupStaticStr);
pub type HourlyRollupStaticPrefix = DbStaticStr<_HourlyRollupStaticStr>;
+
pub type HourlyRollupKeyHourPrefix = DbConcat<HourlyRollupStaticPrefix, HourTruncatedCursor>;
+
pub type HourlyRollupKey = DbConcat<HourlyRollupKeyHourPrefix, Nsid>;
+
pub type HourlyRollupPre = DbConcat<HourlyRollupKeyHourPrefix, Vec<u8>>; // bit hack but
impl HourlyRollupKey {
+
pub fn new(cursor: HourTruncatedCursor, nsid: &Nsid) -> Self {
Self::from_pair(
+
DbConcat::from_pair(Default::default(), cursor),
nsid.clone(),
)
}
+
pub fn new_nsid_prefix(cursor: HourTruncatedCursor, pre: &[u8]) -> HourlyRollupPre {
+
HourlyRollupPre::from_pair(
+
DbConcat::from_pair(Default::default(), cursor),
+
pre.to_vec(),
+
)
+
}
+
pub fn cursor(&self) -> HourTruncatedCursor {
+
self.prefix.suffix
+
}
+
pub fn start(hour: HourTruncatedCursor) -> EncodingResult<Bound<Vec<u8>>> {
+
let prefix = HourlyRollupKeyHourPrefix::from_pair(Default::default(), hour);
+
let prefix_bytes = Self::from_prefix_to_db_bytes(&prefix)?;
+
Ok(Bound::Included(prefix_bytes))
+
}
+
pub fn after_nsid(hour: HourTruncatedCursor, nsid: &Nsid) -> EncodingResult<Bound<Vec<u8>>> {
+
Ok(Bound::Excluded(Self::new(hour, nsid).to_db_bytes()?))
+
}
+
pub fn after_nsid_prefix(
+
hour: HourTruncatedCursor,
+
pre: &[u8],
+
) -> EncodingResult<Bound<Vec<u8>>> {
+
Ok(Bound::Excluded(
+
Self::new_nsid_prefix(hour, pre).to_db_bytes()?,
+
))
+
}
+
pub fn end(hour: HourTruncatedCursor) -> EncodingResult<Bound<Vec<u8>>> {
+
let prefix = HourlyRollupKeyHourPrefix::from_pair(Default::default(), hour);
+
Ok(Bound::Excluded(Self::prefix_range_end(&prefix)?))
+
}
+
pub fn nsid_prefix_end(
+
hour: HourTruncatedCursor,
+
pre: &[u8],
+
) -> EncodingResult<Bound<Vec<u8>>> {
+
Ok(Bound::Excluded(
+
Self::new_nsid_prefix(hour, pre).as_prefix_range_end()?,
+
))
+
}
+
}
+
impl WithCollection for HourlyRollupKey {
+
fn collection(&self) -> &Nsid {
+
&self.suffix
+
}
}
pub type HourlyRollupVal = CountsValue;
+
static_str!("hourly_rank_records", _HourlyRecordsStaticStr);
+
pub type HourlyRecordsKey = BucketedRankRecordsKey<_HourlyRecordsStaticStr, HourTruncatedCursor>;
+
+
static_str!("hourly_rank_dids", _HourlyDidsStaticStr);
+
pub type HourlyDidsKey = BucketedRankRecordsKey<_HourlyDidsStaticStr, HourTruncatedCursor>;
+
+
static_str!("weekly_counts", _WeeklyRollupStaticStr);
pub type WeeklyRollupStaticPrefix = DbStaticStr<_WeeklyRollupStaticStr>;
+
pub type WeeklyRollupKeyWeekPrefix = DbConcat<WeeklyRollupStaticPrefix, WeekTruncatedCursor>;
+
pub type WeeklyRollupKey = DbConcat<WeeklyRollupKeyWeekPrefix, Nsid>;
+
pub type WeeklyRollupPre = DbConcat<WeeklyRollupKeyWeekPrefix, Vec<u8>>;
impl WeeklyRollupKey {
+
pub fn new(cursor: WeekTruncatedCursor, nsid: &Nsid) -> Self {
Self::from_pair(
+
DbConcat::from_pair(Default::default(), cursor),
nsid.clone(),
)
}
+
pub fn new_nsid_prefix(cursor: WeekTruncatedCursor, pre: &[u8]) -> WeeklyRollupPre {
+
WeeklyRollupPre::from_pair(
+
DbConcat::from_pair(Default::default(), cursor),
+
pre.to_vec(),
+
)
+
}
+
pub fn cursor(&self) -> WeekTruncatedCursor {
+
self.prefix.suffix
+
}
+
pub fn start(week: WeekTruncatedCursor) -> EncodingResult<Bound<Vec<u8>>> {
+
let prefix = WeeklyRollupKeyWeekPrefix::from_pair(Default::default(), week);
+
let prefix_bytes = Self::from_prefix_to_db_bytes(&prefix)?;
+
Ok(Bound::Included(prefix_bytes))
+
}
+
pub fn after_nsid(week: WeekTruncatedCursor, nsid: &Nsid) -> EncodingResult<Bound<Vec<u8>>> {
+
Ok(Bound::Excluded(Self::new(week, nsid).to_db_bytes()?))
+
}
+
pub fn after_nsid_prefix(
+
week: WeekTruncatedCursor,
+
prefix: &[u8],
+
) -> EncodingResult<Bound<Vec<u8>>> {
+
Ok(Bound::Excluded(
+
Self::new_nsid_prefix(week, prefix).to_db_bytes()?,
+
))
+
}
+
pub fn end(week: WeekTruncatedCursor) -> EncodingResult<Bound<Vec<u8>>> {
+
let prefix = WeeklyRollupKeyWeekPrefix::from_pair(Default::default(), week);
+
Ok(Bound::Excluded(Self::prefix_range_end(&prefix)?))
+
}
+
pub fn nsid_prefix_end(
+
week: WeekTruncatedCursor,
+
prefix: &[u8],
+
) -> EncodingResult<Bound<Vec<u8>>> {
+
Ok(Bound::Excluded(
+
Self::new_nsid_prefix(week, prefix).as_prefix_range_end()?,
+
))
+
}
+
}
+
impl WithCollection for WeeklyRollupKey {
+
fn collection(&self) -> &Nsid {
+
&self.suffix
+
}
}
pub type WeeklyRollupVal = CountsValue;
+
static_str!("weekly_rank_records", _WeeklyRecordsStaticStr);
+
pub type WeeklyRecordsKey = BucketedRankRecordsKey<_WeeklyRecordsStaticStr, WeekTruncatedCursor>;
+
+
static_str!("weekly_rank_dids", _WeeklyDidsStaticStr);
+
pub type WeeklyDidsKey = BucketedRankRecordsKey<_WeeklyDidsStaticStr, WeekTruncatedCursor>;
+
+
static_str!("ever_counts", _AllTimeRollupStaticStr);
pub type AllTimeRollupStaticPrefix = DbStaticStr<_AllTimeRollupStaticStr>;
pub type AllTimeRollupKey = DbConcat<AllTimeRollupStaticPrefix, Nsid>;
+
pub type AllTimeRollupPre = DbConcat<AllTimeRollupStaticPrefix, Vec<u8>>;
impl AllTimeRollupKey {
pub fn new(nsid: &Nsid) -> Self {
Self::from_pair(Default::default(), nsid.clone())
}
+
pub fn new_nsid_prefix(pre: &[u8]) -> AllTimeRollupPre {
+
AllTimeRollupPre::from_pair(Default::default(), pre.to_vec())
+
}
+
pub fn start() -> EncodingResult<Bound<Vec<u8>>> {
+
Ok(Bound::Included(Self::from_prefix_to_db_bytes(
+
&Default::default(),
+
)?))
+
}
+
pub fn after_nsid(nsid: &Nsid) -> EncodingResult<Bound<Vec<u8>>> {
+
Ok(Bound::Excluded(Self::new(nsid).to_db_bytes()?))
+
}
+
pub fn after_nsid_prefix(prefix: &[u8]) -> EncodingResult<Bound<Vec<u8>>> {
+
Ok(Bound::Excluded(
+
Self::new_nsid_prefix(prefix).to_db_bytes()?,
+
))
+
}
+
pub fn end() -> EncodingResult<Bound<Vec<u8>>> {
+
Ok(Bound::Excluded(
+
Self::prefix_range_end(&Default::default())?,
+
))
+
}
+
pub fn nsid_prefix_end(prefix: &[u8]) -> EncodingResult<Bound<Vec<u8>>> {
+
Ok(Bound::Excluded(
+
Self::new_nsid_prefix(prefix).as_prefix_range_end()?,
+
))
+
}
+
}
+
impl WithCollection for AllTimeRollupKey {
+
fn collection(&self) -> &Nsid {
&self.suffix
}
}
pub type AllTimeRollupVal = CountsValue;
+
pub type AllTimeRankRecordsKey<P> = DbConcat<DbStaticStr<P>, DbConcat<KeyRank, Nsid>>;
+
impl<P> AllTimeRankRecordsKey<P>
+
where
+
P: StaticStr + PartialEq + std::fmt::Debug,
+
{
+
pub fn new(rank: KeyRank, nsid: &Nsid) -> Self {
+
Self::from_pair(Default::default(), DbConcat::from_pair(rank, nsid.clone()))
+
}
+
pub fn with_rank(&self, new_rank: KeyRank) -> Self {
+
Self::new(new_rank, &self.suffix.suffix)
+
}
+
pub fn count(&self) -> u64 {
+
self.suffix.prefix.0
+
}
+
pub fn start() -> EncodingResult<Bound<Vec<u8>>> {
+
Ok(Bound::Included(Self::from_prefix_to_db_bytes(
+
&Default::default(),
+
)?))
+
}
+
pub fn end() -> EncodingResult<Bound<Vec<u8>>> {
+
Ok(Bound::Excluded(
+
Self::prefix_range_end(&Default::default())?,
+
))
+
}
+
}
+
impl<P: StaticStr> WithCollection for AllTimeRankRecordsKey<P> {
+
fn collection(&self) -> &Nsid {
+
&self.suffix.suffix
+
}
+
}
+
impl<P: StaticStr> WithRank for AllTimeRankRecordsKey<P> {
+
fn rank(&self) -> u64 {
+
self.suffix.prefix.into()
+
}
+
}
+
+
static_str!("ever_rank_records", _AllTimeRecordsStaticStr);
+
pub type AllTimeRecordsKey = AllTimeRankRecordsKey<_AllTimeRecordsStaticStr>;
+
+
static_str!("ever_rank_dids", _AllTimeDidsStaticStr);
+
pub type AllTimeDidsKey = AllTimeRankRecordsKey<_AllTimeDidsStaticStr>;
+
#[derive(Debug, Copy, Clone, PartialEq, Hash, PartialOrd, Eq)]
pub struct TruncatedCursor<const MOD: u64>(u64);
impl<const MOD: u64> TruncatedCursor<MOD> {
+
pub fn truncate(raw: u64) -> u64 {
(raw / MOD) * MOD
}
pub fn try_from_raw_u64(time_us: u64) -> Result<Self, EncodingError> {
···
}
pub fn try_from_cursor(cursor: Cursor) -> Result<Self, EncodingError> {
Self::try_from_raw_u64(cursor.to_raw_u64())
+
}
+
pub fn truncate_raw_u64(raw: u64) -> Self {
+
let truncated = Self::truncate(raw);
+
Self(truncated)
}
pub fn truncate_cursor(cursor: Cursor) -> Self {
let raw = cursor.to_raw_u64();
let truncated = Self::truncate(raw);
Self(truncated)
}
+
pub fn to_raw_u64(&self) -> u64 {
+
self.0
+
}
+
pub fn try_as<const MOD_B: u64>(&self) -> Result<TruncatedCursor<MOD_B>, EncodingError> {
+
TruncatedCursor::<MOD_B>::try_from_raw_u64(self.0)
+
}
+
pub fn cycles_until(&self, other: Self) -> u64 {
+
if other < *self {
+
panic!("other must be greater than or equal to self");
+
}
+
(other.0 - self.0) / MOD
+
}
+
pub fn next(&self) -> Self {
+
Self(self.0 + MOD)
+
}
+
pub fn nth_next(&self, n: u64) -> Self {
+
Self(self.0 + (n * MOD))
+
}
+
pub fn prev(&self) -> Self {
+
if self.0 < MOD {
+
panic!("underflow: previous truncation start would be less than zero");
+
}
+
Self(self.0 - MOD)
+
}
}
impl<const MOD: u64> From<TruncatedCursor<MOD>> for Cursor {
fn from(truncated: TruncatedCursor<MOD>) -> Self {
···
}
}
+
pub const HOUR_IN_MICROS: u64 = 1_000_000 * 3600;
pub type HourTruncatedCursor = TruncatedCursor<HOUR_IN_MICROS>;
+
pub const WEEK_IN_MICROS: u64 = HOUR_IN_MICROS * 24 * 7;
pub type WeekTruncatedCursor = TruncatedCursor<WEEK_IN_MICROS>;
+
#[derive(Debug, PartialEq)]
+
pub enum CursorBucket {
+
Hour(HourTruncatedCursor),
+
Week(WeekTruncatedCursor),
+
AllTime,
+
}
+
+
impl CursorBucket {
+
pub fn buckets_spanning(
+
since: HourTruncatedCursor,
+
until: HourTruncatedCursor,
+
) -> Vec<CursorBucket> {
+
if until <= since {
+
return vec![];
+
}
+
let mut out = vec![];
+
let mut current_lower = since;
+
while current_lower < until {
+
if current_lower.cycles_until(until) >= (WEEK_IN_MICROS / HOUR_IN_MICROS) {
+
if let Ok(week) = current_lower.try_as::<WEEK_IN_MICROS>() {
+
out.push(CursorBucket::Week(week));
+
current_lower = week.next().try_as().unwrap();
+
continue;
+
}
+
}
+
out.push(CursorBucket::Hour(current_lower));
+
current_lower = current_lower.next();
+
}
+
out
+
}
+
}
+
#[cfg(test)]
mod test {
use super::{
+
CommitCounts, CountsValue, Cursor, CursorBucket, Did, EncodingError, HourTruncatedCursor,
+
HourlyRollupKey, Nsid, Sketch, HOUR_IN_MICROS, WEEK_IN_MICROS,
};
use crate::db_types::DbBytes;
+
use cardinality_estimator_safe::Element;
+
use sha2::Sha256;
#[test]
fn test_by_hourly_rollup_key() -> Result<(), EncodingError> {
···
#[test]
fn test_by_hourly_rollup_value() -> Result<(), EncodingError> {
+
let mut estimator = Sketch::<14>::default();
+
fn to_element(d: Did) -> Element<14> {
+
Element::from_digest_oneshot::<Sha256>(d.to_string().as_bytes())
+
}
for i in 0..10 {
+
estimator.insert(to_element(
+
Did::new(format!("did:plc:inze6wrmsm7pjl7yta3oig7{i}")).unwrap(),
+
));
}
+
let original = CountsValue::new(
+
CommitCounts {
+
creates: 123,
+
..Default::default()
+
},
+
estimator.clone(),
+
);
let serialized = original.to_db_bytes()?;
let (restored, bytes_consumed) = CountsValue::from_db_bytes(&serialized)?;
assert_eq!(restored, original);
assert_eq!(bytes_consumed, serialized.len());
for i in 10..1_000 {
+
estimator.insert(to_element(
+
Did::new(format!("did:plc:inze6wrmsm7pjl7yta3oig{i}")).unwrap(),
+
));
}
+
let original = CountsValue::new(
+
CommitCounts {
+
creates: 123,
+
..Default::default()
+
},
+
estimator,
+
);
let serialized = original.to_db_bytes()?;
let (restored, bytes_consumed) = CountsValue::from_db_bytes(&serialized)?;
assert_eq!(restored, original);
···
assert_eq!(back, us);
let diff = us.to_raw_u64() - back.to_raw_u64();
assert_eq!(diff, 0);
+
}
+
+
#[test]
+
fn test_spanning_nothing() {
+
let from = Cursor::from_raw_u64(1_743_775_200_000_000).into();
+
let until = Cursor::from_raw_u64(1_743_775_200_000_000).into();
+
assert!(CursorBucket::buckets_spanning(from, until).is_empty());
+
let until = Cursor::from_raw_u64(0).into();
+
assert!(CursorBucket::buckets_spanning(from, until).is_empty());
+
}
+
+
#[test]
+
fn test_spanning_low_hours() {
+
let from = HourTruncatedCursor::truncate_cursor(Cursor::from_start());
+
let until = from.next();
+
assert_eq!(
+
CursorBucket::buckets_spanning(from, until),
+
vec![CursorBucket::Hour(from)]
+
);
+
let until2 = until.next();
+
let until3 = until2.next();
+
assert_eq!(
+
CursorBucket::buckets_spanning(from, until3),
+
vec![
+
CursorBucket::Hour(from),
+
CursorBucket::Hour(until),
+
CursorBucket::Hour(until2),
+
]
+
);
+
}
+
+
#[test]
+
fn test_spanning_week_aligned() {
+
let from = HourTruncatedCursor::truncate_cursor(Cursor::from_start());
+
let until = HourTruncatedCursor::truncate_cursor(Cursor::from_raw_u64(WEEK_IN_MICROS));
+
assert_eq!(
+
CursorBucket::buckets_spanning(from, until),
+
vec![CursorBucket::Week(from.try_as().unwrap()),]
+
);
+
let next_hour = until.next();
+
assert_eq!(
+
CursorBucket::buckets_spanning(from, next_hour),
+
vec![
+
CursorBucket::Week(from.try_as().unwrap()),
+
CursorBucket::Hour(until),
+
]
+
);
+
}
+
+
#[test]
+
fn test_spanning_week_unaligned() {
+
let from = HourTruncatedCursor::truncate_cursor(Cursor::from_raw_u64(
+
WEEK_IN_MICROS - HOUR_IN_MICROS,
+
));
+
let until = HourTruncatedCursor::truncate_cursor(Cursor::from_raw_u64(
+
from.to_raw_u64() + WEEK_IN_MICROS,
+
));
+
let span = CursorBucket::buckets_spanning(from, until);
+
assert_eq!(span.len(), 168);
+
for b in &span {
+
let CursorBucket::Hour(_) = b else {
+
panic!("found week bucket in a span that should only have hourlies");
+
};
+
}
+
let until2 = until.next();
+
assert_eq!(
+
CursorBucket::buckets_spanning(from, until2),
+
vec![
+
CursorBucket::Hour(from),
+
CursorBucket::Week(from.next().try_as().unwrap()),
+
]
+
);
}
}