tracks lexicons and how many times they appeared on the jetstream

feat(server): train a zstd dict

ptr.pet 6001b7f8 b63273b4

verified
Changed files
+31
server
+1
server/Cargo.lock
···
"tower-http",
"tracing",
"tracing-subscriber",
+
"zstd",
[[package]]
+1
server/Cargo.toml
···
rayon = "1.10.0"
parking_lot = { version = "0.12", features = ["send_guard", "hardware-lock-elision"] }
rclite = "0.2.7"
+
zstd = "0.13.3"
[target.'cfg(target_env = "msvc")'.dependencies]
snmalloc-rs = "0.3.8"
+19
server/src/db/mod.rs
···
})
}
+
// train zstd dict with 100 blocks from every lexicon
+
pub fn train_zstd_dict(&self) -> AppResult<Vec<u8>> {
+
let samples = self
+
.get_nsids()
+
.filter_map(|nsid| self.get_handle(&nsid))
+
.map(|handle| {
+
handle
+
.iter()
+
.rev()
+
.map(|res| {
+
res.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))
+
.map(|(_, value)| Cursor::new(value))
+
})
+
.take(1000)
+
})
+
.flatten();
+
zstd::dict::from_sample_iterator(samples, 1024 * 128).map_err(AppError::from)
+
}
+
pub fn get_hits(
&self,
nsid: &str,
+10
server/src/main.rs
···
debug();
return;
}
+
Some("traindict") => {
+
train_zstd_dict();
+
return;
+
}
Some(x) => {
tracing::error!("unknown command: {}", x);
return;
···
ingest_events.join().expect("failed to join ingest events");
db_task.await.expect("cant join db task");
db.sync(true).expect("cant sync db");
+
}
+
+
fn train_zstd_dict() {
+
let db = Db::new(DbConfig::default(), CancellationToken::new()).expect("couldnt create db");
+
let dict_data = db.train_zstd_dict().expect("cant train zstd dict");
+
std::fs::write("zstd_dict", dict_data).expect("cant save zstd dict")
}
fn debug() {