Constellation, Spacedust, Slingshot, UFOs: atproto crates and services for microcosm

maybe working repair task?

Changed files
+217 -22
constellation
src
templates
+29 -10
constellation/src/bin/main.rs
···
/// Saved jsonl from jetstream to use instead of a live subscription
#[arg(short, long)]
fixture: Option<PathBuf>,
+
/// run a scan across the target id table and write all key -> ids to id -> keys
+
#[arg(long, action)]
+
repair_target_ids: bool,
}
#[derive(Debug, Clone, ValueEnum)]
···
rocks.start_backup(backup_dir, auto_backup, stay_alive.clone())?;
}
println!("rocks ready.");
-
run(
-
rocks,
-
fixture,
-
args.data,
-
stream,
-
bind,
-
metrics_bind,
-
stay_alive,
-
)
+
std::thread::scope(|s| {
+
if args.repair_target_ids {
+
let rocks = rocks.clone();
+
let stay_alive = stay_alive.clone();
+
s.spawn(move || {
+
let rep = rocks.run_repair(time::Duration::from_millis(1), stay_alive);
+
eprintln!("repair finished: {rep:?}");
+
rep
+
});
+
}
+
s.spawn(|| {
+
let r = run(
+
rocks,
+
fixture,
+
args.data,
+
stream,
+
bind,
+
metrics_bind,
+
stay_alive,
+
);
+
eprintln!("run finished: {r:?}");
+
r
+
});
+
});
+
Ok(())
}
}
}
···
'monitor: loop {
match readable.get_stats() {
-
Ok(StorageStats { dids, targetables, linking_records }) => {
+
Ok(StorageStats { dids, targetables, linking_records, .. }) => {
metrics::gauge!("storage.stats.dids").set(dids as f64);
metrics::gauge!("storage.stats.targetables").set(targetables as f64);
metrics::gauge!("storage.stats.linking_records").set(linking_records as f64);
+6 -7
constellation/src/server/mod.rs
···
DEFAULT_CURSOR_LIMIT
}
-
const INDEX_BEGAN_AT_TS: u64 = 1738083600; // TODO: not this
-
fn to500(e: tokio::task::JoinError) -> http::StatusCode {
eprintln!("handler error: {e}");
http::StatusCode::INTERNAL_SERVER_ERROR
···
#[template(path = "hello.html.j2")]
struct HelloReponse {
help: &'static str,
-
days_indexed: u64,
+
days_indexed: Option<u64>,
stats: StorageStats,
}
fn hello(
···
let stats = store
.get_stats()
.map_err(|_| http::StatusCode::INTERNAL_SERVER_ERROR)?;
-
let days_indexed = (UNIX_EPOCH + Duration::from_secs(INDEX_BEGAN_AT_TS))
-
.elapsed()
+
let days_indexed = stats
+
.started_at
+
.map(|c| (UNIX_EPOCH + Duration::from_micros(c)).elapsed())
+
.transpose()
.map_err(|_| http::StatusCode::INTERNAL_SERVER_ERROR)?
-
.as_secs()
-
/ 86400;
+
.map(|d| d.as_secs() / 86_400);
Ok(acceptable(accept, HelloReponse {
help: "open this URL in a web browser (or request with Accept: text/html) for information about this API.",
days_indexed,
+2
constellation/src/storage/mem_store.rs
···
dids,
targetables,
linking_records,
+
started_at: None,
+
other_data: Default::default(),
})
}
}
+6
constellation/src/storage/mod.rs
···
/// records with multiple links are single-counted.
/// for LSM stores, deleted links don't decrement this, and updated records with any links will likely increment it.
pub linking_records: u64,
+
+
/// first jetstream cursor when this instance first started
+
pub started_at: Option<u64>,
+
+
/// anything else we want to throw in
+
pub other_data: HashMap<String, u64>,
}
pub trait LinkStorage: Send + Sync {
+162 -3
constellation/src/storage/rocks_store.rs
···
Arc,
};
use std::thread;
-
use std::time::{Duration, Instant};
+
use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
use tokio_util::sync::CancellationToken;
static DID_IDS_CF: &str = "did_ids";
···
static LINK_TARGETS_CF: &str = "link_targets";
static JETSTREAM_CURSOR_KEY: &str = "jetstream_cursor";
+
static STARTED_AT_KEY: &str = "jetstream_first_cursor";
+
// add reverse mappings for targets if this db was running before that was a thing
+
static TARGET_ID_REPAIR_STATE_KEY: &str = "target_id_table_repair_state";
+
+
static COZY_FIRST_CURSOR: u64 = 1_738_083_600_000_000; // constellation.microcosm.blue started
+
+
#[derive(Debug, Clone, Serialize, Deserialize)]
+
struct TargetIdRepairState {
+
/// start time for repair, microseconds timestamp
+
current_us_started_at: u64,
+
/// id table's latest id when repair started
+
id_when_started: u64,
+
/// id table id
+
latest_repaired_i: u64,
+
}
+
impl AsRocksValue for TargetIdRepairState {}
+
impl ValueFromRocks for TargetIdRepairState {}
// todo: actually understand and set these options probably better
fn rocks_opts_base() -> Options {
···
_key_marker: PhantomData,
_val_marker: PhantomData,
name: name.into(),
-
id_seq: Arc::new(AtomicU64::new(0)), // zero is "uninint", first seq num will be 1
+
id_seq: Arc::new(AtomicU64::new(0)), // zero is "uninit", first seq num will be 1
}
}
fn get_id_val(
···
}
}
+
fn now() -> u64 {
+
SystemTime::now()
+
.duration_since(UNIX_EPOCH)
+
.unwrap()
+
.as_micros() as u64
+
}
+
impl RocksStorage {
pub fn new(path: impl AsRef<Path>) -> Result<Self> {
Self::describe_metrics();
-
RocksStorage::open_readmode(path, false)
+
let me = RocksStorage::open_readmode(path, false)?;
+
me.global_init()?;
+
Ok(me)
}
pub fn open_readonly(path: impl AsRef<Path>) -> Result<Self> {
···
let did_id_table = IdTable::setup(DID_IDS_CF);
let target_id_table = IdTable::setup(TARGET_IDS_CF);
+
// note: global stuff like jetstream cursor goes in the default cf
+
// these are bonus extra cfs
let cfs = vec![
// id reference tables
did_id_table.cf_descriptor(),
···
is_writer: !readonly,
backup_task: None.into(),
})
+
}
+
+
fn global_init(&self) -> Result<()> {
+
let first_run = self.db.get(JETSTREAM_CURSOR_KEY)?.is_some();
+
if first_run {
+
self.db.put(STARTED_AT_KEY, _rv(now()))?;
+
+
// hack / temporary: if we're a new db, put in a completed repair
+
// state so we don't run repairs (repairs are for old-code dbs)
+
let completed = TargetIdRepairState {
+
id_when_started: 0,
+
current_us_started_at: 0,
+
latest_repaired_i: 0,
+
};
+
self.db.put(TARGET_ID_REPAIR_STATE_KEY, _rv(completed))?;
+
}
+
Ok(())
+
}
+
+
pub fn run_repair(&self, breather: Duration, stay_alive: CancellationToken) -> Result<bool> {
+
let mut state = match self
+
.db
+
.get(TARGET_ID_REPAIR_STATE_KEY)?
+
.map(|s| _vr(&s))
+
.transpose()?
+
{
+
Some(s) => s,
+
None => TargetIdRepairState {
+
id_when_started: self.did_id_table.priv_id_seq,
+
current_us_started_at: now(),
+
latest_repaired_i: 0,
+
},
+
};
+
+
eprintln!("initial repair state: {state:?}");
+
+
let cf = self.db.cf_handle(TARGET_IDS_CF).unwrap();
+
+
let mut iter = self.db.raw_iterator_cf(&cf);
+
iter.seek_to_first();
+
+
eprintln!("repair iterator sent to first key");
+
+
// skip ahead if we're done some, or take a single first step
+
for _ in 0..state.latest_repaired_i {
+
iter.next();
+
}
+
+
eprintln!(
+
"repair iterator skipped to {}th key",
+
state.latest_repaired_i
+
);
+
+
let mut maybe_done = false;
+
+
while !stay_alive.is_cancelled() && !maybe_done {
+
// let mut batch = WriteBatch::default();
+
+
let mut any_written = false;
+
+
for _ in 0..1000 {
+
if state.latest_repaired_i % 1_000_000 == 0 {
+
eprintln!("target iter at {}", state.latest_repaired_i);
+
}
+
state.latest_repaired_i += 1;
+
+
if !iter.valid() {
+
eprintln!("invalid iter, are we done repairing?");
+
maybe_done = true;
+
break;
+
};
+
+
// eprintln!("iterator seems to be valid! getting the key...");
+
let raw_key = iter.key().unwrap();
+
if raw_key.len() == 8 {
+
// eprintln!("found an 8-byte key, skipping it since it's probably an id...");
+
iter.next();
+
continue;
+
}
+
let target: TargetKey = _kr::<TargetKey>(raw_key)?;
+
let target_id: TargetId = _vr(iter.value().unwrap())?;
+
+
self.db
+
.put_cf(&cf, target_id.id().to_be_bytes(), _rv(&target))?;
+
any_written = true;
+
iter.next();
+
}
+
+
if any_written {
+
self.db
+
.put(TARGET_ID_REPAIR_STATE_KEY, _rv(state.clone()))?;
+
std::thread::sleep(breather);
+
}
+
}
+
+
eprintln!("repair iterator done.");
+
+
Ok(false)
}
pub fn start_backup(
···
.map(|s| s.parse::<u64>())
.transpose()?
.unwrap_or(0);
+
let started_at = self
+
.db
+
.get(STARTED_AT_KEY)?
+
.map(|c| _vr(&c))
+
.transpose()?
+
.unwrap_or(COZY_FIRST_CURSOR);
+
+
let other_data = self
+
.db
+
.get(TARGET_ID_REPAIR_STATE_KEY)?
+
.map(|s| _vr(&s))
+
.transpose()?
+
.map(
+
|TargetIdRepairState {
+
current_us_started_at,
+
id_when_started,
+
latest_repaired_i,
+
}| {
+
HashMap::from([
+
("current_us_started_at".to_string(), current_us_started_at),
+
("id_when_started".to_string(), id_when_started),
+
("latest_repaired_i".to_string(), latest_repaired_i),
+
])
+
},
+
)
+
.unwrap_or(HashMap::default());
+
Ok(StorageStats {
dids,
targetables,
linking_records,
+
started_at: Some(started_at),
+
other_data,
})
···
impl AsRocksValue for &TargetId {}
impl KeyFromRocks for TargetKey {}
impl ValueFromRocks for TargetId {}
+
+
// temp?
+
impl KeyFromRocks for TargetId {}
+
impl AsRocksValue for &TargetKey {}
// target_links table
impl AsRocksKey for &TargetId {}
+12 -2
constellation/templates/hello.html.j2
···
<p>It works by recursively walking <em>all</em> records coming through the firehose, searching for anything that looks like a link. Links are indexed by the target they point at, the collection the record came from, and the JSON path to the link in that record.</p>
<p>
-
This server has indexed <span class="stat">{{ stats.linking_records|human_number }}</span> links between <span class="stat">{{ stats.targetables|human_number }}</span> targets and sources from <span class="stat">{{ stats.dids|human_number }}</span> identities over <span class="stat">{{ days_indexed|human_number }}</span> days.<br/>
+
This server has indexed <span class="stat">{{ stats.linking_records|human_number }}</span> links between <span class="stat">{{ stats.targetables|human_number }}</span> targets and sources from <span class="stat">{{ stats.dids|human_number }}</span> identities over <span class="stat">
+
{%- if let Some(days) = days_indexed %}
+
{{ days|human_number }}
+
{% else %}
+
???
+
{% endif -%}
+
</span> days.<br/>
<small>(indexing new records in real time, backfill coming soon!)</small>
</p>
-
<p>But feel free to use it! If you want to be nice, put your project name and bsky username (or email) in your user-agent header for api requests.</p>
+
{# {% for k, v in stats.other_data.iter() %}
+
<p><strong>{{ k }}</strong>: {{ v }}</p>
+
{% endfor %} #}
+
+
<p>You're welcome to use this public instance! Please do not build the torment nexus. If you want to be nice, put your project name and bsky username (or email) in your user-agent header for api requests.</p>
<h2>API Endpoints</h2>