Constellation, Spacedust, Slingshot, UFOs: atproto crates and services for microcosm

parse dids

"links" is all references, so blocks, follows, etc. which are just direct dids also need to be detected.

Changed files
+166 -14
src
+2 -6
src/at_uri.rs
···
if !s.is_ascii() {
return None;
}
-
// // A-Za-z0-9 . - _ ~
-
// if !s.chars().all(|c| matches!(c, 'A'..='Z' | 'a'..='z' | '0'..='9' | '.' | '-' | '_' | '~')) {
-
// return None
-
// }
// Maximum overall length is 8 kilobytes (which may be shortened in the future)
if s.len() > (8 * 2_usize.pow(10)) {
···
// The URI scheme is `at`, and an authority part preceded with double slashes is always
// required, so the URI always starts at://
-
// -> the spec doesn't explicitly say, but it seems like uri schemes are case-insensitive
+
// -> the spec doesn't explicitly say, but uri schemes can be case-insensitive?
let (proto, rest) = s.split_at_checked(5)?;
if !proto.eq_ignore_ascii_case("at://") {
return None;
···
(
"at://bad-example.com/a/../b",
Some("at://bad-example.com/b"),
-
"paths have traversals resolved (oof)",
+
"paths have traversals resolved (oof)", // reminder to self: we are normalizing, not sanitizing
),
(
"at://bad-example.com/../",
+152
src/did.rs
···
+
/// see https://atproto.com/specs/did#at-protocol-did-identifier-syntax
+
/// this parser is intentinonally lax: it should accept all valid DIDs, and
+
/// may accept some invalid DIDs.
+
///
+
/// at the moment this implementation might also be quite bad and incomplete
+
pub fn parse_did(s: &str) -> Option<String> {
+
// for now, just working through the rules laid out in the docs in order,
+
// without much regard for efficiency for now.
+
+
// The entire URI is made up of a subset of ASCII, containing letters (A-Z, a-z),
+
// digits (0-9), period, underscore, colon, percent sign, or hyphen (._:%-)
+
if !s
+
.chars()
+
.all(|c| matches!(c, 'A'..='Z' | 'a'..='z' | '0'..='9' | '.' | '_' | ':' | '%' | '-'))
+
{
+
return None;
+
}
+
+
// The URI is case-sensitive
+
// -> (nothing to check)
+
+
// The URI starts with lowercase `did:`
+
let unprefixed = s.strip_prefix("did:")?;
+
+
// The method segment is one or more lowercase letters (a-z), followed by :
+
let (method, identifier) = unprefixed.split_once(':')?;
+
if !method.chars().all(|c| c.is_ascii_lowercase()) {
+
return None;
+
}
+
+
// The remainder of the URI (the identifier) may contain any of the above-allowed
+
// ASCII characters, except for percent-sign (%)
+
// -> ok, ugh, gotta know our encoding context for this
+
+
// The URI (and thus the remaining identifier) may not end in ':'.
+
if identifier.ends_with(':') {
+
return None;
+
}
+
+
// Percent-sign (%) is used for "percent encoding" in the identifier section, and
+
// must always be followed by two hex characters
+
// -> again incoding context (bleh)
+
+
// Query (?) and fragment (#) sections are allowed in DID URIs, but not in DID
+
// identifiers. In the context of atproto, the query and fragment parts are not
+
// allowed.
+
// -> disallow here -- the uri decoder should already split them out first.
+
+
// DID identifiers do not generally have a maximum length restriction, but in the
+
// context of atproto, there is an initial hard limit of 2 KB.
+
// -> we're in atproto, so sure, let's enforce it. (would be sensible to do this
+
// -> first but we're following doc order)
+
if s.len() > (2 * 2_usize.pow(10)) {
+
return None;
+
}
+
+
// -> it's not actually written in the spec, but by example in the spec, the
+
// -> identifier cannot be empty
+
if identifier.is_empty() {
+
return None;
+
}
+
+
Some(s.to_string())
+
// the only normalization we might want would be percent-decoding, but we
+
// probably leave that to the uri decoder
+
}
+
+
#[cfg(test)]
+
mod tests {
+
use super::*;
+
+
#[test]
+
fn test_did_parse() {
+
for (case, expected, detail) in vec![
+
("", None, "empty str"),
+
(" ", None, "whitespace str"),
+
("z", None, "not a did"),
+
("did:plc", None, "no identifier separator colon"),
+
("did:plc:", None, "missing identifier"),
+
(
+
"did:web:bad-example.com",
+
Some("did:web:bad-example.com"),
+
"web did",
+
),
+
(
+
"did:plc:hdhoaan3xa3jiuq4fg4mefid",
+
Some("did:plc:hdhoaan3xa3jiuq4fg4mefid"),
+
"plc did",
+
),
+
(
+
"DID:plc:hdhoaan3xa3jiuq4fg4mefid",
+
None,
+
"'did:' prefix must be lowercase",
+
),
+
(
+
"did:ok:z",
+
Some("did:ok:z"),
+
"unknown did methods are allowed",
+
),
+
("did:BAD:z", None, "non-lowercase methods are not allowed"),
+
("did:bad:z$z", None, "invalid chars are not allowed"),
+
(
+
"did:ok:z:z",
+
Some("did:ok:z:z"),
+
"colons are allowed in identifier",
+
),
+
("did:bad:z:", None, "colons not are allowed at the end"),
+
("did:bad:z?q=y", None, "queries are not allowed in atproto"),
+
("did:bad:z#a", None, "anchors are not allowed in atproto"),
+
] {
+
assert_eq!(parse_did(case), expected.map(|s| s.to_string()), "{detail}");
+
}
+
}
+
+
#[test]
+
fn test_doc_exmples_atproto() {
+
// https://atproto.com/specs/did#at-protocol-did-identifier-syntax
+
for case in vec!["did:plc:z72i7hdynmk6r22z27h6tvur", "did:web:blueskyweb.xyz"] {
+
assert!(parse_did(case).is_some(), "should pass: {case}")
+
}
+
}
+
+
#[test]
+
fn test_doc_exmples_lexicon() {
+
// https://atproto.com/specs/did#at-protocol-did-identifier-syntax
+
for case in vec![
+
"did:method:val:two",
+
"did:m:v",
+
"did:method::::val",
+
"did:method:-:_:.",
+
"did:key:zQ3shZc2QzApp2oymGvQbzP8eKheVshBHbU4ZYjeXqwSKEn6N",
+
] {
+
assert!(parse_did(case).is_some(), "should pass: {case}")
+
}
+
}
+
+
#[test]
+
fn test_doc_exmples_invalid() {
+
// https://atproto.com/specs/did#at-protocol-did-identifier-syntax
+
for case in vec![
+
"did:METHOD:val",
+
"did:m123:val",
+
"DID:method:val",
+
"did:method:",
+
"did:method:val/two",
+
"did:method:val?two",
+
"did:method:val#two",
+
] {
+
assert!(parse_did(case).is_none(), "should fail: {case}")
+
}
+
}
+
}
+12 -8
src/lib.rs
···
use fluent_uri::Uri;
pub mod at_uri;
+
pub mod did;
#[derive(Debug, PartialEq)]
pub enum Link {
AtUri(String),
Uri(String),
-
}
-
-
// normalizing is a bit opinionated but ehhh
-
pub fn parse_at_uri(s: &str) -> Option<String> {
-
at_uri::parse_at_uri(s)
+
Did(String),
}
// normalizing is a bit opinionated but eh
···
}
pub fn parse_any(s: &str) -> Option<Link> {
-
parse_at_uri(s)
-
.map(Link::AtUri)
-
.or_else(|| parse_uri(s).map(Link::Uri))
+
at_uri::parse_at_uri(s).map(Link::AtUri).or_else(|| {
+
did::parse_did(s)
+
.map(Link::Did)
+
.or_else(|| parse_uri(s).map(Link::Uri))
+
})
}
#[cfg(test)]
···
"at://did:plc:44ybard66vv44zksje25o7dz/app.bsky.feed.post/3jwdwj2ctlk26".into()
)),
);
+
+
assert_eq!(
+
parse_any("did:plc:44ybard66vv44zksje25o7dz"),
+
Some(Link::Did("did:plc:44ybard66vv44zksje25o7dz".into()))
+
)
}
}