Constellation, Spacedust, Slingshot, UFOs: atproto crates and services for microcosm
at main 11 kB view raw
1use fluent_uri::{Uri, UriRef}; 2use std::sync::LazyLock; 3 4static BASE: LazyLock<Uri<&str>> = LazyLock::new(|| Uri::parse("https://example.com").unwrap()); 5 6// normalizing is a bit opinionated but eh 7/// see "Full AT URI Syntax" at https://atproto.com/specs/at-uri-scheme 8/// this parser is intentinonally lax: it should accept all valid at-uris, and 9/// may accept some invalid at-uris. 10/// 11/// at the moment this implementation is quite bad and incomplete 12pub fn parse_at_uri(s: &str) -> Option<String> { 13 // for now, just working through the rules laid out in the docs in order, 14 // without much regard for efficiency for now. 15 16 // The overall URI is restricted to a subset of ASCII characters 17 if !s.is_ascii() { 18 return None; 19 } 20 21 // Maximum overall length is 8 kilobytes (which may be shortened in the future) 22 if s.len() > (8 * 2_usize.pow(10)) { 23 return None; 24 } 25 26 // Hex-encoding of characters is permitted (but in practice not necessary) 27 // -> decode any unreserved characters. from rfc 3986: 28 // -> For consistency, percent-encoded octets in the ranges of ALPHA 29 // -> (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E), 30 // -> underscore (%5F), or tilde (%7E) should not be created by URI 31 // -> producers and, when found in a URI, should be decoded to their 32 // -> corresponding unreserved characters by URI normalizers. 33 let s = if let Some((unencoded_prefix, rest)) = s.split_once('%') { 34 let mut out = String::with_capacity(s.len()); 35 out.push_str(unencoded_prefix); 36 for segment in rest.split('%') { 37 let Some((hex2, unencoded_suffix)) = segment.split_at_checked(2) else { 38 return None; // bail: % must always be followed by 2 hex digits 39 }; 40 let Ok(decoded) = u8::from_str_radix(hex2, 16).map(char::from) else { 41 return None; // bail: % must be followed by decodable hex 42 }; 43 if matches!(decoded, 'A'..='Z' | 'a'..='z' | '0'..='9' | '.' | '-' | '_' | '~') { 44 out.push(decoded); 45 } else { 46 out.push('%'); 47 out.push_str(&hex2.to_ascii_uppercase()); // norm 48 } 49 out.push_str(unencoded_suffix); 50 } 51 out 52 } else { 53 s.to_string() 54 }; 55 56 // The URI scheme is `at`, and an authority part preceded with double slashes is always 57 // required, so the URI always starts at:// 58 // -> the spec doesn't explicitly say, but uri schemes can be case-insensitive? 59 let (proto, rest) = s.split_at_checked(5)?; 60 if !proto.eq_ignore_ascii_case("at://") { 61 return None; 62 } 63 64 // An authority section is required and must be non-empty. the authority can be either an 65 // atproto Handle, or a DID meeting the restrictions for use with atproto. note that the 66 // authority part can not be interpreted as a host:port pair, because of the use of colon 67 // characters (:) in DIDs. Colons and unreserved characters should not be escaped in DIDs, 68 // but other reserved characters (including #, /, $, &, @) must be escaped. 69 // Note that none of the current "blessed" DID methods for atproto allow these 70 // characters in DID identifiers 71 72 // An optional path section may follow the authority. The path may contain multiple segments 73 // separated by a single slash (/). Generic URI path normalization rules may be used. 74 75 // An optional query part is allowed, following generic URI syntax restrictions 76 77 // An optional fragment part is allowed, using JSON Path syntax 78 79 // -> work backwards from fragment, query, path -> authority 80 let mut base = rest; 81 let (mut fragment, mut query, mut path) = (None, None, None); 82 if let Some((pre, f)) = base.split_once('#') { 83 base = pre; 84 fragment = Some(f); 85 } 86 if let Some((pre, q)) = base.split_once('?') { 87 base = pre; 88 query = Some(q); 89 } 90 if let Some((pre, p)) = base.split_once('/') { 91 base = pre; 92 path = Some(p); 93 } 94 let mut authority = base.to_string(); 95 96 if authority.is_empty() { 97 return None; 98 } 99 100 // Normalization: Authority as handle: lowercased 101 if !authority.starts_with("did:") { 102 // lowercase handles 103 authority.make_ascii_lowercase(); 104 } 105 106 // Normalization: No trailing slashes in path part 107 // Normalization: No duplicate slashes or "dot" sections in path part (/./ or /abc/../ for example) 108 // -> be so lazy 109 let path = match path { 110 Some(p) => { 111 let p = p.trim_end_matches('/'); 112 let uri_ref = UriRef::parse(p).ok()?; // fully bail if we can't parse path 113 let resolved = uri_ref.resolve_against(&*BASE).unwrap(); // both fail conditions are specific to BASE 114 let normalized = resolved.normalize().path().to_string(); 115 let without_trailing_slashes = normalized.trim_end_matches('/'); 116 Some(without_trailing_slashes.to_string()) 117 } 118 None => None, 119 }; 120 121 let mut out = format!("at://{authority}"); 122 if let Some(p) = path { 123 // no need for `/` -- it's added by fluent_uri normalization 124 out.push_str(&p); 125 } 126 if let Some(q) = query { 127 out.push('?'); 128 out.push_str(q); 129 } 130 if let Some(f) = fragment { 131 out.push('#'); 132 out.push_str(f); 133 } 134 135 Some(out) 136 137 // there's a more normalization to do still. ugh. 138} 139 140pub fn at_uri_collection(at_uri: &str) -> Option<String> { 141 let (proto, rest) = at_uri.split_at_checked(5)?; 142 if !proto.eq_ignore_ascii_case("at://") { 143 return None; 144 } 145 let (_did, rest) = rest.split_once('/')?; 146 if let Some((collection, _path_rest)) = rest.split_once('/') { 147 return Some(collection.to_string()); 148 } 149 if let Some((collection, _query_rest)) = rest.split_once('?') { 150 return Some(collection.to_string()); 151 } 152 if let Some((collection, _hash_rest)) = rest.split_once('#') { 153 return Some(collection.to_string()); 154 } 155 Some(rest.to_string()) 156} 157 158#[cfg(test)] 159mod tests { 160 use super::*; 161 162 #[test] 163 fn test_at_uri_parse() { 164 for (case, expected, detail) in vec![ 165 ("", None, "empty"), 166 (" ", None, "whitespace"), 167 ("https://bad-example.com", None, "not at scheme"), 168 ("at://µcosm.bad-example.com", None, "not ascii"), 169 ( 170 "at://bad-example.com", 171 Some("at://bad-example.com"), 172 "handle, authority-only", 173 ), 174 ( 175 "at://did:plc:hdhoaan3xa3jiuq4fg4mefid", 176 Some("at://did:plc:hdhoaan3xa3jiuq4fg4mefid"), 177 "DID, authority-only", 178 ), 179 ( 180 "at://bad-example.com/app.bsky.feed.post/3jwdwj2ctlk26", 181 Some("at://bad-example.com/app.bsky.feed.post/3jwdwj2ctlk26"), 182 "bsky post (handle)", 183 ), 184 ( 185 "at://did:plc:hdhoaan3xa3jiuq4fg4mefid/app.bsky.feed.post/3ldqksainxc27", 186 Some("at://did:plc:hdhoaan3xa3jiuq4fg4mefid/app.bsky.feed.post/3ldqksainxc27"), 187 "bsky post (DID)", 188 ), 189 ( 190 "AT://bad-example.com", 191 Some("at://bad-example.com"), 192 "scheme case is normalized", 193 ), 194 ( 195 "at://bad-example.com", 196 Some("at://bad-example.com"), 197 "scheme case is normalized", 198 ), 199 ( 200 "at://bad-example.com?q=z", 201 Some("at://bad-example.com?q=z"), 202 "query is allowed", 203 ), 204 ( 205 "at://bad-example.com#a", 206 Some("at://bad-example.com#a"), 207 "fragment is allowed", 208 ), 209 ( 210 "at://bad-example.com/%", 211 None, 212 "invalid percent-encoding: ends with %", 213 ), 214 ( 215 "at://bad-example.com/%2", 216 None, 217 "invalid percent-encoding: ends with only one digit after %", 218 ), 219 ( 220 "at://bad-example.com/%ZZ", 221 None, 222 "invalid percent-encoding: non-hex after %", 223 ), 224 ( 225 "at://bad-example.com/%3A", 226 Some("at://bad-example.com/%3A"), 227 "valid percent-encoding is left", 228 ), 229 ( 230 "at://bad-example.com/%3a", 231 Some("at://bad-example.com/%3A"), 232 "valid percent-encoding is hex-uppercased", 233 ), 234 ( 235 "at://bad-example.com/%61/%62", 236 Some("at://bad-example.com/a/b"), 237 "unreserved characters are percent-decoded", 238 ), 239 ( 240 "at://bad-example.com/a/../b", 241 Some("at://bad-example.com/b"), 242 "paths have traversals resolved (oof)", // reminder to self: we are normalizing, not sanitizing 243 ), 244 ( 245 "at://bad-example.com/../", 246 Some("at://bad-example.com"), 247 "paths always have trailing slashes removed", 248 ), 249 ] { 250 assert_eq!( 251 parse_at_uri(case), 252 expected.map(|s| s.to_string()), 253 "{detail}" 254 ); 255 } 256 } 257 258 #[test] 259 fn test_at_uri_collection() { 260 for (case, expected, detail) in vec![ 261 ("", None, "empty"), 262 ("at://did:plc:vc7f4oafdgxsihk4cry2xpze", None, "did only"), 263 ( 264 "at://did:plc:vc7f4oafdgxsihk4cry2xpze/collec.tion", 265 Some("collec.tion"), 266 "no path (weird)", 267 ), 268 ( 269 "at://did:plc:vc7f4oafdgxsihk4cry2xpze/collec.tion/path", 270 Some("collec.tion"), 271 "normal at-uri", 272 ), 273 ( 274 "at://did:plc:vc7f4oafdgxsihk4cry2xpze/collec.tion?query", 275 Some("collec.tion"), 276 "colleciton with query", 277 ), 278 ( 279 "at://did:plc:vc7f4oafdgxsihk4cry2xpze/collec.tion#hash", 280 Some("collec.tion"), 281 "colleciton with hash", 282 ), 283 ( 284 "at://did:plc:vc7f4oafdgxsihk4cry2xpze/collec.tion/path?query#hash", 285 Some("collec.tion"), 286 "colleciton with everything", 287 ), 288 ( 289 "at://did:web:example.com/collec.tion/path", 290 Some("collec.tion"), 291 "did:web", 292 ), 293 ( 294 "at://did:web:example.com/col.lec.tio.ns.so.long.going.on.and.on", 295 Some("col.lec.tio.ns.so.long.going.on.and.on"), 296 "long collection", 297 ), 298 ] { 299 assert_eq!( 300 at_uri_collection(case), 301 expected.map(|s| s.to_string()), 302 "{detail}" 303 ); 304 } 305 } 306}