forked from
microcosm.blue/microcosm-rs
Constellation, Spacedust, Slingshot, UFOs: atproto crates and services for microcosm
1use fluent_uri::{Uri, UriRef};
2use std::sync::LazyLock;
3
4static BASE: LazyLock<Uri<&str>> = LazyLock::new(|| Uri::parse("https://example.com").unwrap());
5
6// normalizing is a bit opinionated but eh
7/// see "Full AT URI Syntax" at https://atproto.com/specs/at-uri-scheme
8/// this parser is intentinonally lax: it should accept all valid at-uris, and
9/// may accept some invalid at-uris.
10///
11/// at the moment this implementation is quite bad and incomplete
12pub fn parse_at_uri(s: &str) -> Option<String> {
13 // for now, just working through the rules laid out in the docs in order,
14 // without much regard for efficiency for now.
15
16 // The overall URI is restricted to a subset of ASCII characters
17 if !s.is_ascii() {
18 return None;
19 }
20
21 // Maximum overall length is 8 kilobytes (which may be shortened in the future)
22 if s.len() > (8 * 2_usize.pow(10)) {
23 return None;
24 }
25
26 // Hex-encoding of characters is permitted (but in practice not necessary)
27 // -> decode any unreserved characters. from rfc 3986:
28 // -> For consistency, percent-encoded octets in the ranges of ALPHA
29 // -> (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E),
30 // -> underscore (%5F), or tilde (%7E) should not be created by URI
31 // -> producers and, when found in a URI, should be decoded to their
32 // -> corresponding unreserved characters by URI normalizers.
33 let s = if let Some((unencoded_prefix, rest)) = s.split_once('%') {
34 let mut out = String::with_capacity(s.len());
35 out.push_str(unencoded_prefix);
36 for segment in rest.split('%') {
37 let Some((hex2, unencoded_suffix)) = segment.split_at_checked(2) else {
38 return None; // bail: % must always be followed by 2 hex digits
39 };
40 let Ok(decoded) = u8::from_str_radix(hex2, 16).map(char::from) else {
41 return None; // bail: % must be followed by decodable hex
42 };
43 if matches!(decoded, 'A'..='Z' | 'a'..='z' | '0'..='9' | '.' | '-' | '_' | '~') {
44 out.push(decoded);
45 } else {
46 out.push('%');
47 out.push_str(&hex2.to_ascii_uppercase()); // norm
48 }
49 out.push_str(unencoded_suffix);
50 }
51 out
52 } else {
53 s.to_string()
54 };
55
56 // The URI scheme is `at`, and an authority part preceded with double slashes is always
57 // required, so the URI always starts at://
58 // -> the spec doesn't explicitly say, but uri schemes can be case-insensitive?
59 let (proto, rest) = s.split_at_checked(5)?;
60 if !proto.eq_ignore_ascii_case("at://") {
61 return None;
62 }
63
64 // An authority section is required and must be non-empty. the authority can be either an
65 // atproto Handle, or a DID meeting the restrictions for use with atproto. note that the
66 // authority part can not be interpreted as a host:port pair, because of the use of colon
67 // characters (:) in DIDs. Colons and unreserved characters should not be escaped in DIDs,
68 // but other reserved characters (including #, /, $, &, @) must be escaped.
69 // Note that none of the current "blessed" DID methods for atproto allow these
70 // characters in DID identifiers
71
72 // An optional path section may follow the authority. The path may contain multiple segments
73 // separated by a single slash (/). Generic URI path normalization rules may be used.
74
75 // An optional query part is allowed, following generic URI syntax restrictions
76
77 // An optional fragment part is allowed, using JSON Path syntax
78
79 // -> work backwards from fragment, query, path -> authority
80 let mut base = rest;
81 let (mut fragment, mut query, mut path) = (None, None, None);
82 if let Some((pre, f)) = base.split_once('#') {
83 base = pre;
84 fragment = Some(f);
85 }
86 if let Some((pre, q)) = base.split_once('?') {
87 base = pre;
88 query = Some(q);
89 }
90 if let Some((pre, p)) = base.split_once('/') {
91 base = pre;
92 path = Some(p);
93 }
94 let mut authority = base.to_string();
95
96 if authority.is_empty() {
97 return None;
98 }
99
100 // Normalization: Authority as handle: lowercased
101 if !authority.starts_with("did:") {
102 // lowercase handles
103 authority.make_ascii_lowercase();
104 }
105
106 // Normalization: No trailing slashes in path part
107 // Normalization: No duplicate slashes or "dot" sections in path part (/./ or /abc/../ for example)
108 // -> be so lazy
109 let path = match path {
110 Some(p) => {
111 let p = p.trim_end_matches('/');
112 let uri_ref = UriRef::parse(p).ok()?; // fully bail if we can't parse path
113 let resolved = uri_ref.resolve_against(&*BASE).unwrap(); // both fail conditions are specific to BASE
114 let normalized = resolved.normalize().path().to_string();
115 let without_trailing_slashes = normalized.trim_end_matches('/');
116 Some(without_trailing_slashes.to_string())
117 }
118 None => None,
119 };
120
121 let mut out = format!("at://{authority}");
122 if let Some(p) = path {
123 // no need for `/` -- it's added by fluent_uri normalization
124 out.push_str(&p);
125 }
126 if let Some(q) = query {
127 out.push('?');
128 out.push_str(q);
129 }
130 if let Some(f) = fragment {
131 out.push('#');
132 out.push_str(f);
133 }
134
135 Some(out)
136
137 // there's a more normalization to do still. ugh.
138}
139
140pub fn at_uri_collection(at_uri: &str) -> Option<String> {
141 let (proto, rest) = at_uri.split_at_checked(5)?;
142 if !proto.eq_ignore_ascii_case("at://") {
143 return None;
144 }
145 let (_did, rest) = rest.split_once('/')?;
146 if let Some((collection, _path_rest)) = rest.split_once('/') {
147 return Some(collection.to_string());
148 }
149 if let Some((collection, _query_rest)) = rest.split_once('?') {
150 return Some(collection.to_string());
151 }
152 if let Some((collection, _hash_rest)) = rest.split_once('#') {
153 return Some(collection.to_string());
154 }
155 Some(rest.to_string())
156}
157
158#[cfg(test)]
159mod tests {
160 use super::*;
161
162 #[test]
163 fn test_at_uri_parse() {
164 for (case, expected, detail) in vec![
165 ("", None, "empty"),
166 (" ", None, "whitespace"),
167 ("https://bad-example.com", None, "not at scheme"),
168 ("at://µcosm.bad-example.com", None, "not ascii"),
169 (
170 "at://bad-example.com",
171 Some("at://bad-example.com"),
172 "handle, authority-only",
173 ),
174 (
175 "at://did:plc:hdhoaan3xa3jiuq4fg4mefid",
176 Some("at://did:plc:hdhoaan3xa3jiuq4fg4mefid"),
177 "DID, authority-only",
178 ),
179 (
180 "at://bad-example.com/app.bsky.feed.post/3jwdwj2ctlk26",
181 Some("at://bad-example.com/app.bsky.feed.post/3jwdwj2ctlk26"),
182 "bsky post (handle)",
183 ),
184 (
185 "at://did:plc:hdhoaan3xa3jiuq4fg4mefid/app.bsky.feed.post/3ldqksainxc27",
186 Some("at://did:plc:hdhoaan3xa3jiuq4fg4mefid/app.bsky.feed.post/3ldqksainxc27"),
187 "bsky post (DID)",
188 ),
189 (
190 "AT://bad-example.com",
191 Some("at://bad-example.com"),
192 "scheme case is normalized",
193 ),
194 (
195 "at://bad-example.com",
196 Some("at://bad-example.com"),
197 "scheme case is normalized",
198 ),
199 (
200 "at://bad-example.com?q=z",
201 Some("at://bad-example.com?q=z"),
202 "query is allowed",
203 ),
204 (
205 "at://bad-example.com#a",
206 Some("at://bad-example.com#a"),
207 "fragment is allowed",
208 ),
209 (
210 "at://bad-example.com/%",
211 None,
212 "invalid percent-encoding: ends with %",
213 ),
214 (
215 "at://bad-example.com/%2",
216 None,
217 "invalid percent-encoding: ends with only one digit after %",
218 ),
219 (
220 "at://bad-example.com/%ZZ",
221 None,
222 "invalid percent-encoding: non-hex after %",
223 ),
224 (
225 "at://bad-example.com/%3A",
226 Some("at://bad-example.com/%3A"),
227 "valid percent-encoding is left",
228 ),
229 (
230 "at://bad-example.com/%3a",
231 Some("at://bad-example.com/%3A"),
232 "valid percent-encoding is hex-uppercased",
233 ),
234 (
235 "at://bad-example.com/%61/%62",
236 Some("at://bad-example.com/a/b"),
237 "unreserved characters are percent-decoded",
238 ),
239 (
240 "at://bad-example.com/a/../b",
241 Some("at://bad-example.com/b"),
242 "paths have traversals resolved (oof)", // reminder to self: we are normalizing, not sanitizing
243 ),
244 (
245 "at://bad-example.com/../",
246 Some("at://bad-example.com"),
247 "paths always have trailing slashes removed",
248 ),
249 ] {
250 assert_eq!(
251 parse_at_uri(case),
252 expected.map(|s| s.to_string()),
253 "{detail}"
254 );
255 }
256 }
257
258 #[test]
259 fn test_at_uri_collection() {
260 for (case, expected, detail) in vec![
261 ("", None, "empty"),
262 ("at://did:plc:vc7f4oafdgxsihk4cry2xpze", None, "did only"),
263 (
264 "at://did:plc:vc7f4oafdgxsihk4cry2xpze/collec.tion",
265 Some("collec.tion"),
266 "no path (weird)",
267 ),
268 (
269 "at://did:plc:vc7f4oafdgxsihk4cry2xpze/collec.tion/path",
270 Some("collec.tion"),
271 "normal at-uri",
272 ),
273 (
274 "at://did:plc:vc7f4oafdgxsihk4cry2xpze/collec.tion?query",
275 Some("collec.tion"),
276 "colleciton with query",
277 ),
278 (
279 "at://did:plc:vc7f4oafdgxsihk4cry2xpze/collec.tion#hash",
280 Some("collec.tion"),
281 "colleciton with hash",
282 ),
283 (
284 "at://did:plc:vc7f4oafdgxsihk4cry2xpze/collec.tion/path?query#hash",
285 Some("collec.tion"),
286 "colleciton with everything",
287 ),
288 (
289 "at://did:web:example.com/collec.tion/path",
290 Some("collec.tion"),
291 "did:web",
292 ),
293 (
294 "at://did:web:example.com/col.lec.tio.ns.so.long.going.on.and.on",
295 Some("col.lec.tio.ns.so.long.going.on.and.on"),
296 "long collection",
297 ),
298 ] {
299 assert_eq!(
300 at_uri_collection(case),
301 expected.map(|s| s.to_string()),
302 "{detail}"
303 );
304 }
305 }
306}