links/src/at_uri.rs at main · nekomimi.pet/microcosm-rs

Constellation, Spacedust, Slingshot, UFOs: atproto crates and services for microcosm
microcosm-rs / links / src / at_uri.rs
at main 11 kB view raw
  1use fluent_uri::{Uri, UriRef};
  2use std::sync::LazyLock;
  3
  4static BASE: LazyLock<Uri<&str>> = LazyLock::new(|| Uri::parse("https://example.com").unwrap());
  5
  6// normalizing is a bit opinionated but eh
  7/// see "Full AT URI Syntax" at https://atproto.com/specs/at-uri-scheme
  8/// this parser is intentinonally lax: it should accept all valid at-uris, and
  9/// may accept some invalid at-uris.
 10///
 11/// at the moment this implementation is quite bad and incomplete
 12pub fn parse_at_uri(s: &str) -> Option<String> {
 13    // for now, just working through the rules laid out in the docs in order,
 14    // without much regard for efficiency for now.
 15
 16    // The overall URI is restricted to a subset of ASCII characters
 17    if !s.is_ascii() {
 18        return None;
 19    }
 20
 21    // Maximum overall length is 8 kilobytes (which may be shortened in the future)
 22    if s.len() > (8 * 2_usize.pow(10)) {
 23        return None;
 24    }
 25
 26    // Hex-encoding of characters is permitted (but in practice not necessary)
 27    // -> decode any unreserved characters. from rfc 3986:
 28    // ->   For consistency, percent-encoded octets in the ranges of ALPHA
 29    // ->   (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E),
 30    // ->   underscore (%5F), or tilde (%7E) should not be created by URI
 31    // ->   producers and, when found in a URI, should be decoded to their
 32    // ->   corresponding unreserved characters by URI normalizers.
 33    let s = if let Some((unencoded_prefix, rest)) = s.split_once('%') {
 34        let mut out = String::with_capacity(s.len());
 35        out.push_str(unencoded_prefix);
 36        for segment in rest.split('%') {
 37            let Some((hex2, unencoded_suffix)) = segment.split_at_checked(2) else {
 38                return None; // bail: % must always be followed by 2 hex digits
 39            };
 40            let Ok(decoded) = u8::from_str_radix(hex2, 16).map(char::from) else {
 41                return None; // bail: % must be followed by decodable hex
 42            };
 43            if matches!(decoded, 'A'..='Z' | 'a'..='z' | '0'..='9' | '.' | '-' | '_' | '~') {
 44                out.push(decoded);
 45            } else {
 46                out.push('%');
 47                out.push_str(&hex2.to_ascii_uppercase()); // norm
 48            }
 49            out.push_str(unencoded_suffix);
 50        }
 51        out
 52    } else {
 53        s.to_string()
 54    };
 55
 56    // The URI scheme is `at`, and an authority part preceded with double slashes is always
 57    // required, so the URI always starts at://
 58    // -> the spec doesn't explicitly say, but uri schemes can be case-insensitive?
 59    let (proto, rest) = s.split_at_checked(5)?;
 60    if !proto.eq_ignore_ascii_case("at://") {
 61        return None;
 62    }
 63
 64    // An authority section is required and must be non-empty. the authority can be either an
 65    // atproto Handle, or a DID meeting the restrictions for use with atproto. note that the
 66    // authority part can not be interpreted as a host:port pair, because of the use of colon
 67    // characters (:) in DIDs. Colons and unreserved characters should not be escaped in DIDs,
 68    // but other reserved characters (including #, /, $, &, @) must be escaped.
 69    //      Note that none of the current "blessed" DID methods for atproto allow these
 70    //      characters in DID identifiers
 71
 72    // An optional path section may follow the authority. The path may contain multiple segments
 73    // separated by a single slash (/). Generic URI path normalization rules may be used.
 74
 75    // An optional query part is allowed, following generic URI syntax restrictions
 76
 77    // An optional fragment part is allowed, using JSON Path syntax
 78
 79    // -> work backwards from fragment, query, path -> authority
 80    let mut base = rest;
 81    let (mut fragment, mut query, mut path) = (None, None, None);
 82    if let Some((pre, f)) = base.split_once('#') {
 83        base = pre;
 84        fragment = Some(f);
 85    }
 86    if let Some((pre, q)) = base.split_once('?') {
 87        base = pre;
 88        query = Some(q);
 89    }
 90    if let Some((pre, p)) = base.split_once('/') {
 91        base = pre;
 92        path = Some(p);
 93    }
 94    let mut authority = base.to_string();
 95
 96    if authority.is_empty() {
 97        return None;
 98    }
 99
100    // Normalization: Authority as handle: lowercased
101    if !authority.starts_with("did:") {
102        // lowercase handles
103        authority.make_ascii_lowercase();
104    }
105
106    // Normalization: No trailing slashes in path part
107    // Normalization: No duplicate slashes or "dot" sections in path part (/./ or /abc/../ for example)
108    // -> be so lazy
109    let path = match path {
110        Some(p) => {
111            let p = p.trim_end_matches('/');
112            let uri_ref = UriRef::parse(p).ok()?; // fully bail if we can't parse path
113            let resolved = uri_ref.resolve_against(&*BASE).unwrap(); // both fail conditions are specific to BASE
114            let normalized = resolved.normalize().path().to_string();
115            let without_trailing_slashes = normalized.trim_end_matches('/');
116            Some(without_trailing_slashes.to_string())
117        }
118        None => None,
119    };
120
121    let mut out = format!("at://{authority}");
122    if let Some(p) = path {
123        // no need for `/` -- it's added by fluent_uri normalization
124        out.push_str(&p);
125    }
126    if let Some(q) = query {
127        out.push('?');
128        out.push_str(q);
129    }
130    if let Some(f) = fragment {
131        out.push('#');
132        out.push_str(f);
133    }
134
135    Some(out)
136
137    // there's a more normalization to do still. ugh.
138}
139
140pub fn at_uri_collection(at_uri: &str) -> Option<String> {
141    let (proto, rest) = at_uri.split_at_checked(5)?;
142    if !proto.eq_ignore_ascii_case("at://") {
143        return None;
144    }
145    let (_did, rest) = rest.split_once('/')?;
146    if let Some((collection, _path_rest)) = rest.split_once('/') {
147        return Some(collection.to_string());
148    }
149    if let Some((collection, _query_rest)) = rest.split_once('?') {
150        return Some(collection.to_string());
151    }
152    if let Some((collection, _hash_rest)) = rest.split_once('#') {
153        return Some(collection.to_string());
154    }
155    Some(rest.to_string())
156}
157
158#[cfg(test)]
159mod tests {
160    use super::*;
161
162    #[test]
163    fn test_at_uri_parse() {
164        for (case, expected, detail) in vec![
165            ("", None, "empty"),
166            (" ", None, "whitespace"),
167            ("https://bad-example.com", None, "not at scheme"),
168            ("at://µcosm.bad-example.com", None, "not ascii"),
169            (
170                "at://bad-example.com",
171                Some("at://bad-example.com"),
172                "handle, authority-only",
173            ),
174            (
175                "at://did:plc:hdhoaan3xa3jiuq4fg4mefid",
176                Some("at://did:plc:hdhoaan3xa3jiuq4fg4mefid"),
177                "DID, authority-only",
178            ),
179            (
180                "at://bad-example.com/app.bsky.feed.post/3jwdwj2ctlk26",
181                Some("at://bad-example.com/app.bsky.feed.post/3jwdwj2ctlk26"),
182                "bsky post (handle)",
183            ),
184            (
185                "at://did:plc:hdhoaan3xa3jiuq4fg4mefid/app.bsky.feed.post/3ldqksainxc27",
186                Some("at://did:plc:hdhoaan3xa3jiuq4fg4mefid/app.bsky.feed.post/3ldqksainxc27"),
187                "bsky post (DID)",
188            ),
189            (
190                "AT://bad-example.com",
191                Some("at://bad-example.com"),
192                "scheme case is normalized",
193            ),
194            (
195                "at://bad-example.com",
196                Some("at://bad-example.com"),
197                "scheme case is normalized",
198            ),
199            (
200                "at://bad-example.com?q=z",
201                Some("at://bad-example.com?q=z"),
202                "query is allowed",
203            ),
204            (
205                "at://bad-example.com#a",
206                Some("at://bad-example.com#a"),
207                "fragment is allowed",
208            ),
209            (
210                "at://bad-example.com/%",
211                None,
212                "invalid percent-encoding: ends with %",
213            ),
214            (
215                "at://bad-example.com/%2",
216                None,
217                "invalid percent-encoding: ends with only one digit after %",
218            ),
219            (
220                "at://bad-example.com/%ZZ",
221                None,
222                "invalid percent-encoding: non-hex after %",
223            ),
224            (
225                "at://bad-example.com/%3A",
226                Some("at://bad-example.com/%3A"),
227                "valid percent-encoding is left",
228            ),
229            (
230                "at://bad-example.com/%3a",
231                Some("at://bad-example.com/%3A"),
232                "valid percent-encoding is hex-uppercased",
233            ),
234            (
235                "at://bad-example.com/%61/%62",
236                Some("at://bad-example.com/a/b"),
237                "unreserved characters are percent-decoded",
238            ),
239            (
240                "at://bad-example.com/a/../b",
241                Some("at://bad-example.com/b"),
242                "paths have traversals resolved (oof)", // reminder to self: we are normalizing, not sanitizing
243            ),
244            (
245                "at://bad-example.com/../",
246                Some("at://bad-example.com"),
247                "paths always have trailing slashes removed",
248            ),
249        ] {
250            assert_eq!(
251                parse_at_uri(case),
252                expected.map(|s| s.to_string()),
253                "{detail}"
254            );
255        }
256    }
257
258    #[test]
259    fn test_at_uri_collection() {
260        for (case, expected, detail) in vec![
261            ("", None, "empty"),
262            ("at://did:plc:vc7f4oafdgxsihk4cry2xpze", None, "did only"),
263            (
264                "at://did:plc:vc7f4oafdgxsihk4cry2xpze/collec.tion",
265                Some("collec.tion"),
266                "no path (weird)",
267            ),
268            (
269                "at://did:plc:vc7f4oafdgxsihk4cry2xpze/collec.tion/path",
270                Some("collec.tion"),
271                "normal at-uri",
272            ),
273            (
274                "at://did:plc:vc7f4oafdgxsihk4cry2xpze/collec.tion?query",
275                Some("collec.tion"),
276                "colleciton with query",
277            ),
278            (
279                "at://did:plc:vc7f4oafdgxsihk4cry2xpze/collec.tion#hash",
280                Some("collec.tion"),
281                "colleciton with hash",
282            ),
283            (
284                "at://did:plc:vc7f4oafdgxsihk4cry2xpze/collec.tion/path?query#hash",
285                Some("collec.tion"),
286                "colleciton with everything",
287            ),
288            (
289                "at://did:web:example.com/collec.tion/path",
290                Some("collec.tion"),
291                "did:web",
292            ),
293            (
294                "at://did:web:example.com/col.lec.tio.ns.so.long.going.on.and.on",
295                Some("col.lec.tio.ns.so.long.going.on.and.on"),
296                "long collection",
297            ),
298        ] {
299            assert_eq!(
300                at_uri_collection(case),
301                expected.map(|s| s.to_string()),
302                "{detail}"
303            );
304        }
305    }
306}