···
1
+
use fluent_uri::{Uri, UriRef};
2
+
use std::sync::LazyLock;
4
+
static BASE: LazyLock<Uri<&str>> = LazyLock::new(|| Uri::parse("https://example.com").unwrap());
6
+
// normalizing is a bit opinionated but eh
7
+
/// see "Full AT URI Syntax" at https://atproto.com/specs/at-uri-scheme
8
+
/// this parser is intentinonally lax: it should accept all valid at-uris, and
9
+
/// may accept some invalid at-uris.
11
+
/// at the moment this implementation is quite bad and incomplete
12
+
pub fn parse_at_uri(s: &str) -> Option<String> {
13
+
// for now, just working through the rules laid out in the docs in order,
14
+
// without much regard for efficiency for now.
16
+
// The overall URI is restricted to a subset of ASCII characters
20
+
// // A-Za-z0-9 . - _ ~
21
+
// if !s.chars().all(|c| matches!(c, 'A'..='Z' | 'a'..='z' | '0'..='9' | '.' | '-' | '_' | '~')) {
25
+
// Maximum overall length is 8 kilobytes (which may be shortened in the future)
26
+
if s.len() > (8 * 2_usize.pow(10)) {
30
+
// Hex-encoding of characters is permitted (but in practice not necessary)
31
+
// -> decode any unreserved characters. from rfc 3986:
32
+
// -> For consistency, percent-encoded octets in the ranges of ALPHA
33
+
// -> (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E),
34
+
// -> underscore (%5F), or tilde (%7E) should not be created by URI
35
+
// -> producers and, when found in a URI, should be decoded to their
36
+
// -> corresponding unreserved characters by URI normalizers.
37
+
let s = if let Some((unencoded_prefix, rest)) = s.split_once('%') {
38
+
let mut out = String::with_capacity(s.len());
39
+
out.push_str(unencoded_prefix);
40
+
for segment in rest.split('%') {
41
+
let Some((hex2, unencoded_suffix)) = segment.split_at_checked(2) else {
42
+
return None; // bail: % must always be followed by 2 hex digits
44
+
let Ok(decoded) = u8::from_str_radix(hex2, 16).map(char::from) else {
45
+
return None; // bail: % must be followed by decodable hex
47
+
if matches!(decoded, 'A'..='Z' | 'a'..='z' | '0'..='9' | '.' | '-' | '_' | '~') {
51
+
out.push_str(&hex2.to_ascii_uppercase()); // norm
53
+
out.push_str(unencoded_suffix);
60
+
// The URI scheme is `at`, and an authority part preceded with double slashes is always
61
+
// required, so the URI always starts at://
62
+
// -> the spec doesn't explicitly say, but it seems like uri schemes are case-insensitive
63
+
let (proto, rest) = s.split_at_checked(5)?;
64
+
if !proto.eq_ignore_ascii_case("at://") {
68
+
// An authority section is required and must be non-empty. the authority can be either an
69
+
// atproto Handle, or a DID meeting the restrictions for use with atproto. note that the
70
+
// authority part can not be interpreted as a host:port pair, because of the use of colon
71
+
// characters (:) in DIDs. Colons and unreserved characters should not be escaped in DIDs,
72
+
// but other reserved characters (including #, /, $, &, @) must be escaped.
73
+
// Note that none of the current "blessed" DID methods for atproto allow these
74
+
// characters in DID identifiers
76
+
// An optional path section may follow the authority. The path may contain multiple segments
77
+
// separated by a single slash (/). Generic URI path normalization rules may be used.
79
+
// An optional query part is allowed, following generic URI syntax restrictions
81
+
// An optional fragment part is allowed, using JSON Path syntax
83
+
// -> work backwards from fragment, query, path -> authority
84
+
let mut base = rest;
85
+
let (mut fragment, mut query, mut path) = (None, None, None);
86
+
if let Some((pre, f)) = base.split_once('#') {
90
+
if let Some((pre, q)) = base.split_once('?') {
94
+
if let Some((pre, p)) = base.split_once('/') {
98
+
let mut authority = base.to_string();
100
+
if authority.is_empty() {
104
+
// Normalization: Authority as handle: lowercased
105
+
if !authority.starts_with("did:") {
106
+
// lowercase handles
107
+
authority.make_ascii_lowercase();
110
+
// Normalization: No trailing slashes in path part
111
+
// Normalization: No duplicate slashes or "dot" sections in path part (/./ or /abc/../ for example)
113
+
let path = match path {
115
+
let p = p.trim_end_matches('/');
116
+
let uri_ref = UriRef::parse(p).ok()?; // fully bail if we can't parse path
117
+
let resolved = uri_ref.resolve_against(&*BASE).unwrap(); // both fail conditions are specific to BASE
118
+
let normalized = resolved.normalize().path().to_string();
119
+
let without_trailing_slashes = normalized.trim_end_matches('/');
120
+
Some(without_trailing_slashes.to_string())
125
+
let mut out = format!("at://{authority}");
126
+
if let Some(p) = path {
127
+
// no need for `/` -- it's added by fluent_uri normalization
130
+
if let Some(q) = query {
134
+
if let Some(f) = fragment {
141
+
// there's a more normalization to do still. ugh.
149
+
fn test_at_uri_parse() {
150
+
for (case, expected, detail) in vec![
151
+
("", None, "empty"),
152
+
(" ", None, "whitespace"),
153
+
("https://bad-example.com", None, "not at scheme"),
154
+
("at://µcosm.bad-example.com", None, "not ascii"),
156
+
"at://bad-example.com",
157
+
Some("at://bad-example.com"),
158
+
"handle, authority-only",
161
+
"at://did:plc:hdhoaan3xa3jiuq4fg4mefid",
162
+
Some("at://did:plc:hdhoaan3xa3jiuq4fg4mefid"),
163
+
"DID, authority-only",
166
+
"at://bad-example.com/app.bsky.feed.post/3jwdwj2ctlk26",
167
+
Some("at://bad-example.com/app.bsky.feed.post/3jwdwj2ctlk26"),
168
+
"bsky post (handle)",
171
+
"at://did:plc:hdhoaan3xa3jiuq4fg4mefid/app.bsky.feed.post/3ldqksainxc27",
172
+
Some("at://did:plc:hdhoaan3xa3jiuq4fg4mefid/app.bsky.feed.post/3ldqksainxc27"),
176
+
"AT://bad-example.com",
177
+
Some("at://bad-example.com"),
178
+
"scheme case is normalized",
181
+
"at://bad-example.com",
182
+
Some("at://bad-example.com"),
183
+
"scheme case is normalized",
186
+
"at://bad-example.com?q=z",
187
+
Some("at://bad-example.com?q=z"),
188
+
"query is allowed",
191
+
"at://bad-example.com#a",
192
+
Some("at://bad-example.com#a"),
193
+
"fragment is allowed",
196
+
"at://bad-example.com/%",
198
+
"invalid percent-encoding: ends with %",
201
+
"at://bad-example.com/%2",
203
+
"invalid percent-encoding: ends with only one digit after %",
206
+
"at://bad-example.com/%ZZ",
208
+
"invalid percent-encoding: non-hex after %",
211
+
"at://bad-example.com/%3A",
212
+
Some("at://bad-example.com/%3A"),
213
+
"valid percent-encoding is left",
216
+
"at://bad-example.com/%3a",
217
+
Some("at://bad-example.com/%3A"),
218
+
"valid percent-encoding is hex-uppercased",
221
+
"at://bad-example.com/%61/%62",
222
+
Some("at://bad-example.com/a/b"),
223
+
"unreserved characters are percent-decoded",
226
+
"at://bad-example.com/a/../b",
227
+
Some("at://bad-example.com/b"),
228
+
"paths have traversals resolved (oof)",
231
+
"at://bad-example.com/../",
232
+
Some("at://bad-example.com"),
233
+
"paths always have trailing slashes removed",
237
+
parse_at_uri(case),
238
+
expected.map(|s| s.to_string()),