commit 283754e2266a3e36d648ee87899e2d8888438cfb · nekomimi.pet/microcosm-rs

+243

src/at_uri.rs

···

       1
       1
       +
       use fluent_uri::{Uri, UriRef};

     

       2
       2
       +
       use std::sync::LazyLock;

     

       3
       3
       +
       

     

       4
       4
       +
       static BASE: LazyLock<Uri<&str>> = LazyLock::new(|| Uri::parse("https://example.com").unwrap());

     

       5
       5
       +
       

     

       6
       6
       +
       // normalizing is a bit opinionated but eh

     

       7
       7
       +
       /// see "Full AT URI Syntax" at https://atproto.com/specs/at-uri-scheme

     

       8
       8
       +
       /// this parser is intentinonally lax: it should accept all valid at-uris, and

     

       9
       9
       +
       /// may accept some invalid at-uris.

     

       10
       10
       +
       ///

     

       11
       11
       +
       /// at the moment this implementation is quite bad and incomplete

     

       12
       12
       +
       pub fn parse_at_uri(s: &str) -> Option<String> {

     

       13
       13
       +
           // for now, just working through the rules laid out in the docs in order,

     

       14
       14
       +
           // without much regard for efficiency for now.

     

       15
       15
       +
       

     

       16
       16
       +
           // The overall URI is restricted to a subset of ASCII characters

     

       17
       17
       +
           if !s.is_ascii() {

     

       18
       18
       +
               return None;

     

       19
       19
       +
           }

     

       20
       20
       +
           // // A-Za-z0-9 . - _ ~

     

       21
       21
       +
           // if !s.chars().all(|c| matches!(c, 'A'..='Z' | 'a'..='z' | '0'..='9' | '.' | '-' | '_' | '~')) {

     

       22
       22
       +
           //     return None

     

       23
       23
       +
           // }

     

       24
       24
       +
       

     

       25
       25
       +
           // Maximum overall length is 8 kilobytes (which may be shortened in the future)

     

       26
       26
       +
           if s.len() > (8 * 2_usize.pow(10)) {

     

       27
       27
       +
               return None;

     

       28
       28
       +
           }

     

       29
       29
       +
       

     

       30
       30
       +
           // Hex-encoding of characters is permitted (but in practice not necessary)

     

       31
       31
       +
           // -> decode any unreserved characters. from rfc 3986:

     

       32
       32
       +
           // ->   For consistency, percent-encoded octets in the ranges of ALPHA

     

       33
       33
       +
           // ->   (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E),

     

       34
       34
       +
           // ->   underscore (%5F), or tilde (%7E) should not be created by URI

     

       35
       35
       +
           // ->   producers and, when found in a URI, should be decoded to their

     

       36
       36
       +
           // ->   corresponding unreserved characters by URI normalizers.

     

       37
       37
       +
           let s = if let Some((unencoded_prefix, rest)) = s.split_once('%') {

     

       38
       38
       +
               let mut out = String::with_capacity(s.len());

     

       39
       39
       +
               out.push_str(unencoded_prefix);

     

       40
       40
       +
               for segment in rest.split('%') {

     

       41
       41
       +
                   let Some((hex2, unencoded_suffix)) = segment.split_at_checked(2) else {

     

       42
       42
       +
                       return None; // bail: % must always be followed by 2 hex digits

     

       43
       43
       +
                   };

     

       44
       44
       +
                   let Ok(decoded) = u8::from_str_radix(hex2, 16).map(char::from) else {

     

       45
       45
       +
                       return None; // bail: % must be followed by decodable hex

     

       46
       46
       +
                   };

     

       47
       47
       +
                   if matches!(decoded, 'A'..='Z' | 'a'..='z' | '0'..='9' | '.' | '-' | '_' | '~') {

     

       48
       48
       +
                       out.push(decoded);

     

       49
       49
       +
                   } else {

     

       50
       50
       +
                       out.push('%');

     

       51
       51
       +
                       out.push_str(&hex2.to_ascii_uppercase()); // norm

     

       52
       52
       +
                   }

     

       53
       53
       +
                   out.push_str(unencoded_suffix);

     

       54
       54
       +
               }

     

       55
       55
       +
               out

     

       56
       56
       +
           } else {

     

       57
       57
       +
               s.to_string()

     

       58
       58
       +
           };

     

       59
       59
       +
       

     

       60
       60
       +
           // The URI scheme is `at`, and an authority part preceded with double slashes is always

     

       61
       61
       +
           // required, so the URI always starts at://

     

       62
       62
       +
           // -> the spec doesn't explicitly say, but it seems like uri schemes are case-insensitive

     

       63
       63
       +
           let (proto, rest) = s.split_at_checked(5)?;

     

       64
       64
       +
           if !proto.eq_ignore_ascii_case("at://") {

     

       65
       65
       +
               return None;

     

       66
       66
       +
           }

     

       67
       67
       +
       

     

       68
       68
       +
           // An authority section is required and must be non-empty. the authority can be either an

     

       69
       69
       +
           // atproto Handle, or a DID meeting the restrictions for use with atproto. note that the

     

       70
       70
       +
           // authority part can not be interpreted as a host:port pair, because of the use of colon

     

       71
       71
       +
           // characters (:) in DIDs. Colons and unreserved characters should not be escaped in DIDs,

     

       72
       72
       +
           // but other reserved characters (including #, /, $, &, @) must be escaped.

     

       73
       73
       +
           //      Note that none of the current "blessed" DID methods for atproto allow these

     

       74
       74
       +
           //      characters in DID identifiers

     

       75
       75
       +
       

     

       76
       76
       +
           // An optional path section may follow the authority. The path may contain multiple segments

     

       77
       77
       +
           // separated by a single slash (/). Generic URI path normalization rules may be used.

     

       78
       78
       +
       

     

       79
       79
       +
           // An optional query part is allowed, following generic URI syntax restrictions

     

       80
       80
       +
       

     

       81
       81
       +
           // An optional fragment part is allowed, using JSON Path syntax

     

       82
       82
       +
       

     

       83
       83
       +
           // -> work backwards from fragment, query, path -> authority

     

       84
       84
       +
           let mut base = rest;

     

       85
       85
       +
           let (mut fragment, mut query, mut path) = (None, None, None);

     

       86
       86
       +
           if let Some((pre, f)) = base.split_once('#') {

     

       87
       87
       +
               base = pre;

     

       88
       88
       +
               fragment = Some(f);

     

       89
       89
       +
           }

     

       90
       90
       +
           if let Some((pre, q)) = base.split_once('?') {

     

       91
       91
       +
               base = pre;

     

       92
       92
       +
               query = Some(q);

     

       93
       93
       +
           }

     

       94
       94
       +
           if let Some((pre, p)) = base.split_once('/') {

     

       95
       95
       +
               base = pre;

     

       96
       96
       +
               path = Some(p);

     

       97
       97
       +
           }

     

       98
       98
       +
           let mut authority = base.to_string();

     

       99
       99
       +
       

     

       100
       100
       +
           if authority.is_empty() {

     

       101
       101
       +
               return None;

     

       102
       102
       +
           }

     

       103
       103
       +
       

     

       104
       104
       +
           // Normalization: Authority as handle: lowercased

     

       105
       105
       +
           if !authority.starts_with("did:") {

     

       106
       106
       +
               // lowercase handles

     

       107
       107
       +
               authority.make_ascii_lowercase();

     

       108
       108
       +
           }

     

       109
       109
       +
       

     

       110
       110
       +
           // Normalization: No trailing slashes in path part

     

       111
       111
       +
           // Normalization: No duplicate slashes or "dot" sections in path part (/./ or /abc/../ for example)

     

       112
       112
       +
           // -> be so lazy

     

       113
       113
       +
           let path = match path {

     

       114
       114
       +
               Some(p) => {

     

       115
       115
       +
                   let p = p.trim_end_matches('/');

     

       116
       116
       +
                   let uri_ref = UriRef::parse(p).ok()?; // fully bail if we can't parse path

     

       117
       117
       +
                   let resolved = uri_ref.resolve_against(&*BASE).unwrap(); // both fail conditions are specific to BASE

     

       118
       118
       +
                   let normalized = resolved.normalize().path().to_string();

     

       119
       119
       +
                   let without_trailing_slashes = normalized.trim_end_matches('/');

     

       120
       120
       +
                   Some(without_trailing_slashes.to_string())

     

       121
       121
       +
               }

     

       122
       122
       +
               None => None,

     

       123
       123
       +
           };

     

       124
       124
       +
       

     

       125
       125
       +
           let mut out = format!("at://{authority}");

     

       126
       126
       +
           if let Some(p) = path {

     

       127
       127
       +
               // no need for `/` -- it's added by fluent_uri normalization

     

       128
       128
       +
               out.push_str(&p);

     

       129
       129
       +
           }

     

       130
       130
       +
           if let Some(q) = query {

     

       131
       131
       +
               out.push('?');

     

       132
       132
       +
               out.push_str(q);

     

       133
       133
       +
           }

     

       134
       134
       +
           if let Some(f) = fragment {

     

       135
       135
       +
               out.push('#');

     

       136
       136
       +
               out.push_str(f);

     

       137
       137
       +
           }

     

       138
       138
       +
       

     

       139
       139
       +
           Some(out)

     

       140
       140
       +
       

     

       141
       141
       +
           // there's a more normalization to do still. ugh.

     

       142
       142
       +
       }

     

       143
       143
       +
       

     

       144
       144
       +
       #[cfg(test)]

     

       145
       145
       +
       mod tests {

     

       146
       146
       +
           use super::*;

     

       147
       147
       +
       

     

       148
       148
       +
           #[test]

     

       149
       149
       +
           fn test_at_uri_parse() {

     

       150
       150
       +
               for (case, expected, detail) in vec![

     

       151
       151
       +
                   ("", None, "empty"),

     

       152
       152
       +
                   (" ", None, "whitespace"),

     

       153
       153
       +
                   ("https://bad-example.com", None, "not at scheme"),

     

       154
       154
       +
                   ("at://µcosm.bad-example.com", None, "not ascii"),

     

       155
       155
       +
                   (

     

       156
       156
       +
                       "at://bad-example.com",

     

       157
       157
       +
                       Some("at://bad-example.com"),

     

       158
       158
       +
                       "handle, authority-only",

     

       159
       159
       +
                   ),

     

       160
       160
       +
                   (

     

       161
       161
       +
                       "at://did:plc:hdhoaan3xa3jiuq4fg4mefid",

     

       162
       162
       +
                       Some("at://did:plc:hdhoaan3xa3jiuq4fg4mefid"),

     

       163
       163
       +
                       "DID, authority-only",

     

       164
       164
       +
                   ),

     

       165
       165
       +
                   (

     

       166
       166
       +
                       "at://bad-example.com/app.bsky.feed.post/3jwdwj2ctlk26",

     

       167
       167
       +
                       Some("at://bad-example.com/app.bsky.feed.post/3jwdwj2ctlk26"),

     

       168
       168
       +
                       "bsky post (handle)",

     

       169
       169
       +
                   ),

     

       170
       170
       +
                   (

     

       171
       171
       +
                       "at://did:plc:hdhoaan3xa3jiuq4fg4mefid/app.bsky.feed.post/3ldqksainxc27",

     

       172
       172
       +
                       Some("at://did:plc:hdhoaan3xa3jiuq4fg4mefid/app.bsky.feed.post/3ldqksainxc27"),

     

       173
       173
       +
                       "bsky post (DID)",

     

       174
       174
       +
                   ),

     

       175
       175
       +
                   (

     

       176
       176
       +
                       "AT://bad-example.com",

     

       177
       177
       +
                       Some("at://bad-example.com"),

     

       178
       178
       +
                       "scheme case is normalized",

     

       179
       179
       +
                   ),

     

       180
       180
       +
                   (

     

       181
       181
       +
                       "at://bad-example.com",

     

       182
       182
       +
                       Some("at://bad-example.com"),

     

       183
       183
       +
                       "scheme case is normalized",

     

       184
       184
       +
                   ),

     

       185
       185
       +
                   (

     

       186
       186
       +
                       "at://bad-example.com?q=z",

     

       187
       187
       +
                       Some("at://bad-example.com?q=z"),

     

       188
       188
       +
                       "query is allowed",

     

       189
       189
       +
                   ),

     

       190
       190
       +
                   (

     

       191
       191
       +
                       "at://bad-example.com#a",

     

       192
       192
       +
                       Some("at://bad-example.com#a"),

     

       193
       193
       +
                       "fragment is allowed",

     

       194
       194
       +
                   ),

     

       195
       195
       +
                   (

     

       196
       196
       +
                       "at://bad-example.com/%",

     

       197
       197
       +
                       None,

     

       198
       198
       +
                       "invalid percent-encoding: ends with %",

     

       199
       199
       +
                   ),

     

       200
       200
       +
                   (

     

       201
       201
       +
                       "at://bad-example.com/%2",

     

       202
       202
       +
                       None,

     

       203
       203
       +
                       "invalid percent-encoding: ends with only one digit after %",

     

       204
       204
       +
                   ),

     

       205
       205
       +
                   (

     

       206
       206
       +
                       "at://bad-example.com/%ZZ",

     

       207
       207
       +
                       None,

     

       208
       208
       +
                       "invalid percent-encoding: non-hex after %",

     

       209
       209
       +
                   ),

     

       210
       210
       +
                   (

     

       211
       211
       +
                       "at://bad-example.com/%3A",

     

       212
       212
       +
                       Some("at://bad-example.com/%3A"),

     

       213
       213
       +
                       "valid percent-encoding is left",

     

       214
       214
       +
                   ),

     

       215
       215
       +
                   (

     

       216
       216
       +
                       "at://bad-example.com/%3a",

     

       217
       217
       +
                       Some("at://bad-example.com/%3A"),

     

       218
       218
       +
                       "valid percent-encoding is hex-uppercased",

     

       219
       219
       +
                   ),

     

       220
       220
       +
                   (

     

       221
       221
       +
                       "at://bad-example.com/%61/%62",

     

       222
       222
       +
                       Some("at://bad-example.com/a/b"),

     

       223
       223
       +
                       "unreserved characters are percent-decoded",

     

       224
       224
       +
                   ),

     

       225
       225
       +
                   (

     

       226
       226
       +
                       "at://bad-example.com/a/../b",

     

       227
       227
       +
                       Some("at://bad-example.com/b"),

     

       228
       228
       +
                       "paths have traversals resolved (oof)",

     

       229
       229
       +
                   ),

     

       230
       230
       +
                   (

     

       231
       231
       +
                       "at://bad-example.com/../",

     

       232
       232
       +
                       Some("at://bad-example.com"),

     

       233
       233
       +
                       "paths always have trailing slashes removed",

     

       234
       234
       +
                   ),

     

       235
       235
       +
               ] {

     

       236
       236
       +
                   assert_eq!(

     

       237
       237
       +
                       parse_at_uri(case),

     

       238
       238
       +
                       expected.map(|s| s.to_string()),

     

       239
       239
       +
                       "{detail}"

     

       240
       240
       +
                   );

     

       241
       241
       +
               }

     

       242
       242
       +
           }

     

       243
       243
       +
       }

+6 -5

src/lib.rs

···

       1
       1
        
       use fluent_uri::Uri;

     

       2
       2
        
       

     

       3
       3
       +
       pub mod at_uri;

     

       4
       4
       +
       

     

       3
       5
        
       #[derive(Debug, PartialEq)]

     

       4
       6
        
       pub enum Link {

     

       5
       7
        
           AtUri(String),

     

       6
       8
        
           Uri(String),

     

       7
       9
        
       }

     

       8
       10
        
       

     

       9
       9
       -
       // normalizing is a bit opinionated

     

       10
       10
       -
       pub fn parse_at_uri(_s: &str) -> Option<String> {

     

       11
       11
       -
           // TODO

     

       12
       12
       -
           None

     

       11
       11
       +
       // normalizing is a bit opinionated but ehhh

     

       12
       12
       +
       pub fn parse_at_uri(s: &str) -> Option<String> {

     

       13
       13
       +
           at_uri::parse_at_uri(s)

     

       13
       14
        
       }

     

       14
       15
        
       

     

       15
       15
       -
       // normalizing is a bit opinionated

     

       16
       16
       +
       // normalizing is a bit opinionated but eh

     

       16
       17
        
       pub fn parse_uri(s: &str) -> Option<String> {

     

       17
       18
        
           Uri::parse(s).map(|u| u.normalize().into_string()).ok()

     

       18
       19
        
       }