1use crate::corpus::LexiconCorpus;
2use crate::lexicon::{
3 LexArrayItem, LexObjectProperty, LexUserType, LexXrpcBodySchema,
4 LexXrpcSubscriptionMessageSchema,
5};
6use jacquard_common::smol_str::{SmolStr, ToSmolStr};
7use jacquard_common::{CowStr, smol_str};
8use std::collections::{BTreeMap, BTreeSet};
9
10/// Information about a single union type found in the corpus
11#[derive(Debug, Clone)]
12pub struct UnionInfo {
13 /// NSID of the lexicon containing this union
14 pub lexicon_nsid: SmolStr,
15 /// Name of the def containing this union (e.g., "main", "replyRef")
16 pub def_name: SmolStr,
17 /// Field path within the def (e.g., "embed", "properties.embed")
18 pub field_path: CowStr<'static>,
19 /// Refs that exist in the corpus
20 pub known_refs: Vec<CowStr<'static>>,
21 /// Refs that don't exist in the corpus
22 pub unknown_refs: Vec<CowStr<'static>>,
23 /// Whether the union is closed (default true if not specified)
24 pub closed: bool,
25}
26
27impl UnionInfo {
28 /// Get the source text for this union's lexicon from the corpus
29 pub fn get_source<'c>(&self, corpus: &'c LexiconCorpus) -> Option<&'c str> {
30 corpus.get_source(&self.lexicon_nsid)
31 }
32
33 /// Check if this union has any unknown refs
34 pub fn has_unknown_refs(&self) -> bool {
35 !self.unknown_refs.is_empty()
36 }
37
38 /// Get all refs (known + unknown)
39 pub fn all_refs(&self) -> impl Iterator<Item = &CowStr<'static>> {
40 self.known_refs.iter().chain(self.unknown_refs.iter())
41 }
42}
43
44/// Registry of all union types found in the corpus
45#[derive(Debug, Clone)]
46pub struct UnionRegistry {
47 /// Map from union identifier to union info
48 /// Key is "{lexicon_nsid}#{def_name}:{field_path}"
49 unions: BTreeMap<SmolStr, UnionInfo>,
50}
51
52impl UnionRegistry {
53 /// Create a new empty union registry
54 pub fn new() -> Self {
55 Self {
56 unions: BTreeMap::new(),
57 }
58 }
59
60 /// Build a union registry from a corpus
61 pub fn from_corpus(corpus: &LexiconCorpus) -> Self {
62 let mut registry = Self::new();
63
64 for (nsid, doc) in corpus.iter() {
65 for (def_name, def) in &doc.defs {
66 registry.collect_unions_from_def(corpus, nsid, def_name, def);
67 }
68 }
69
70 registry
71 }
72
73 /// Collect unions from a single def
74 fn collect_unions_from_def(
75 &mut self,
76 corpus: &LexiconCorpus,
77 nsid: &SmolStr,
78 def_name: &SmolStr,
79 def: &LexUserType<'static>,
80 ) {
81 match def {
82 LexUserType::Record(record) => match &record.record {
83 crate::lexicon::LexRecordRecord::Object(obj) => {
84 self.collect_unions_from_object(corpus, nsid, def_name, "", obj);
85 }
86 },
87 LexUserType::Object(obj) => {
88 self.collect_unions_from_object(corpus, nsid, def_name, "", obj);
89 }
90 LexUserType::XrpcQuery(query) => {
91 if let Some(output) = &query.output {
92 if let Some(schema) = &output.schema {
93 self.collect_unions_from_xrpc_body_schema(
94 corpus, nsid, def_name, "output", schema,
95 );
96 }
97 }
98 }
99 LexUserType::XrpcProcedure(proc) => {
100 if let Some(input) = &proc.input {
101 if let Some(schema) = &input.schema {
102 self.collect_unions_from_xrpc_body_schema(
103 corpus, nsid, def_name, "input", schema,
104 );
105 }
106 }
107 if let Some(output) = &proc.output {
108 if let Some(schema) = &output.schema {
109 self.collect_unions_from_xrpc_body_schema(
110 corpus, nsid, def_name, "output", schema,
111 );
112 }
113 }
114 }
115 LexUserType::XrpcSubscription(sub) => {
116 if let Some(message) = &sub.message {
117 if let Some(schema) = &message.schema {
118 self.collect_unions_from_subscription_message_schema(
119 corpus, nsid, def_name, "message", schema,
120 );
121 }
122 }
123 }
124 _ => {}
125 }
126 }
127
128 /// Collect unions from an object's properties
129 fn collect_unions_from_object(
130 &mut self,
131 corpus: &LexiconCorpus,
132 nsid: &SmolStr,
133 def_name: &SmolStr,
134 path_prefix: &str,
135 obj: &crate::lexicon::LexObject<'static>,
136 ) {
137 for (prop_name, prop) in &obj.properties {
138 let prop_path = if path_prefix.is_empty() {
139 prop_name.to_smolstr()
140 } else {
141 smol_str::format_smolstr!("{}.{}", path_prefix, prop_name)
142 };
143
144 match prop {
145 LexObjectProperty::Union(union) => {
146 self.register_union(
147 corpus,
148 nsid,
149 def_name,
150 &prop_path,
151 &union.refs,
152 union.closed,
153 );
154 }
155 LexObjectProperty::Array(array) => {
156 if let LexArrayItem::Union(union) = &array.items {
157 let array_path = format!("{}[]", prop_path);
158 self.register_union(
159 corpus,
160 nsid,
161 def_name,
162 &array_path,
163 &union.refs,
164 union.closed,
165 );
166 }
167 }
168 LexObjectProperty::Ref(ref_type) => {
169 // Check if ref points to a union
170 if let Some((_, ref_def)) = corpus.resolve_ref(ref_type.r#ref.as_ref()) {
171 if matches!(ref_def, LexUserType::Object(_)) {
172 // Recursively check the referenced object
173 // (we'll handle this in a future iteration if needed)
174 }
175 }
176 }
177 _ => {}
178 }
179 }
180 }
181
182 /// Collect unions from XRPC body schema
183 fn collect_unions_from_xrpc_body_schema(
184 &mut self,
185 corpus: &LexiconCorpus,
186 nsid: &SmolStr,
187 def_name: &SmolStr,
188 path: &str,
189 schema: &LexXrpcBodySchema<'static>,
190 ) {
191 match schema {
192 LexXrpcBodySchema::Union(union) => {
193 self.register_union(corpus, nsid, def_name, path, &union.refs, union.closed);
194 }
195 LexXrpcBodySchema::Object(obj) => {
196 self.collect_unions_from_object(corpus, nsid, def_name, path, obj);
197 }
198 _ => {}
199 }
200 }
201
202 /// Collect unions from subscription message schema
203 fn collect_unions_from_subscription_message_schema(
204 &mut self,
205 corpus: &LexiconCorpus,
206 nsid: &SmolStr,
207 def_name: &SmolStr,
208 path: &str,
209 schema: &LexXrpcSubscriptionMessageSchema<'static>,
210 ) {
211 match schema {
212 LexXrpcSubscriptionMessageSchema::Union(union) => {
213 self.register_union(corpus, nsid, def_name, path, &union.refs, union.closed);
214 }
215 LexXrpcSubscriptionMessageSchema::Object(obj) => {
216 self.collect_unions_from_object(corpus, nsid, def_name, path, obj);
217 }
218 _ => {}
219 }
220 }
221
222 /// Register a union with the registry
223 fn register_union(
224 &mut self,
225 corpus: &LexiconCorpus,
226 nsid: &SmolStr,
227 def_name: &SmolStr,
228 field_path: &str,
229 refs: &[jacquard_common::CowStr<'static>],
230 closed: Option<bool>,
231 ) {
232 let mut known_refs = Vec::new();
233 let mut unknown_refs = Vec::new();
234
235 for ref_str in refs {
236 if corpus.ref_exists(&ref_str) {
237 known_refs.push(ref_str.clone());
238 } else {
239 unknown_refs.push(ref_str.clone());
240 }
241 }
242
243 let key = smol_str::format_smolstr!("{}#{}:{}", nsid, def_name, field_path);
244 self.unions.insert(
245 key,
246 UnionInfo {
247 lexicon_nsid: nsid.clone(),
248 def_name: def_name.clone(),
249 field_path: CowStr::Owned(field_path.to_smolstr()),
250 known_refs,
251 unknown_refs,
252 closed: closed.unwrap_or(true),
253 },
254 );
255 }
256
257 /// Get all unions
258 pub fn iter(&self) -> impl Iterator<Item = (&SmolStr, &UnionInfo)> {
259 self.unions.iter()
260 }
261
262 /// Get a specific union
263 pub fn get(&self, key: &str) -> Option<&UnionInfo> {
264 self.unions.get(key)
265 }
266
267 /// Number of unions in registry
268 pub fn len(&self) -> usize {
269 self.unions.len()
270 }
271
272 /// Check if registry is empty
273 pub fn is_empty(&self) -> bool {
274 self.unions.is_empty()
275 }
276
277 /// Get all unique refs across all unions
278 pub fn all_refs(&self) -> BTreeSet<CowStr<'static>> {
279 let mut refs = BTreeSet::new();
280 for union in self.unions.values() {
281 refs.extend(union.known_refs.iter().cloned());
282 refs.extend(union.unknown_refs.iter().cloned());
283 }
284 refs
285 }
286}
287
288impl Default for UnionRegistry {
289 fn default() -> Self {
290 Self::new()
291 }
292}
293
294#[cfg(test)]
295mod tests {
296 use super::*;
297
298 #[test]
299 fn test_union_registry_from_corpus() {
300 let corpus = LexiconCorpus::load_from_dir("tests/fixtures/test_lexicons")
301 .expect("failed to load lexicons");
302
303 let registry = UnionRegistry::from_corpus(&corpus);
304
305 assert!(!registry.is_empty());
306
307 // Check that we found the embed union in post
308 let post_embed = registry
309 .iter()
310 .find(|(_, info)| {
311 info.lexicon_nsid == "app.bsky.feed.post"
312 && info.def_name == "main"
313 && info.field_path.contains("embed")
314 })
315 .expect("should find post embed union");
316
317 let info = post_embed.1;
318 assert!(info.known_refs.contains(&"app.bsky.embed.images".into()));
319 assert!(info.known_refs.contains(&"app.bsky.embed.video".into()));
320 assert!(info.known_refs.contains(&"app.bsky.embed.external".into()));
321 }
322
323 #[test]
324 fn test_union_registry_tracks_unknown_refs() {
325 let corpus = LexiconCorpus::load_from_dir("tests/fixtures/test_lexicons")
326 .expect("failed to load lexicons");
327
328 let registry = UnionRegistry::from_corpus(&corpus);
329
330 // If there are any unknown refs, they should be tracked
331 for (_, info) in registry.iter() {
332 for unknown in &info.unknown_refs {
333 assert!(!corpus.ref_exists(unknown));
334 }
335 }
336 }
337}