alpha
Login
or
Join now
nonbinary.computer
/
jacquard
Star
0
Fork
7
Atom
Configure Feed
Issues
Pull Requests
Commits
Tags
Feed URL
Select the types of activity you want to include in your feed.
A better Rust ATProto crate
Star
0
Fork
7
Atom
Configure Feed
Issues
Pull Requests
Commits
Tags
Feed URL
Select the types of activity you want to include in your feed.
Overview
Issues
Pulls
Pipelines
better lexicon parsing errors
author
Orual
date
5 months ago
(Jan 7, 2026, 11:05 AM -0500)
commit
2c6880ce
2c6880ce509249db6015f3b4cf186b2fbe0ca492
parent
7e5406c7
7e5406c73b666c0f6cbaffd6b0399cceac5bbb77
change-id
sptuyzov
sptuyzovlxvvmprnvzpuoszquyopuvru
+369
-27
6 changed files
Expand all
Collapse all
Unified
Split
Cargo.lock
crates
jacquard-lexicon
Cargo.toml
src
corpus.rs
error.rs
tests
fixtures
error_cases
broken_lexicon.json
test_lexicons
not_a_lexicon.json
+1
Cargo.lock
Reviewed
···
2506
2506
"serde",
2507
2507
"serde_ipld_dagcbor",
2508
2508
"serde_json",
2509
2509
+
"serde_path_to_error",
2509
2510
"serde_repr",
2510
2511
"serde_with",
2511
2512
"sha2",
+1
crates/jacquard-lexicon/Cargo.toml
Reviewed
···
27
27
proc-macro2 = { workspace = true, optional = true }
28
28
quote = { workspace = true, optional = true }
29
29
serde.workspace = true
30
30
+
serde_path_to_error = "0.1"
30
31
serde_ipld_dagcbor.workspace = true
31
32
serde_json.workspace = true
32
33
serde_repr.workspace = true
+321
-8
crates/jacquard-lexicon/src/corpus.rs
Reviewed
···
1
1
-
use crate::ref_utils::RefPath;
2
2
-
use crate::error::Result;
1
1
+
use crate::error::{CodegenError, Result};
3
2
use crate::lexicon::{LexUserType, LexiconDoc};
3
3
+
use crate::ref_utils::RefPath;
4
4
use jacquard_common::{into_static::IntoStatic, smol_str::SmolStr};
5
5
use std::collections::BTreeMap;
6
6
use std::fs;
7
7
use std::path::Path;
8
8
9
9
+
/// Check if content looks like a lexicon file.
10
10
+
///
11
11
+
/// A file is considered a lexicon if it contains a `"lexicon"` key at the top level
12
12
+
/// or one level down (for some wrapper formats). This allows us to distinguish
13
13
+
/// "not a lexicon at all" (skip silently) from "broken lexicon" (report error).
14
14
+
fn is_lexicon_content(content: &str) -> bool {
15
15
+
// Quick string scan first (fast path for non-JSON or unrelated JSON)
16
16
+
if !content.contains("\"lexicon\"") {
17
17
+
return false;
18
18
+
}
19
19
+
20
20
+
// Parse to Value and check structure
21
21
+
if let Ok(value) = serde_json::from_str::<serde_json::Value>(content) {
22
22
+
// Top-level lexicon field
23
23
+
if value.get("lexicon").is_some() {
24
24
+
return true;
25
25
+
}
26
26
+
// One level down (some wrapper formats)
27
27
+
if let Some(obj) = value.as_object() {
28
28
+
for v in obj.values() {
29
29
+
if v.get("lexicon").is_some() {
30
30
+
return true;
31
31
+
}
32
32
+
}
33
33
+
}
34
34
+
}
35
35
+
false
36
36
+
}
37
37
+
38
38
+
/// Raw lexicon doc for two-phase parsing - defs are kept as raw JSON Values
39
39
+
/// so we can deserialize each separately with better error tracking.
40
40
+
#[derive(Debug, serde::Deserialize)]
41
41
+
struct RawLexiconDoc<'s> {
42
42
+
pub lexicon: crate::lexicon::Lexicon,
43
43
+
#[serde(borrow)]
44
44
+
pub id: jacquard_common::CowStr<'s>,
45
45
+
pub revision: Option<u32>,
46
46
+
#[serde(borrow)]
47
47
+
pub description: Option<jacquard_common::CowStr<'s>>,
48
48
+
pub defs: BTreeMap<SmolStr, serde_json::Value>,
49
49
+
}
50
50
+
51
51
+
/// Helper to create a parse error with path context.
52
52
+
fn make_parse_error(
53
53
+
file_path: &Path,
54
54
+
json_path: &str,
55
55
+
message: String,
56
56
+
content: &str,
57
57
+
) -> CodegenError {
58
58
+
CodegenError::ParseError {
59
59
+
path: file_path.to_path_buf(),
60
60
+
json_path: Some(json_path.to_string()),
61
61
+
message,
62
62
+
src: Some(content.to_string()),
63
63
+
span: None,
64
64
+
}
65
65
+
}
66
66
+
67
67
+
/// Recursively parse properties with path tracking.
68
68
+
/// Returns parsed properties or an error with the full path.
69
69
+
fn parse_properties_deep(
70
70
+
props_value: &serde_json::Value,
71
71
+
base_path: &str,
72
72
+
file_path: &Path,
73
73
+
content: &str,
74
74
+
) -> std::result::Result<BTreeMap<SmolStr, crate::lexicon::LexObjectProperty<'static>>, CodegenError>
75
75
+
{
76
76
+
let props_obj = props_value.as_object().ok_or_else(|| {
77
77
+
make_parse_error(
78
78
+
file_path,
79
79
+
base_path,
80
80
+
"expected object for properties".to_string(),
81
81
+
content,
82
82
+
)
83
83
+
})?;
84
84
+
85
85
+
let mut parsed_props = BTreeMap::new();
86
86
+
for (prop_name, prop_value) in props_obj {
87
87
+
let prop_path = format!("{}.{}", base_path, prop_name);
88
88
+
89
89
+
// Try to parse this property
90
90
+
let parsed: crate::lexicon::LexObjectProperty =
91
91
+
serde_path_to_error::deserialize(prop_value).map_err(|e| {
92
92
+
let inner_path = e.path().to_string();
93
93
+
let full_path = if inner_path.is_empty() {
94
94
+
prop_path.clone()
95
95
+
} else {
96
96
+
format!("{}.{}", prop_path, inner_path)
97
97
+
};
98
98
+
make_parse_error(file_path, &full_path, e.inner().to_string(), content)
99
99
+
})?;
100
100
+
101
101
+
parsed_props.insert(SmolStr::new(prop_name), parsed.into_static());
102
102
+
}
103
103
+
104
104
+
Ok(parsed_props)
105
105
+
}
106
106
+
107
107
+
/// Parse an object-like def with deep property tracking.
108
108
+
fn parse_object_deep(
109
109
+
value: &serde_json::Value,
110
110
+
base_path: &str,
111
111
+
file_path: &Path,
112
112
+
content: &str,
113
113
+
) -> std::result::Result<crate::lexicon::LexObject<'static>, CodegenError> {
114
114
+
use crate::lexicon::LexObject;
115
115
+
116
116
+
let obj = value.as_object().ok_or_else(|| {
117
117
+
make_parse_error(file_path, base_path, "expected object".to_string(), content)
118
118
+
})?;
119
119
+
120
120
+
// Parse properties deeply if present
121
121
+
let properties = if let Some(props) = obj.get("properties") {
122
122
+
let props_path = format!("{}.properties", base_path);
123
123
+
parse_properties_deep(props, &props_path, file_path, content)?
124
124
+
} else {
125
125
+
BTreeMap::new()
126
126
+
};
127
127
+
128
128
+
// Parse the rest of the object normally
129
129
+
let description = obj
130
130
+
.get("description")
131
131
+
.and_then(|v| v.as_str())
132
132
+
.map(|s| jacquard_common::CowStr::copy_from_str(s));
133
133
+
let required: Option<Vec<SmolStr>> = obj
134
134
+
.get("required")
135
135
+
.map(|v| serde_json::from_value(v.clone()))
136
136
+
.transpose()
137
137
+
.map_err(|e| make_parse_error(file_path, &format!("{}.required", base_path), e.to_string(), content))?;
138
138
+
let nullable: Option<Vec<SmolStr>> = obj
139
139
+
.get("nullable")
140
140
+
.map(|v| serde_json::from_value(v.clone()))
141
141
+
.transpose()
142
142
+
.map_err(|e| make_parse_error(file_path, &format!("{}.nullable", base_path), e.to_string(), content))?;
143
143
+
144
144
+
Ok(LexObject {
145
145
+
description,
146
146
+
required,
147
147
+
nullable,
148
148
+
properties,
149
149
+
})
150
150
+
}
151
151
+
152
152
+
/// Parse a def with deep path tracking for nested structures.
153
153
+
fn parse_def_deep(
154
154
+
def_name: &str,
155
155
+
value: &serde_json::Value,
156
156
+
file_path: &Path,
157
157
+
content: &str,
158
158
+
) -> std::result::Result<LexUserType<'static>, CodegenError> {
159
159
+
let base_path = format!("defs.{}", def_name);
160
160
+
161
161
+
// Check the type field to determine how to parse
162
162
+
let type_str = value
163
163
+
.get("type")
164
164
+
.and_then(|v| v.as_str())
165
165
+
.unwrap_or("object");
166
166
+
167
167
+
match type_str {
168
168
+
"object" => {
169
169
+
let obj = parse_object_deep(value, &base_path, file_path, content)?;
170
170
+
Ok(LexUserType::Object(obj))
171
171
+
}
172
172
+
"record" => {
173
173
+
// Records have a nested record.properties structure
174
174
+
if let Some(record_value) = value.get("record") {
175
175
+
let record_path = format!("{}.record", base_path);
176
176
+
let inner_obj = parse_object_deep(record_value, &record_path, file_path, content)?;
177
177
+
178
178
+
// Parse the rest of the record
179
179
+
let obj = value.as_object().ok_or_else(|| {
180
180
+
make_parse_error(file_path, &base_path, "expected object".to_string(), content)
181
181
+
})?;
182
182
+
183
183
+
let description = obj
184
184
+
.get("description")
185
185
+
.and_then(|v| v.as_str())
186
186
+
.map(|s| jacquard_common::CowStr::copy_from_str(s));
187
187
+
let key: Option<jacquard_common::CowStr<'static>> = obj
188
188
+
.get("key")
189
189
+
.and_then(|v| v.as_str())
190
190
+
.map(|s| jacquard_common::CowStr::copy_from_str(s));
191
191
+
192
192
+
Ok(LexUserType::Record(crate::lexicon::LexRecord {
193
193
+
description,
194
194
+
key,
195
195
+
record: crate::lexicon::LexRecordRecord::Object(inner_obj),
196
196
+
}))
197
197
+
} else {
198
198
+
// Fallback to normal parsing if no record field
199
199
+
serde_path_to_error::deserialize(value)
200
200
+
.map(|v: LexUserType| v.into_static())
201
201
+
.map_err(|e| make_parse_error(file_path, &base_path, e.inner().to_string(), content))
202
202
+
}
203
203
+
}
204
204
+
// For other types (query, procedure, etc.), use the simpler approach for now
205
205
+
// Could be extended later
206
206
+
_ => serde_path_to_error::deserialize(value)
207
207
+
.map(|v: LexUserType| v.into_static())
208
208
+
.map_err(|e| {
209
209
+
let inner_path = e.path().to_string();
210
210
+
let full_path = if inner_path.is_empty() {
211
211
+
base_path
212
212
+
} else {
213
213
+
format!("{}.{}", base_path, inner_path)
214
214
+
};
215
215
+
make_parse_error(file_path, &full_path, e.inner().to_string(), content)
216
216
+
}),
217
217
+
}
218
218
+
}
219
219
+
220
220
+
/// Parse a lexicon with rich error context using deep recursive parsing.
221
221
+
///
222
222
+
/// This parses the document structure recursively, tracking paths through:
223
223
+
/// - defs → def_name → properties → prop_name → nested fields
224
224
+
///
225
225
+
/// This gives us detailed error paths like "defs.main.properties.count.default"
226
226
+
fn parse_lexicon_with_context(
227
227
+
content: &str,
228
228
+
path: &Path,
229
229
+
) -> std::result::Result<LexiconDoc<'static>, CodegenError> {
230
230
+
// Phase 1: Parse the top-level structure with defs as raw Values
231
231
+
let raw_doc: RawLexiconDoc = serde_json::from_str(content).map_err(|e| {
232
232
+
CodegenError::ParseError {
233
233
+
path: path.to_path_buf(),
234
234
+
json_path: None,
235
235
+
message: e.to_string(),
236
236
+
src: Some(content.to_string()),
237
237
+
span: None,
238
238
+
}
239
239
+
})?;
240
240
+
241
241
+
// Phase 2: Parse each def with deep path tracking
242
242
+
let mut parsed_defs = BTreeMap::new();
243
243
+
for (def_name, def_value) in raw_doc.defs {
244
244
+
let parsed_def = parse_def_deep(&def_name, &def_value, path, content)?;
245
245
+
parsed_defs.insert(def_name, parsed_def);
246
246
+
}
247
247
+
248
248
+
// Reconstruct the full LexiconDoc
249
249
+
Ok(LexiconDoc {
250
250
+
lexicon: raw_doc.lexicon,
251
251
+
id: raw_doc.id.into_static(),
252
252
+
revision: raw_doc.revision,
253
253
+
description: raw_doc.description.map(|d| d.into_static()),
254
254
+
defs: parsed_defs,
255
255
+
})
256
256
+
}
257
257
+
9
258
/// Registry of all loaded lexicons for reference resolution
10
259
#[derive(Debug, Clone)]
11
260
pub struct LexiconCorpus {
···
32
281
for schema_path in schemas {
33
282
let content = fs::read_to_string(schema_path.as_ref())?;
34
283
35
35
-
// Try to parse as lexicon doc - skip files that aren't lexicon schemas
36
36
-
let doc: LexiconDoc = match serde_json::from_str(&content) {
37
37
-
Ok(doc) => doc,
38
38
-
Err(_) => continue, // Skip non-lexicon JSON files
39
39
-
};
284
284
+
// Check if this file is trying to be a lexicon
285
285
+
if !is_lexicon_content(&content) {
286
286
+
// Not a lexicon, skip silently
287
287
+
continue;
288
288
+
}
289
289
+
290
290
+
// This IS a lexicon - parse with good error reporting
291
291
+
let doc = parse_lexicon_with_context(&content, schema_path.as_ref())?;
40
292
41
293
let nsid = SmolStr::from(doc.id.to_string());
42
42
-
corpus.docs.insert(nsid.clone(), doc.into_static());
294
294
+
corpus.docs.insert(nsid.clone(), doc);
43
295
corpus.sources.insert(nsid, content);
44
296
}
45
297
···
166
418
// Non-existing refs
167
419
assert!(!corpus.ref_exists("com.example.fake"));
168
420
assert!(!corpus.ref_exists("app.bsky.feed.post#nonexistent"));
421
421
+
}
422
422
+
423
423
+
#[test]
424
424
+
fn test_non_lexicon_json_skipped_silently() {
425
425
+
// The test_lexicons directory contains not_a_lexicon.json which should be skipped
426
426
+
let corpus = LexiconCorpus::load_from_dir("tests/fixtures/test_lexicons")
427
427
+
.expect("should succeed even with non-lexicon JSON files");
428
428
+
429
429
+
// The non-lexicon file should not be in the corpus
430
430
+
assert!(corpus.get("some random config").is_none());
431
431
+
432
432
+
// But valid lexicons should still load
433
433
+
assert!(corpus.get("app.bsky.feed.post").is_some());
434
434
+
}
435
435
+
436
436
+
#[test]
437
437
+
fn test_is_lexicon_content_detection() {
438
438
+
// Not a lexicon - no "lexicon" key
439
439
+
assert!(!is_lexicon_content(r#"{"name": "test", "version": "1.0"}"#));
440
440
+
441
441
+
// Not a lexicon - invalid JSON
442
442
+
assert!(!is_lexicon_content("not json at all"));
443
443
+
444
444
+
// Is a lexicon - has "lexicon" at top level
445
445
+
assert!(is_lexicon_content(r#"{"lexicon": 1, "id": "test.foo"}"#));
446
446
+
447
447
+
// Is a lexicon - has "lexicon" one level down
448
448
+
assert!(is_lexicon_content(
449
449
+
r#"{"wrapper": {"lexicon": 1, "id": "test.foo"}}"#
450
450
+
));
451
451
+
}
452
452
+
453
453
+
#[test]
454
454
+
fn test_broken_lexicon_returns_error_with_path() {
455
455
+
let result = LexiconCorpus::load_from_dir("tests/fixtures/error_cases");
456
456
+
457
457
+
// Should fail because broken_lexicon.json is a lexicon (has "lexicon" key)
458
458
+
// but has invalid structure
459
459
+
let err = result.expect_err("should fail on broken lexicon");
460
460
+
let err_str = err.to_string();
461
461
+
462
462
+
// Error should include the full path to the broken property
463
463
+
assert!(
464
464
+
err_str.contains("defs.main.properties.count"),
465
465
+
"error should contain path to the broken property, got: {}",
466
466
+
err_str
467
467
+
);
468
468
+
469
469
+
// Error should also include the actual error message
470
470
+
assert!(
471
471
+
err_str.contains("expected i64"),
472
472
+
"error should describe the type mismatch, got: {}",
473
473
+
err_str
474
474
+
);
475
475
+
476
476
+
// Error should mention the file
477
477
+
assert!(
478
478
+
err_str.contains("broken_lexicon.json"),
479
479
+
"error should mention the file, got: {}",
480
480
+
err_str
481
481
+
);
169
482
}
170
483
}
+24
-19
crates/jacquard-lexicon/src/error.rs
Reviewed
···
3
3
use std::path::PathBuf;
4
4
use thiserror::Error;
5
5
6
6
+
fn format_parse_error(path: &PathBuf, json_path: Option<&str>, message: &str) -> String {
7
7
+
match json_path {
8
8
+
Some(jp) if !jp.is_empty() => {
9
9
+
format!("failed to parse lexicon {}: at {}: {}", path.display(), jp, message)
10
10
+
}
11
11
+
_ => format!("failed to parse lexicon {}: {}", path.display(), message),
12
12
+
}
13
13
+
}
14
14
+
6
15
/// Errors that can occur during lexicon code generation
7
16
#[derive(Debug, Error, Diagnostic)]
8
17
#[non_exhaustive]
···
12
21
Io(#[from] io::Error),
13
22
14
23
/// Failed to parse lexicon JSON
15
15
-
#[error("Failed to parse lexicon JSON in {}", path.display())]
24
24
+
#[error("{}", format_parse_error(path, json_path.as_deref(), message))]
16
25
#[diagnostic(
17
26
code(lexicon::parse_error),
18
27
help("Check that the lexicon file is valid JSON and follows the lexicon schema")
19
28
)]
20
29
ParseError {
21
21
-
#[source]
22
22
-
source: serde_json::Error,
23
30
/// Path to the file that failed to parse
24
31
path: PathBuf,
32
32
+
/// JSON path where the error occurred (from serde_path_to_error)
33
33
+
json_path: Option<String>,
34
34
+
/// The underlying error message
35
35
+
message: String,
25
36
/// Source text that failed to parse
26
37
#[source_code]
27
38
src: Option<String>,
···
90
101
91
102
impl CodegenError {
92
103
/// Create a parse error with context
93
93
-
pub fn parse_error(source: serde_json::Error, path: impl Into<PathBuf>) -> Self {
104
104
+
pub fn parse_error(message: impl Into<String>, path: impl Into<PathBuf>) -> Self {
94
105
Self::ParseError {
95
95
-
source,
96
106
path: path.into(),
107
107
+
json_path: None,
108
108
+
message: message.into(),
97
109
src: None,
98
110
span: None,
99
111
}
100
112
}
101
113
102
102
-
/// Create a parse error with source text
103
103
-
pub fn parse_error_with_source(
104
104
-
source: serde_json::Error,
114
114
+
/// Create a parse error with source text and JSON path
115
115
+
pub fn parse_error_with_context(
116
116
+
message: impl Into<String>,
105
117
path: impl Into<PathBuf>,
118
118
+
json_path: Option<String>,
106
119
src: String,
107
120
) -> Self {
108
108
-
// Try to extract error location from serde_json error
109
109
-
let span = if let Some(line) = source.line().checked_sub(1) {
110
110
-
let col = source.column().saturating_sub(1);
111
111
-
// Approximate byte offset (not perfect but good enough for display)
112
112
-
Some((line * 80 + col, 1).into())
113
113
-
} else {
114
114
-
None
115
115
-
};
116
116
-
117
121
Self::ParseError {
118
118
-
source,
119
122
path: path.into(),
123
123
+
json_path,
124
124
+
message: message.into(),
120
125
src: Some(src),
121
121
-
span,
126
126
+
span: None,
122
127
}
123
128
}
124
129
+15
crates/jacquard-lexicon/tests/fixtures/error_cases/broken_lexicon.json
Reviewed
···
1
1
+
{
2
2
+
"lexicon": 1,
3
3
+
"id": "test.broken.lexicon",
4
4
+
"defs": {
5
5
+
"main": {
6
6
+
"type": "object",
7
7
+
"properties": {
8
8
+
"count": {
9
9
+
"type": "integer",
10
10
+
"default": "not_a_number"
11
11
+
}
12
12
+
}
13
13
+
}
14
14
+
}
15
15
+
}
+7
crates/jacquard-lexicon/tests/fixtures/test_lexicons/not_a_lexicon.json
Reviewed
···
1
1
+
{
2
2
+
"name": "some random config",
3
3
+
"version": "1.0.0",
4
4
+
"settings": {
5
5
+
"enabled": true
6
6
+
}
7
7
+
}