lang_jsonld/lang/
tokenizer.rs1use chumsky::{chain::Chain as _, prelude::*};
2use lsp_core::prelude::{spanned, Spanned, StringStyle, Token};
3
4pub fn tokenize(st: &str) -> (Vec<Spanned<Token>>, Vec<Simple<char>>) {
5 let parser = parser()
6 .then_ignore(end().recover_with(skip_then_retry_until([])))
7 .padded();
8
9 let (json, errs) = parser.parse_recovery(st);
10
11 (json.unwrap_or_default(), errs)
12}
13
14fn parser() -> impl Parser<char, Vec<Spanned<Token>>, Error = Simple<char>> {
15 let tok = just("true")
16 .to(Token::True)
17 .or(just("false").to(Token::False))
18 .or(just("null").to(Token::Null))
19 .or(just(']').to(Token::SqClose))
20 .or(just('{').to(Token::CurlOpen))
21 .or(just('}').to(Token::CurlClose))
22 .or(just(':').to(Token::Colon))
23 .or(just(',').to(Token::Comma))
24 .or(just('[').to(Token::SqOpen));
25
26 let items = tok
27 .or(parse_num())
28 .or(parse_string().map(|st| Token::Str(st, StringStyle::Double)));
29
30 items.map_with_span(spanned).padded().repeated()
31}
32
33fn exponent() -> impl Parser<char, Vec<char>, Error = Simple<char>> {
34 one_of("eE")
35 .then(one_of("+-").or_not())
36 .then(filter(|c: &char| c.is_numeric()).repeated().at_least(1))
37 .map(|((x, y), z)| {
38 let mut o = Vec::with_capacity(1 + y.is_some() as usize + z.len());
39 o.push(x);
40 y.append_to(&mut o);
41 z.append_to(&mut o);
42 o
43 })
44}
45
46fn parse_num() -> impl Parser<char, Token, Error = Simple<char>> {
47 let before_dot = || {
48 one_of("+-")
49 .or_not()
50 .then(filter(|c: &char| c.is_numeric()).repeated().at_least(1))
51 .map(|(x, y)| {
52 let mut o: Vec<char> = Vec::with_capacity(x.is_some() as usize + y.len());
53 x.append_to(&mut o);
54 y.append_to(&mut o);
55 o
56 })
57 };
58
59 let no_dot = || {
60 filter(|c: &char| c.is_numeric())
61 .repeated()
62 .at_least(1)
63 .then(exponent())
64 .map(|(mut x, y)| {
65 y.append_to(&mut x);
66 x
67 })
68 };
69
70 let with_dot = || {
71 just('.').then(no_dot()).map(|(x, y)| {
72 let mut o = Vec::with_capacity(1 + y.len());
73 o.push(x);
74 y.append_to(&mut o);
75 o
76 })
77 };
78
79 with_dot()
80 .or(before_dot().then(with_dot()).map(|(mut x, y)| {
81 y.append_to(&mut x);
82 x
83 }))
84 .or(no_dot())
85 .or(before_dot())
86 .collect()
87 .map(|x| Token::Number(x))
88}
89
90fn parse_string() -> impl Parser<char, String, Error = Simple<char>> {
91 let escape = just('\\').ignore_then(
92 just('\\')
93 .or(just('/'))
94 .or(just('"'))
95 .or(just('b').to('\x08'))
96 .or(just('f').to('\x0C'))
97 .or(just('n').to('\n'))
98 .or(just('r').to('\r'))
99 .or(just('t').to('\t'))
100 .or(just('u').ignore_then(
101 filter(|c: &char| c.is_digit(16))
102 .repeated()
103 .exactly(4)
104 .collect::<String>()
105 .validate(|digits, span, emit| {
106 char::from_u32(u32::from_str_radix(&digits, 16).unwrap()).unwrap_or_else(
107 || {
108 emit(Simple::custom(span, "invalid unicode character"));
109 '\u{FFFD}' },
111 )
112 }),
113 )),
114 );
115
116 just('"')
117 .ignore_then(filter(|c| *c != '\\' && *c != '"').or(escape).repeated())
118 .then_ignore(just('"'))
119 .collect::<String>()
120 .labelled("string")
121}
122
123#[cfg(test)]
124mod tests {
125 use lsp_core::prelude::Token::*;
126
127 use super::*;
128
129 #[test]
130 fn parse_simple() {
131 let (tokens, errs) = tokenize("");
132 assert!(tokens.is_empty());
133 assert!(errs.is_empty());
134
135 let (tokens, errs) = tokenize(", [ ] { } null true false");
136 let tokens: Vec<_> = tokens.into_iter().map(|x| x.into_value()).collect();
137 assert_eq!(
138 tokens,
139 vec![Comma, SqOpen, SqClose, CurlOpen, CurlClose, Null, True, False]
140 );
141 assert!(errs.is_empty());
142 }
143
144 #[test]
145 fn parse_string() {
146 let (tokens, errs) = tokenize(" \"Epic string!!\"");
147 let tokens: Vec<_> = tokens.into_iter().map(|x| x.into_value()).collect();
148 assert_eq!(
149 tokens,
150 vec![Str("Epic string!!".into(), StringStyle::Double)]
151 );
152 assert!(errs.is_empty());
153
154 let (tokens, errs) = tokenize(" \"Epic string!!");
155 let tokens: Vec<_> = tokens.into_iter().map(|x| x.into_value()).collect();
156 assert_eq!(tokens, vec![]);
157 assert_eq!(errs.len(), 1);
158 }
159
160 #[test]
161 fn parse_num() {
162 let (tokens, errs) = tokenize(" 423");
163 let tokens: Vec<_> = tokens.into_iter().map(|x| x.into_value()).collect();
164 assert_eq!(tokens, vec![Number(String::from("423"))]);
165 assert!(errs.is_empty());
166 }
167}