lang_jsonld/lang/
tokenizer.rs

1use chumsky::{chain::Chain as _, prelude::*};
2use lsp_core::prelude::{spanned, Spanned, StringStyle, Token};
3
4pub fn tokenize(st: &str) -> (Vec<Spanned<Token>>, Vec<Simple<char>>) {
5    let parser = parser()
6        .then_ignore(end().recover_with(skip_then_retry_until([])))
7        .padded();
8
9    let (json, errs) = parser.parse_recovery(st);
10
11    (json.unwrap_or_default(), errs)
12}
13
14fn parser() -> impl Parser<char, Vec<Spanned<Token>>, Error = Simple<char>> {
15    let tok = just("true")
16        .to(Token::True)
17        .or(just("false").to(Token::False))
18        .or(just("null").to(Token::Null))
19        .or(just(']').to(Token::SqClose))
20        .or(just('{').to(Token::CurlOpen))
21        .or(just('}').to(Token::CurlClose))
22        .or(just(':').to(Token::Colon))
23        .or(just(',').to(Token::Comma))
24        .or(just('[').to(Token::SqOpen));
25
26    let items = tok
27        .or(parse_num())
28        .or(parse_string().map(|st| Token::Str(st, StringStyle::Double)));
29
30    items.map_with_span(spanned).padded().repeated()
31}
32
33fn exponent() -> impl Parser<char, Vec<char>, Error = Simple<char>> {
34    one_of("eE")
35        .then(one_of("+-").or_not())
36        .then(filter(|c: &char| c.is_numeric()).repeated().at_least(1))
37        .map(|((x, y), z)| {
38            let mut o = Vec::with_capacity(1 + y.is_some() as usize + z.len());
39            o.push(x);
40            y.append_to(&mut o);
41            z.append_to(&mut o);
42            o
43        })
44}
45
46fn parse_num() -> impl Parser<char, Token, Error = Simple<char>> {
47    let before_dot = || {
48        one_of("+-")
49            .or_not()
50            .then(filter(|c: &char| c.is_numeric()).repeated().at_least(1))
51            .map(|(x, y)| {
52                let mut o: Vec<char> = Vec::with_capacity(x.is_some() as usize + y.len());
53                x.append_to(&mut o);
54                y.append_to(&mut o);
55                o
56            })
57    };
58
59    let no_dot = || {
60        filter(|c: &char| c.is_numeric())
61            .repeated()
62            .at_least(1)
63            .then(exponent())
64            .map(|(mut x, y)| {
65                y.append_to(&mut x);
66                x
67            })
68    };
69
70    let with_dot = || {
71        just('.').then(no_dot()).map(|(x, y)| {
72            let mut o = Vec::with_capacity(1 + y.len());
73            o.push(x);
74            y.append_to(&mut o);
75            o
76        })
77    };
78
79    with_dot()
80        .or(before_dot().then(with_dot()).map(|(mut x, y)| {
81            y.append_to(&mut x);
82            x
83        }))
84        .or(no_dot())
85        .or(before_dot())
86        .collect()
87        .map(|x| Token::Number(x))
88}
89
90fn parse_string() -> impl Parser<char, String, Error = Simple<char>> {
91    let escape = just('\\').ignore_then(
92        just('\\')
93            .or(just('/'))
94            .or(just('"'))
95            .or(just('b').to('\x08'))
96            .or(just('f').to('\x0C'))
97            .or(just('n').to('\n'))
98            .or(just('r').to('\r'))
99            .or(just('t').to('\t'))
100            .or(just('u').ignore_then(
101                filter(|c: &char| c.is_digit(16))
102                    .repeated()
103                    .exactly(4)
104                    .collect::<String>()
105                    .validate(|digits, span, emit| {
106                        char::from_u32(u32::from_str_radix(&digits, 16).unwrap()).unwrap_or_else(
107                            || {
108                                emit(Simple::custom(span, "invalid unicode character"));
109                                '\u{FFFD}' // unicode replacement character
110                            },
111                        )
112                    }),
113            )),
114    );
115
116    just('"')
117        .ignore_then(filter(|c| *c != '\\' && *c != '"').or(escape).repeated())
118        .then_ignore(just('"'))
119        .collect::<String>()
120        .labelled("string")
121}
122
123#[cfg(test)]
124mod tests {
125    use lsp_core::prelude::Token::*;
126
127    use super::*;
128
129    #[test]
130    fn parse_simple() {
131        let (tokens, errs) = tokenize("");
132        assert!(tokens.is_empty());
133        assert!(errs.is_empty());
134
135        let (tokens, errs) = tokenize(", [ ] { } null true false");
136        let tokens: Vec<_> = tokens.into_iter().map(|x| x.into_value()).collect();
137        assert_eq!(
138            tokens,
139            vec![Comma, SqOpen, SqClose, CurlOpen, CurlClose, Null, True, False]
140        );
141        assert!(errs.is_empty());
142    }
143
144    #[test]
145    fn parse_string() {
146        let (tokens, errs) = tokenize(" \"Epic string!!\"");
147        let tokens: Vec<_> = tokens.into_iter().map(|x| x.into_value()).collect();
148        assert_eq!(
149            tokens,
150            vec![Str("Epic string!!".into(), StringStyle::Double)]
151        );
152        assert!(errs.is_empty());
153
154        let (tokens, errs) = tokenize(" \"Epic string!!");
155        let tokens: Vec<_> = tokens.into_iter().map(|x| x.into_value()).collect();
156        assert_eq!(tokens, vec![]);
157        assert_eq!(errs.len(), 1);
158    }
159
160    #[test]
161    fn parse_num() {
162        let (tokens, errs) = tokenize(" 423");
163        let tokens: Vec<_> = tokens.into_iter().map(|x| x.into_value()).collect();
164        assert_eq!(tokens, vec![Number(String::from("423"))]);
165        assert!(errs.is_empty());
166    }
167}