token_helpers/
helpers.rs

1use chumsky::{chain::Chain, prelude::*, Parser};
2use lsp_core::util::token::{StringStyle, Token};
3
4#[macro_export]
5macro_rules! t {
6    ($t:ty) => {
7        impl Parser<char, $t, Error = Simple<char>>
8    };
9}
10
11pub fn tok(st: &'static str, tok: Token) -> t!(Token) {
12    just::<char, &str, Simple<char>>(st).to(tok)
13}
14
15pub fn tokens_ext() -> t!(Token) {
16    choice((tok("{", Token::CurlOpen), tok("}", Token::CurlClose)))
17}
18
19pub fn keywords() -> t!(Token) {
20    choice((
21        just("PREFIX").to(Token::PrefixTag),
22        just("BASE").to(Token::BaseTag),
23    ))
24    .or(just('@').ignore_then(choice((
25        just("prefix").to(Token::PrefixTag),
26        just("base").to(Token::BaseTag),
27    ))))
28}
29
30pub fn tokens() -> t!(Token) {
31    choice((
32        // tok("@prefix", Token::PrefixTag),
33        // tok("@base", Token::BaseTag),
34        tok("PREFIX", Token::SparqlPrefix),
35        tok("BASE", Token::SparqlBase),
36        tok("[", Token::SqOpen),
37        tok("]", Token::SqClose),
38        tok("(", Token::BracketOpen),
39        tok(")", Token::BracketClose),
40        tok("^^", Token::DataTypeDelim),
41        tok(".", Token::Stop),
42        tok(",", Token::Comma),
43        tok(";", Token::PredicateSplit),
44        tok("a", Token::PredType),
45        tok("true", Token::True),
46        tok("false", Token::False),
47    ))
48}
49
50pub fn comment() -> t!(Token) {
51    just('#')
52        .ignore_then(none_of("\n\r").repeated().collect())
53        .map(|x| Token::Comment(x))
54}
55
56pub fn invalid() -> t!(Token) {
57    none_of(" \n\r.,;[]")
58        .repeated()
59        .at_least(1)
60        .collect()
61        .map(Token::Invalid)
62}
63
64pub fn iri_ref() -> t!(Token) {
65    let letter = none_of("<>\"{}|^`\\").repeated().at_least(1).or(uchar());
66
67    letter
68        .repeated()
69        .flatten()
70        .collect()
71        .delimited_by(just('<'), just('>'))
72        .map(|x| Token::IRIRef(x))
73}
74
75pub fn pname_ns() -> t!(Token) {
76    pn_prefix()
77        .collect()
78        .or_not()
79        .then_ignore(just(':'))
80        .then(pn_local().collect().or_not())
81        .map(|(x, local)| {
82            if let Some(local) = local {
83                Token::PNameLN(x, local)
84            } else {
85                Token::PNameLN(x, String::new())
86            }
87        })
88}
89
90pub fn label_post() -> t!(Vec<char>) {
91    just('.')
92        .repeated()
93        .chain(pn_chars().repeated().at_least(1))
94}
95
96pub fn blank_node_label() -> t!(Token) {
97    let label = pn_chars()
98        .or(filter(|c: &char| c.is_numeric()))
99        .repeated()
100        .then(label_post().repeated().flatten())
101        .map(|(mut x, y)| {
102            x.extend(y);
103            x
104        });
105
106    just('_')
107        .then(just(':'))
108        .ignore_then(label.collect())
109        .map(|x| Token::BlankNodeLabel(x))
110}
111
112pub fn lang_tag() -> t!(Token) {
113    let rep = just('-').chain(filter(|c: &char| c.is_alphanumeric()).repeated());
114    just('@')
115        .ignore_then(filter(|c: &char| c.is_alphabetic()).repeated())
116        .then(rep.repeated().flatten())
117        .map(|(mut x, y)| {
118            y.append_to(&mut x);
119            x
120        })
121        .collect()
122        .map(|string| Token::LangTag(string))
123}
124
125pub fn integer() -> t!(Token) {
126    let before_dot = || {
127        one_of("+-")
128            .or_not()
129            .then(filter(|c: &char| c.is_numeric()).repeated().at_least(1))
130            .map(|(x, y)| {
131                let mut o: Vec<char> = Vec::with_capacity(x.is_some() as usize + y.len());
132                x.append_to(&mut o);
133                y.append_to(&mut o);
134                o
135            })
136    };
137
138    let no_dot = || {
139        filter(|c: &char| c.is_numeric())
140            .repeated()
141            .at_least(1)
142            .then(exponent().or_not())
143            .map(|(mut x, y)| {
144                if let Some(exp) = y {
145                    exp.append_to(&mut x);
146                }
147                x
148            })
149    };
150
151    let with_dot = || {
152        just('.').then(no_dot()).map(|(x, y)| {
153            let mut o = Vec::with_capacity(1 + y.len());
154            o.push(x);
155            y.append_to(&mut o);
156            o
157        })
158    };
159
160    with_dot()
161        .or(before_dot().then(with_dot()).map(|(mut x, y)| {
162            y.append_to(&mut x);
163            x
164        }))
165        .or(no_dot())
166        .or(before_dot())
167        .collect()
168        .map(|x| Token::Number(x))
169}
170
171pub fn exponent() -> t!(Vec<char>) {
172    one_of("eE")
173        .then(one_of("+-").or_not())
174        .then(filter(|c: &char| c.is_numeric()).repeated().at_least(1))
175        .map(|((x, y), z)| {
176            let mut o = Vec::with_capacity(1 + y.is_some() as usize + z.len());
177            o.push(x);
178            y.append_to(&mut o);
179            z.append_to(&mut o);
180            o
181        })
182}
183
184pub fn parse_string<const C: char>() -> t!(String) {
185    let letter = e_char().or(uchar()).or(filter(|c: &char| {
186        *c != '\\' && *c != '\n' && *c != '\r' && *c != C
187    })
188    .repeated()
189    .at_least(1));
190
191    letter
192        .repeated()
193        .flatten()
194        .collect()
195        .delimited_by(just(C), just(C))
196}
197
198pub fn parse_long_string<const C: char>() -> t!(String) {
199    let si = || just::<char, char, Simple<char>>(C);
200    let delim = si().repeated().exactly(3);
201
202    let letter = delim.not();
203
204    delim
205        .ignore_then(letter.repeated().collect().map(|x| {
206            // println!("Found {:?}", x);
207            x
208        }))
209        .then_ignore(delim)
210    // letter
211    //     .repeated()
212    //     .flatten()
213    //     .collect()
214    //     .delimited_by(delim, delim)
215    // delim
216    //     .ignore_then(
217    //         si().repeated()
218    //             .at_most(2)
219    //             .then(letter.repeated().flatten())
220    //             .map(|(mut x, y)| {
221    //                 y.append_to(&mut x);
222    //                 x
223    //             }),
224    //     )
225    //     .then_ignore(delim)
226    //     .collect()
227}
228
229pub fn strings() -> t!(Token) {
230    long_string_double()
231        .or(long_string_single())
232        .or(string_single())
233        .or(string_double())
234}
235
236pub fn string_single() -> t!(Token) {
237    parse_string::<'\''>().map(|x| Token::Str(x, StringStyle::Single))
238}
239pub fn string_double() -> t!(Token) {
240    parse_string::<'"'>().map(|x| Token::Str(x, StringStyle::Double))
241}
242
243pub fn long_string_single() -> t!(Token) {
244    parse_long_string::<'\''>().map(|x| Token::Str(x, StringStyle::SingleLong))
245}
246
247pub fn long_string_double() -> t!(Token) {
248    parse_long_string::<'"'>().map(|x| Token::Str(x, StringStyle::DoubleLong))
249}
250
251pub fn uchar() -> t!(Vec<char>) {
252    let small = just('\\')
253        .chain(just('u'))
254        .chain(hex())
255        .chain(hex())
256        .chain(hex())
257        .chain(hex());
258
259    let big = just('\\')
260        .chain(just('U'))
261        .chain(hex())
262        .chain(hex())
263        .chain(hex())
264        .chain(hex())
265        .chain(hex())
266        .chain(hex())
267        .chain(hex())
268        .chain(hex());
269
270    small.or(big)
271}
272
273pub fn e_char() -> t!(Vec<char>) {
274    just('\\')
275        .then(one_of("tbnrf\"'\\"))
276        .map(|(x, y)| vec![x, y])
277}
278
279pub fn pn_chars_base() -> t!(char) {
280    filter(|c: &char| c.is_alphabetic())
281}
282
283pub fn pn_chars_u() -> t!(char) {
284    pn_chars_base().or(just('_'))
285}
286pub fn varname() -> t!(char) {
287    pn_chars_u().or(filter(|c: &char| c.is_numeric()))
288}
289pub fn pn_chars() -> t!(char) {
290    pn_chars_u()
291        .or(just('-'))
292        .or(filter(|c: &char| c.is_numeric()))
293}
294pub fn pn_prefix() -> t!(Vec<char>) {
295    let ne = just('.')
296        .repeated()
297        .then(pn_chars().repeated().at_least(1))
298        .map(|(x, y)| {
299            let mut o: Vec<char> = Vec::with_capacity(x.len() + y.len());
300            x.append_to(&mut o);
301            y.append_to(&mut o);
302            o
303        })
304        .repeated()
305        .flatten();
306
307    pn_chars_base().then(ne.or_not()).map(|(x, y)| {
308        if let Some(y) = y {
309            let mut o = Vec::with_capacity(y.len() + 1);
310            o.push(x);
311            o.extend(y);
312            o
313        } else {
314            vec![x]
315        }
316    })
317}
318
319pub fn pn_local() -> t!(Vec<char>) {
320    let first_char = pn_chars_u()
321        .or(filter(|c: &char| *c == ':' || c.is_numeric()))
322        .repeated()
323        .at_least(1)
324        .or(plx());
325
326    let other = || pn_chars().or(just(':')).or(just('%'));
327
328    let rest = just('.')
329        .repeated()
330        .then(other().repeated().at_least(1))
331        .map(|(x, y)| {
332            let mut o: Vec<char> = Vec::with_capacity(x.len() + y.len());
333            x.append_to(&mut o);
334            y.append_to(&mut o);
335            o
336        })
337        .repeated()
338        .flatten();
339
340    first_char.then(rest.or_not()).map(|(mut x, y)| {
341        if let Some(y) = y {
342            y.append_to(&mut x);
343        }
344        x
345    })
346}
347
348pub fn plx() -> t!(Vec<char>) {
349    percent().or(pn_local_esc())
350}
351
352pub fn percent() -> t!(Vec<char>) {
353    just('%')
354        .ignore_then(hex().then(hex()))
355        .map(|(x, y)| vec![x, y])
356}
357
358pub fn hex() -> t!(char) {
359    filter(|c: &char| c.is_ascii_hexdigit())
360}
361
362pub fn pn_local_esc() -> t!(Vec<char>) {
363    just('\\')
364        .then(one_of("_~.-!$&'()*+,;=/?#@%"))
365        .map(|(x, y)| vec![x, y])
366}
367
368#[cfg(test)]
369mod tests {
370    use super::*;
371    #[test]
372    fn parse_keywords() {
373        assert!(keywords().parse("@prefix").is_ok());
374        assert!(tokens().parse(".").is_ok());
375        assert!(iri_ref().parse("<testing>").is_ok());
376        assert!(pname_ns().parse(":").is_ok());
377        assert!(pname_ns().parse("testing:").is_ok());
378        assert!(pname_ns().parse("testing:test").is_ok());
379        assert!(blank_node_label().parse("_:test").is_ok());
380        assert!(lang_tag().parse("@en").is_ok());
381        assert!(integer().parse("14").is_ok());
382        assert!(integer().parse("14.0").is_ok());
383        assert!(strings().parse("'testing'").is_ok());
384        assert!(strings().parse("\"testing\"").is_ok());
385        assert!(strings().parse("\"\"\"testing\"\"\"").is_ok());
386        assert!(comment().parse("# This is a nice comment").is_ok());
387    }
388
389    #[test]
390    fn parse_multiple_kws() {
391        assert!(tokens()
392            .padded()
393            .repeated()
394            .parse("@prefix @base . .")
395            .is_ok());
396        assert!(iri_ref()
397            .padded()
398            .repeated()
399            .parse("<testing> <testing>")
400            .is_ok());
401        assert!(pname_ns()
402            .padded()
403            .repeated()
404            .parse(": testing: testing:test")
405            .is_ok());
406        assert!(blank_node_label()
407            .padded()
408            .repeated()
409            .parse("_:b1 _:b0")
410            .is_ok());
411        assert!(lang_tag().padded().repeated().parse("@en @en-nl").is_ok());
412        assert!(integer().padded().repeated().parse("14 14").is_ok());
413        assert!(strings()
414            .padded()
415            .repeated()
416            .parse("\"testing\" 'testing'")
417            .is_ok());
418    }
419}