lang_turtle/lang/
tokenizer.rs

1use chumsky::prelude::*;
2use logos::Logos;
3use lsp_core::prelude::{spanned, Spanned, StringStyle, Token};
4use token_helpers::*;
5
6#[allow(non_camel_case_types)]
7#[derive(Logos, Debug, PartialEq)]
8#[logos(skip r"[ \t\n\f\r]+")] // Ignore this regex pattern between tokens
9enum TurtleToken {
10    #[token("@prefix")]
11    Prefix,
12
13    #[token("prefix", ignore(case))]
14    SqPrefix,
15
16    #[token("@base")]
17    Base,
18
19    #[token("base", ignore(case))]
20    SqBase,
21
22    #[token("[")]
23    SqOpen,
24
25    #[token("]")]
26    SqClose,
27
28    #[token("(")]
29    BraceOpen,
30
31    #[token(")")]
32    BraceClose,
33
34    #[token("a")]
35    TypeTag,
36
37    #[token(";")]
38    Semi,
39
40    #[token(",")]
41    Comma,
42    #[token(".")]
43    Stop,
44
45    #[token("^^")]
46    DataTag,
47
48    #[token("true")]
49    True,
50
51    #[token("false")]
52    False,
53
54    #[regex(r#"(_:((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])|_)|[0-9])((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])|_)|\-|[0-9]|\u00B7|[\u0300-\u036F]|[\u203F-\u2040])*(\.*((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])|_)|\-|[0-9]|\u00B7|[\u0300-\u036F]|[\u203F-\u2040])((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])|_)|\-|[0-9]|\u00B7|[\u0300-\u036F]|[\u203F-\u2040])*)*)"#)]
55    BLANK_NODE_LABEL,
56
57    #[regex(r#"([+-]?(([0-9]+\.[0-9]*([eE][+-]?[0-9]+))|(\.([0-9])+([eE][+-]?[0-9]+))|(([0-9])+([eE][+-]?[0-9]+))))"#)]
58    DOUBLE,
59
60    #[regex(r#"([+-]?([0-9])*\.([0-9])+)"#)]
61    DECIMAL,
62
63    #[regex(r#"([+-]?[0-9]+)"#)]
64    INTEGER,
65
66    #[regex(r#"([+-]?[0-9]+\.)"#)]
67    INTEGER_WITH_DOT,
68
69    #[regex(r#"(@[a-zA-Z][a-zA-Z]*(\-[a-zA-Z0-9][a-zA-Z0-9]*)*)"#)]
70    LANGTAG,
71
72    #[regex(r#"("([^\x22\x5C\x0A\x0D]|(\\[tbnrf\"'\\])|((\\u([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f]))|(\\U([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f]))))*")"#)]
73    STRING_LITERAL_QUOTE,
74
75    #[regex(r#"('([^\x27\x5C\x0A\x0D]|(\\[tbnrf\"'\\])|((\\u([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f]))|(\\U([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f]))))*')"#)]
76    STRING_LITERAL_SINGLE_QUOTE,
77
78    #[regex(r#"('''(('|'')?([^'\\]|(\\[tbnrf\"'\\])|((\\u([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f]))|(\\U([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])))))*''')"#)]
79    STRING_LITERAL_LONG_SINGLE_QUOTE,
80
81    #[regex(r#"("""(("|"")?([^"\\]|(\\[tbnrf\"'\\])|((\\u([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f]))|(\\U([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])))))*""")"#)]
82    STRING_LITERAL_LONG_QUOTE,
83
84    #[regex(r#"(<([^\x00-\x20<>"{}|^`\\]|((\\u([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f]))|(\\U([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f]))))*>)"#)]
85    IRIREF,
86
87    #[regex(r#"((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])((((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])|_)|\-|[0-9]|\u00B7|[\u0300-\u036F]|[\u203F-\u2040])|\.)*((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])|_)|\-|[0-9]|\u00B7|[\u0300-\u036F]|[\u203F-\u2040]))?)?:)"#)]
88    PNAME_NS,
89
90    #[regex(r#"(((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])((((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])|_)|\-|[0-9]|\u00B7|[\u0300-\u036F]|[\u203F-\u2040])|\.)*((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])|_)|\-|[0-9]|\u00B7|[\u0300-\u036F]|[\u203F-\u2040]))?)?:)(((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])|_)|:|[0-9]|((%([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f]))|(\\(_|\~|\.|\-|!|\$|\&|\\"|\(|\)|\*|\+|"|'|;|=|,|/|\?|\#|@|%))))(\.|(((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])|_)|\-|[0-9]|\u00B7|[\u0300-\u036F]|[\u203F-\u2040])|:|((%([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f]))|(\\(_|\~|\.|\-|!|\$|\&|\\"|\(|\)|\*|\+|"|'|;|=|,|/|\?|\#|@|%)))))*))"#)]
91    PNAME_LN,
92
93    #[regex(r#"#[^\u000D\u000A]*"#)]
94    Comment,
95}
96
97pub fn parse_tokens_str<'a>(text: &'a str) -> (Vec<Spanned<Token>>, Vec<Simple<char>>) {
98    let mut tokens = Vec::new();
99    let mut errors = Vec::new();
100    let mut lex = TurtleToken::lexer(text);
101    while let Some(x) = lex.next() {
102        let t = || text[lex.span()].to_string();
103        let t2 = |d_start, d_end| {
104            let span = lex.span();
105            let (start, end) = (span.start, span.end);
106            text[start + d_start..end - d_end].to_string()
107        };
108
109        match x {
110            Ok(token) => {
111                let t = match token {
112                    TurtleToken::Comment => Token::Comment(t()),
113                    TurtleToken::Prefix => Token::PrefixTag,
114                    TurtleToken::Base => Token::BaseTag,
115                    TurtleToken::SqPrefix => Token::SparqlPrefix,
116                    TurtleToken::SqBase => Token::SparqlBase,
117                    TurtleToken::SqOpen => Token::SqOpen,
118                    TurtleToken::SqClose => Token::SqClose,
119                    TurtleToken::BraceOpen => Token::BracketOpen,
120                    TurtleToken::BraceClose => Token::BracketClose,
121                    TurtleToken::TypeTag => Token::PredType,
122                    TurtleToken::Semi => Token::PredicateSplit,
123                    TurtleToken::Comma => Token::Comma,
124                    TurtleToken::Stop => Token::Stop,
125                    TurtleToken::DataTag => Token::DataTypeDelim,
126                    TurtleToken::True => Token::True,
127                    TurtleToken::False => Token::False,
128                    TurtleToken::BLANK_NODE_LABEL => Token::BlankNodeLabel(t2(2, 0)),
129                    TurtleToken::DOUBLE => Token::Number(t()),
130                    TurtleToken::DECIMAL => Token::Number(t()),
131                    TurtleToken::INTEGER => Token::Number(t()),
132                    TurtleToken::INTEGER_WITH_DOT => {
133                        let span = lex.span();
134                        let end = span.end - 1;
135                        let start = span.start;
136                        tokens.push(spanned(
137                            Token::Number(text[start..end].to_string()),
138                            start..end,
139                        ));
140                        tokens.push(spanned(Token::Stop, end..end + 1));
141
142                        continue;
143                    }
144                    TurtleToken::LANGTAG => Token::LangTag(t2(1, 0)),
145                    TurtleToken::STRING_LITERAL_LONG_SINGLE_QUOTE => {
146                        Token::Str(t2(3, 3), StringStyle::SingleLong)
147                    }
148                    TurtleToken::STRING_LITERAL_QUOTE => Token::Str(t2(1, 1), StringStyle::Double),
149                    TurtleToken::STRING_LITERAL_LONG_QUOTE => {
150                        Token::Str(t2(3, 3), StringStyle::DoubleLong)
151                    }
152                    TurtleToken::STRING_LITERAL_SINGLE_QUOTE => {
153                        Token::Str(t2(1, 1), StringStyle::Single)
154                    }
155                    TurtleToken::IRIREF => Token::IRIRef(t2(1, 1)),
156                    TurtleToken::PNAME_LN | TurtleToken::PNAME_NS => {
157                        let st = &text[lex.span()];
158                        let ends_with_stop = st.ends_with('.');
159
160                        if ends_with_stop {
161                            let span = lex.span();
162                            let end = span.end - 1;
163                            let start = span.start;
164                            if let Some((first, second)) = text[start..end].split_once(":") {
165                                tokens.push(spanned(
166                                    Token::PNameLN(Some(first.to_string()), second.to_string()),
167                                    start..end,
168                                ));
169                                tokens.push(spanned(Token::Stop, end..end + 1));
170                            } else {
171                                tokens.push(spanned(
172                                    Token::Invalid(text[start..end].to_string()),
173                                    start..end,
174                                ));
175                                tokens.push(spanned(Token::Stop, end..end + 1));
176                            }
177                            continue;
178                        } else {
179                            if let Some((first, second)) = text[lex.span()].split_once(":") {
180                                Token::PNameLN(Some(first.to_string()), second.to_string())
181                            } else {
182                                Token::Invalid(t())
183                            }
184                        }
185                    }
186                };
187                tokens.push(spanned(t, lex.span()));
188            }
189            Err(_) => {
190                tokens.push(spanned(Token::Invalid(t()), lex.span()));
191                errors.push(Simple::custom(
192                    lex.span(),
193                    format!("Unexpected token '{}'", &text[lex.span()]),
194                ))
195            }
196        }
197    }
198
199    (tokens, errors)
200}
201pub fn parse_tokens_str_safe(text: &str) -> Result<Vec<Spanned<Token>>, Vec<Simple<char>>> {
202    let (t, e) = parse_tokens_str(text);
203    if e.is_empty() {
204        Ok(t)
205    } else {
206        Err(e)
207    }
208}
209
210pub fn parse_token() -> t!(Token) {
211    choice((
212        keywords(),
213        comment(),
214        iri_ref(),
215        pname_ns(),
216        blank_node_label(),
217        lang_tag(),
218        integer(),
219        strings(),
220        tokens(),
221    ))
222    .recover_with(skip_parser(invalid()))
223}
224
225pub fn parse_tokens() -> t!(Vec<Spanned<Token>>) {
226    parse_token()
227        .map_with_span(spanned)
228        .padded()
229        .repeated()
230        .then_ignore(end().recover_with(skip_then_retry_until([])))
231}
232
233#[cfg(test)]
234mod tests {
235    use std::ops::Range;
236
237    use super::*;
238
239    #[allow(unused)]
240    fn log_parse_results(tokens: Vec<(Token, Range<usize>)>, err: &Vec<Simple<char>>) {
241        tokens.iter().for_each(|tk| {
242            println!("{:?}", tk);
243        });
244
245        err.iter().for_each(|er| eprintln!("{:?}", er));
246    }
247
248    #[test]
249    fn parse_directives() {
250        let input = "
251            @prefix elm: <http://elm.com/types#> .
252            @prefix : <http://elm.com/types#> .
253            @base <http://example.com/#> . 
254            # Test comment!
255        ";
256
257        assert!(parse_tokens().parse(input).is_ok());
258    }
259
260    #[test]
261    fn parse_strings() {
262        for input in ["\"\"\"test\"\"\"", "\"\"\"t\"est\"\"\""] {
263            println!("Input {}", input);
264            let (tok, err) = long_string_double().parse_recovery(input);
265            println!("Found tokens {:?} {:?}", tok, err);
266            assert!(tok.is_some());
267            assert!(err.is_empty());
268        }
269    }
270
271    #[test]
272    fn parse_named_node() {
273        let input = "
274            <http://localhost/elmBeta> 
275            elm:Beta
276
277            :testing
278            ";
279
280        let (tok, err) = parse_tokens().parse_recovery(input);
281        assert!(tok.is_some());
282        assert!(err.is_empty());
283    }
284
285    #[test]
286    fn simple_test() {
287        let input = "
288            @prefix elm: <http://elm.com/types#> .
289            @base <http://example.com/#> . 
290            
291            elm:Beta foaf:string \"cookie\"@en ;
292                     foaf:astring \"jar\"^^xsd:string .
293
294            elm:Bnode a [ foaf:name \"Kachan\" ; 
295                          foaf:lastName \"Bruh\" ; 
296                          foaf:email \"kb@kbc.be\", \"notkb@notkbc.be\" ].
297            ";
298
299        let (tok, err) = parse_tokens().parse_recovery(input);
300        assert!(tok.is_some());
301        assert!(err.is_empty());
302    }
303
304    #[test]
305    fn complex_test() {
306        let input = "
307            @prefix rr: <http://www.w3.org/ns/r2rml#> .
308            @prefix foaf: <http://xmlns.com/foaf/0.1/> .
309            @prefix ex: <http://example.com/> .
310            @prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
311            @prefix rml: <http://semweb.mmlab.be/ns/rml#> .
312            @prefix ql: <http://semweb.mmlab.be/ns/ql#> .
313
314            @base <http://example.com/base/> .
315
316            <TriplesMap1>
317              a rr:TriplesMap;
318                    
319              rml:logicalSource [ 
320                rml:source \"student.csv\";
321                rml:referenceFormulation ql:CSV
322              ] ;
323                
324              rr:subjectMap [ 
325                rr:template \"http://example.com/{Name}\" 
326              ]; 
327                
328              rr:predicateObjectMap [ 
329                rr:predicate foaf:name ; 
330                rr:objectMap [ 
331                  rml:reference \"Name\" 
332                ]
333              ].
334            ";
335
336        let (tok, err) = parse_tokens().parse_recovery(input);
337        assert!(tok.is_some());
338        assert!(err.is_empty());
339    }
340
341    #[test]
342    fn parse_invalid() {
343        let input = "
344            @prefix elm: http .
345            ";
346
347        let (tok, err) = parse_tokens().parse_recovery(input);
348        assert!(tok.is_some());
349
350        println!("tokens {:?}", tok);
351        assert_eq!(tok.unwrap().len(), 4);
352        assert_eq!(err.len(), 1);
353    }
354}