1use chumsky::prelude::*;
2use logos::Logos;
3use lsp_core::prelude::{spanned, Spanned, StringStyle, Token};
4use token_helpers::*;
5
6#[allow(non_camel_case_types)]
7#[derive(Logos, Debug, PartialEq)]
8#[logos(skip r"[ \t\n\f\r]+")] enum TurtleToken {
10 #[token("@prefix")]
11 Prefix,
12
13 #[token("prefix", ignore(case))]
14 SqPrefix,
15
16 #[token("@base")]
17 Base,
18
19 #[token("base", ignore(case))]
20 SqBase,
21
22 #[token("[")]
23 SqOpen,
24
25 #[token("]")]
26 SqClose,
27
28 #[token("(")]
29 BraceOpen,
30
31 #[token(")")]
32 BraceClose,
33
34 #[token("a")]
35 TypeTag,
36
37 #[token(";")]
38 Semi,
39
40 #[token(",")]
41 Comma,
42 #[token(".")]
43 Stop,
44
45 #[token("^^")]
46 DataTag,
47
48 #[token("true")]
49 True,
50
51 #[token("false")]
52 False,
53
54 #[regex(r#"(_:((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])|_)|[0-9])((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])|_)|\-|[0-9]|\u00B7|[\u0300-\u036F]|[\u203F-\u2040])*(\.*((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])|_)|\-|[0-9]|\u00B7|[\u0300-\u036F]|[\u203F-\u2040])((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])|_)|\-|[0-9]|\u00B7|[\u0300-\u036F]|[\u203F-\u2040])*)*)"#)]
55 BLANK_NODE_LABEL,
56
57 #[regex(r#"([+-]?(([0-9]+\.[0-9]*([eE][+-]?[0-9]+))|(\.([0-9])+([eE][+-]?[0-9]+))|(([0-9])+([eE][+-]?[0-9]+))))"#)]
58 DOUBLE,
59
60 #[regex(r#"([+-]?([0-9])*\.([0-9])+)"#)]
61 DECIMAL,
62
63 #[regex(r#"([+-]?[0-9]+)"#)]
64 INTEGER,
65
66 #[regex(r#"([+-]?[0-9]+\.)"#)]
67 INTEGER_WITH_DOT,
68
69 #[regex(r#"(@[a-zA-Z][a-zA-Z]*(\-[a-zA-Z0-9][a-zA-Z0-9]*)*)"#)]
70 LANGTAG,
71
72 #[regex(r#"("([^\x22\x5C\x0A\x0D]|(\\[tbnrf\"'\\])|((\\u([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f]))|(\\U([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f]))))*")"#)]
73 STRING_LITERAL_QUOTE,
74
75 #[regex(r#"('([^\x27\x5C\x0A\x0D]|(\\[tbnrf\"'\\])|((\\u([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f]))|(\\U([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f]))))*')"#)]
76 STRING_LITERAL_SINGLE_QUOTE,
77
78 #[regex(r#"('''(('|'')?([^'\\]|(\\[tbnrf\"'\\])|((\\u([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f]))|(\\U([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])))))*''')"#)]
79 STRING_LITERAL_LONG_SINGLE_QUOTE,
80
81 #[regex(r#"("""(("|"")?([^"\\]|(\\[tbnrf\"'\\])|((\\u([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f]))|(\\U([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])))))*""")"#)]
82 STRING_LITERAL_LONG_QUOTE,
83
84 #[regex(r#"(<([^\x00-\x20<>"{}|^`\\]|((\\u([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f]))|(\\U([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f]))))*>)"#)]
85 IRIREF,
86
87 #[regex(r#"((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])((((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])|_)|\-|[0-9]|\u00B7|[\u0300-\u036F]|[\u203F-\u2040])|\.)*((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])|_)|\-|[0-9]|\u00B7|[\u0300-\u036F]|[\u203F-\u2040]))?)?:)"#)]
88 PNAME_NS,
89
90 #[regex(r#"(((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])((((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])|_)|\-|[0-9]|\u00B7|[\u0300-\u036F]|[\u203F-\u2040])|\.)*((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])|_)|\-|[0-9]|\u00B7|[\u0300-\u036F]|[\u203F-\u2040]))?)?:)(((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])|_)|:|[0-9]|((%([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f]))|(\\(_|\~|\.|\-|!|\$|\&|\\"|\(|\)|\*|\+|"|'|;|=|,|/|\?|\#|@|%))))(\.|(((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])|_)|\-|[0-9]|\u00B7|[\u0300-\u036F]|[\u203F-\u2040])|:|((%([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f]))|(\\(_|\~|\.|\-|!|\$|\&|\\"|\(|\)|\*|\+|"|'|;|=|,|/|\?|\#|@|%)))))*))"#)]
91 PNAME_LN,
92
93 #[regex(r#"#[^\u000D\u000A]*"#)]
94 Comment,
95}
96
97pub fn parse_tokens_str<'a>(text: &'a str) -> (Vec<Spanned<Token>>, Vec<Simple<char>>) {
98 let mut tokens = Vec::new();
99 let mut errors = Vec::new();
100 let mut lex = TurtleToken::lexer(text);
101 while let Some(x) = lex.next() {
102 let t = || text[lex.span()].to_string();
103 let t2 = |d_start, d_end| {
104 let span = lex.span();
105 let (start, end) = (span.start, span.end);
106 text[start + d_start..end - d_end].to_string()
107 };
108
109 match x {
110 Ok(token) => {
111 let t = match token {
112 TurtleToken::Comment => Token::Comment(t()),
113 TurtleToken::Prefix => Token::PrefixTag,
114 TurtleToken::Base => Token::BaseTag,
115 TurtleToken::SqPrefix => Token::SparqlPrefix,
116 TurtleToken::SqBase => Token::SparqlBase,
117 TurtleToken::SqOpen => Token::SqOpen,
118 TurtleToken::SqClose => Token::SqClose,
119 TurtleToken::BraceOpen => Token::BracketOpen,
120 TurtleToken::BraceClose => Token::BracketClose,
121 TurtleToken::TypeTag => Token::PredType,
122 TurtleToken::Semi => Token::PredicateSplit,
123 TurtleToken::Comma => Token::Comma,
124 TurtleToken::Stop => Token::Stop,
125 TurtleToken::DataTag => Token::DataTypeDelim,
126 TurtleToken::True => Token::True,
127 TurtleToken::False => Token::False,
128 TurtleToken::BLANK_NODE_LABEL => Token::BlankNodeLabel(t2(2, 0)),
129 TurtleToken::DOUBLE => Token::Number(t()),
130 TurtleToken::DECIMAL => Token::Number(t()),
131 TurtleToken::INTEGER => Token::Number(t()),
132 TurtleToken::INTEGER_WITH_DOT => {
133 let span = lex.span();
134 let end = span.end - 1;
135 let start = span.start;
136 tokens.push(spanned(
137 Token::Number(text[start..end].to_string()),
138 start..end,
139 ));
140 tokens.push(spanned(Token::Stop, end..end + 1));
141
142 continue;
143 }
144 TurtleToken::LANGTAG => Token::LangTag(t2(1, 0)),
145 TurtleToken::STRING_LITERAL_LONG_SINGLE_QUOTE => {
146 Token::Str(t2(3, 3), StringStyle::SingleLong)
147 }
148 TurtleToken::STRING_LITERAL_QUOTE => Token::Str(t2(1, 1), StringStyle::Double),
149 TurtleToken::STRING_LITERAL_LONG_QUOTE => {
150 Token::Str(t2(3, 3), StringStyle::DoubleLong)
151 }
152 TurtleToken::STRING_LITERAL_SINGLE_QUOTE => {
153 Token::Str(t2(1, 1), StringStyle::Single)
154 }
155 TurtleToken::IRIREF => Token::IRIRef(t2(1, 1)),
156 TurtleToken::PNAME_LN | TurtleToken::PNAME_NS => {
157 let st = &text[lex.span()];
158 let ends_with_stop = st.ends_with('.');
159
160 if ends_with_stop {
161 let span = lex.span();
162 let end = span.end - 1;
163 let start = span.start;
164 if let Some((first, second)) = text[start..end].split_once(":") {
165 tokens.push(spanned(
166 Token::PNameLN(Some(first.to_string()), second.to_string()),
167 start..end,
168 ));
169 tokens.push(spanned(Token::Stop, end..end + 1));
170 } else {
171 tokens.push(spanned(
172 Token::Invalid(text[start..end].to_string()),
173 start..end,
174 ));
175 tokens.push(spanned(Token::Stop, end..end + 1));
176 }
177 continue;
178 } else {
179 if let Some((first, second)) = text[lex.span()].split_once(":") {
180 Token::PNameLN(Some(first.to_string()), second.to_string())
181 } else {
182 Token::Invalid(t())
183 }
184 }
185 }
186 };
187 tokens.push(spanned(t, lex.span()));
188 }
189 Err(_) => {
190 tokens.push(spanned(Token::Invalid(t()), lex.span()));
191 errors.push(Simple::custom(
192 lex.span(),
193 format!("Unexpected token '{}'", &text[lex.span()]),
194 ))
195 }
196 }
197 }
198
199 (tokens, errors)
200}
201pub fn parse_tokens_str_safe(text: &str) -> Result<Vec<Spanned<Token>>, Vec<Simple<char>>> {
202 let (t, e) = parse_tokens_str(text);
203 if e.is_empty() {
204 Ok(t)
205 } else {
206 Err(e)
207 }
208}
209
210pub fn parse_token() -> t!(Token) {
211 choice((
212 keywords(),
213 comment(),
214 iri_ref(),
215 pname_ns(),
216 blank_node_label(),
217 lang_tag(),
218 integer(),
219 strings(),
220 tokens(),
221 ))
222 .recover_with(skip_parser(invalid()))
223}
224
225pub fn parse_tokens() -> t!(Vec<Spanned<Token>>) {
226 parse_token()
227 .map_with_span(spanned)
228 .padded()
229 .repeated()
230 .then_ignore(end().recover_with(skip_then_retry_until([])))
231}
232
233#[cfg(test)]
234mod tests {
235 use std::ops::Range;
236
237 use super::*;
238
239 #[allow(unused)]
240 fn log_parse_results(tokens: Vec<(Token, Range<usize>)>, err: &Vec<Simple<char>>) {
241 tokens.iter().for_each(|tk| {
242 println!("{:?}", tk);
243 });
244
245 err.iter().for_each(|er| eprintln!("{:?}", er));
246 }
247
248 #[test]
249 fn parse_directives() {
250 let input = "
251 @prefix elm: <http://elm.com/types#> .
252 @prefix : <http://elm.com/types#> .
253 @base <http://example.com/#> .
254 # Test comment!
255 ";
256
257 assert!(parse_tokens().parse(input).is_ok());
258 }
259
260 #[test]
261 fn parse_strings() {
262 for input in ["\"\"\"test\"\"\"", "\"\"\"t\"est\"\"\""] {
263 println!("Input {}", input);
264 let (tok, err) = long_string_double().parse_recovery(input);
265 println!("Found tokens {:?} {:?}", tok, err);
266 assert!(tok.is_some());
267 assert!(err.is_empty());
268 }
269 }
270
271 #[test]
272 fn parse_named_node() {
273 let input = "
274 <http://localhost/elmBeta>
275 elm:Beta
276
277 :testing
278 ";
279
280 let (tok, err) = parse_tokens().parse_recovery(input);
281 assert!(tok.is_some());
282 assert!(err.is_empty());
283 }
284
285 #[test]
286 fn simple_test() {
287 let input = "
288 @prefix elm: <http://elm.com/types#> .
289 @base <http://example.com/#> .
290
291 elm:Beta foaf:string \"cookie\"@en ;
292 foaf:astring \"jar\"^^xsd:string .
293
294 elm:Bnode a [ foaf:name \"Kachan\" ;
295 foaf:lastName \"Bruh\" ;
296 foaf:email \"kb@kbc.be\", \"notkb@notkbc.be\" ].
297 ";
298
299 let (tok, err) = parse_tokens().parse_recovery(input);
300 assert!(tok.is_some());
301 assert!(err.is_empty());
302 }
303
304 #[test]
305 fn complex_test() {
306 let input = "
307 @prefix rr: <http://www.w3.org/ns/r2rml#> .
308 @prefix foaf: <http://xmlns.com/foaf/0.1/> .
309 @prefix ex: <http://example.com/> .
310 @prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
311 @prefix rml: <http://semweb.mmlab.be/ns/rml#> .
312 @prefix ql: <http://semweb.mmlab.be/ns/ql#> .
313
314 @base <http://example.com/base/> .
315
316 <TriplesMap1>
317 a rr:TriplesMap;
318
319 rml:logicalSource [
320 rml:source \"student.csv\";
321 rml:referenceFormulation ql:CSV
322 ] ;
323
324 rr:subjectMap [
325 rr:template \"http://example.com/{Name}\"
326 ];
327
328 rr:predicateObjectMap [
329 rr:predicate foaf:name ;
330 rr:objectMap [
331 rml:reference \"Name\"
332 ]
333 ].
334 ";
335
336 let (tok, err) = parse_tokens().parse_recovery(input);
337 assert!(tok.is_some());
338 assert!(err.is_empty());
339 }
340
341 #[test]
342 fn parse_invalid() {
343 let input = "
344 @prefix elm: http .
345 ";
346
347 let (tok, err) = parse_tokens().parse_recovery(input);
348 assert!(tok.is_some());
349
350 println!("tokens {:?}", tok);
351 assert_eq!(tok.unwrap().len(), 4);
352 assert_eq!(err.len(), 1);
353 }
354}