lang_sparql/lang/
tokenizer.rs

1use chumsky::prelude::*;
2use logos::Logos;
3use lsp_core::prelude::{
4    spanned, Spanned, SparqlAggregate, SparqlCall, SparqlExpr, SparqlKeyword, StringStyle, Token,
5};
6
7#[allow(non_camel_case_types)]
8#[derive(Logos, Debug, PartialEq)]
9#[logos(skip r"[ \t\n\f\r]+")] // Ignore this regex pattern between tokens
10enum SparqlToken {
11    #[token("REGEX", |_| SparqlKeyword::Regex, ignore(case))]
12    #[token("SUBSTR", |_| SparqlKeyword::Substr, ignore(case))]
13    #[token("REPLACE", |_| SparqlKeyword::Replace, ignore(case))]
14    #[token("EXISTS", |_| SparqlKeyword::Exists, ignore(case))]
15    #[token("SELECT", |_| SparqlKeyword::Select, ignore(case))]
16    #[token("DISTINCT", |_| SparqlKeyword::Distinct, ignore(case))]
17    #[token("REDUCED", |_| SparqlKeyword::Reduced, ignore(case))]
18    #[token("OPTIONAL", |_| SparqlKeyword::Optional, ignore(case))]
19    #[token("UNION", |_| SparqlKeyword::Union, ignore(case))]
20    #[token("AS", |_| SparqlKeyword::As, ignore(case))]
21    #[token("CONSTRUCT", |_| SparqlKeyword::Construct, ignore(case))]
22    #[token("WHERE", |_| SparqlKeyword::Where, ignore(case))]
23    #[token("DESCRIBE", |_| SparqlKeyword::Describe, ignore(case))]
24    #[token("ASK", |_| SparqlKeyword::Ask, ignore(case))]
25    #[token("FROM", |_| SparqlKeyword::From, ignore(case))]
26    #[token("NAMED", |_| SparqlKeyword::Named, ignore(case))]
27    #[token("GROUP", |_| SparqlKeyword::Group, ignore(case))]
28    #[token("BY", |_| SparqlKeyword::By, ignore(case))]
29    #[token("HAVING", |_| SparqlKeyword::Having, ignore(case))]
30    #[token("ORDER", |_| SparqlKeyword::Order, ignore(case))]
31    #[token("ASC", |_| SparqlKeyword::Asc, ignore(case))]
32    #[token("DESC", |_| SparqlKeyword::Desc, ignore(case))]
33    #[token("LIMIT", |_| SparqlKeyword::Limit, ignore(case))]
34    #[token("OFFSET", |_| SparqlKeyword::Offset, ignore(case))]
35    #[token("VALUES", |_| SparqlKeyword::Values, ignore(case))]
36    #[token("LOAD", |_| SparqlKeyword::Load, ignore(case))]
37    #[token("SILENT", |_| SparqlKeyword::Silent, ignore(case))]
38    #[token("CLEAR", |_| SparqlKeyword::Clear, ignore(case))]
39    #[token("DROP", |_| SparqlKeyword::Drop, ignore(case))]
40    #[token("CREATE", |_| SparqlKeyword::Create, ignore(case))]
41    #[token("ADD", |_| SparqlKeyword::Add, ignore(case))]
42    #[token("MOVE", |_| SparqlKeyword::Move, ignore(case))]
43    #[token("COPY", |_| SparqlKeyword::Copy, ignore(case))]
44    #[token("INSERT", |_| SparqlKeyword::Insert, ignore(case))]
45    #[token("DATA", |_| SparqlKeyword::Data, ignore(case))]
46    #[token("DELETE", |_| SparqlKeyword::Delete, ignore(case))]
47    #[token("WITH", |_| SparqlKeyword::With, ignore(case))]
48    #[token("USING", |_| SparqlKeyword::Using, ignore(case))]
49    #[token("DEFAULT", |_| SparqlKeyword::Default, ignore(case))]
50    #[token("ALL", |_| SparqlKeyword::All, ignore(case))]
51    #[token("GRAPH", |_| SparqlKeyword::Graph, ignore(case))]
52    #[token("SERVICE", |_| SparqlKeyword::Service, ignore(case))]
53    #[token("BIND", |_| SparqlKeyword::Bind, ignore(case))]
54    #[token("UNDEF", |_| SparqlKeyword::Undef, ignore(case))]
55    #[token("MINUS", |_| SparqlKeyword::Minus, ignore(case))]
56    #[token("FILTER", |_| SparqlKeyword::Filter, ignore(case))]
57    Kwd(SparqlKeyword),
58
59    #[token("COUNT", |_| SparqlAggregate::Count, ignore(case))]
60    #[token("SUM", |_| SparqlAggregate::Sum, ignore(case))]
61    #[token("MIN", |_| SparqlAggregate::Min, ignore(case))]
62    #[token("MAX", |_| SparqlAggregate::Max, ignore(case))]
63    #[token("AVG", |_| SparqlAggregate::Avg, ignore(case))]
64    #[token("SAMPLE", |_| SparqlAggregate::Sample, ignore(case))]
65    #[token("GROUP_CONCAT", |_| SparqlAggregate::GroupConcat, ignore(case))]
66    Agg(SparqlAggregate),
67
68    #[token("STR", |_| SparqlCall::Str, ignore(case))]
69    #[token("LANG", |_| SparqlCall::Lang, ignore(case))]
70    #[token("langMatches", |_| SparqlCall::LangMatches, ignore(case))]
71    #[token("LANGDIR", |_| SparqlCall::LangDir, ignore(case))]
72    #[token("datatype", |_| SparqlCall::Datatype, ignore(case))]
73    #[token("BOUND", |_| SparqlCall::Bound, ignore(case))]
74    #[token("IRI", |_| SparqlCall::Iri, ignore(case))]
75    #[token("URI", |_| SparqlCall::Uri, ignore(case))]
76    #[token("BNODE", |_| SparqlCall::Bnode, ignore(case))]
77    #[token("RAND", |_| SparqlCall::Rand, ignore(case))]
78    #[token("ABS", |_| SparqlCall::Abs, ignore(case))]
79    #[token("CEIL", |_| SparqlCall::Ceil, ignore(case))]
80    #[token("FLOOR", |_| SparqlCall::Floor, ignore(case))]
81    #[token("ROUND", |_| SparqlCall::Round, ignore(case))]
82    #[token("CONCAT", |_| SparqlCall::Concat, ignore(case))]
83    #[token("STRLEN", |_| SparqlCall::StrLen, ignore(case))]
84    #[token("UCASE", |_| SparqlCall::Ucase, ignore(case))]
85    #[token("lcase", |_| SparqlCall::Lcase, ignore(case))]
86    #[token("ENCODE_FOR_URI", |_| SparqlCall::EncodeForUri, ignore(case))]
87    #[token("CONTAINS", |_| SparqlCall::Contains, ignore(case))]
88    #[token("STRSTARTS", |_| SparqlCall::StrStarts, ignore(case))]
89    #[token("STRENDS", |_| SparqlCall::StrEnds, ignore(case))]
90    #[token("STRBEFORE", |_| SparqlCall::StrBefore, ignore(case))]
91    #[token("STRAFTER", |_| SparqlCall::StrAfter, ignore(case))]
92    #[token("YEAR", |_| SparqlCall::Year, ignore(case))]
93    #[token("MONTH", |_| SparqlCall::Month, ignore(case))]
94    #[token("DAY", |_| SparqlCall::Day, ignore(case))]
95    #[token("HOURS", |_| SparqlCall::Hours, ignore(case))]
96    #[token("MINUTES", |_| SparqlCall::Minutes, ignore(case))]
97    #[token("SECONDS", |_| SparqlCall::Seconds, ignore(case))]
98    #[token("TIMEZONE", |_| SparqlCall::Timezone, ignore(case))]
99    #[token("TZ", |_| SparqlCall::Tz, ignore(case))]
100    #[token("NOW", |_| SparqlCall::Now, ignore(case))]
101    #[token("UUID", |_| SparqlCall::Uuid, ignore(case))]
102    #[token("STRUUID", |_| SparqlCall::StrUuid, ignore(case))]
103    #[token("MD5", |_| SparqlCall::Md5, ignore(case))]
104    #[token("SHA1", |_| SparqlCall::Sha1, ignore(case))]
105    #[token("SHA256", |_| SparqlCall::Sha256, ignore(case))]
106    #[token("SHA384", |_| SparqlCall::Sha384, ignore(case))]
107    #[token("SHA512", |_| SparqlCall::Sha512, ignore(case))]
108    #[token("COALESCE", |_| SparqlCall::Coalesce, ignore(case))]
109    #[token("IF", |_| SparqlCall::If, ignore(case))]
110    #[token("STRLANG", |_| SparqlCall::StrLang, ignore(case))]
111    #[token("STRLANGDIR", |_| SparqlCall::StrLangDir, ignore(case))]
112    #[token("STRDT", |_| SparqlCall::StrDt, ignore(case))]
113    #[token("sameTerm", |_| SparqlCall::SameTerm, ignore(case))]
114    #[token("isIRI", |_| SparqlCall::IsIri, ignore(case))]
115    #[token("isURI", |_| SparqlCall::IsUri, ignore(case))]
116    #[token("isBLANK", |_| SparqlCall::IsBlank, ignore(case))]
117    #[token("isLITERAL", |_| SparqlCall::IsLiteral, ignore(case))]
118    #[token("isNUMBERIC", |_| SparqlCall::IsNumeric, ignore(case))]
119    #[token("hasLANG", |_| SparqlCall::HasLang, ignore(case))]
120    #[token("hasLANGDIR", |_| SparqlCall::HasLangDir, ignore(case))]
121    #[token("isTRIPLE", |_| SparqlCall::IsTriple, ignore(case))]
122    #[token("TRIPLE", |_| SparqlCall::Triple, ignore(case))]
123    #[token("SUBJECT", |_| SparqlCall::Subject, ignore(case))]
124    #[token("PREDICATE", |_| SparqlCall::Predicate, ignore(case))]
125    #[token("OBJECT", |_| SparqlCall::Object, ignore(case))]
126    Call(SparqlCall),
127
128    #[token("in", |_| SparqlExpr::In, ignore(case))]
129    #[token("not", |_| SparqlExpr::Not, ignore(case))]
130    #[token("||", |_| SparqlExpr::Or, ignore(case))]
131    #[token("&&", |_| SparqlExpr::And, ignore(case))]
132    #[token("=", |_| SparqlExpr::Equal, ignore(case))]
133    #[token("!=", |_| SparqlExpr::NotEqual, ignore(case))]
134    #[token("<", |_| SparqlExpr::Lt, ignore(case))]
135    #[token(">", |_| SparqlExpr::Gt, ignore(case))]
136    #[token("<=", |_| SparqlExpr::Lte, ignore(case))]
137    #[token(">=", |_| SparqlExpr::Gte, ignore(case))]
138    #[token("+", |_| SparqlExpr::Plus, ignore(case))]
139    #[token("-", |_| SparqlExpr::Minus, ignore(case))]
140    #[token("*", |_| SparqlExpr::Times, ignore(case))]
141    #[token("/", |_| SparqlExpr::Divide, ignore(case))]
142    #[token("!", |_| SparqlExpr::Exclamation, ignore(case))]
143    Expr(SparqlExpr),
144
145    #[token("prefix", ignore(case))]
146    SqPrefix,
147
148    #[token("base", ignore(case))]
149    SqBase,
150
151    #[token("[")]
152    SqOpen,
153
154    #[token("]")]
155    SqClose,
156
157    #[token("(")]
158    BraceOpen,
159
160    #[token(")")]
161    BraceClose,
162
163    #[token("a")]
164    TypeTag,
165
166    #[token(";")]
167    Semi,
168
169    #[token(",")]
170    Comma,
171    #[token(".")]
172    Stop,
173
174    #[token("^^")]
175    DataTag,
176
177    #[token("true")]
178    True,
179
180    #[token("false")]
181    False,
182
183    #[token("{")]
184    CurlOpen,
185
186    #[token("}")]
187    CurlClose,
188
189    #[regex(r#"(_:((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])|_)|[0-9])((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])|_)|\-|[0-9]|\u00B7|[\u0300-\u036F]|[\u203F-\u2040])*(\.*((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])|_)|\-|[0-9]|\u00B7|[\u0300-\u036F]|[\u203F-\u2040])((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])|_)|\-|[0-9]|\u00B7|[\u0300-\u036F]|[\u203F-\u2040])*)*)"#)]
190    BLANK_NODE_LABEL,
191
192    #[regex(r#"([+-]?(([0-9]+\.[0-9]*([eE][+-]?[0-9]+))|(\.([0-9])+([eE][+-]?[0-9]+))|(([0-9])+([eE][+-]?[0-9]+))))"#)]
193    DOUBLE,
194
195    #[regex(r#"([+-]?([0-9])*\.([0-9])+)"#)]
196    DECIMAL,
197
198    #[regex(r#"([+-]?[0-9]+)"#)]
199    INTEGER,
200
201    #[regex(r#"([+-]?[0-9]+\.)"#)]
202    INTEGER_WITH_DOT,
203
204    #[regex(r#"(@[a-zA-Z][a-zA-Z]*(\-[a-zA-Z0-9][a-zA-Z0-9]*)*)"#)]
205    LANGTAG,
206
207    #[regex(r#"("([^\x22\x5C\x0A\x0D]|(\\[tbnrf\"'\\])|((\\u([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f]))|(\\U([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f]))))*")"#)]
208    STRING_LITERAL_QUOTE,
209
210    #[regex(r#"('([^\x27\x5C\x0A\x0D]|(\\[tbnrf\"'\\])|((\\u([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f]))|(\\U([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f]))))*')"#)]
211    STRING_LITERAL_SINGLE_QUOTE,
212
213    #[regex(r#"('''(('|'')?([^'\\]|(\\[tbnrf\"'\\])|((\\u([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f]))|(\\U([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])))))*''')"#)]
214    STRING_LITERAL_LONG_SINGLE_QUOTE,
215
216    #[regex(r#"("""(("|"")?([^"\\]|(\\[tbnrf\"'\\])|((\\u([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f]))|(\\U([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])))))*""")"#)]
217    STRING_LITERAL_LONG_QUOTE,
218
219    #[regex(r#"(<([^\x00-\x20<>"{}|^`\\]|((\\u([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f]))|(\\U([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f]))))*>)"#)]
220    IRIREF,
221
222    #[regex(r#"((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])((((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])|_)|\-|[0-9]|\u00B7|[\u0300-\u036F]|[\u203F-\u2040])|\.)*((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])|_)|\-|[0-9]|\u00B7|[\u0300-\u036F]|[\u203F-\u2040]))?)?:)"#)]
223    PNAME_NS,
224
225    #[regex(r#"(((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])((((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])|_)|\-|[0-9]|\u00B7|[\u0300-\u036F]|[\u203F-\u2040])|\.)*((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])|_)|\-|[0-9]|\u00B7|[\u0300-\u036F]|[\u203F-\u2040]))?)?:)(((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])|_)|:|[0-9]|((%([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f]))|(\\(_|\~|\.|\-|!|\$|\&|\\"|\(|\)|\*|\+|"|'|;|=|,|/|\?|\#|@|%))))(\.|(((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])|_)|\-|[0-9]|\u00B7|[\u0300-\u036F]|[\u203F-\u2040])|:|((%([0-9]|[A-F]|[a-f])([0-9]|[A-F]|[a-f]))|(\\(_|\~|\.|\-|!|\$|\&|\\"|\(|\)|\*|\+|"|'|;|=|,|/|\?|\#|@|%)))))*))"#)]
226    PNAME_LN,
227
228    #[regex(r#"#[^\u000D\u000A]*"#)]
229    Comment,
230
231    #[regex(r#"((\?|\$)((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])|_)|[0-9])((([A-Z]|[a-z]|[\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF])|_)|[0-9]|\u00B7|[\u0300-\u036F]|[\u203F-\u2040])*)"#)]
232    Variable,
233}
234
235pub fn parse_tokens_str<'a>(text: &'a str) -> (Vec<Spanned<Token>>, Vec<Simple<char>>) {
236    let mut tokens = Vec::new();
237    let mut errors = Vec::new();
238    let mut lex = SparqlToken::lexer(text);
239    while let Some(x) = lex.next() {
240        let t = || text[lex.span()].to_string();
241        let t2 = |d_start, d_end| {
242            let span = lex.span();
243            let (start, end) = (span.start, span.end);
244            text[start + d_start..end - d_end].to_string()
245        };
246
247        match x {
248            Ok(token) => {
249                let t = match token {
250                    SparqlToken::Comment => Token::Comment(t()),
251                    SparqlToken::SqPrefix => Token::SparqlPrefix,
252                    SparqlToken::SqBase => Token::SparqlBase,
253                    SparqlToken::SqOpen => Token::SqOpen,
254                    SparqlToken::SqClose => Token::SqClose,
255                    SparqlToken::BraceOpen => Token::BracketOpen,
256                    SparqlToken::BraceClose => Token::BracketClose,
257                    SparqlToken::TypeTag => Token::PredType,
258                    SparqlToken::CurlOpen => Token::CurlOpen,
259                    SparqlToken::CurlClose => Token::CurlClose,
260                    SparqlToken::Semi => Token::PredicateSplit,
261                    SparqlToken::Comma => Token::Comma,
262                    SparqlToken::Stop => Token::Stop,
263                    SparqlToken::DataTag => Token::DataTypeDelim,
264                    SparqlToken::True => Token::True,
265                    SparqlToken::False => Token::False,
266                    SparqlToken::BLANK_NODE_LABEL => Token::BlankNodeLabel(t2(2, 0)),
267                    SparqlToken::DOUBLE => Token::Number(t()),
268                    SparqlToken::DECIMAL => Token::Number(t()),
269                    SparqlToken::INTEGER => Token::Number(t()),
270                    SparqlToken::INTEGER_WITH_DOT => {
271                        let span = lex.span();
272                        let end = span.end - 1;
273                        let start = span.start;
274                        tokens.push(spanned(
275                            Token::Number(text[start..end].to_string()),
276                            start..end,
277                        ));
278                        tokens.push(spanned(Token::Stop, end..end + 1));
279
280                        continue;
281                    }
282                    SparqlToken::LANGTAG => Token::LangTag(t2(1, 0)),
283                    SparqlToken::STRING_LITERAL_LONG_SINGLE_QUOTE => {
284                        Token::Str(t2(3, 3), StringStyle::SingleLong)
285                    }
286                    SparqlToken::STRING_LITERAL_QUOTE => Token::Str(t2(1, 1), StringStyle::Double),
287                    SparqlToken::STRING_LITERAL_LONG_QUOTE => {
288                        Token::Str(t2(3, 3), StringStyle::DoubleLong)
289                    }
290                    SparqlToken::STRING_LITERAL_SINGLE_QUOTE => {
291                        Token::Str(t2(1, 1), StringStyle::Single)
292                    }
293                    SparqlToken::IRIREF => Token::IRIRef(t2(1, 1)),
294                    SparqlToken::PNAME_LN | SparqlToken::PNAME_NS => {
295                        let st = &text[lex.span()];
296                        let ends_with_stop = st.ends_with('.');
297
298                        if ends_with_stop {
299                            let span = lex.span();
300                            let end = span.end - 1;
301                            let start = span.start;
302                            if let Some((first, second)) = text[start..end].split_once(":") {
303                                tokens.push(spanned(
304                                    Token::PNameLN(Some(first.to_string()), second.to_string()),
305                                    start..end,
306                                ));
307                                tokens.push(spanned(Token::Stop, end..end + 1));
308                            } else {
309                                tokens.push(spanned(
310                                    Token::Invalid(text[start..end].to_string()),
311                                    start..end,
312                                ));
313                                tokens.push(spanned(Token::Stop, end..end + 1));
314                            }
315                            continue;
316                        } else {
317                            if let Some((first, second)) = text[lex.span()].split_once(":") {
318                                Token::PNameLN(Some(first.to_string()), second.to_string())
319                            } else {
320                                Token::Invalid(t())
321                            }
322                        }
323                    }
324                    SparqlToken::Kwd(sparql_keyword) => Token::SparqlKeyword(sparql_keyword),
325                    SparqlToken::Agg(sparql_aggregate) => Token::SparqlAggregate(sparql_aggregate),
326                    SparqlToken::Call(sparql_call) => Token::SparqlCall(sparql_call),
327                    SparqlToken::Expr(sparql_expr) => Token::SparqlExpr(sparql_expr),
328                    SparqlToken::Variable => Token::Variable(t()),
329                };
330                tokens.push(spanned(t, lex.span()));
331            }
332            Err(_) => {
333                tokens.push(spanned(Token::Invalid(t()), lex.span()));
334                errors.push(Simple::custom(
335                    lex.span(),
336                    format!("Unexpected token '{}'", &text[lex.span()]),
337                ))
338            }
339        }
340    }
341
342    (tokens, errors)
343}
344
345pub fn parse_tokens_str_safe(text: &str) -> Result<Vec<Spanned<Token>>, Vec<Simple<char>>> {
346    let (t, e) = parse_tokens_str(text);
347    if e.is_empty() {
348        Ok(t)
349    } else {
350        Err(e)
351    }
352}
353
354#[cfg(test)]
355mod tests {
356    use super::parse_tokens_str;
357
358    #[test]
359    fn parse_random_tokens_1() {
360        let inp = r#"
361PREFIX ent:  <http://org.example.com/employees#>
362DESCRIBE ?x WHERE { ?x ent:employeeId "1234" }
363        "#;
364
365        let (tok, er) = parse_tokens_str(inp);
366        assert_eq!(tok.len(), 11);
367        assert_eq!(er, vec![]);
368    }
369
370    #[test]
371    fn parse_random_tokens_2() {
372        let inp = r#"
373PREFIX  dc:  <http://purl.org/dc/elements/1.1/>
374SELECT  ?title
375WHERE   { 
376    ?x dc:title ?title
377    FILTER regex(?title, "^SPARQL") 
378}
379        "#;
380
381        let (tok, er) = parse_tokens_str(inp);
382        assert_eq!(tok.len(), 18);
383        assert_eq!(er, vec![]);
384    }
385
386    #[test]
387    fn parse_random_tokens_3() {
388        let inp = r#"
389PREFIX  dc:  <http://purl.org/dc/elements/1.1/>
390PREFIX  ns:  <http://example.org/ns#>
391
392SELECT  ?title ?price
393WHERE   {
394    ?x ns:price ?price .
395    FILTER (?price < 30.5)
396    ?x dc:title ?title . 
397}
398        "#;
399
400        let (tok, er) = parse_tokens_str(inp);
401        assert_eq!(tok.len(), 26);
402        assert_eq!(er, vec![]);
403    }
404
405    #[test]
406    fn parse_random_tokens_4() {
407        let inp = r#"
408PREFIX foaf: <http://xmlns.com/foaf/0.1/>
409SELECT ?name ?mbox
410WHERE  {
411    ?x foaf:name  ?name .
412    OPTIONAL { ?x  foaf:mbox  ?mbox }
413}
414        "#;
415
416        let (tok, er) = parse_tokens_str(inp);
417        assert_eq!(tok.len(), 19);
418        assert_eq!(er, vec![]);
419    }
420
421    #[test]
422    fn parse_random_tokens_5() {
423        let inp = r#"
424PREFIX foaf:    <http://xmlns.com/foaf/0.1/>
425ASK  {
426   ?x foaf:name  "Alice" ;
427      foaf:mbox  <mailto:alice@work.example>
428}
429        "#;
430
431        let (tok, er) = parse_tokens_str(inp);
432        assert_eq!(tok.len(), 12);
433        assert_eq!(er, vec![]);
434    }
435
436    #[test]
437    fn parse_random_tokens_6() {
438        let inp = r#"
439PREFIX  dc: <http://purl.org/dc/elements/1.1/>
440PREFIX app: <http://example.org/ns#>
441PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
442
443CONSTRUCT { ?s ?p ?o } WHERE
444{
445    GRAPH ?g { ?s ?p ?o } .
446    ?g dc:publisher <http://www.w3.org/> .
447    ?g dc:date ?date .
448    FILTER ( app:customDate(?date) > "2005-02-28T00:00:00Z"^^xsd:dateTime ) .
449}
450        "#;
451
452        let (tok, er) = parse_tokens_str(inp);
453        assert_eq!(tok.len(), 46);
454        assert_eq!(er, vec![]);
455    }
456
457    #[test]
458    fn parse_random_tokens_7() {
459        let inp = r#"
460PREFIX foaf:    <http://xmlns.com/foaf/0.1/>
461PREFIX vcard:   <http://www.w3.org/2001/vcard-rdf/3.0#>
462
463CONSTRUCT {
464     ?x  vcard:N _:v .
465    _:v vcard:givenName ?gname .
466    _:v vcard:familyName ?fname
467} WHERE {
468    { ?x foaf:firstname ?gname } UNION  { ?x foaf:givenname   ?gname } .
469    { ?x foaf:surname   ?fname } UNION  { ?x foaf:family_name ?fname } .
470}
471        "#;
472
473        let (tok, er) = parse_tokens_str(inp);
474        assert_eq!(tok.len(), 47);
475        assert_eq!(er, vec![]);
476    }
477
478    #[test]
479    fn parse_random_tokens_8() {
480        let inp = r#"
481PREFIX  dc:  <http://purl.org/dc/elements/1.1/>
482PREFIX  ns:  <http://example.org/ns#>
483SELECT  ?title (?p*(1-?discount) AS ?price)
484{ ?x ns:price ?p .
485  ?x dc:title ?title . 
486  [] ns:discount ?discount 
487}
488        "#;
489
490        let (tok, er) = parse_tokens_str(inp);
491        for t in &tok {
492            println!("t {:?}", t);
493        }
494        assert_eq!(tok.len(), 33);
495        assert_eq!(er, vec![]);
496    }
497}