oxttl/
lexer.rs

1#![allow(clippy::range_plus_one)]
2
3use crate::toolkit::{TokenRecognizer, TokenRecognizerError};
4use memchr::{memchr, memchr2};
5use oxilangtag::LanguageTag;
6use oxiri::Iri;
7use oxrdf::NamedNode;
8use std::borrow::Cow;
9use std::cmp::min;
10use std::collections::HashMap;
11use std::ops::Range;
12use std::str;
13
14#[derive(Debug, PartialEq, Eq)]
15pub enum N3Token<'a> {
16    IriRef(String),
17    PrefixedName {
18        prefix: &'a str,
19        local: Cow<'a, str>,
20        might_be_invalid_iri: bool,
21    },
22    Variable(Cow<'a, str>),
23    BlankNodeLabel(&'a str),
24    String(String),
25    Integer(&'a str),
26    Decimal(&'a str),
27    Double(&'a str),
28    LangTag(&'a str),
29    Punctuation(&'a str),
30    PlainKeyword(&'a str),
31}
32
33#[derive(Eq, PartialEq)]
34pub enum N3LexerMode {
35    NTriples,
36    Turtle,
37    N3,
38}
39
40#[derive(Default)]
41pub struct N3LexerOptions {
42    pub base_iri: Option<Iri<String>>,
43}
44
45pub struct N3Lexer {
46    mode: N3LexerMode,
47    unchecked: bool,
48}
49
50// TODO: there are a lot of 'None' (missing data) returned even if the stream is ending!!!
51// TODO: simplify by not giving is_end and fail with an "unexpected eof" is none is returned when is_end=true?
52
53impl TokenRecognizer for N3Lexer {
54    type Token<'a> = N3Token<'a>;
55    type Options = N3LexerOptions;
56
57    fn recognize_next_token<'a>(
58        &mut self,
59        data: &'a [u8],
60        is_ending: bool,
61        options: &N3LexerOptions,
62    ) -> Option<(usize, Result<N3Token<'a>, TokenRecognizerError>)> {
63        match *data.first()? {
64            b'<' => match *data.get(1)? {
65                b'<' => Some((2, Ok(N3Token::Punctuation("<<")))),
66                b'=' if self.mode == N3LexerMode::N3 => {
67                    if let Some((consumed, result)) = self.recognize_iri(data, options) {
68                        Some(if let Ok(result) = result {
69                            (consumed, Ok(result))
70                        } else {
71                            (2, Ok(N3Token::Punctuation("<=")))
72                        })
73                    } else if is_ending {
74                        Some((2, Ok(N3Token::Punctuation("<="))))
75                    } else {
76                        None
77                    }
78                }
79                b'-' if self.mode == N3LexerMode::N3 => {
80                    if let Some((consumed, result)) = self.recognize_iri(data, options) {
81                        Some(if let Ok(result) = result {
82                            (consumed, Ok(result))
83                        } else {
84                            (2, Ok(N3Token::Punctuation("<-")))
85                        })
86                    } else if is_ending {
87                        Some((2, Ok(N3Token::Punctuation("<-"))))
88                    } else {
89                        None
90                    }
91                }
92                _ => self.recognize_iri(data, options),
93            },
94            b'>' => {
95                if *data.get(1)? == b'>' {
96                    Some((2, Ok(N3Token::Punctuation(">>"))))
97                } else {
98                    Some((1, Ok(N3Token::Punctuation(">"))))
99                }
100            }
101            b'_' => match data.get(1)? {
102                b':' => Self::recognize_blank_node_label(data),
103                c => Some((
104                    1,
105                    Err((0, format!("Unexpected character '{}'", char::from(*c))).into()),
106                )),
107            },
108            b'"' => {
109                if self.mode != N3LexerMode::NTriples
110                    && *data.get(1)? == b'"'
111                    && *data.get(2)? == b'"'
112                {
113                    self.recognize_long_string(data, b'"')
114                } else {
115                    self.recognize_string(data, b'"')
116                }
117            }
118            b'\'' if self.mode != N3LexerMode::NTriples => {
119                if *data.get(1)? == b'\'' && *data.get(2)? == b'\'' {
120                    self.recognize_long_string(data, b'\'')
121                } else {
122                    self.recognize_string(data, b'\'')
123                }
124            }
125            b'@' => self.recognize_lang_tag(data),
126            b'.' => match data.get(1) {
127                Some(b'0'..=b'9') => Self::recognize_number(data, is_ending),
128                Some(_) => Some((1, Ok(N3Token::Punctuation(".")))),
129                None => is_ending.then_some((1, Ok(N3Token::Punctuation(".")))),
130            },
131            b'^' => {
132                if *data.get(1)? == b'^' {
133                    Some((2, Ok(N3Token::Punctuation("^^"))))
134                } else {
135                    Some((1, Ok(N3Token::Punctuation("^"))))
136                }
137            }
138            b'(' => Some((1, Ok(N3Token::Punctuation("(")))),
139            b')' => Some((1, Ok(N3Token::Punctuation(")")))),
140            b'[' => Some((1, Ok(N3Token::Punctuation("[")))),
141            b']' => Some((1, Ok(N3Token::Punctuation("]")))),
142            b'{' => {
143                if *data.get(1)? == b'|' {
144                    Some((2, Ok(N3Token::Punctuation("{|"))))
145                } else {
146                    Some((1, Ok(N3Token::Punctuation("{"))))
147                }
148            }
149            b'}' => Some((1, Ok(N3Token::Punctuation("}")))),
150            b',' => Some((1, Ok(N3Token::Punctuation(",")))),
151            b';' => Some((1, Ok(N3Token::Punctuation(";")))),
152            b'!' => Some((1, Ok(N3Token::Punctuation("!")))),
153            b'|' => {
154                if *data.get(1)? == b'}' {
155                    Some((2, Ok(N3Token::Punctuation("|}"))))
156                } else {
157                    Some((1, Ok(N3Token::Punctuation("|"))))
158                }
159            }
160            b'=' => {
161                if *data.get(1)? == b'>' {
162                    Some((2, Ok(N3Token::Punctuation("=>"))))
163                } else {
164                    Some((1, Ok(N3Token::Punctuation("="))))
165                }
166            }
167            b'0'..=b'9' | b'+' | b'-' => Self::recognize_number(data, is_ending),
168            b'?' => self.recognize_variable(data, is_ending),
169            _ => self.recognize_pname_or_keyword(data, is_ending),
170        }
171    }
172}
173
174impl N3Lexer {
175    pub fn new(mode: N3LexerMode, unchecked: bool) -> Self {
176        Self { mode, unchecked }
177    }
178
179    fn recognize_iri(
180        &self,
181        data: &[u8],
182        options: &N3LexerOptions,
183    ) -> Option<(usize, Result<N3Token<'static>, TokenRecognizerError>)> {
184        // [18] IRIREF  ::=  '<' ([^#x00-#x20<>"{}|^`\] | UCHAR)* '>' /* #x00=NULL #01-#x1F=control codes #x20=space */
185        let mut string = Vec::new();
186        let mut i = 1;
187        loop {
188            let end = memchr2(b'>', b'\\', &data[i..])?;
189            string.extend_from_slice(&data[i..i + end]);
190            i += end;
191            match data[i] {
192                b'>' => {
193                    return Some((i + 1, self.parse_iri(string, 0..i + 1, options)));
194                }
195                b'\\' => {
196                    let (additional, c) = self.recognize_escape(&data[i..], i, false)?;
197                    i += additional + 1;
198                    match c {
199                        Ok(c) => {
200                            let mut buf = [0; 4];
201                            string.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
202                        }
203                        Err(e) => return Some((i, Err(e))),
204                    }
205                }
206                _ => unreachable!(),
207            }
208        }
209    }
210
211    fn parse_iri(
212        &self,
213        iri: Vec<u8>,
214        position: Range<usize>,
215        options: &N3LexerOptions,
216    ) -> Result<N3Token<'static>, TokenRecognizerError> {
217        let iri = string_from_utf8(iri, position.clone())?;
218        Ok(N3Token::IriRef(
219            if let Some(base_iri) = options.base_iri.as_ref() {
220                if self.unchecked {
221                    base_iri.resolve_unchecked(&iri)
222                } else {
223                    base_iri
224                        .resolve(&iri)
225                        .map_err(|e| (position, e.to_string()))?
226                }
227                .into_inner()
228            } else if self.unchecked {
229                iri
230            } else {
231                Iri::parse(iri)
232                    .map_err(|e| (position, e.to_string()))?
233                    .into_inner()
234            },
235        ))
236    }
237
238    fn recognize_pname_or_keyword<'a>(
239        &self,
240        data: &'a [u8],
241        is_ending: bool,
242    ) -> Option<(usize, Result<N3Token<'a>, TokenRecognizerError>)> {
243        // [139s]  PNAME_NS   ::=  PN_PREFIX? ':'
244        // [140s]  PNAME_LN   ::=  PNAME_NS PN_LOCAL
245        // [167s]  PN_PREFIX  ::=  PN_CHARS_BASE ((PN_CHARS | '.')* PN_CHARS)?
246        let mut i = 0;
247        loop {
248            if let Some(r) = Self::recognize_unicode_char(&data[i..], i) {
249                match r {
250                    Ok((c, consumed)) => {
251                        if c == ':' {
252                            i += consumed;
253                            break;
254                        } else if i == 0 {
255                            if !Self::is_possible_pn_chars_base(c) {
256                                return Some((
257                                    consumed,
258                                    Err((
259                                        0..consumed,
260                                        format!(
261                                            "'{c}' is not allowed at the beginning of a prefix name"
262                                        ),
263                                    )
264                                        .into()),
265                                ));
266                            }
267                            i += consumed;
268                        } else if Self::is_possible_pn_chars(c) || c == '.' {
269                            i += consumed;
270                        } else {
271                            while data[..i].ends_with(b".") {
272                                i -= 1;
273                            }
274                            return Some((
275                                i,
276                                str_from_utf8(&data[..i], 0..i).map(N3Token::PlainKeyword),
277                            ));
278                        }
279                    }
280                    Err(e) => return Some((e.location.end, Err(e))),
281                }
282            } else if is_ending {
283                while data[..i].ends_with(b".") {
284                    i -= 1;
285                }
286                return Some(if i == 0 {
287                    (
288                        1,
289                        Err((0..1, format!("Unexpected byte {}", data[0])).into()),
290                    )
291                } else {
292                    (
293                        i,
294                        str_from_utf8(&data[..i], 0..i).map(N3Token::PlainKeyword),
295                    )
296                });
297            } else {
298                return None;
299            }
300        }
301        let pn_prefix = match str_from_utf8(&data[..i - 1], 0..i - 1) {
302            Ok(pn_prefix) => pn_prefix,
303            Err(e) => return Some((i, Err(e))),
304        };
305        if pn_prefix.ends_with('.') {
306            return Some((
307                i,
308                Err((
309                    0..i,
310                    format!(
311                        "'{pn_prefix}' is not a valid prefix: prefixes are not allowed to end with '.'"),
312                )
313                    .into()),
314            ));
315        }
316
317        let (consumed, pn_local_result) =
318            self.recognize_optional_pn_local(&data[i..], is_ending)?;
319        Some((
320            consumed + i,
321            pn_local_result.map(|(local, might_be_invalid_iri)| N3Token::PrefixedName {
322                prefix: pn_prefix,
323                local,
324                might_be_invalid_iri,
325            }),
326        ))
327    }
328
329    fn recognize_variable<'a>(
330        &self,
331        data: &'a [u8],
332        is_ending: bool,
333    ) -> Option<(usize, Result<N3Token<'a>, TokenRecognizerError>)> {
334        // [36]  QUICK_VAR_NAME  ::=  "?" PN_LOCAL
335        let (consumed, result) = self.recognize_optional_pn_local(&data[1..], is_ending)?;
336        Some((
337            consumed + 1,
338            result.and_then(|(name, _)| {
339                if name.is_empty() {
340                    Err((0..consumed, "A variable name is not allowed to be empty").into())
341                } else {
342                    Ok(N3Token::Variable(name))
343                }
344            }),
345        ))
346    }
347
348    fn recognize_optional_pn_local<'a>(
349        &self,
350        data: &'a [u8],
351        is_ending: bool,
352    ) -> Option<(usize, Result<(Cow<'a, str>, bool), TokenRecognizerError>)> {
353        // [168s]  PN_LOCAL  ::=  (PN_CHARS_U | ':' | [0-9] | PLX) ((PN_CHARS | '.' | ':' | PLX)* (PN_CHARS | ':' | PLX))?
354        let mut i = 0;
355        let mut buffer = None; // Buffer if there are some escaped characters
356        let mut position_that_is_already_in_buffer = 0;
357        let mut might_be_invalid_iri = false;
358        let mut ends_with_unescaped_dot = 0;
359        loop {
360            if let Some(r) = Self::recognize_unicode_char(&data[i..], i) {
361                match r {
362                    Ok((c, consumed)) => {
363                        if c == '%' {
364                            i += 1;
365                            let a = char::from(*data.get(i)?);
366                            i += 1;
367                            let b = char::from(*data.get(i)?);
368                            if !a.is_ascii_hexdigit() || !b.is_ascii_hexdigit() {
369                                return Some((i + 1, Err((
370                                    i - 2..=i, format!("escapes in IRIs should be % followed by two hexadecimal characters, found '%{a}{b}'")
371                                ).into())));
372                            }
373                            i += 1;
374                            ends_with_unescaped_dot = 0;
375                        } else if c == '\\' {
376                            i += 1;
377                            let a = char::from(*data.get(i)?);
378                            if self.unchecked
379                                || matches!(
380                                    a,
381                                    '_' | '~'
382                                        | '.'
383                                        | '-'
384                                        | '!'
385                                        | '$'
386                                        | '&'
387                                        | '\''
388                                        | '('
389                                        | ')'
390                                        | '*'
391                                        | '+'
392                                        | ','
393                                        | ';'
394                                        | '='
395                                )
396                            {
397                                // ok to escape
398                            } else if matches!(a, '/' | '?' | '#' | '@' | '%') {
399                                // ok to escape but requires IRI validation
400                                might_be_invalid_iri = true;
401                            } else {
402                                return Some((i + 1, Err((
403                                    i..=i, format!("The character that are allowed to be escaped in IRIs are _~.-!$&'()*+,;=/?#@%, found '{a}'")
404                                ).into())));
405                            }
406                            let buffer = buffer.get_or_insert_with(String::new);
407                            // We add the missing bytes
408                            if i - position_that_is_already_in_buffer > 1 {
409                                buffer.push_str(
410                                    match str_from_utf8(
411                                        &data[position_that_is_already_in_buffer..i - 1],
412                                        position_that_is_already_in_buffer..i - 1,
413                                    ) {
414                                        Ok(data) => data,
415                                        Err(e) => return Some((i, Err(e))),
416                                    },
417                                )
418                            }
419                            buffer.push(a);
420                            i += 1;
421                            position_that_is_already_in_buffer = i;
422                            ends_with_unescaped_dot = 0;
423                        } else if i == 0 {
424                            if !(Self::is_possible_pn_chars_u(c) || c == ':' || c.is_ascii_digit())
425                            {
426                                return Some((0, Ok((Cow::Borrowed(""), false))));
427                            }
428                            if !self.unchecked {
429                                might_be_invalid_iri |=
430                                    Self::is_possible_pn_chars_base_but_not_valid_iri(c)
431                                        || c == ':';
432                            }
433                            i += consumed;
434                        } else if Self::is_possible_pn_chars(c) || c == ':' {
435                            if !self.unchecked {
436                                might_be_invalid_iri |=
437                                    Self::is_possible_pn_chars_base_but_not_valid_iri(c)
438                                        || c == ':';
439                            }
440                            i += consumed;
441                            ends_with_unescaped_dot = 0;
442                        } else if c == '.' {
443                            i += consumed;
444                            ends_with_unescaped_dot += 1;
445                        } else {
446                            let buffer = if let Some(mut buffer) = buffer {
447                                buffer.push_str(
448                                    match str_from_utf8(
449                                        &data[position_that_is_already_in_buffer..i],
450                                        position_that_is_already_in_buffer..i,
451                                    ) {
452                                        Ok(data) => data,
453                                        Err(e) => return Some((i, Err(e))),
454                                    },
455                                );
456                                // We do not include the last dots
457                                for _ in 0..ends_with_unescaped_dot {
458                                    buffer.pop();
459                                }
460                                i -= ends_with_unescaped_dot;
461                                Cow::Owned(buffer)
462                            } else {
463                                let mut data = match str_from_utf8(&data[..i], 0..i) {
464                                    Ok(data) => data,
465                                    Err(e) => return Some((i, Err(e))),
466                                };
467                                // We do not include the last dots
468                                data = &data[..data.len() - ends_with_unescaped_dot];
469                                i -= ends_with_unescaped_dot;
470                                Cow::Borrowed(data)
471                            };
472                            return Some((i, Ok((buffer, might_be_invalid_iri))));
473                        }
474                    }
475                    Err(e) => return Some((e.location.end, Err(e))),
476                }
477            } else if is_ending {
478                let buffer = if let Some(mut buffer) = buffer {
479                    // We do not include the last dot
480                    while buffer.ends_with('.') {
481                        buffer.pop();
482                        i -= 1;
483                    }
484                    Cow::Owned(buffer)
485                } else {
486                    let mut data = match str_from_utf8(&data[..i], 0..i) {
487                        Ok(data) => data,
488                        Err(e) => return Some((i, Err(e))),
489                    };
490                    // We do not include the last dot
491                    while let Some(d) = data.strip_suffix('.') {
492                        data = d;
493                        i -= 1;
494                    }
495                    Cow::Borrowed(data)
496                };
497                return Some((i, Ok((buffer, might_be_invalid_iri))));
498            } else {
499                return None;
500            }
501        }
502    }
503
504    fn recognize_blank_node_label(
505        data: &[u8],
506    ) -> Option<(usize, Result<N3Token<'_>, TokenRecognizerError>)> {
507        // [141s]  BLANK_NODE_LABEL  ::=  '_:' (PN_CHARS_U | [0-9]) ((PN_CHARS | '.')* PN_CHARS)?
508        let mut i = 2;
509        loop {
510            match Self::recognize_unicode_char(&data[i..], i)? {
511                Ok((c, consumed)) => {
512                    if (i == 2 && (Self::is_possible_pn_chars_u(c) || c.is_ascii_digit()))
513                        || (i > 2 && Self::is_possible_pn_chars(c))
514                    {
515                        // Ok
516                    } else if i > 2 && c == '.' {
517                        if data[i - 1] == b'.' {
518                            i -= 1;
519                            return Some((
520                                i,
521                                str_from_utf8(&data[2..i], 2..i).map(N3Token::BlankNodeLabel),
522                            ));
523                        }
524                    } else if i == 2 {
525                        return Some((
526                            i,
527                            Err((0..i, "A blank node ID should not be empty").into()),
528                        ));
529                    } else if data[i - 1] == b'.' {
530                        i -= 1;
531                        return Some((
532                            i,
533                            str_from_utf8(&data[2..i], 2..i).map(N3Token::BlankNodeLabel),
534                        ));
535                    } else {
536                        return Some((
537                            i,
538                            str_from_utf8(&data[2..i], 2..i).map(N3Token::BlankNodeLabel),
539                        ));
540                    }
541                    i += consumed;
542                }
543                Err(e) => return Some((e.location.end, Err(e))),
544            }
545        }
546    }
547
548    fn recognize_lang_tag<'a>(
549        &self,
550        data: &'a [u8],
551    ) -> Option<(usize, Result<N3Token<'a>, TokenRecognizerError>)> {
552        // [144s]  LANGTAG  ::=  '@' [a-zA-Z]+ ('-' [a-zA-Z0-9]+)*
553        let mut is_last_block_empty = true;
554        for (i, c) in data[1..].iter().enumerate() {
555            if c.is_ascii_alphabetic() {
556                is_last_block_empty = false;
557            } else if i == 0 {
558                return Some((
559                    1,
560                    Err((1..2, "A language code should always start with a letter").into()),
561                ));
562            } else if is_last_block_empty {
563                return Some((i, self.parse_lang_tag(&data[1..i], 1..i - 1)));
564            } else if *c == b'-' {
565                is_last_block_empty = true;
566            } else {
567                return Some((i + 1, self.parse_lang_tag(&data[1..=i], 1..i)));
568            }
569        }
570        None
571    }
572
573    fn parse_lang_tag<'a>(
574        &self,
575        lang_tag: &'a [u8],
576        position: Range<usize>,
577    ) -> Result<N3Token<'a>, TokenRecognizerError> {
578        let lang_tag = str_from_utf8(lang_tag, position.clone())?;
579        Ok(N3Token::LangTag(if self.unchecked {
580            lang_tag
581        } else {
582            LanguageTag::parse(lang_tag)
583                .map_err(|e| (position.clone(), e.to_string()))?
584                .into_inner()
585        }))
586    }
587    fn recognize_string(
588        &self,
589        data: &[u8],
590        delimiter: u8,
591    ) -> Option<(usize, Result<N3Token<'static>, TokenRecognizerError>)> {
592        // [22]  STRING_LITERAL_QUOTE         ::=  '"' ([^#x22#x5C#xA#xD] | ECHAR | UCHAR)* '"' /* #x22=" #x5C=\ #xA=new line #xD=carriage return */
593        // [23]  STRING_LITERAL_SINGLE_QUOTE  ::=  "'" ([^#x27#x5C#xA#xD] | ECHAR | UCHAR)* "'" /* #x27=' #x5C=\ #xA=new line #xD=carriage return */
594        let mut string = String::new();
595        let mut i = 1;
596        loop {
597            let mut end = memchr2(delimiter, b'\\', &data[i..])?;
598            if !self.unchecked {
599                // We check also line jumps
600                if let Some(line_jump_end) = memchr2(b'\n', b'\r', &data[i..i + end]) {
601                    end = line_jump_end;
602                }
603            }
604            match str_from_utf8(&data[i..i + end], i..i + end) {
605                Ok(s) => string.push_str(s),
606                Err(e) => return Some((end, Err(e))),
607            };
608            i += end;
609            match data[i] {
610                c if c == delimiter => {
611                    return Some((i + 1, Ok(N3Token::String(string))));
612                }
613                b'\\' => {
614                    let (additional, c) = self.recognize_escape(&data[i..], i, true)?;
615                    i += additional + 1;
616                    match c {
617                        Ok(c) => {
618                            string.push(c);
619                        }
620                        Err(e) => {
621                            // We read until the end of string char
622                            let end = memchr(delimiter, &data[i..])?;
623                            return Some((i + end + 1, Err(e)));
624                        }
625                    }
626                }
627                b'\n' | b'\r' => {
628                    // We read until the end of string char
629                    let end = memchr(delimiter, &data[i..])?;
630                    return Some((
631                        i + end + 1,
632                        Err((
633                            i..i + 1,
634                            "Line jumps are not allowed in string literals, use \\n",
635                        )
636                            .into()),
637                    ));
638                }
639                _ => unreachable!(),
640            }
641        }
642    }
643
644    fn recognize_long_string(
645        &self,
646        data: &[u8],
647        delimiter: u8,
648    ) -> Option<(usize, Result<N3Token<'static>, TokenRecognizerError>)> {
649        // [24]  STRING_LITERAL_LONG_SINGLE_QUOTE  ::=  "'''" (("'" | "''")? ([^'\] | ECHAR | UCHAR))* "'''"
650        // [25]  STRING_LITERAL_LONG_QUOTE         ::=  '"""' (('"' | '""')? ([^"\] | ECHAR | UCHAR))* '"""'
651        let mut string = String::new();
652        let mut i = 3;
653        loop {
654            let end = memchr2(delimiter, b'\\', &data[i..])?;
655            match str_from_utf8(&data[i..i + end], i..i + end) {
656                Ok(s) => string.push_str(s),
657                Err(e) => return Some((end, Err(e))),
658            };
659            i += end;
660            match data[i] {
661                c if c == delimiter => {
662                    if *data.get(i + 1)? == delimiter && *data.get(i + 2)? == delimiter {
663                        return Some((i + 3, Ok(N3Token::String(string))));
664                    }
665                    i += 1;
666                    string.push(char::from(delimiter));
667                }
668                b'\\' => {
669                    let (additional, c) = self.recognize_escape(&data[i..], i, true)?;
670                    i += additional + 1;
671                    match c {
672                        Ok(c) => {
673                            string.push(c);
674                        }
675                        Err(e) => return Some((i, Err(e))),
676                    }
677                }
678                _ => unreachable!(),
679            }
680        }
681    }
682
683    fn recognize_number(
684        data: &[u8],
685        is_ending: bool,
686    ) -> Option<(usize, Result<N3Token<'_>, TokenRecognizerError>)> {
687        // [19]  INTEGER    ::=  [+-]? [0-9]+
688        // [20]  DECIMAL    ::=  [+-]? [0-9]* '.' [0-9]+
689        // [21]  DOUBLE     ::=  [+-]? ([0-9]+ '.' [0-9]* EXPONENT | '.' [0-9]+ EXPONENT | [0-9]+ EXPONENT)
690        // [154s] EXPONENT  ::=  [eE] [+-]? [0-9]+
691        let mut i = 0;
692        let c = *data.first()?;
693        if matches!(c, b'+' | b'-') {
694            i += 1;
695        }
696        // We read the digits before .
697        let count_before = Self::recognize_digits(&data[i..], is_ending)?;
698        i += count_before;
699
700        // We read the digits after .
701        let c = if let Some(c) = data.get(i) {
702            Some(c)
703        } else if is_ending {
704            None
705        } else {
706            return None;
707        };
708        let count_after = if c == Some(&b'.') {
709            i += 1;
710            let count_after = Self::recognize_digits(&data[i..], is_ending)?;
711            i += count_after;
712            Some(count_after)
713        } else {
714            None
715        };
716
717        // End
718        let c = if let Some(c) = data.get(i) {
719            Some(c)
720        } else if is_ending {
721            None
722        } else {
723            return None;
724        };
725        if matches!(c, Some(b'e' | b'E')) {
726            i += 1;
727
728            let c = if let Some(c) = data.get(i) {
729                Some(c)
730            } else if is_ending {
731                None
732            } else {
733                return None;
734            };
735            if matches!(c, Some(b'+' | b'-')) {
736                i += 1;
737            }
738
739            let count_exp = Self::recognize_digits(&data[i..], is_ending)?;
740            i += count_exp;
741            Some((
742                i,
743                if count_exp == 0 {
744                    Err((0..i, "A double exponent cannot be empty").into())
745                } else if count_before == 0 && count_after.unwrap_or(0) == 0 {
746                    Err((0..i, "A double should not be empty").into())
747                } else {
748                    str_from_utf8(&data[..i], 0..i).map(N3Token::Double)
749                },
750            ))
751        } else if let Some(count_after) = count_after {
752            if count_after == 0 {
753                // We do not consume the '.' after all
754                i -= 1;
755                Some((
756                    i,
757                    if count_before == 0 {
758                        Err((0..i, "An integer should not be empty").into())
759                    } else {
760                        str_from_utf8(&data[..i], 0..i).map(N3Token::Integer)
761                    },
762                ))
763            } else {
764                Some((i, str_from_utf8(&data[..i], 0..i).map(N3Token::Decimal)))
765            }
766        } else {
767            Some((
768                i,
769                if count_before == 0 {
770                    Err((0..i, "An integer should not be empty").into())
771                } else {
772                    str_from_utf8(&data[..i], 0..i).map(N3Token::Integer)
773                },
774            ))
775        }
776    }
777
778    fn recognize_digits(data: &[u8], is_ending: bool) -> Option<usize> {
779        for (i, c) in data.iter().enumerate() {
780            if !c.is_ascii_digit() {
781                return Some(i);
782            }
783        }
784        is_ending.then_some(data.len())
785    }
786
787    fn recognize_escape(
788        &self,
789        data: &[u8],
790        position: usize,
791        with_echar: bool,
792    ) -> Option<(usize, Result<char, TokenRecognizerError>)> {
793        // [26]   UCHAR  ::=  '\u' HEX HEX HEX HEX | '\U' HEX HEX HEX HEX HEX HEX HEX HEX
794        // [159s] ECHAR  ::=  '\' [tbnrf"'\]
795        match *data.get(1)? {
796            b'u' => match Self::recognize_hex_char(&data[2..], 4, 'u', position) {
797                Ok(c) => Some((5, Ok(c?))),
798                Err(e) => {
799                    if self.unchecked {
800                        match Self::recognize_utf16_surrogate_pair(&data[2..], position) {
801                            Ok(c) => Some((11, Ok(c?))),
802                            Err(e) => Some((5, Err(e))),
803                        }
804                    } else {
805                        Some((5, Err(e)))
806                    }
807                }
808            },
809            b'U' => match Self::recognize_hex_char(&data[2..], 8, 'U', position) {
810                Ok(c) => Some((9, Ok(c?))),
811                Err(e) => Some((9, Err(e))),
812            },
813            b't' if with_echar => Some((1, Ok('\t'))),
814            b'b' if with_echar => Some((1, Ok('\x08'))),
815            b'n' if with_echar => Some((1, Ok('\n'))),
816            b'r' if with_echar => Some((1, Ok('\r'))),
817            b'f' if with_echar => Some((1, Ok('\x0C'))),
818            b'"' if with_echar => Some((1, Ok('"'))),
819            b'\'' if with_echar => Some((1, Ok('\''))),
820            b'\\' if with_echar => Some((1, Ok('\\'))),
821            c => Some((
822                1,
823                Err((
824                    position..position + 2,
825                    format!("Unexpected escape character '\\{}'", char::from(c)),
826                )
827                    .into()),
828            )), // TODO: read until end of string
829        }
830    }
831
832    fn recognize_hex_char(
833        data: &[u8],
834        len: usize,
835        escape_char: char,
836        position: usize,
837    ) -> Result<Option<char>, TokenRecognizerError> {
838        if data.len() < len {
839            return Ok(None);
840        };
841        let mut codepoint = 0;
842        for i in 0..len {
843            let c = data[i];
844            codepoint = codepoint * 16
845                + u32::from(match c {
846                    b'0'..=b'9' => c - b'0',
847                    b'a'..=b'f' => c - b'a' + 10,
848                    b'A'..=b'F' => c - b'A' + 10,
849                    _ => {
850                        let val = str::from_utf8(&data[..len]).unwrap_or_default();
851                        return Err((
852                        position + i + 2..position + i + 3,
853                        format!(
854                            "The escape sequence '\\{escape_char}{val}' is not a valid hexadecimal string"
855                        ),
856                    ).into());
857                    }
858                });
859        }
860        let c = char::from_u32(codepoint).ok_or_else(|| {
861            let val = str::from_utf8(&data[..len]).unwrap_or_default();
862            (
863                position..position + len +2,
864                format!(
865                    "The escape sequence '\\{escape_char}{val}' is encoding {codepoint:X} that is not a valid unicode character",
866                ),
867            )
868        })?;
869        Ok(Some(c))
870    }
871
872    fn recognize_utf16_surrogate_pair(
873        data: &[u8],
874        position: usize,
875    ) -> Result<Option<char>, TokenRecognizerError> {
876        let Some(val_high_slice) = data.get(..4) else {
877            return Ok(None);
878        };
879        let val_high = str_from_utf8(val_high_slice, position..position + 6)?;
880        let surrogate_high = u16::from_str_radix(val_high, 16).map_err(|e| {
881            (
882                position..position + 6,
883                format!(
884                    "The escape sequence '\\u{val_high}' is not a valid hexadecimal string: {e}"
885                ),
886            )
887        })?;
888
889        // TODO: replace with [`u16::is_utf16_surrogate`] when #94919 is stable
890        if !matches!(surrogate_high, 0xD800..=0xDFFF) {
891            return Err((
892                position..position + 6,
893                format!("The escape sequence '\\u{val_high}' is not a UTF-16 surrogate"),
894            )
895                .into());
896        }
897        let Some(&d4) = data.get(4) else {
898            return Ok(None);
899        };
900        let Some(&d5) = data.get(5) else {
901            return Ok(None);
902        };
903        if d4 != b'\\' || d5 != b'u' {
904            return Err((
905                position..position + 6,
906                format!(
907                    "UTF-16 surrogate escape sequence '\\u{val_high}' must be followed by another surrogate escape sequence"),
908            )
909                .into());
910        }
911
912        let Some(val_low_slice) = data.get(6..10) else {
913            return Ok(None);
914        };
915        let val_low = str_from_utf8(val_low_slice, position + 6..position + 12)?;
916        let surrogate_low = u16::from_str_radix(val_low, 16).map_err(|e| {
917            (
918                position + 6..position + 12,
919                format!(
920                    "The escape sequence '\\u{val_low}' is not a valid hexadecimal string: {e}"
921                ),
922            )
923        })?;
924
925        let mut chars = char::decode_utf16([surrogate_high, surrogate_low]);
926
927        let c = chars.next()
928            .and_then(Result::ok)
929            .ok_or_else(|| {
930                (
931                    position..position + 12,
932                    format!(
933                        "Escape sequences '\\u{val_high}\\u{val_low}' do not form a valid UTF-16 surrogate pair"
934                    ),
935                )
936            })?;
937
938        debug_assert_eq!(
939            chars.next(),
940            None,
941            "Surrogate pair should combine to exactly one character"
942        );
943
944        Ok(Some(c))
945    }
946
947    fn recognize_unicode_char(
948        data: &[u8],
949        position: usize,
950    ) -> Option<Result<(char, usize), TokenRecognizerError>> {
951        let mut code_point: u32;
952        let bytes_needed: usize;
953        let mut lower_boundary = 0x80;
954        let mut upper_boundary = 0xBF;
955
956        let byte = *data.first()?;
957        match byte {
958            0x00..=0x7F => return Some(Ok((char::from(byte), 1))),
959            0xC2..=0xDF => {
960                bytes_needed = 1;
961                code_point = u32::from(byte) & 0x1F;
962            }
963            0xE0..=0xEF => {
964                if byte == 0xE0 {
965                    lower_boundary = 0xA0;
966                }
967                if byte == 0xED {
968                    upper_boundary = 0x9F;
969                }
970                bytes_needed = 2;
971                code_point = u32::from(byte) & 0xF;
972            }
973            0xF0..=0xF4 => {
974                if byte == 0xF0 {
975                    lower_boundary = 0x90;
976                }
977                if byte == 0xF4 {
978                    upper_boundary = 0x8F;
979                }
980                bytes_needed = 3;
981                code_point = u32::from(byte) & 0x7;
982            }
983            _ => {
984                return Some(Err((
985                    position..=position,
986                    "Invalid UTF-8 character encoding",
987                )
988                    .into()))
989            }
990        }
991
992        for i in 1..=bytes_needed {
993            let byte = *data.get(i)?;
994            if byte < lower_boundary || upper_boundary < byte {
995                return Some(Err((
996                    position..=position + i,
997                    "Invalid UTF-8 character encoding",
998                )
999                    .into()));
1000            }
1001            lower_boundary = 0x80;
1002            upper_boundary = 0xBF;
1003            code_point = (code_point << 6) | (u32::from(byte) & 0x3F);
1004        }
1005
1006        Some(
1007            char::from_u32(code_point)
1008                .map(|c| (c, bytes_needed + 1))
1009                .ok_or_else(|| {
1010                    (
1011                        position..=position + bytes_needed,
1012                        format!("The codepoint {code_point:X} is not a valid unicode character"),
1013                    )
1014                        .into()
1015                }),
1016        )
1017    }
1018
1019    // [157s]  PN_CHARS_BASE  ::=  [A-Z] | [a-z] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x02FF] | [#x0370-#x037D] | [#x037F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
1020    fn is_possible_pn_chars_base(c: char) -> bool {
1021        matches!(c,
1022        'A'..='Z'
1023        | 'a'..='z'
1024        | '\u{00C0}'..='\u{00D6}'
1025        | '\u{00D8}'..='\u{00F6}'
1026        | '\u{00F8}'..='\u{02FF}'
1027        | '\u{0370}'..='\u{037D}'
1028        | '\u{037F}'..='\u{1FFF}'
1029        | '\u{200C}'..='\u{200D}'
1030        | '\u{2070}'..='\u{218F}'
1031        | '\u{2C00}'..='\u{2FEF}'
1032        | '\u{3001}'..='\u{D7FF}'
1033        | '\u{F900}'..='\u{FDCF}'
1034        | '\u{FDF0}'..='\u{FFFD}'
1035        | '\u{10000}'..='\u{EFFFF}')
1036    }
1037
1038    // [158s]  PN_CHARS_U  ::=  PN_CHARS_BASE | '_'
1039    pub(super) fn is_possible_pn_chars_u(c: char) -> bool {
1040        Self::is_possible_pn_chars_base(c) || c == '_'
1041    }
1042
1043    // [160s]  PN_CHARS  ::=  PN_CHARS_U | '-' | [0-9] | #x00B7 | [#x0300-#x036F] | [#x203F-#x2040]
1044    pub(crate) fn is_possible_pn_chars(c: char) -> bool {
1045        Self::is_possible_pn_chars_u(c)
1046            || matches!(c,
1047        '-' | '0'..='9' | '\u{00B7}' | '\u{0300}'..='\u{036F}' | '\u{203F}'..='\u{2040}')
1048    }
1049
1050    fn is_possible_pn_chars_base_but_not_valid_iri(c: char) -> bool {
1051        matches!(c, '\u{FFF0}'..='\u{FFFD}')
1052            || u32::from(c) % u32::from('\u{FFFE}') == 0
1053            || u32::from(c) % u32::from('\u{FFFF}') == 0
1054    }
1055}
1056
1057pub fn resolve_local_name(
1058    prefix: &str,
1059    local: &str,
1060    might_be_invalid_iri: bool,
1061    prefixes: &HashMap<String, Iri<String>>,
1062) -> Result<NamedNode, String> {
1063    if let Some(start) = prefixes.get(prefix) {
1064        let iri = format!("{start}{local}");
1065        if might_be_invalid_iri || start.path().is_empty() {
1066            // We validate again. We always validate if the local part might be the IRI authority.
1067            if let Err(e) = Iri::parse(iri.as_str()) {
1068                return Err(format!(
1069                    "The prefixed name {prefix}:{local} builds IRI {iri} that is invalid: {e}"
1070                ));
1071            }
1072        }
1073        Ok(NamedNode::new_unchecked(iri))
1074    } else {
1075        Err(format!("The prefix {prefix}: has not been declared"))
1076    }
1077}
1078
1079fn str_from_utf8(data: &[u8], range: Range<usize>) -> Result<&str, TokenRecognizerError> {
1080    str::from_utf8(data).map_err(|e| {
1081        (
1082            range.start + e.valid_up_to()..min(range.end, range.start + e.valid_up_to() + 4),
1083            format!("Invalid UTF-8: {e}"),
1084        )
1085            .into()
1086    })
1087}
1088
1089fn string_from_utf8(data: Vec<u8>, range: Range<usize>) -> Result<String, TokenRecognizerError> {
1090    String::from_utf8(data).map_err(|e| {
1091        (
1092            range.start + e.utf8_error().valid_up_to()
1093                ..min(range.end, range.start + e.utf8_error().valid_up_to() + 4),
1094            format!("Invalid UTF-8: {e}"),
1095        )
1096            .into()
1097    })
1098}