1#![allow(clippy::range_plus_one)]
2
3use crate::toolkit::{TokenRecognizer, TokenRecognizerError};
4use memchr::{memchr, memchr2};
5use oxilangtag::LanguageTag;
6use oxiri::Iri;
7use oxrdf::NamedNode;
8use std::borrow::Cow;
9use std::cmp::min;
10use std::collections::HashMap;
11use std::ops::Range;
12use std::str;
13
14#[derive(Debug, PartialEq, Eq)]
15pub enum N3Token<'a> {
16 IriRef(String),
17 PrefixedName {
18 prefix: &'a str,
19 local: Cow<'a, str>,
20 might_be_invalid_iri: bool,
21 },
22 Variable(Cow<'a, str>),
23 BlankNodeLabel(&'a str),
24 String(String),
25 Integer(&'a str),
26 Decimal(&'a str),
27 Double(&'a str),
28 LangTag(&'a str),
29 Punctuation(&'a str),
30 PlainKeyword(&'a str),
31}
32
33#[derive(Eq, PartialEq)]
34pub enum N3LexerMode {
35 NTriples,
36 Turtle,
37 N3,
38}
39
40#[derive(Default)]
41pub struct N3LexerOptions {
42 pub base_iri: Option<Iri<String>>,
43}
44
45pub struct N3Lexer {
46 mode: N3LexerMode,
47 unchecked: bool,
48}
49
50impl TokenRecognizer for N3Lexer {
54 type Token<'a> = N3Token<'a>;
55 type Options = N3LexerOptions;
56
57 fn recognize_next_token<'a>(
58 &mut self,
59 data: &'a [u8],
60 is_ending: bool,
61 options: &N3LexerOptions,
62 ) -> Option<(usize, Result<N3Token<'a>, TokenRecognizerError>)> {
63 match *data.first()? {
64 b'<' => match *data.get(1)? {
65 b'<' => Some((2, Ok(N3Token::Punctuation("<<")))),
66 b'=' if self.mode == N3LexerMode::N3 => {
67 if let Some((consumed, result)) = self.recognize_iri(data, options) {
68 Some(if let Ok(result) = result {
69 (consumed, Ok(result))
70 } else {
71 (2, Ok(N3Token::Punctuation("<=")))
72 })
73 } else if is_ending {
74 Some((2, Ok(N3Token::Punctuation("<="))))
75 } else {
76 None
77 }
78 }
79 b'-' if self.mode == N3LexerMode::N3 => {
80 if let Some((consumed, result)) = self.recognize_iri(data, options) {
81 Some(if let Ok(result) = result {
82 (consumed, Ok(result))
83 } else {
84 (2, Ok(N3Token::Punctuation("<-")))
85 })
86 } else if is_ending {
87 Some((2, Ok(N3Token::Punctuation("<-"))))
88 } else {
89 None
90 }
91 }
92 _ => self.recognize_iri(data, options),
93 },
94 b'>' => {
95 if *data.get(1)? == b'>' {
96 Some((2, Ok(N3Token::Punctuation(">>"))))
97 } else {
98 Some((1, Ok(N3Token::Punctuation(">"))))
99 }
100 }
101 b'_' => match data.get(1)? {
102 b':' => Self::recognize_blank_node_label(data),
103 c => Some((
104 1,
105 Err((0, format!("Unexpected character '{}'", char::from(*c))).into()),
106 )),
107 },
108 b'"' => {
109 if self.mode != N3LexerMode::NTriples
110 && *data.get(1)? == b'"'
111 && *data.get(2)? == b'"'
112 {
113 self.recognize_long_string(data, b'"')
114 } else {
115 self.recognize_string(data, b'"')
116 }
117 }
118 b'\'' if self.mode != N3LexerMode::NTriples => {
119 if *data.get(1)? == b'\'' && *data.get(2)? == b'\'' {
120 self.recognize_long_string(data, b'\'')
121 } else {
122 self.recognize_string(data, b'\'')
123 }
124 }
125 b'@' => self.recognize_lang_tag(data),
126 b'.' => match data.get(1) {
127 Some(b'0'..=b'9') => Self::recognize_number(data, is_ending),
128 Some(_) => Some((1, Ok(N3Token::Punctuation(".")))),
129 None => is_ending.then_some((1, Ok(N3Token::Punctuation(".")))),
130 },
131 b'^' => {
132 if *data.get(1)? == b'^' {
133 Some((2, Ok(N3Token::Punctuation("^^"))))
134 } else {
135 Some((1, Ok(N3Token::Punctuation("^"))))
136 }
137 }
138 b'(' => Some((1, Ok(N3Token::Punctuation("(")))),
139 b')' => Some((1, Ok(N3Token::Punctuation(")")))),
140 b'[' => Some((1, Ok(N3Token::Punctuation("[")))),
141 b']' => Some((1, Ok(N3Token::Punctuation("]")))),
142 b'{' => {
143 if *data.get(1)? == b'|' {
144 Some((2, Ok(N3Token::Punctuation("{|"))))
145 } else {
146 Some((1, Ok(N3Token::Punctuation("{"))))
147 }
148 }
149 b'}' => Some((1, Ok(N3Token::Punctuation("}")))),
150 b',' => Some((1, Ok(N3Token::Punctuation(",")))),
151 b';' => Some((1, Ok(N3Token::Punctuation(";")))),
152 b'!' => Some((1, Ok(N3Token::Punctuation("!")))),
153 b'|' => {
154 if *data.get(1)? == b'}' {
155 Some((2, Ok(N3Token::Punctuation("|}"))))
156 } else {
157 Some((1, Ok(N3Token::Punctuation("|"))))
158 }
159 }
160 b'=' => {
161 if *data.get(1)? == b'>' {
162 Some((2, Ok(N3Token::Punctuation("=>"))))
163 } else {
164 Some((1, Ok(N3Token::Punctuation("="))))
165 }
166 }
167 b'0'..=b'9' | b'+' | b'-' => Self::recognize_number(data, is_ending),
168 b'?' => self.recognize_variable(data, is_ending),
169 _ => self.recognize_pname_or_keyword(data, is_ending),
170 }
171 }
172}
173
174impl N3Lexer {
175 pub fn new(mode: N3LexerMode, unchecked: bool) -> Self {
176 Self { mode, unchecked }
177 }
178
179 fn recognize_iri(
180 &self,
181 data: &[u8],
182 options: &N3LexerOptions,
183 ) -> Option<(usize, Result<N3Token<'static>, TokenRecognizerError>)> {
184 let mut string = Vec::new();
186 let mut i = 1;
187 loop {
188 let end = memchr2(b'>', b'\\', &data[i..])?;
189 string.extend_from_slice(&data[i..i + end]);
190 i += end;
191 match data[i] {
192 b'>' => {
193 return Some((i + 1, self.parse_iri(string, 0..i + 1, options)));
194 }
195 b'\\' => {
196 let (additional, c) = self.recognize_escape(&data[i..], i, false)?;
197 i += additional + 1;
198 match c {
199 Ok(c) => {
200 let mut buf = [0; 4];
201 string.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
202 }
203 Err(e) => return Some((i, Err(e))),
204 }
205 }
206 _ => unreachable!(),
207 }
208 }
209 }
210
211 fn parse_iri(
212 &self,
213 iri: Vec<u8>,
214 position: Range<usize>,
215 options: &N3LexerOptions,
216 ) -> Result<N3Token<'static>, TokenRecognizerError> {
217 let iri = string_from_utf8(iri, position.clone())?;
218 Ok(N3Token::IriRef(
219 if let Some(base_iri) = options.base_iri.as_ref() {
220 if self.unchecked {
221 base_iri.resolve_unchecked(&iri)
222 } else {
223 base_iri
224 .resolve(&iri)
225 .map_err(|e| (position, e.to_string()))?
226 }
227 .into_inner()
228 } else if self.unchecked {
229 iri
230 } else {
231 Iri::parse(iri)
232 .map_err(|e| (position, e.to_string()))?
233 .into_inner()
234 },
235 ))
236 }
237
238 fn recognize_pname_or_keyword<'a>(
239 &self,
240 data: &'a [u8],
241 is_ending: bool,
242 ) -> Option<(usize, Result<N3Token<'a>, TokenRecognizerError>)> {
243 let mut i = 0;
247 loop {
248 if let Some(r) = Self::recognize_unicode_char(&data[i..], i) {
249 match r {
250 Ok((c, consumed)) => {
251 if c == ':' {
252 i += consumed;
253 break;
254 } else if i == 0 {
255 if !Self::is_possible_pn_chars_base(c) {
256 return Some((
257 consumed,
258 Err((
259 0..consumed,
260 format!(
261 "'{c}' is not allowed at the beginning of a prefix name"
262 ),
263 )
264 .into()),
265 ));
266 }
267 i += consumed;
268 } else if Self::is_possible_pn_chars(c) || c == '.' {
269 i += consumed;
270 } else {
271 while data[..i].ends_with(b".") {
272 i -= 1;
273 }
274 return Some((
275 i,
276 str_from_utf8(&data[..i], 0..i).map(N3Token::PlainKeyword),
277 ));
278 }
279 }
280 Err(e) => return Some((e.location.end, Err(e))),
281 }
282 } else if is_ending {
283 while data[..i].ends_with(b".") {
284 i -= 1;
285 }
286 return Some(if i == 0 {
287 (
288 1,
289 Err((0..1, format!("Unexpected byte {}", data[0])).into()),
290 )
291 } else {
292 (
293 i,
294 str_from_utf8(&data[..i], 0..i).map(N3Token::PlainKeyword),
295 )
296 });
297 } else {
298 return None;
299 }
300 }
301 let pn_prefix = match str_from_utf8(&data[..i - 1], 0..i - 1) {
302 Ok(pn_prefix) => pn_prefix,
303 Err(e) => return Some((i, Err(e))),
304 };
305 if pn_prefix.ends_with('.') {
306 return Some((
307 i,
308 Err((
309 0..i,
310 format!(
311 "'{pn_prefix}' is not a valid prefix: prefixes are not allowed to end with '.'"),
312 )
313 .into()),
314 ));
315 }
316
317 let (consumed, pn_local_result) =
318 self.recognize_optional_pn_local(&data[i..], is_ending)?;
319 Some((
320 consumed + i,
321 pn_local_result.map(|(local, might_be_invalid_iri)| N3Token::PrefixedName {
322 prefix: pn_prefix,
323 local,
324 might_be_invalid_iri,
325 }),
326 ))
327 }
328
329 fn recognize_variable<'a>(
330 &self,
331 data: &'a [u8],
332 is_ending: bool,
333 ) -> Option<(usize, Result<N3Token<'a>, TokenRecognizerError>)> {
334 let (consumed, result) = self.recognize_optional_pn_local(&data[1..], is_ending)?;
336 Some((
337 consumed + 1,
338 result.and_then(|(name, _)| {
339 if name.is_empty() {
340 Err((0..consumed, "A variable name is not allowed to be empty").into())
341 } else {
342 Ok(N3Token::Variable(name))
343 }
344 }),
345 ))
346 }
347
348 fn recognize_optional_pn_local<'a>(
349 &self,
350 data: &'a [u8],
351 is_ending: bool,
352 ) -> Option<(usize, Result<(Cow<'a, str>, bool), TokenRecognizerError>)> {
353 let mut i = 0;
355 let mut buffer = None; let mut position_that_is_already_in_buffer = 0;
357 let mut might_be_invalid_iri = false;
358 let mut ends_with_unescaped_dot = 0;
359 loop {
360 if let Some(r) = Self::recognize_unicode_char(&data[i..], i) {
361 match r {
362 Ok((c, consumed)) => {
363 if c == '%' {
364 i += 1;
365 let a = char::from(*data.get(i)?);
366 i += 1;
367 let b = char::from(*data.get(i)?);
368 if !a.is_ascii_hexdigit() || !b.is_ascii_hexdigit() {
369 return Some((i + 1, Err((
370 i - 2..=i, format!("escapes in IRIs should be % followed by two hexadecimal characters, found '%{a}{b}'")
371 ).into())));
372 }
373 i += 1;
374 ends_with_unescaped_dot = 0;
375 } else if c == '\\' {
376 i += 1;
377 let a = char::from(*data.get(i)?);
378 if self.unchecked
379 || matches!(
380 a,
381 '_' | '~'
382 | '.'
383 | '-'
384 | '!'
385 | '$'
386 | '&'
387 | '\''
388 | '('
389 | ')'
390 | '*'
391 | '+'
392 | ','
393 | ';'
394 | '='
395 )
396 {
397 } else if matches!(a, '/' | '?' | '#' | '@' | '%') {
399 might_be_invalid_iri = true;
401 } else {
402 return Some((i + 1, Err((
403 i..=i, format!("The character that are allowed to be escaped in IRIs are _~.-!$&'()*+,;=/?#@%, found '{a}'")
404 ).into())));
405 }
406 let buffer = buffer.get_or_insert_with(String::new);
407 if i - position_that_is_already_in_buffer > 1 {
409 buffer.push_str(
410 match str_from_utf8(
411 &data[position_that_is_already_in_buffer..i - 1],
412 position_that_is_already_in_buffer..i - 1,
413 ) {
414 Ok(data) => data,
415 Err(e) => return Some((i, Err(e))),
416 },
417 )
418 }
419 buffer.push(a);
420 i += 1;
421 position_that_is_already_in_buffer = i;
422 ends_with_unescaped_dot = 0;
423 } else if i == 0 {
424 if !(Self::is_possible_pn_chars_u(c) || c == ':' || c.is_ascii_digit())
425 {
426 return Some((0, Ok((Cow::Borrowed(""), false))));
427 }
428 if !self.unchecked {
429 might_be_invalid_iri |=
430 Self::is_possible_pn_chars_base_but_not_valid_iri(c)
431 || c == ':';
432 }
433 i += consumed;
434 } else if Self::is_possible_pn_chars(c) || c == ':' {
435 if !self.unchecked {
436 might_be_invalid_iri |=
437 Self::is_possible_pn_chars_base_but_not_valid_iri(c)
438 || c == ':';
439 }
440 i += consumed;
441 ends_with_unescaped_dot = 0;
442 } else if c == '.' {
443 i += consumed;
444 ends_with_unescaped_dot += 1;
445 } else {
446 let buffer = if let Some(mut buffer) = buffer {
447 buffer.push_str(
448 match str_from_utf8(
449 &data[position_that_is_already_in_buffer..i],
450 position_that_is_already_in_buffer..i,
451 ) {
452 Ok(data) => data,
453 Err(e) => return Some((i, Err(e))),
454 },
455 );
456 for _ in 0..ends_with_unescaped_dot {
458 buffer.pop();
459 }
460 i -= ends_with_unescaped_dot;
461 Cow::Owned(buffer)
462 } else {
463 let mut data = match str_from_utf8(&data[..i], 0..i) {
464 Ok(data) => data,
465 Err(e) => return Some((i, Err(e))),
466 };
467 data = &data[..data.len() - ends_with_unescaped_dot];
469 i -= ends_with_unescaped_dot;
470 Cow::Borrowed(data)
471 };
472 return Some((i, Ok((buffer, might_be_invalid_iri))));
473 }
474 }
475 Err(e) => return Some((e.location.end, Err(e))),
476 }
477 } else if is_ending {
478 let buffer = if let Some(mut buffer) = buffer {
479 while buffer.ends_with('.') {
481 buffer.pop();
482 i -= 1;
483 }
484 Cow::Owned(buffer)
485 } else {
486 let mut data = match str_from_utf8(&data[..i], 0..i) {
487 Ok(data) => data,
488 Err(e) => return Some((i, Err(e))),
489 };
490 while let Some(d) = data.strip_suffix('.') {
492 data = d;
493 i -= 1;
494 }
495 Cow::Borrowed(data)
496 };
497 return Some((i, Ok((buffer, might_be_invalid_iri))));
498 } else {
499 return None;
500 }
501 }
502 }
503
504 fn recognize_blank_node_label(
505 data: &[u8],
506 ) -> Option<(usize, Result<N3Token<'_>, TokenRecognizerError>)> {
507 let mut i = 2;
509 loop {
510 match Self::recognize_unicode_char(&data[i..], i)? {
511 Ok((c, consumed)) => {
512 if (i == 2 && (Self::is_possible_pn_chars_u(c) || c.is_ascii_digit()))
513 || (i > 2 && Self::is_possible_pn_chars(c))
514 {
515 } else if i > 2 && c == '.' {
517 if data[i - 1] == b'.' {
518 i -= 1;
519 return Some((
520 i,
521 str_from_utf8(&data[2..i], 2..i).map(N3Token::BlankNodeLabel),
522 ));
523 }
524 } else if i == 2 {
525 return Some((
526 i,
527 Err((0..i, "A blank node ID should not be empty").into()),
528 ));
529 } else if data[i - 1] == b'.' {
530 i -= 1;
531 return Some((
532 i,
533 str_from_utf8(&data[2..i], 2..i).map(N3Token::BlankNodeLabel),
534 ));
535 } else {
536 return Some((
537 i,
538 str_from_utf8(&data[2..i], 2..i).map(N3Token::BlankNodeLabel),
539 ));
540 }
541 i += consumed;
542 }
543 Err(e) => return Some((e.location.end, Err(e))),
544 }
545 }
546 }
547
548 fn recognize_lang_tag<'a>(
549 &self,
550 data: &'a [u8],
551 ) -> Option<(usize, Result<N3Token<'a>, TokenRecognizerError>)> {
552 let mut is_last_block_empty = true;
554 for (i, c) in data[1..].iter().enumerate() {
555 if c.is_ascii_alphabetic() {
556 is_last_block_empty = false;
557 } else if i == 0 {
558 return Some((
559 1,
560 Err((1..2, "A language code should always start with a letter").into()),
561 ));
562 } else if is_last_block_empty {
563 return Some((i, self.parse_lang_tag(&data[1..i], 1..i - 1)));
564 } else if *c == b'-' {
565 is_last_block_empty = true;
566 } else {
567 return Some((i + 1, self.parse_lang_tag(&data[1..=i], 1..i)));
568 }
569 }
570 None
571 }
572
573 fn parse_lang_tag<'a>(
574 &self,
575 lang_tag: &'a [u8],
576 position: Range<usize>,
577 ) -> Result<N3Token<'a>, TokenRecognizerError> {
578 let lang_tag = str_from_utf8(lang_tag, position.clone())?;
579 Ok(N3Token::LangTag(if self.unchecked {
580 lang_tag
581 } else {
582 LanguageTag::parse(lang_tag)
583 .map_err(|e| (position.clone(), e.to_string()))?
584 .into_inner()
585 }))
586 }
587 fn recognize_string(
588 &self,
589 data: &[u8],
590 delimiter: u8,
591 ) -> Option<(usize, Result<N3Token<'static>, TokenRecognizerError>)> {
592 let mut string = String::new();
595 let mut i = 1;
596 loop {
597 let mut end = memchr2(delimiter, b'\\', &data[i..])?;
598 if !self.unchecked {
599 if let Some(line_jump_end) = memchr2(b'\n', b'\r', &data[i..i + end]) {
601 end = line_jump_end;
602 }
603 }
604 match str_from_utf8(&data[i..i + end], i..i + end) {
605 Ok(s) => string.push_str(s),
606 Err(e) => return Some((end, Err(e))),
607 };
608 i += end;
609 match data[i] {
610 c if c == delimiter => {
611 return Some((i + 1, Ok(N3Token::String(string))));
612 }
613 b'\\' => {
614 let (additional, c) = self.recognize_escape(&data[i..], i, true)?;
615 i += additional + 1;
616 match c {
617 Ok(c) => {
618 string.push(c);
619 }
620 Err(e) => {
621 let end = memchr(delimiter, &data[i..])?;
623 return Some((i + end + 1, Err(e)));
624 }
625 }
626 }
627 b'\n' | b'\r' => {
628 let end = memchr(delimiter, &data[i..])?;
630 return Some((
631 i + end + 1,
632 Err((
633 i..i + 1,
634 "Line jumps are not allowed in string literals, use \\n",
635 )
636 .into()),
637 ));
638 }
639 _ => unreachable!(),
640 }
641 }
642 }
643
644 fn recognize_long_string(
645 &self,
646 data: &[u8],
647 delimiter: u8,
648 ) -> Option<(usize, Result<N3Token<'static>, TokenRecognizerError>)> {
649 let mut string = String::new();
652 let mut i = 3;
653 loop {
654 let end = memchr2(delimiter, b'\\', &data[i..])?;
655 match str_from_utf8(&data[i..i + end], i..i + end) {
656 Ok(s) => string.push_str(s),
657 Err(e) => return Some((end, Err(e))),
658 };
659 i += end;
660 match data[i] {
661 c if c == delimiter => {
662 if *data.get(i + 1)? == delimiter && *data.get(i + 2)? == delimiter {
663 return Some((i + 3, Ok(N3Token::String(string))));
664 }
665 i += 1;
666 string.push(char::from(delimiter));
667 }
668 b'\\' => {
669 let (additional, c) = self.recognize_escape(&data[i..], i, true)?;
670 i += additional + 1;
671 match c {
672 Ok(c) => {
673 string.push(c);
674 }
675 Err(e) => return Some((i, Err(e))),
676 }
677 }
678 _ => unreachable!(),
679 }
680 }
681 }
682
683 fn recognize_number(
684 data: &[u8],
685 is_ending: bool,
686 ) -> Option<(usize, Result<N3Token<'_>, TokenRecognizerError>)> {
687 let mut i = 0;
692 let c = *data.first()?;
693 if matches!(c, b'+' | b'-') {
694 i += 1;
695 }
696 let count_before = Self::recognize_digits(&data[i..], is_ending)?;
698 i += count_before;
699
700 let c = if let Some(c) = data.get(i) {
702 Some(c)
703 } else if is_ending {
704 None
705 } else {
706 return None;
707 };
708 let count_after = if c == Some(&b'.') {
709 i += 1;
710 let count_after = Self::recognize_digits(&data[i..], is_ending)?;
711 i += count_after;
712 Some(count_after)
713 } else {
714 None
715 };
716
717 let c = if let Some(c) = data.get(i) {
719 Some(c)
720 } else if is_ending {
721 None
722 } else {
723 return None;
724 };
725 if matches!(c, Some(b'e' | b'E')) {
726 i += 1;
727
728 let c = if let Some(c) = data.get(i) {
729 Some(c)
730 } else if is_ending {
731 None
732 } else {
733 return None;
734 };
735 if matches!(c, Some(b'+' | b'-')) {
736 i += 1;
737 }
738
739 let count_exp = Self::recognize_digits(&data[i..], is_ending)?;
740 i += count_exp;
741 Some((
742 i,
743 if count_exp == 0 {
744 Err((0..i, "A double exponent cannot be empty").into())
745 } else if count_before == 0 && count_after.unwrap_or(0) == 0 {
746 Err((0..i, "A double should not be empty").into())
747 } else {
748 str_from_utf8(&data[..i], 0..i).map(N3Token::Double)
749 },
750 ))
751 } else if let Some(count_after) = count_after {
752 if count_after == 0 {
753 i -= 1;
755 Some((
756 i,
757 if count_before == 0 {
758 Err((0..i, "An integer should not be empty").into())
759 } else {
760 str_from_utf8(&data[..i], 0..i).map(N3Token::Integer)
761 },
762 ))
763 } else {
764 Some((i, str_from_utf8(&data[..i], 0..i).map(N3Token::Decimal)))
765 }
766 } else {
767 Some((
768 i,
769 if count_before == 0 {
770 Err((0..i, "An integer should not be empty").into())
771 } else {
772 str_from_utf8(&data[..i], 0..i).map(N3Token::Integer)
773 },
774 ))
775 }
776 }
777
778 fn recognize_digits(data: &[u8], is_ending: bool) -> Option<usize> {
779 for (i, c) in data.iter().enumerate() {
780 if !c.is_ascii_digit() {
781 return Some(i);
782 }
783 }
784 is_ending.then_some(data.len())
785 }
786
787 fn recognize_escape(
788 &self,
789 data: &[u8],
790 position: usize,
791 with_echar: bool,
792 ) -> Option<(usize, Result<char, TokenRecognizerError>)> {
793 match *data.get(1)? {
796 b'u' => match Self::recognize_hex_char(&data[2..], 4, 'u', position) {
797 Ok(c) => Some((5, Ok(c?))),
798 Err(e) => {
799 if self.unchecked {
800 match Self::recognize_utf16_surrogate_pair(&data[2..], position) {
801 Ok(c) => Some((11, Ok(c?))),
802 Err(e) => Some((5, Err(e))),
803 }
804 } else {
805 Some((5, Err(e)))
806 }
807 }
808 },
809 b'U' => match Self::recognize_hex_char(&data[2..], 8, 'U', position) {
810 Ok(c) => Some((9, Ok(c?))),
811 Err(e) => Some((9, Err(e))),
812 },
813 b't' if with_echar => Some((1, Ok('\t'))),
814 b'b' if with_echar => Some((1, Ok('\x08'))),
815 b'n' if with_echar => Some((1, Ok('\n'))),
816 b'r' if with_echar => Some((1, Ok('\r'))),
817 b'f' if with_echar => Some((1, Ok('\x0C'))),
818 b'"' if with_echar => Some((1, Ok('"'))),
819 b'\'' if with_echar => Some((1, Ok('\''))),
820 b'\\' if with_echar => Some((1, Ok('\\'))),
821 c => Some((
822 1,
823 Err((
824 position..position + 2,
825 format!("Unexpected escape character '\\{}'", char::from(c)),
826 )
827 .into()),
828 )), }
830 }
831
832 fn recognize_hex_char(
833 data: &[u8],
834 len: usize,
835 escape_char: char,
836 position: usize,
837 ) -> Result<Option<char>, TokenRecognizerError> {
838 if data.len() < len {
839 return Ok(None);
840 };
841 let mut codepoint = 0;
842 for i in 0..len {
843 let c = data[i];
844 codepoint = codepoint * 16
845 + u32::from(match c {
846 b'0'..=b'9' => c - b'0',
847 b'a'..=b'f' => c - b'a' + 10,
848 b'A'..=b'F' => c - b'A' + 10,
849 _ => {
850 let val = str::from_utf8(&data[..len]).unwrap_or_default();
851 return Err((
852 position + i + 2..position + i + 3,
853 format!(
854 "The escape sequence '\\{escape_char}{val}' is not a valid hexadecimal string"
855 ),
856 ).into());
857 }
858 });
859 }
860 let c = char::from_u32(codepoint).ok_or_else(|| {
861 let val = str::from_utf8(&data[..len]).unwrap_or_default();
862 (
863 position..position + len +2,
864 format!(
865 "The escape sequence '\\{escape_char}{val}' is encoding {codepoint:X} that is not a valid unicode character",
866 ),
867 )
868 })?;
869 Ok(Some(c))
870 }
871
872 fn recognize_utf16_surrogate_pair(
873 data: &[u8],
874 position: usize,
875 ) -> Result<Option<char>, TokenRecognizerError> {
876 let Some(val_high_slice) = data.get(..4) else {
877 return Ok(None);
878 };
879 let val_high = str_from_utf8(val_high_slice, position..position + 6)?;
880 let surrogate_high = u16::from_str_radix(val_high, 16).map_err(|e| {
881 (
882 position..position + 6,
883 format!(
884 "The escape sequence '\\u{val_high}' is not a valid hexadecimal string: {e}"
885 ),
886 )
887 })?;
888
889 if !matches!(surrogate_high, 0xD800..=0xDFFF) {
891 return Err((
892 position..position + 6,
893 format!("The escape sequence '\\u{val_high}' is not a UTF-16 surrogate"),
894 )
895 .into());
896 }
897 let Some(&d4) = data.get(4) else {
898 return Ok(None);
899 };
900 let Some(&d5) = data.get(5) else {
901 return Ok(None);
902 };
903 if d4 != b'\\' || d5 != b'u' {
904 return Err((
905 position..position + 6,
906 format!(
907 "UTF-16 surrogate escape sequence '\\u{val_high}' must be followed by another surrogate escape sequence"),
908 )
909 .into());
910 }
911
912 let Some(val_low_slice) = data.get(6..10) else {
913 return Ok(None);
914 };
915 let val_low = str_from_utf8(val_low_slice, position + 6..position + 12)?;
916 let surrogate_low = u16::from_str_radix(val_low, 16).map_err(|e| {
917 (
918 position + 6..position + 12,
919 format!(
920 "The escape sequence '\\u{val_low}' is not a valid hexadecimal string: {e}"
921 ),
922 )
923 })?;
924
925 let mut chars = char::decode_utf16([surrogate_high, surrogate_low]);
926
927 let c = chars.next()
928 .and_then(Result::ok)
929 .ok_or_else(|| {
930 (
931 position..position + 12,
932 format!(
933 "Escape sequences '\\u{val_high}\\u{val_low}' do not form a valid UTF-16 surrogate pair"
934 ),
935 )
936 })?;
937
938 debug_assert_eq!(
939 chars.next(),
940 None,
941 "Surrogate pair should combine to exactly one character"
942 );
943
944 Ok(Some(c))
945 }
946
947 fn recognize_unicode_char(
948 data: &[u8],
949 position: usize,
950 ) -> Option<Result<(char, usize), TokenRecognizerError>> {
951 let mut code_point: u32;
952 let bytes_needed: usize;
953 let mut lower_boundary = 0x80;
954 let mut upper_boundary = 0xBF;
955
956 let byte = *data.first()?;
957 match byte {
958 0x00..=0x7F => return Some(Ok((char::from(byte), 1))),
959 0xC2..=0xDF => {
960 bytes_needed = 1;
961 code_point = u32::from(byte) & 0x1F;
962 }
963 0xE0..=0xEF => {
964 if byte == 0xE0 {
965 lower_boundary = 0xA0;
966 }
967 if byte == 0xED {
968 upper_boundary = 0x9F;
969 }
970 bytes_needed = 2;
971 code_point = u32::from(byte) & 0xF;
972 }
973 0xF0..=0xF4 => {
974 if byte == 0xF0 {
975 lower_boundary = 0x90;
976 }
977 if byte == 0xF4 {
978 upper_boundary = 0x8F;
979 }
980 bytes_needed = 3;
981 code_point = u32::from(byte) & 0x7;
982 }
983 _ => {
984 return Some(Err((
985 position..=position,
986 "Invalid UTF-8 character encoding",
987 )
988 .into()))
989 }
990 }
991
992 for i in 1..=bytes_needed {
993 let byte = *data.get(i)?;
994 if byte < lower_boundary || upper_boundary < byte {
995 return Some(Err((
996 position..=position + i,
997 "Invalid UTF-8 character encoding",
998 )
999 .into()));
1000 }
1001 lower_boundary = 0x80;
1002 upper_boundary = 0xBF;
1003 code_point = (code_point << 6) | (u32::from(byte) & 0x3F);
1004 }
1005
1006 Some(
1007 char::from_u32(code_point)
1008 .map(|c| (c, bytes_needed + 1))
1009 .ok_or_else(|| {
1010 (
1011 position..=position + bytes_needed,
1012 format!("The codepoint {code_point:X} is not a valid unicode character"),
1013 )
1014 .into()
1015 }),
1016 )
1017 }
1018
1019 fn is_possible_pn_chars_base(c: char) -> bool {
1021 matches!(c,
1022 'A'..='Z'
1023 | 'a'..='z'
1024 | '\u{00C0}'..='\u{00D6}'
1025 | '\u{00D8}'..='\u{00F6}'
1026 | '\u{00F8}'..='\u{02FF}'
1027 | '\u{0370}'..='\u{037D}'
1028 | '\u{037F}'..='\u{1FFF}'
1029 | '\u{200C}'..='\u{200D}'
1030 | '\u{2070}'..='\u{218F}'
1031 | '\u{2C00}'..='\u{2FEF}'
1032 | '\u{3001}'..='\u{D7FF}'
1033 | '\u{F900}'..='\u{FDCF}'
1034 | '\u{FDF0}'..='\u{FFFD}'
1035 | '\u{10000}'..='\u{EFFFF}')
1036 }
1037
1038 pub(super) fn is_possible_pn_chars_u(c: char) -> bool {
1040 Self::is_possible_pn_chars_base(c) || c == '_'
1041 }
1042
1043 pub(crate) fn is_possible_pn_chars(c: char) -> bool {
1045 Self::is_possible_pn_chars_u(c)
1046 || matches!(c,
1047 '-' | '0'..='9' | '\u{00B7}' | '\u{0300}'..='\u{036F}' | '\u{203F}'..='\u{2040}')
1048 }
1049
1050 fn is_possible_pn_chars_base_but_not_valid_iri(c: char) -> bool {
1051 matches!(c, '\u{FFF0}'..='\u{FFFD}')
1052 || u32::from(c) % u32::from('\u{FFFE}') == 0
1053 || u32::from(c) % u32::from('\u{FFFF}') == 0
1054 }
1055}
1056
1057pub fn resolve_local_name(
1058 prefix: &str,
1059 local: &str,
1060 might_be_invalid_iri: bool,
1061 prefixes: &HashMap<String, Iri<String>>,
1062) -> Result<NamedNode, String> {
1063 if let Some(start) = prefixes.get(prefix) {
1064 let iri = format!("{start}{local}");
1065 if might_be_invalid_iri || start.path().is_empty() {
1066 if let Err(e) = Iri::parse(iri.as_str()) {
1068 return Err(format!(
1069 "The prefixed name {prefix}:{local} builds IRI {iri} that is invalid: {e}"
1070 ));
1071 }
1072 }
1073 Ok(NamedNode::new_unchecked(iri))
1074 } else {
1075 Err(format!("The prefix {prefix}: has not been declared"))
1076 }
1077}
1078
1079fn str_from_utf8(data: &[u8], range: Range<usize>) -> Result<&str, TokenRecognizerError> {
1080 str::from_utf8(data).map_err(|e| {
1081 (
1082 range.start + e.valid_up_to()..min(range.end, range.start + e.valid_up_to() + 4),
1083 format!("Invalid UTF-8: {e}"),
1084 )
1085 .into()
1086 })
1087}
1088
1089fn string_from_utf8(data: Vec<u8>, range: Range<usize>) -> Result<String, TokenRecognizerError> {
1090 String::from_utf8(data).map_err(|e| {
1091 (
1092 range.start + e.utf8_error().valid_up_to()
1093 ..min(range.end, range.start + e.utf8_error().valid_up_to() + 4),
1094 format!("Invalid UTF-8: {e}"),
1095 )
1096 .into()
1097 })
1098}