rio_turtle/
shared.rs

1use crate::error::*;
2use crate::utils::*;
3use oxilangtag::LanguageTag;
4use oxiri::Iri;
5use rio_api::model::*;
6use std::char;
7use std::io::BufRead;
8
9pub const MAX_ASCII: u8 = 0x7F;
10
11pub fn parse_iriref_absolute<'a>(
12    read: &mut LookAheadByteReader<impl BufRead>,
13    buffer: &'a mut String,
14) -> Result<NamedNode<'a>, TurtleError> {
15    parse_iriref(read, buffer)?;
16    Iri::parse(buffer.as_str()).map_err(|error| {
17        read.parse_error(TurtleErrorKind::InvalidIri {
18            iri: buffer.to_owned(),
19            error,
20        })
21    })?;
22    Ok(NamedNode { iri: buffer })
23}
24
25pub fn parse_iriref_relative<'a>(
26    read: &mut LookAheadByteReader<impl BufRead>,
27    buffer: &'a mut String,
28    temp_buffer: &mut String,
29    base_iri: &Option<Iri<String>>,
30) -> Result<NamedNode<'a>, TurtleError> {
31    if let Some(base_iri) = base_iri {
32        parse_iriref(read, temp_buffer)?;
33        let result = base_iri.resolve_into(temp_buffer, buffer).map_err(|error| {
34            read.parse_error(TurtleErrorKind::InvalidIri {
35                iri: temp_buffer.to_owned(),
36                error,
37            })
38        });
39        temp_buffer.clear();
40        result.map(move |_| NamedNode { iri: buffer })
41    } else {
42        parse_iriref_absolute(read, buffer)
43    }
44}
45
46pub fn parse_iriref(
47    read: &mut LookAheadByteReader<impl BufRead>,
48    buffer: &mut String,
49) -> Result<(), TurtleError> {
50    // [18] 	IRIREF 	::= 	'<' ([^#x00-#x20<>"{}|^`\] | UCHAR)* '>' /* #x00=NULL #01-#x1F=control codes #x20=space */
51    // Most of the validation is done by the IRI parser
52    read.check_is_current(b'<')?;
53    loop {
54        read.consume()?;
55        match read.current() {
56            None | Some(b'\n') | Some(b'\r') => read.unexpected_char_error()?,
57            Some(b'>') => {
58                read.consume()?;
59                return Ok(());
60            }
61            Some(b'\\') => {
62                read.consume()?;
63                buffer.push(match read.current() {
64                    Some(b'u') => read_hexa_char(read, 4)?,
65                    Some(b'U') => read_hexa_char(read, 8)?,
66                    _ => read.unexpected_char_error()?,
67                });
68            }
69            Some(c) => buffer.push(if c <= MAX_ASCII {
70                char::from(c) //optimization to avoid UTF-8 decoding
71            } else {
72                read_utf8_char(read)?
73            }),
74        }
75    }
76}
77
78pub fn parse_blank_node_label<'a>(
79    read: &mut LookAheadByteReader<impl BufRead>,
80    buffer: &'a mut String,
81) -> Result<BlankNode<'a>, TurtleError> {
82    // [141s] 	BLANK_NODE_LABEL 	::= 	'_:' (PN_CHARS_U | [0-9]) ((PN_CHARS | '.')* PN_CHARS)?
83    read.check_is_current(b'_')?;
84    read.consume()?;
85    read.check_is_current(b':')?;
86    read.consume()?;
87
88    let c = read.required_current()?;
89    if c <= MAX_ASCII && (is_possible_pn_chars_u_ascii(c) || c.is_ascii_digit()) {
90        buffer.push(char::from(c))
91    } else {
92        let c = read_utf8_char(read)?;
93        if is_possible_pn_chars_u_unicode(c) {
94            buffer.push(c);
95        } else {
96            read.unexpected_char_error()?
97        }
98    }
99
100    loop {
101        read.consume()?;
102        match read.current() {
103            Some(b'.') => match read.next()? {
104                Some(c) if is_possible_pn_chars_ascii(c) || c > MAX_ASCII => buffer.push('.'),
105                _ => {
106                    return Ok(BlankNode { id: buffer });
107                }
108            },
109            Some(c) if c < MAX_ASCII && is_possible_pn_chars_ascii(c) => buffer.push(char::from(c)),
110            _ => {
111                let c = read_utf8_char(read)?;
112                if is_possible_pn_chars_unicode(c) {
113                    buffer.push(c);
114                } else {
115                    return Ok(BlankNode { id: buffer });
116                }
117            }
118        }
119    }
120}
121
122pub fn parse_langtag(
123    read: &mut LookAheadByteReader<impl BufRead>,
124    buffer: &mut String,
125) -> Result<(), TurtleError> {
126    // [144s] 	LANGTAG 	::= 	'@' [a-zA-Z]+ ('-' [a-zA-Z0-9]+)*
127    read.check_is_current(b'@')?;
128    read.consume()?;
129
130    while let Some(c) = read.current() {
131        match c {
132            b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'-' => {
133                buffer.push(char::from(c).to_ascii_lowercase());
134                read.consume()?;
135            }
136            _ => {
137                break;
138            }
139        }
140    }
141    LanguageTag::parse(buffer.as_str()).map_err(|error| {
142        read.parse_error(TurtleErrorKind::InvalidLanguageTag {
143            tag: buffer.to_owned(),
144            error,
145        })
146    })?;
147    Ok(())
148}
149
150pub fn parse_string_literal_quote(
151    read: &mut LookAheadByteReader<impl BufRead>,
152    buffer: &mut String,
153) -> Result<(), TurtleError> {
154    // [22] 	STRING_LITERAL_QUOTE 	::= 	'"' ([^#x22#x5C#xA#xD] | ECHAR | UCHAR)* '"' /* #x22=" #x5C=\ #xA=new line #xD=carriage return */
155    parse_string_literal_quote_inner(read, buffer, b'"')
156}
157
158pub fn parse_string_literal_quote_inner(
159    read: &mut LookAheadByteReader<impl BufRead>,
160    buffer: &mut String,
161    quote: u8,
162) -> Result<(), TurtleError> {
163    read.check_is_current(quote)?;
164    loop {
165        read.consume()?;
166        match read.required_current()? {
167            c if c == quote => {
168                read.consume()?;
169                return Ok(());
170            }
171            b'\\' => parse_echar_or_uchar(read, buffer)?,
172            b'\n' | b'\r' => read.unexpected_char_error()?,
173            c => buffer.push(if c <= MAX_ASCII {
174                char::from(c) //optimization to avoid UTF-8 decoding
175            } else {
176                read_utf8_char(read)?
177            }),
178        }
179    }
180}
181
182pub fn parse_echar_or_uchar(
183    read: &mut LookAheadByteReader<impl BufRead>,
184    buffer: &mut String,
185) -> Result<(), TurtleError> {
186    read.check_is_current(b'\\')?;
187    read.consume()?;
188    match read.required_current()? {
189        b't' => buffer.push('\t'),
190        b'b' => buffer.push('\u{8}'),
191        b'n' => buffer.push('\n'),
192        b'r' => buffer.push('\r'),
193        b'f' => buffer.push('\u{C}'),
194        b'"' => buffer.push('"'),
195        b'\'' => buffer.push('\''),
196        b'\\' => buffer.push('\\'),
197        b'u' => buffer.push(read_hexa_char(read, 4)?),
198        b'U' => buffer.push(read_hexa_char(read, 8)?),
199        _ => read.unexpected_char_error()?,
200    }
201    Ok(())
202}
203
204pub(crate) fn read_hexa_char(
205    read: &mut LookAheadByteReader<impl BufRead>,
206    len: usize,
207) -> Result<char, TurtleError> {
208    let point = read_hexa_u32(read, len)?;
209    char::from_u32(point)
210        .ok_or_else(|| read.parse_error(TurtleErrorKind::InvalidUnicodeCodePoint(point)))
211}
212
213fn read_hexa_u32(
214    read: &mut LookAheadByteReader<impl BufRead>,
215    len: usize,
216) -> Result<u32, TurtleError> {
217    let mut value = 0;
218    for _ in 0..len {
219        read.consume()?;
220        if let Some(d) = convert_hexa_byte(read.required_current()?) {
221            value = value * 16 + u32::from(d);
222        } else {
223            read.unexpected_char_error()?;
224        };
225    }
226    Ok(value)
227}
228
229fn convert_hexa_byte(c: u8) -> Option<u8> {
230    match c {
231        b'0'..=b'9' => Some(c - b'0'),
232        b'a'..=b'f' => Some(c - b'a' + 10),
233        b'A'..=b'F' => Some(c - b'A' + 10),
234        _ => None,
235    }
236}
237
238// [157s] 	PN_CHARS_BASE 	::= 	[A-Z] | [a-z] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x02FF] | [#x0370-#x037D] | [#x037F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
239pub fn is_possible_pn_chars_base_ascii(c: u8) -> bool {
240    c.is_ascii_alphabetic()
241}
242
243// [157s] 	PN_CHARS_BASE 	::= 	[A-Z] | [a-z] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x02FF] | [#x0370-#x037D] | [#x037F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
244pub fn is_possible_pn_chars_base_unicode(c: char) -> bool {
245    matches!(c,
246        'A'..='Z'
247        | 'a'..='z'
248        | '\u{00C0}'..='\u{00D6}'
249        | '\u{00D8}'..='\u{00F6}'
250        | '\u{00F8}'..='\u{02FF}'
251        | '\u{0370}'..='\u{037D}'
252        | '\u{037F}'..='\u{1FFF}'
253        | '\u{200C}'..='\u{200D}'
254        | '\u{2070}'..='\u{218F}'
255        | '\u{2C00}'..='\u{2FEF}'
256        | '\u{3001}'..='\u{D7FF}'
257        | '\u{F900}'..='\u{FDCF}'
258        | '\u{FDF0}'..='\u{FFFD}'
259        | '\u{10000}'..='\u{EFFFF}')
260}
261
262// [158s] 	PN_CHARS_U 	::= 	PN_CHARS_BASE | '_' | ':'
263pub fn is_possible_pn_chars_u_ascii(c: u8) -> bool {
264    is_possible_pn_chars_base_ascii(c) || c == b'_'
265}
266
267// [158s] 	PN_CHARS_U 	::= 	PN_CHARS_BASE | '_' | ':'
268pub fn is_possible_pn_chars_u_unicode(c: char) -> bool {
269    is_possible_pn_chars_base_unicode(c) || c == '_'
270}
271
272// [160s] 	PN_CHARS 	::= 	PN_CHARS_U | '-' | [0-9] | #x00B7 | [#x0300-#x036F] | [#x203F-#x2040]
273pub fn is_possible_pn_chars_ascii(c: u8) -> bool {
274    is_possible_pn_chars_u_ascii(c) || matches!(c, b'-' | b'0'..=b'9' | 0x00B7)
275}
276
277// [160s] 	PN_CHARS 	::= 	PN_CHARS_U | '-' | [0-9] | #x00B7 | [#x0300-#x036F] | [#x203F-#x2040]
278pub fn is_possible_pn_chars_unicode(c: char) -> bool {
279    is_possible_pn_chars_u_unicode(c)
280        || matches!(c, 
281        '-' | '0'..='9' | '\u{00B7}' | '\u{0300}'..='\u{036F}' | '\u{203F}'..='\u{2040}')
282}
283
284/// Algorithm from https://encoding.spec.whatwg.org/#utf-8-decoder
285pub fn read_utf8_char(read: &mut LookAheadByteReader<impl BufRead>) -> Result<char, TurtleError> {
286    let mut code_point: u32;
287    let bytes_needed: usize;
288    let mut lower_boundary = 0x80;
289    let mut upper_boundary = 0xBF;
290
291    let byte = read.required_current()?;
292    match byte {
293        0x00..=0x7F => return Ok(char::from(byte)),
294        0xC2..=0xDF => {
295            bytes_needed = 1;
296            code_point = u32::from(byte) & 0x1F;
297        }
298        0xE0..=0xEF => {
299            if byte == 0xE0 {
300                lower_boundary = 0xA0;
301            }
302            if byte == 0xED {
303                upper_boundary = 0x9F;
304            }
305            bytes_needed = 2;
306            code_point = u32::from(byte) & 0xF;
307        }
308        0xF0..=0xF4 => {
309            if byte == 0xF0 {
310                lower_boundary = 0x90;
311            }
312            if byte == 0xF4 {
313                upper_boundary = 0x8F;
314            }
315            bytes_needed = 3;
316            code_point = u32::from(byte) & 0x7;
317        }
318        _ => return read.unexpected_char_error(),
319    }
320
321    for _ in 0..bytes_needed {
322        read.consume()?;
323        let byte = read.required_current()?;
324        if byte < lower_boundary || upper_boundary < byte {
325            return read.unexpected_char_error();
326        }
327        lower_boundary = 0x80;
328        upper_boundary = 0xBF;
329        code_point = (code_point << 6) | (u32::from(byte) & 0x3F);
330    }
331
332    char::from_u32(code_point)
333        .ok_or_else(|| read.parse_error(TurtleErrorKind::InvalidUnicodeCodePoint(code_point)))
334}