1use crate::error::*;
2use crate::utils::*;
3use oxilangtag::LanguageTag;
4use oxiri::Iri;
5use rio_api::model::*;
6use std::char;
7use std::io::BufRead;
8
9pub const MAX_ASCII: u8 = 0x7F;
10
11pub fn parse_iriref_absolute<'a>(
12 read: &mut LookAheadByteReader<impl BufRead>,
13 buffer: &'a mut String,
14) -> Result<NamedNode<'a>, TurtleError> {
15 parse_iriref(read, buffer)?;
16 Iri::parse(buffer.as_str()).map_err(|error| {
17 read.parse_error(TurtleErrorKind::InvalidIri {
18 iri: buffer.to_owned(),
19 error,
20 })
21 })?;
22 Ok(NamedNode { iri: buffer })
23}
24
25pub fn parse_iriref_relative<'a>(
26 read: &mut LookAheadByteReader<impl BufRead>,
27 buffer: &'a mut String,
28 temp_buffer: &mut String,
29 base_iri: &Option<Iri<String>>,
30) -> Result<NamedNode<'a>, TurtleError> {
31 if let Some(base_iri) = base_iri {
32 parse_iriref(read, temp_buffer)?;
33 let result = base_iri.resolve_into(temp_buffer, buffer).map_err(|error| {
34 read.parse_error(TurtleErrorKind::InvalidIri {
35 iri: temp_buffer.to_owned(),
36 error,
37 })
38 });
39 temp_buffer.clear();
40 result.map(move |_| NamedNode { iri: buffer })
41 } else {
42 parse_iriref_absolute(read, buffer)
43 }
44}
45
46pub fn parse_iriref(
47 read: &mut LookAheadByteReader<impl BufRead>,
48 buffer: &mut String,
49) -> Result<(), TurtleError> {
50 read.check_is_current(b'<')?;
53 loop {
54 read.consume()?;
55 match read.current() {
56 None | Some(b'\n') | Some(b'\r') => read.unexpected_char_error()?,
57 Some(b'>') => {
58 read.consume()?;
59 return Ok(());
60 }
61 Some(b'\\') => {
62 read.consume()?;
63 buffer.push(match read.current() {
64 Some(b'u') => read_hexa_char(read, 4)?,
65 Some(b'U') => read_hexa_char(read, 8)?,
66 _ => read.unexpected_char_error()?,
67 });
68 }
69 Some(c) => buffer.push(if c <= MAX_ASCII {
70 char::from(c) } else {
72 read_utf8_char(read)?
73 }),
74 }
75 }
76}
77
78pub fn parse_blank_node_label<'a>(
79 read: &mut LookAheadByteReader<impl BufRead>,
80 buffer: &'a mut String,
81) -> Result<BlankNode<'a>, TurtleError> {
82 read.check_is_current(b'_')?;
84 read.consume()?;
85 read.check_is_current(b':')?;
86 read.consume()?;
87
88 let c = read.required_current()?;
89 if c <= MAX_ASCII && (is_possible_pn_chars_u_ascii(c) || c.is_ascii_digit()) {
90 buffer.push(char::from(c))
91 } else {
92 let c = read_utf8_char(read)?;
93 if is_possible_pn_chars_u_unicode(c) {
94 buffer.push(c);
95 } else {
96 read.unexpected_char_error()?
97 }
98 }
99
100 loop {
101 read.consume()?;
102 match read.current() {
103 Some(b'.') => match read.next()? {
104 Some(c) if is_possible_pn_chars_ascii(c) || c > MAX_ASCII => buffer.push('.'),
105 _ => {
106 return Ok(BlankNode { id: buffer });
107 }
108 },
109 Some(c) if c < MAX_ASCII && is_possible_pn_chars_ascii(c) => buffer.push(char::from(c)),
110 _ => {
111 let c = read_utf8_char(read)?;
112 if is_possible_pn_chars_unicode(c) {
113 buffer.push(c);
114 } else {
115 return Ok(BlankNode { id: buffer });
116 }
117 }
118 }
119 }
120}
121
122pub fn parse_langtag(
123 read: &mut LookAheadByteReader<impl BufRead>,
124 buffer: &mut String,
125) -> Result<(), TurtleError> {
126 read.check_is_current(b'@')?;
128 read.consume()?;
129
130 while let Some(c) = read.current() {
131 match c {
132 b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'-' => {
133 buffer.push(char::from(c).to_ascii_lowercase());
134 read.consume()?;
135 }
136 _ => {
137 break;
138 }
139 }
140 }
141 LanguageTag::parse(buffer.as_str()).map_err(|error| {
142 read.parse_error(TurtleErrorKind::InvalidLanguageTag {
143 tag: buffer.to_owned(),
144 error,
145 })
146 })?;
147 Ok(())
148}
149
150pub fn parse_string_literal_quote(
151 read: &mut LookAheadByteReader<impl BufRead>,
152 buffer: &mut String,
153) -> Result<(), TurtleError> {
154 parse_string_literal_quote_inner(read, buffer, b'"')
156}
157
158pub fn parse_string_literal_quote_inner(
159 read: &mut LookAheadByteReader<impl BufRead>,
160 buffer: &mut String,
161 quote: u8,
162) -> Result<(), TurtleError> {
163 read.check_is_current(quote)?;
164 loop {
165 read.consume()?;
166 match read.required_current()? {
167 c if c == quote => {
168 read.consume()?;
169 return Ok(());
170 }
171 b'\\' => parse_echar_or_uchar(read, buffer)?,
172 b'\n' | b'\r' => read.unexpected_char_error()?,
173 c => buffer.push(if c <= MAX_ASCII {
174 char::from(c) } else {
176 read_utf8_char(read)?
177 }),
178 }
179 }
180}
181
182pub fn parse_echar_or_uchar(
183 read: &mut LookAheadByteReader<impl BufRead>,
184 buffer: &mut String,
185) -> Result<(), TurtleError> {
186 read.check_is_current(b'\\')?;
187 read.consume()?;
188 match read.required_current()? {
189 b't' => buffer.push('\t'),
190 b'b' => buffer.push('\u{8}'),
191 b'n' => buffer.push('\n'),
192 b'r' => buffer.push('\r'),
193 b'f' => buffer.push('\u{C}'),
194 b'"' => buffer.push('"'),
195 b'\'' => buffer.push('\''),
196 b'\\' => buffer.push('\\'),
197 b'u' => buffer.push(read_hexa_char(read, 4)?),
198 b'U' => buffer.push(read_hexa_char(read, 8)?),
199 _ => read.unexpected_char_error()?,
200 }
201 Ok(())
202}
203
204pub(crate) fn read_hexa_char(
205 read: &mut LookAheadByteReader<impl BufRead>,
206 len: usize,
207) -> Result<char, TurtleError> {
208 let point = read_hexa_u32(read, len)?;
209 char::from_u32(point)
210 .ok_or_else(|| read.parse_error(TurtleErrorKind::InvalidUnicodeCodePoint(point)))
211}
212
213fn read_hexa_u32(
214 read: &mut LookAheadByteReader<impl BufRead>,
215 len: usize,
216) -> Result<u32, TurtleError> {
217 let mut value = 0;
218 for _ in 0..len {
219 read.consume()?;
220 if let Some(d) = convert_hexa_byte(read.required_current()?) {
221 value = value * 16 + u32::from(d);
222 } else {
223 read.unexpected_char_error()?;
224 };
225 }
226 Ok(value)
227}
228
229fn convert_hexa_byte(c: u8) -> Option<u8> {
230 match c {
231 b'0'..=b'9' => Some(c - b'0'),
232 b'a'..=b'f' => Some(c - b'a' + 10),
233 b'A'..=b'F' => Some(c - b'A' + 10),
234 _ => None,
235 }
236}
237
238pub fn is_possible_pn_chars_base_ascii(c: u8) -> bool {
240 c.is_ascii_alphabetic()
241}
242
243pub fn is_possible_pn_chars_base_unicode(c: char) -> bool {
245 matches!(c,
246 'A'..='Z'
247 | 'a'..='z'
248 | '\u{00C0}'..='\u{00D6}'
249 | '\u{00D8}'..='\u{00F6}'
250 | '\u{00F8}'..='\u{02FF}'
251 | '\u{0370}'..='\u{037D}'
252 | '\u{037F}'..='\u{1FFF}'
253 | '\u{200C}'..='\u{200D}'
254 | '\u{2070}'..='\u{218F}'
255 | '\u{2C00}'..='\u{2FEF}'
256 | '\u{3001}'..='\u{D7FF}'
257 | '\u{F900}'..='\u{FDCF}'
258 | '\u{FDF0}'..='\u{FFFD}'
259 | '\u{10000}'..='\u{EFFFF}')
260}
261
262pub fn is_possible_pn_chars_u_ascii(c: u8) -> bool {
264 is_possible_pn_chars_base_ascii(c) || c == b'_'
265}
266
267pub fn is_possible_pn_chars_u_unicode(c: char) -> bool {
269 is_possible_pn_chars_base_unicode(c) || c == '_'
270}
271
272pub fn is_possible_pn_chars_ascii(c: u8) -> bool {
274 is_possible_pn_chars_u_ascii(c) || matches!(c, b'-' | b'0'..=b'9' | 0x00B7)
275}
276
277pub fn is_possible_pn_chars_unicode(c: char) -> bool {
279 is_possible_pn_chars_u_unicode(c)
280 || matches!(c,
281 '-' | '0'..='9' | '\u{00B7}' | '\u{0300}'..='\u{036F}' | '\u{203F}'..='\u{2040}')
282}
283
284pub fn read_utf8_char(read: &mut LookAheadByteReader<impl BufRead>) -> Result<char, TurtleError> {
286 let mut code_point: u32;
287 let bytes_needed: usize;
288 let mut lower_boundary = 0x80;
289 let mut upper_boundary = 0xBF;
290
291 let byte = read.required_current()?;
292 match byte {
293 0x00..=0x7F => return Ok(char::from(byte)),
294 0xC2..=0xDF => {
295 bytes_needed = 1;
296 code_point = u32::from(byte) & 0x1F;
297 }
298 0xE0..=0xEF => {
299 if byte == 0xE0 {
300 lower_boundary = 0xA0;
301 }
302 if byte == 0xED {
303 upper_boundary = 0x9F;
304 }
305 bytes_needed = 2;
306 code_point = u32::from(byte) & 0xF;
307 }
308 0xF0..=0xF4 => {
309 if byte == 0xF0 {
310 lower_boundary = 0x90;
311 }
312 if byte == 0xF4 {
313 upper_boundary = 0x8F;
314 }
315 bytes_needed = 3;
316 code_point = u32::from(byte) & 0x7;
317 }
318 _ => return read.unexpected_char_error(),
319 }
320
321 for _ in 0..bytes_needed {
322 read.consume()?;
323 let byte = read.required_current()?;
324 if byte < lower_boundary || upper_boundary < byte {
325 return read.unexpected_char_error();
326 }
327 lower_boundary = 0x80;
328 upper_boundary = 0xBF;
329 code_point = (code_point << 6) | (u32::from(byte) & 0x3F);
330 }
331
332 char::from_u32(code_point)
333 .ok_or_else(|| read.parse_error(TurtleErrorKind::InvalidUnicodeCodePoint(code_point)))
334}