rio_turtle/
ntriples.rs

1//! Implementation of N-Triples and N-Quads RDF syntax
2
3use crate::error::*;
4use crate::shared::*;
5use crate::triple_allocator::TripleAllocator;
6use crate::utils::*;
7use rio_api::model::*;
8use rio_api::parser::*;
9use std::io::BufRead;
10
11/// A [N-Triples](https://www.w3.org/TR/n-triples/) and [N-Triples-star](https://w3c.github.io/rdf-star/cg-spec/#n-triples-star) streaming parser.
12///
13/// It implements the [`TriplesParser`] trait.
14///
15/// Its memory consumption is linear in the size of the longest line of the file.
16/// It does not do any allocation during parsing except buffer resizing
17/// if a line significantly longer than the previous is encountered,
18/// or if a line uses deeply nested triples.
19///
20///
21/// Count the number of people using the [`TriplesParser`] API:
22/// ```
23/// use rio_turtle::{NTriplesParser, TurtleError};
24/// use rio_api::parser::TriplesParser;
25/// use rio_api::model::NamedNode;
26///
27/// let file = b"<http://example.com/foo> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .
28/// <http://example.com/foo> <http://schema.org/name> \"Foo\" .
29/// <http://example.com/bar> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .
30/// <http://example.com/bar> <http://schema.org/name> \"Bar\" .";
31///
32/// let rdf_type = NamedNode { iri: "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" };
33/// let schema_person = NamedNode { iri: "http://schema.org/Person" };
34/// let mut count = 0;
35/// NTriplesParser::new(file.as_ref()).parse_all(&mut |t| {
36///     if t.predicate == rdf_type && t.object == schema_person.into() {
37///         count += 1;
38///     }
39///     Ok(()) as Result<(), TurtleError>
40/// })?;
41/// assert_eq!(2, count);
42/// # Result::<_,rio_turtle::TurtleError>::Ok(())
43/// ```
44pub struct NTriplesParser<R: BufRead> {
45    read: LookAheadByteReader<R>,
46    triple_alloc: TripleAllocator,
47}
48
49impl<R: BufRead> NTriplesParser<R> {
50    pub fn new(reader: R) -> Self {
51        Self {
52            read: LookAheadByteReader::new(reader),
53            triple_alloc: TripleAllocator::new(),
54        }
55    }
56}
57
58impl<R: BufRead> TriplesParser for NTriplesParser<R> {
59    type Error = TurtleError;
60
61    fn parse_step<E: From<TurtleError>>(
62        &mut self,
63        on_triple: &mut impl FnMut(Triple<'_>) -> Result<(), E>,
64    ) -> Result<(), E> {
65        match parse_triple_line(&mut self.read, &mut self.triple_alloc) {
66            Ok(true) => match on_triple(*self.triple_alloc.top()) {
67                Ok(()) => {
68                    self.triple_alloc.pop_top_triple();
69                    debug_assert_eq!(self.triple_alloc.complete_len(), 0);
70                    debug_assert_eq!(self.triple_alloc.incomplete_len(), 0);
71                    Ok(())
72                }
73                Err(err) => {
74                    self.triple_alloc.clear();
75                    Err(err)
76                }
77            },
78            Ok(false) => Ok(()),
79            Err(error) => {
80                self.read.consume_line_end()?;
81                self.triple_alloc.clear();
82                Err(E::from(error))
83            }
84        }
85    }
86
87    fn is_end(&self) -> bool {
88        self.read.current().is_none()
89    }
90}
91
92/// A [N-Quads](https://www.w3.org/TR/n-quads/) and [N-Quads-star](https://w3c.github.io/rdf-star/cg-spec/#n-quads-star) streaming parser.
93///
94/// It implements the `QuadsParser` trait.
95///
96/// Its memory consumption is linear in the size of the longest line of the file.
97/// It does not do any allocation during parsing except buffer resizing
98/// if a line significantly longer than the previous is encountered,
99/// or if a line uses deeply nested triples.
100///
101/// Count the number of people using the `QuadsParser` API:
102/// ```
103/// use rio_turtle::{NQuadsParser, TurtleError};
104/// use rio_api::parser::QuadsParser;
105/// use rio_api::model::NamedNode;
106///
107/// let file = b"<http://example.com/foo> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> <http://example.com/> .
108/// <http://example.com/foo> <http://schema.org/name> \"Foo\" <http://example.com/> .
109/// <http://example.com/bar> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .
110/// <http://example.com/bar> <http://schema.org/name> \"Bar\" .";
111///
112/// let rdf_type = NamedNode { iri: "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" };
113/// let schema_person = NamedNode { iri: "http://schema.org/Person" };
114/// let mut count = 0;
115/// NQuadsParser::new(file.as_ref()).parse_all(&mut |t| {
116///     if t.predicate == rdf_type && t.object == schema_person.into() {
117///         count += 1;
118///     }
119///     Ok(()) as Result<(), TurtleError>
120/// })?;
121/// assert_eq!(2, count);
122/// # Result::<_,rio_turtle::TurtleError>::Ok(())
123/// ```
124pub struct NQuadsParser<R: BufRead> {
125    read: LookAheadByteReader<R>,
126    triple_alloc: TripleAllocator,
127    graph_name_buf: String,
128}
129
130impl<R: BufRead> NQuadsParser<R> {
131    pub fn new(reader: R) -> Self {
132        Self {
133            read: LookAheadByteReader::new(reader),
134            triple_alloc: TripleAllocator::new(),
135            graph_name_buf: String::default(),
136        }
137    }
138}
139
140impl<R: BufRead> QuadsParser for NQuadsParser<R> {
141    type Error = TurtleError;
142
143    fn parse_step<E: From<TurtleError>>(
144        &mut self,
145        on_quad: &mut impl FnMut(Quad<'_>) -> Result<(), E>,
146    ) -> Result<(), E> {
147        match parse_quad_line(
148            &mut self.read,
149            &mut self.triple_alloc,
150            &mut self.graph_name_buf,
151        ) {
152            Ok(Some(opt_graph_name)) => match on_quad(self.triple_alloc.top_quad(opt_graph_name)) {
153                Ok(()) => {
154                    self.triple_alloc.pop_top_triple();
155                    debug_assert_eq!(self.triple_alloc.complete_len(), 0);
156                    debug_assert_eq!(self.triple_alloc.incomplete_len(), 0);
157                    Ok(())
158                }
159                Err(err) => {
160                    self.triple_alloc.clear();
161                    Err(err)
162                }
163            },
164            Ok(None) => Ok(()),
165            Err(error) => {
166                self.read.consume_line_end()?;
167                self.triple_alloc.clear();
168                Err(E::from(error))
169            }
170        }
171    }
172
173    fn is_end(&self) -> bool {
174        self.read.current().is_none()
175    }
176}
177
178fn parse_triple_line(
179    read: &mut LookAheadByteReader<impl BufRead>,
180    triple_alloc: &mut TripleAllocator,
181) -> Result<bool, TurtleError> {
182    skip_whitespace(read)?;
183
184    if matches!(
185        read.current(),
186        None | Some(b'#') | Some(b'\r') | Some(b'\n')
187    ) {
188        skip_until_eol(read)?;
189        return Ok(false);
190    }
191
192    parse_triple(read, triple_alloc)?;
193
194    read.check_is_current(b'.')?;
195    read.consume()?;
196    skip_whitespace(read)?;
197
198    match read.current() {
199        None | Some(b'#') | Some(b'\r') | Some(b'\n') => skip_until_eol(read)?,
200        _ => read.unexpected_char_error()?,
201    }
202
203    Ok(true)
204}
205
206fn parse_triple(
207    read: &mut LookAheadByteReader<impl BufRead>,
208    triple_alloc: &mut TripleAllocator,
209) -> Result<(), TurtleError> {
210    triple_alloc.push_triple_start();
211
212    parse_subject(read, triple_alloc)?;
213    skip_whitespace(read)?;
214
215    triple_alloc.try_push_predicate(|b| parse_iriref(read, b))?;
216    skip_whitespace(read)?;
217
218    parse_object(read, triple_alloc)?;
219    skip_whitespace(read)?;
220
221    Ok(())
222}
223
224fn parse_quad_line<'a>(
225    read: &mut LookAheadByteReader<impl BufRead>,
226    triple_alloc: &mut TripleAllocator,
227    graph_name_buf: &'a mut String,
228) -> Result<Option<Option<GraphName<'a>>>, TurtleError> {
229    skip_whitespace(read)?;
230
231    if matches!(
232        read.current(),
233        None | Some(b'#') | Some(b'\r') | Some(b'\n')
234    ) {
235        skip_until_eol(read)?;
236        return Ok(None);
237    }
238
239    parse_triple(read, triple_alloc)?;
240    let opt_graph_name = match read.current() {
241        Some(b'<') | Some(b'_') => {
242            graph_name_buf.clear();
243            Some(parse_graph_name(read, graph_name_buf)?)
244        }
245        _ => None,
246    };
247    skip_whitespace(read)?;
248
249    read.check_is_current(b'.')?;
250    read.consume()?;
251    skip_whitespace(read)?;
252
253    match read.current() {
254        None | Some(b'#') | Some(b'\r') | Some(b'\n') => skip_until_eol(read)?,
255        _ => read.unexpected_char_error()?,
256    }
257
258    Ok(Some(opt_graph_name))
259}
260
261fn parse_subject(
262    read: &mut LookAheadByteReader<impl BufRead>,
263    triple_alloc: &mut TripleAllocator,
264) -> Result<(), TurtleError> {
265    match read.required_current()? {
266        b'<' => match read.required_next()? {
267            b'<' => {
268                parse_quoted_triple(read, triple_alloc)?;
269                triple_alloc.push_subject_triple();
270                Ok(())
271            }
272            _ => triple_alloc.try_push_subject(|b| parse_iriref(read, b).map(Subject::from)),
273        },
274        b'_' => {
275            triple_alloc.try_push_subject(|b| parse_blank_node_label(read, b).map(Subject::from))
276        }
277        _ => read.unexpected_char_error(),
278    }
279}
280
281fn parse_object(
282    read: &mut LookAheadByteReader<impl BufRead>,
283    triple_alloc: &mut TripleAllocator,
284) -> Result<(), TurtleError> {
285    match read.required_current()? {
286        b'<' => match read.required_next()? {
287            b'<' => {
288                parse_quoted_triple(read, triple_alloc)?;
289                triple_alloc.push_object_triple();
290                Ok(())
291            }
292            _ => triple_alloc.try_push_object(|b, _| parse_iriref(read, b).map(Term::from)),
293        },
294        b'_' => {
295            triple_alloc.try_push_object(|b, _| parse_blank_node_label(read, b).map(Term::from))
296        }
297        b'"' => triple_alloc.try_push_object(|b1, b2| parse_literal(read, b1, b2).map(Term::from)),
298        _ => read.unexpected_char_error(),
299    }
300}
301
302fn parse_quoted_triple(
303    read: &mut LookAheadByteReader<impl BufRead>,
304    triple_alloc: &mut TripleAllocator,
305) -> Result<(), TurtleError> {
306    debug_assert_eq!(read.current(), Some(b'<'));
307    debug_assert_eq!(read.next()?, Some(b'<'));
308    read.increment_stack_size()?;
309    read.consume_many(2)?;
310
311    skip_whitespace(read)?;
312
313    parse_triple(read, triple_alloc)?;
314
315    read.check_is_current(b'>')?;
316    read.consume()?;
317    read.check_is_current(b'>')?;
318    read.consume()?;
319    read.decrement_stack_size();
320    skip_whitespace(read)
321}
322
323fn parse_graph_name<'a>(
324    read: &mut LookAheadByteReader<impl BufRead>,
325    buffer: &'a mut String,
326) -> Result<GraphName<'a>, TurtleError> {
327    match read.required_current()? {
328        b'<' => Ok(parse_iriref(read, buffer)?.into()),
329        b'_' => Ok(parse_blank_node_label(read, buffer)?.into()),
330        _ => read.unexpected_char_error(),
331    }
332}
333
334pub(crate) fn parse_literal<'a>(
335    read: &mut LookAheadByteReader<impl BufRead>,
336    buffer: &'a mut String,
337    annotation_buffer: &'a mut String,
338) -> Result<Literal<'a>, TurtleError> {
339    parse_string_literal_quote(read, buffer)?;
340    skip_whitespace(read)?;
341
342    match read.current() {
343        Some(b'@') => {
344            parse_langtag(read, annotation_buffer)?;
345            Ok(Literal::LanguageTaggedString {
346                value: buffer,
347                language: annotation_buffer,
348            })
349        }
350        Some(b'^') => {
351            read.consume()?;
352            read.check_is_current(b'^')?;
353            read.consume()?;
354            skip_whitespace(read)?;
355            Ok(Literal::Typed {
356                value: buffer,
357                datatype: parse_iriref(read, annotation_buffer)?,
358            })
359        }
360        _ => Ok(Literal::Simple { value: buffer }),
361    }
362}
363
364pub(crate) fn skip_whitespace(
365    read: &mut LookAheadByteReader<impl BufRead>,
366) -> Result<(), TurtleError> {
367    loop {
368        match read.current() {
369            Some(b' ') | Some(b'\t') => read.consume()?,
370            _ => return Ok(()),
371        }
372    }
373}
374
375pub(crate) fn skip_until_eol(
376    read: &mut LookAheadByteReader<impl BufRead>,
377) -> Result<(), TurtleError> {
378    loop {
379        match read.current() {
380            None => return Ok(()),
381            Some(b'\n') => {
382                read.consume()?;
383                return Ok(());
384            }
385            _ => (),
386        }
387        read.consume()?;
388    }
389}
390
391pub(crate) fn parse_iriref<'a>(
392    read: &mut LookAheadByteReader<impl BufRead>,
393    buffer: &'a mut String,
394) -> Result<NamedNode<'a>, TurtleError> {
395    parse_iriref_absolute(read, buffer)?;
396    Ok(NamedNode { iri: buffer })
397}
398
399#[cfg(test)]
400mod test {
401    #[test]
402    fn nquads_star_valid_quad() -> Result<(), Box<dyn std::error::Error>> {
403        // adding this test because there is currently no testsuite specific to N-Quads star
404        use crate::{NQuadsParser, TurtleError};
405        use rio_api::parser::QuadsParser;
406        let file = b"<< <tag:a> <tag:b> <tag:c> >> <tag:d> << <tag:e> <tag:f> <tag:g> >> <tag:h>.";
407        let mut count = 0;
408        NQuadsParser::new(file.as_ref()).parse_all(&mut |_| -> Result<(), TurtleError> {
409            count += 1;
410            Ok(())
411        })?;
412        assert_eq!(1, count);
413        Ok(())
414    }
415
416    #[test]
417    fn nquads_star_invalid_graph_name() {
418        // adding this test because there is currently no testsuite specific to N-Quads star
419        use crate::{NQuadsParser, TurtleError};
420        use rio_api::parser::QuadsParser;
421        let file = b"<tag:s> <tag:p> <tag:o> << <tag:a> <tag:b> <tag:c> >> .";
422        let mut count = 0;
423        let res = NQuadsParser::new(file.as_ref()).parse_all(&mut |_| -> Result<(), TurtleError> {
424            count += 1;
425            Ok(())
426        });
427        assert!(res.is_err());
428    }
429}