oxrdfxml/
parser.rs

1use crate::error::{RdfXmlParseError, RdfXmlSyntaxError};
2use crate::utils::*;
3use oxilangtag::LanguageTag;
4use oxiri::{Iri, IriParseError};
5use oxrdf::vocab::rdf;
6use oxrdf::{BlankNode, Literal, NamedNode, Subject, Term, Triple};
7use quick_xml::escape::{resolve_xml_entity, unescape_with};
8use quick_xml::events::attributes::Attribute;
9use quick_xml::events::*;
10use quick_xml::name::{LocalName, PrefixDeclaration, PrefixIter, QName, ResolveResult};
11use quick_xml::{Decoder, Error, NsReader, Writer};
12use std::borrow::Cow;
13use std::collections::{HashMap, HashSet};
14use std::io::{BufReader, Read};
15use std::str;
16#[cfg(feature = "async-tokio")]
17use tokio::io::{AsyncRead, BufReader as AsyncBufReader};
18
19/// A [RDF/XML](https://www.w3.org/TR/rdf-syntax-grammar/) streaming parser.
20///
21/// It reads the file in streaming.
22/// It does not keep data in memory except a stack for handling nested XML tags, and a set of all
23/// seen `rdf:ID`s to detect duplicate ids and fail according to the specification.
24///
25/// Its performances are not optimized yet and hopefully could be significantly enhanced by reducing the
26/// number of allocations and copies done by the parser.
27///
28/// Count the number of people:
29/// ```
30/// use oxrdf::vocab::rdf;
31/// use oxrdf::NamedNodeRef;
32/// use oxrdfxml::RdfXmlParser;
33///
34/// let file = br#"<?xml version="1.0"?>
35/// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:schema="http://schema.org/">
36///  <rdf:Description rdf:about="http://example.com/foo">
37///    <rdf:type rdf:resource="http://schema.org/Person" />
38///    <schema:name>Foo</schema:name>
39///  </rdf:Description>
40///  <schema:Person rdf:about="http://example.com/bar" schema:name="Bar" />
41/// </rdf:RDF>"#;
42///
43/// let schema_person = NamedNodeRef::new("http://schema.org/Person")?;
44/// let mut count = 0;
45/// for triple in RdfXmlParser::new().for_reader(file.as_ref()) {
46///     let triple = triple?;
47///     if triple.predicate == rdf::TYPE && triple.object == schema_person.into() {
48///         count += 1;
49///     }
50/// }
51/// assert_eq!(2, count);
52/// # Result::<_, Box<dyn std::error::Error>>::Ok(())
53/// ```
54#[derive(Default, Clone)]
55#[must_use]
56pub struct RdfXmlParser {
57    unchecked: bool,
58    base: Option<Iri<String>>,
59}
60
61impl RdfXmlParser {
62    /// Builds a new [`RdfXmlParser`].
63    #[inline]
64    pub fn new() -> Self {
65        Self::default()
66    }
67
68    /// Assumes the file is valid to make parsing faster.
69    ///
70    /// It will skip some validations.
71    ///
72    /// Note that if the file is actually not valid, broken RDF might be emitted by the parser.
73    #[inline]
74    pub fn unchecked(mut self) -> Self {
75        self.unchecked = true;
76        self
77    }
78
79    #[inline]
80    pub fn with_base_iri(mut self, base_iri: impl Into<String>) -> Result<Self, IriParseError> {
81        self.base = Some(Iri::parse(base_iri.into())?);
82        Ok(self)
83    }
84
85    /// Parses a RDF/XML file from a [`Read`] implementation.
86    ///
87    /// Count the number of people:
88    /// ```
89    /// use oxrdf::vocab::rdf;
90    /// use oxrdf::NamedNodeRef;
91    /// use oxrdfxml::RdfXmlParser;
92    ///
93    /// let file = br#"<?xml version="1.0"?>
94    /// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:schema="http://schema.org/">
95    ///  <rdf:Description rdf:about="http://example.com/foo">
96    ///    <rdf:type rdf:resource="http://schema.org/Person" />
97    ///    <schema:name>Foo</schema:name>
98    ///  </rdf:Description>
99    ///  <schema:Person rdf:about="http://example.com/bar" schema:name="Bar" />
100    /// </rdf:RDF>"#;
101    ///
102    /// let schema_person = NamedNodeRef::new("http://schema.org/Person")?;
103    /// let mut count = 0;
104    /// for triple in RdfXmlParser::new().for_reader(file.as_ref()) {
105    ///     let triple = triple?;
106    ///     if triple.predicate == rdf::TYPE && triple.object == schema_person.into() {
107    ///         count += 1;
108    ///     }
109    /// }
110    /// assert_eq!(2, count);
111    /// # Result::<_, Box<dyn std::error::Error>>::Ok(())
112    /// ```
113    pub fn for_reader<R: Read>(self, reader: R) -> ReaderRdfXmlParser<R> {
114        ReaderRdfXmlParser {
115            results: Vec::new(),
116            parser: self.into_internal(BufReader::new(reader)),
117            reader_buffer: Vec::default(),
118        }
119    }
120
121    /// Parses a RDF/XML file from a [`AsyncRead`] implementation.
122    ///
123    /// Count the number of people:
124    /// ```
125    /// # #[tokio::main(flavor = "current_thread")]
126    /// # async fn main() -> Result<(), Box<dyn std::error::Error>> {
127    /// use oxrdf::vocab::rdf;
128    /// use oxrdf::NamedNodeRef;
129    /// use oxrdfxml::RdfXmlParser;
130    ///
131    /// let file = br#"<?xml version="1.0"?>
132    /// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:schema="http://schema.org/">
133    ///   <rdf:Description rdf:about="http://example.com/foo">
134    ///     <rdf:type rdf:resource="http://schema.org/Person" />
135    ///     <schema:name>Foo</schema:name>
136    ///   </rdf:Description>
137    ///   <schema:Person rdf:about="http://example.com/bar" schema:name="Bar" />
138    /// </rdf:RDF>"#;
139    ///
140    /// let schema_person = NamedNodeRef::new("http://schema.org/Person")?;
141    /// let mut count = 0;
142    /// let mut parser = RdfXmlParser::new().for_tokio_async_reader(file.as_ref());
143    /// while let Some(triple) = parser.next().await {
144    ///     let triple = triple?;
145    ///     if triple.predicate == rdf::TYPE && triple.object == schema_person.into() {
146    ///         count += 1;
147    ///     }
148    /// }
149    /// assert_eq!(2, count);
150    /// # Ok(())
151    /// # }
152    /// ```
153    #[cfg(feature = "async-tokio")]
154    pub fn for_tokio_async_reader<R: AsyncRead + Unpin>(
155        self,
156        reader: R,
157    ) -> TokioAsyncReaderRdfXmlParser<R> {
158        TokioAsyncReaderRdfXmlParser {
159            results: Vec::new(),
160            parser: self.into_internal(AsyncBufReader::new(reader)),
161            reader_buffer: Vec::default(),
162        }
163    }
164
165    /// Parses a RDF/XML file from a byte slice.
166    ///
167    /// Count the number of people:
168    /// ```
169    /// use oxrdf::vocab::rdf;
170    /// use oxrdf::NamedNodeRef;
171    /// use oxrdfxml::RdfXmlParser;
172    ///
173    /// let file = br#"<?xml version="1.0"?>
174    /// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:schema="http://schema.org/">
175    ///  <rdf:Description rdf:about="http://example.com/foo">
176    ///    <rdf:type rdf:resource="http://schema.org/Person" />
177    ///    <schema:name>Foo</schema:name>
178    ///  </rdf:Description>
179    ///  <schema:Person rdf:about="http://example.com/bar" schema:name="Bar" />
180    /// </rdf:RDF>"#;
181    ///
182    /// let schema_person = NamedNodeRef::new("http://schema.org/Person")?;
183    /// let mut count = 0;
184    /// for triple in RdfXmlParser::new().for_slice(file) {
185    ///     let triple = triple?;
186    ///     if triple.predicate == rdf::TYPE && triple.object == schema_person.into() {
187    ///         count += 1;
188    ///     }
189    /// }
190    /// assert_eq!(2, count);
191    /// # Result::<_, Box<dyn std::error::Error>>::Ok(())
192    /// ```
193    pub fn for_slice(self, slice: &[u8]) -> SliceRdfXmlParser<'_> {
194        SliceRdfXmlParser {
195            results: Vec::new(),
196            parser: self.into_internal(slice),
197            reader_buffer: Vec::default(),
198        }
199    }
200
201    fn into_internal<T>(self, reader: T) -> InternalRdfXmlParser<T> {
202        let mut reader = NsReader::from_reader(reader);
203        reader.config_mut().expand_empty_elements = true;
204        InternalRdfXmlParser {
205            reader,
206            state: vec![RdfXmlState::Doc {
207                base_iri: self.base.clone(),
208            }],
209            custom_entities: HashMap::new(),
210            in_literal_depth: 0,
211            known_rdf_id: HashSet::default(),
212            is_end: false,
213            unchecked: self.unchecked,
214        }
215    }
216}
217
218/// Parses a RDF/XML file from a [`Read`] implementation.
219///
220/// Can be built using [`RdfXmlParser::for_reader`].
221///
222/// Count the number of people:
223/// ```
224/// use oxrdf::vocab::rdf;
225/// use oxrdf::NamedNodeRef;
226/// use oxrdfxml::RdfXmlParser;
227///
228/// let file = br#"<?xml version="1.0"?>
229/// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:schema="http://schema.org/">
230///  <rdf:Description rdf:about="http://example.com/foo">
231///    <rdf:type rdf:resource="http://schema.org/Person" />
232///    <schema:name>Foo</schema:name>
233///  </rdf:Description>
234///  <schema:Person rdf:about="http://example.com/bar" schema:name="Bar" />
235/// </rdf:RDF>"#;
236///
237/// let schema_person = NamedNodeRef::new("http://schema.org/Person")?;
238/// let mut count = 0;
239/// for triple in RdfXmlParser::new().for_reader(file.as_ref()) {
240///     let triple = triple?;
241///     if triple.predicate == rdf::TYPE && triple.object == schema_person.into() {
242///         count += 1;
243///     }
244/// }
245/// assert_eq!(2, count);
246/// # Result::<_, Box<dyn std::error::Error>>::Ok(())
247/// ```
248#[must_use]
249pub struct ReaderRdfXmlParser<R: Read> {
250    results: Vec<Triple>,
251    parser: InternalRdfXmlParser<BufReader<R>>,
252    reader_buffer: Vec<u8>,
253}
254
255impl<R: Read> Iterator for ReaderRdfXmlParser<R> {
256    type Item = Result<Triple, RdfXmlParseError>;
257
258    fn next(&mut self) -> Option<Self::Item> {
259        loop {
260            if let Some(triple) = self.results.pop() {
261                return Some(Ok(triple));
262            } else if self.parser.is_end {
263                return None;
264            }
265            if let Err(e) = self.parse_step() {
266                return Some(Err(e));
267            }
268        }
269    }
270}
271
272impl<R: Read> ReaderRdfXmlParser<R> {
273    /// The list of IRI prefixes considered at the current step of the parsing.
274    ///
275    /// This method returns (prefix name, prefix value) tuples.
276    /// It is empty at the beginning of the parsing and gets updated when prefixes are encountered.
277    /// It should be full at the end of the parsing (but if a prefix is overridden, only the latest version will be returned).
278    ///
279    /// ```
280    /// use oxrdfxml::RdfXmlParser;
281    ///
282    /// let file = br#"<?xml version="1.0"?>
283    /// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:schema="http://schema.org/">
284    ///  <rdf:Description rdf:about="http://example.com/foo">
285    ///    <rdf:type rdf:resource="http://schema.org/Person" />
286    ///    <schema:name>Foo</schema:name>
287    ///  </rdf:Description>
288    ///  <schema:Person rdf:about="http://example.com/bar" schema:name="Bar" />
289    /// </rdf:RDF>"#;
290    ///
291    /// let mut parser = RdfXmlParser::new().for_reader(file.as_ref());
292    /// assert_eq!(parser.prefixes().collect::<Vec<_>>(), []); // No prefix at the beginning
293    ///
294    /// parser.next().unwrap()?; // We read the first triple
295    /// assert_eq!(
296    ///     parser.prefixes().collect::<Vec<_>>(),
297    ///     [
298    ///         ("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#"),
299    ///         ("schema", "http://schema.org/")
300    ///     ]
301    /// ); // There are now prefixes
302    /// # Result::<_, Box<dyn std::error::Error>>::Ok(())
303    /// ```
304    pub fn prefixes(&self) -> RdfXmlPrefixesIter<'_> {
305        RdfXmlPrefixesIter {
306            inner: self.parser.reader.prefixes(),
307            decoder: self.parser.reader.decoder(),
308            unchecked: self.parser.unchecked,
309        }
310    }
311
312    /// The base IRI considered at the current step of the parsing.
313    ///
314    /// ```
315    /// use oxrdfxml::RdfXmlParser;
316    ///
317    /// let file = br#"<?xml version="1.0"?>
318    /// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xml:base="http://example.com/">
319    ///  <rdf:Description rdf:about="foo">
320    ///    <rdf:type rdf:resource="http://schema.org/Person" />
321    ///  </rdf:Description>
322    /// </rdf:RDF>"#;
323    ///
324    /// let mut parser = RdfXmlParser::new().for_reader(file.as_ref());
325    /// assert!(parser.base_iri().is_none()); // No base at the beginning because none has been given to the parser.
326    ///
327    /// parser.next().unwrap()?; // We read the first triple
328    /// assert_eq!(parser.base_iri(), Some("http://example.com/")); // There is now a base IRI.
329    /// # Result::<_, Box<dyn std::error::Error>>::Ok(())
330    /// ```
331    pub fn base_iri(&self) -> Option<&str> {
332        Some(self.parser.current_base_iri()?.as_str())
333    }
334
335    /// The current byte position in the input data.
336    pub fn buffer_position(&self) -> u64 {
337        self.parser.reader.buffer_position()
338    }
339
340    fn parse_step(&mut self) -> Result<(), RdfXmlParseError> {
341        self.reader_buffer.clear();
342        let event = self
343            .parser
344            .reader
345            .read_event_into(&mut self.reader_buffer)?;
346        self.parser.parse_event(event, &mut self.results)
347    }
348}
349
350/// Parses a RDF/XML file from a [`AsyncRead`] implementation.
351///
352/// Can be built using [`RdfXmlParser::for_tokio_async_reader`].
353///
354/// Count the number of people:
355/// ```
356/// # #[tokio::main(flavor = "current_thread")]
357/// # async fn main() -> Result<(), Box<dyn std::error::Error>> {
358/// use oxrdf::vocab::rdf;
359/// use oxrdf::NamedNodeRef;
360/// use oxrdfxml::RdfXmlParser;
361///
362/// let file = br#"<?xml version="1.0"?>
363/// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:schema="http://schema.org/">
364///   <rdf:Description rdf:about="http://example.com/foo">
365///     <rdf:type rdf:resource="http://schema.org/Person" />
366///     <schema:name>Foo</schema:name>
367///   </rdf:Description>
368///   <schema:Person rdf:about="http://example.com/bar" schema:name="Bar" />
369/// </rdf:RDF>"#;
370///
371/// let schema_person = NamedNodeRef::new("http://schema.org/Person")?;
372/// let mut count = 0;
373/// let mut parser = RdfXmlParser::new().for_tokio_async_reader(file.as_ref());
374/// while let Some(triple) = parser.next().await {
375///     let triple = triple?;
376///     if triple.predicate == rdf::TYPE && triple.object == schema_person.into() {
377///         count += 1;
378///     }
379/// }
380/// assert_eq!(2, count);
381/// # Ok(())
382/// # }
383/// ```
384#[cfg(feature = "async-tokio")]
385#[must_use]
386pub struct TokioAsyncReaderRdfXmlParser<R: AsyncRead + Unpin> {
387    results: Vec<Triple>,
388    parser: InternalRdfXmlParser<AsyncBufReader<R>>,
389    reader_buffer: Vec<u8>,
390}
391
392#[cfg(feature = "async-tokio")]
393impl<R: AsyncRead + Unpin> TokioAsyncReaderRdfXmlParser<R> {
394    /// Reads the next triple or returns `None` if the file is finished.
395    pub async fn next(&mut self) -> Option<Result<Triple, RdfXmlParseError>> {
396        loop {
397            if let Some(triple) = self.results.pop() {
398                return Some(Ok(triple));
399            } else if self.parser.is_end {
400                return None;
401            }
402            if let Err(e) = self.parse_step().await {
403                return Some(Err(e));
404            }
405        }
406    }
407
408    /// The list of IRI prefixes considered at the current step of the parsing.
409    ///
410    /// This method returns (prefix name, prefix value) tuples.
411    /// It is empty at the beginning of the parsing and gets updated when prefixes are encountered.
412    /// It should be full at the end of the parsing (but if a prefix is overridden, only the latest version will be returned).
413    ///
414    /// ```
415    /// # #[tokio::main(flavor = "current_thread")]
416    /// # async fn main() -> Result<(), Box<dyn std::error::Error>> {
417    /// use oxrdfxml::RdfXmlParser;
418    ///
419    /// let file = br#"<?xml version="1.0"?>
420    /// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:schema="http://schema.org/">
421    ///  <rdf:Description rdf:about="http://example.com/foo">
422    ///    <rdf:type rdf:resource="http://schema.org/Person" />
423    ///    <schema:name>Foo</schema:name>
424    ///  </rdf:Description>
425    ///  <schema:Person rdf:about="http://example.com/bar" schema:name="Bar" />
426    /// </rdf:RDF>"#;
427    ///
428    /// let mut parser = RdfXmlParser::new().for_tokio_async_reader(file.as_ref());
429    /// assert_eq!(parser.prefixes().collect::<Vec<_>>(), []); // No prefix at the beginning
430    ///
431    /// parser.next().await.unwrap()?; // We read the first triple
432    /// assert_eq!(
433    ///     parser.prefixes().collect::<Vec<_>>(),
434    ///     [
435    ///         ("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#"),
436    ///         ("schema", "http://schema.org/")
437    ///     ]
438    /// ); // There are now prefixes
439    /// # Ok(())
440    /// # }
441    /// ```
442    pub fn prefixes(&self) -> RdfXmlPrefixesIter<'_> {
443        RdfXmlPrefixesIter {
444            inner: self.parser.reader.prefixes(),
445            decoder: self.parser.reader.decoder(),
446            unchecked: self.parser.unchecked,
447        }
448    }
449
450    /// The base IRI considered at the current step of the parsing.
451    ///
452    /// ```
453    /// # #[tokio::main(flavor = "current_thread")]
454    /// # async fn main() -> Result<(), Box<dyn std::error::Error>> {
455    /// use oxrdfxml::RdfXmlParser;
456    ///
457    /// let file = br#"<?xml version="1.0"?>
458    /// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xml:base="http://example.com/">
459    ///  <rdf:Description rdf:about="foo">
460    ///    <rdf:type rdf:resource="http://schema.org/Person" />
461    ///  </rdf:Description>
462    /// </rdf:RDF>"#;
463    ///
464    /// let mut parser = RdfXmlParser::new().for_tokio_async_reader(file.as_ref());
465    /// assert!(parser.base_iri().is_none()); // No base at the beginning because none has been given to the parser.
466    ///
467    /// parser.next().await.unwrap()?; // We read the first triple
468    /// assert_eq!(parser.base_iri(), Some("http://example.com/")); // There is now a base IRI.
469    /// # Ok(())
470    /// # }
471    /// ```
472    pub fn base_iri(&self) -> Option<&str> {
473        Some(self.parser.current_base_iri()?.as_str())
474    }
475
476    /// The current byte position in the input data.
477    pub fn buffer_position(&self) -> u64 {
478        self.parser.reader.buffer_position()
479    }
480
481    async fn parse_step(&mut self) -> Result<(), RdfXmlParseError> {
482        self.reader_buffer.clear();
483        let event = self
484            .parser
485            .reader
486            .read_event_into_async(&mut self.reader_buffer)
487            .await?;
488        self.parser.parse_event(event, &mut self.results)
489    }
490}
491
492/// Parses a RDF/XML file from a byte slice.
493///
494/// Can be built using [`RdfXmlParser::for_slice`].
495///
496/// Count the number of people:
497/// ```
498/// use oxrdf::vocab::rdf;
499/// use oxrdf::NamedNodeRef;
500/// use oxrdfxml::RdfXmlParser;
501///
502/// let file = br#"<?xml version="1.0"?>
503/// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:schema="http://schema.org/">
504///  <rdf:Description rdf:about="http://example.com/foo">
505///    <rdf:type rdf:resource="http://schema.org/Person" />
506///    <schema:name>Foo</schema:name>
507///  </rdf:Description>
508///  <schema:Person rdf:about="http://example.com/bar" schema:name="Bar" />
509/// </rdf:RDF>"#;
510///
511/// let schema_person = NamedNodeRef::new("http://schema.org/Person")?;
512/// let mut count = 0;
513/// for triple in RdfXmlParser::new().for_slice(file) {
514///     let triple = triple?;
515///     if triple.predicate == rdf::TYPE && triple.object == schema_person.into() {
516///         count += 1;
517///     }
518/// }
519/// assert_eq!(2, count);
520/// # Result::<_, Box<dyn std::error::Error>>::Ok(())
521/// ```
522#[must_use]
523pub struct SliceRdfXmlParser<'a> {
524    results: Vec<Triple>,
525    parser: InternalRdfXmlParser<&'a [u8]>,
526    reader_buffer: Vec<u8>,
527}
528
529impl Iterator for SliceRdfXmlParser<'_> {
530    type Item = Result<Triple, RdfXmlSyntaxError>;
531
532    fn next(&mut self) -> Option<Self::Item> {
533        loop {
534            if let Some(triple) = self.results.pop() {
535                return Some(Ok(triple));
536            } else if self.parser.is_end {
537                return None;
538            }
539            if let Err(RdfXmlParseError::Syntax(e)) = self.parse_step() {
540                // I/O errors can't happen
541                return Some(Err(e));
542            }
543        }
544    }
545}
546
547impl SliceRdfXmlParser<'_> {
548    /// The list of IRI prefixes considered at the current step of the parsing.
549    ///
550    /// This method returns (prefix name, prefix value) tuples.
551    /// It is empty at the beginning of the parsing and gets updated when prefixes are encountered.
552    /// It should be full at the end of the parsing (but if a prefix is overridden, only the latest version will be returned).
553    ///
554    /// ```
555    /// use oxrdfxml::RdfXmlParser;
556    ///
557    /// let file = br#"<?xml version="1.0"?>
558    /// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:schema="http://schema.org/">
559    ///  <rdf:Description rdf:about="http://example.com/foo">
560    ///    <rdf:type rdf:resource="http://schema.org/Person" />
561    ///    <schema:name>Foo</schema:name>
562    ///  </rdf:Description>
563    ///  <schema:Person rdf:about="http://example.com/bar" schema:name="Bar" />
564    /// </rdf:RDF>"#;
565    ///
566    /// let mut parser = RdfXmlParser::new().for_slice(file);
567    /// assert_eq!(parser.prefixes().collect::<Vec<_>>(), []); // No prefix at the beginning
568    ///
569    /// parser.next().unwrap()?; // We read the first triple
570    /// assert_eq!(
571    ///     parser.prefixes().collect::<Vec<_>>(),
572    ///     [
573    ///         ("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#"),
574    ///         ("schema", "http://schema.org/")
575    ///     ]
576    /// ); // There are now prefixes
577    /// # Result::<_, Box<dyn std::error::Error>>::Ok(())
578    /// ```
579    pub fn prefixes(&self) -> RdfXmlPrefixesIter<'_> {
580        RdfXmlPrefixesIter {
581            inner: self.parser.reader.prefixes(),
582            decoder: self.parser.reader.decoder(),
583            unchecked: self.parser.unchecked,
584        }
585    }
586
587    /// The base IRI considered at the current step of the parsing.
588    ///
589    /// ```
590    /// use oxrdfxml::RdfXmlParser;
591    ///
592    /// let file = br#"<?xml version="1.0"?>
593    /// <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xml:base="http://example.com/">
594    ///  <rdf:Description rdf:about="foo">
595    ///    <rdf:type rdf:resource="http://schema.org/Person" />
596    ///  </rdf:Description>
597    /// </rdf:RDF>"#;
598    ///
599    /// let mut parser = RdfXmlParser::new().for_slice(file);
600    /// assert!(parser.base_iri().is_none()); // No base at the beginning because none has been given to the parser.
601    ///
602    /// parser.next().unwrap()?; // We read the first triple
603    /// assert_eq!(parser.base_iri(), Some("http://example.com/")); // There is now a base IRI.
604    /// # Result::<_, Box<dyn std::error::Error>>::Ok(())
605    /// ```
606    pub fn base_iri(&self) -> Option<&str> {
607        Some(self.parser.current_base_iri()?.as_str())
608    }
609
610    /// The current byte position in the input data.
611    pub fn buffer_position(&self) -> u64 {
612        self.parser.reader.buffer_position()
613    }
614
615    fn parse_step(&mut self) -> Result<(), RdfXmlParseError> {
616        self.reader_buffer.clear();
617        let event = self
618            .parser
619            .reader
620            .read_event_into(&mut self.reader_buffer)?;
621        self.parser.parse_event(event, &mut self.results)
622    }
623}
624
625/// Iterator on the file prefixes.
626///
627/// See [`ReaderRdfXmlParser::prefixes`].
628pub struct RdfXmlPrefixesIter<'a> {
629    inner: PrefixIter<'a>,
630    decoder: Decoder,
631    unchecked: bool,
632}
633
634impl<'a> Iterator for RdfXmlPrefixesIter<'a> {
635    type Item = (&'a str, &'a str);
636
637    #[inline]
638    fn next(&mut self) -> Option<Self::Item> {
639        loop {
640            let (key, value) = self.inner.next()?;
641            return Some((
642                match key {
643                    PrefixDeclaration::Default => "",
644                    PrefixDeclaration::Named(name) => {
645                        let Ok(Cow::Borrowed(name)) = self.decoder.decode(name) else {
646                            continue;
647                        };
648                        let Ok(Cow::Borrowed(name)) = unescape_with(name, |_| None) else {
649                            continue;
650                        };
651                        if !self.unchecked && !is_nc_name(name) {
652                            continue; // We don't return invalid prefixes
653                        }
654                        name
655                    }
656                },
657                {
658                    let Ok(Cow::Borrowed(value)) = self.decoder.decode(value.0) else {
659                        continue;
660                    };
661                    let Ok(Cow::Borrowed(value)) = unescape_with(value, |_| None) else {
662                        continue;
663                    };
664                    if !self.unchecked && Iri::parse(value).is_err() {
665                        continue; // We don't return invalid prefixes
666                    }
667                    value
668                },
669            ));
670        }
671    }
672
673    #[inline]
674    fn size_hint(&self) -> (usize, Option<usize>) {
675        self.inner.size_hint()
676    }
677}
678
679const RDF_ABOUT: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#about";
680const RDF_ABOUT_EACH: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEach";
681const RDF_ABOUT_EACH_PREFIX: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEachPrefix";
682const RDF_BAG_ID: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#bagID";
683const RDF_DATATYPE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#datatype";
684const RDF_DESCRIPTION: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#Description";
685const RDF_ID: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#ID";
686const RDF_LI: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#li";
687const RDF_NODE_ID: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#nodeID";
688const RDF_PARSE_TYPE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#parseType";
689const RDF_RDF: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#RDF";
690const RDF_RESOURCE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#resource";
691
692const RESERVED_RDF_ELEMENTS: [&str; 11] = [
693    RDF_ABOUT,
694    RDF_ABOUT_EACH,
695    RDF_ABOUT_EACH_PREFIX,
696    RDF_BAG_ID,
697    RDF_DATATYPE,
698    RDF_ID,
699    RDF_LI,
700    RDF_NODE_ID,
701    RDF_PARSE_TYPE,
702    RDF_RDF,
703    RDF_RESOURCE,
704];
705const RESERVED_RDF_ATTRIBUTES: [&str; 5] = [
706    RDF_ABOUT_EACH,
707    RDF_ABOUT_EACH_PREFIX,
708    RDF_LI,
709    RDF_RDF,
710    RDF_RESOURCE,
711];
712
713#[derive(Clone, Debug)]
714enum NodeOrText {
715    Node(Subject),
716    Text(String),
717}
718
719enum RdfXmlState {
720    Doc {
721        base_iri: Option<Iri<String>>,
722    },
723    Rdf {
724        base_iri: Option<Iri<String>>,
725        language: Option<String>,
726    },
727    NodeElt {
728        base_iri: Option<Iri<String>>,
729        language: Option<String>,
730        subject: Subject,
731        li_counter: u64,
732    },
733    PropertyElt {
734        // Resource, Literal or Empty property element
735        iri: NamedNode,
736        base_iri: Option<Iri<String>>,
737        language: Option<String>,
738        subject: Subject,
739        object: Option<NodeOrText>,
740        id_attr: Option<NamedNode>,
741        datatype_attr: Option<NamedNode>,
742    },
743    ParseTypeCollectionPropertyElt {
744        iri: NamedNode,
745        base_iri: Option<Iri<String>>,
746        language: Option<String>,
747        subject: Subject,
748        objects: Vec<Subject>,
749        id_attr: Option<NamedNode>,
750    },
751    ParseTypeLiteralPropertyElt {
752        iri: NamedNode,
753        base_iri: Option<Iri<String>>,
754        language: Option<String>,
755        subject: Subject,
756        writer: Writer<Vec<u8>>,
757        id_attr: Option<NamedNode>,
758        emit: bool, // false for parseTypeOtherPropertyElt support
759    },
760}
761
762struct InternalRdfXmlParser<R> {
763    reader: NsReader<R>,
764    state: Vec<RdfXmlState>,
765    custom_entities: HashMap<String, String>,
766    in_literal_depth: usize,
767    known_rdf_id: HashSet<String>,
768    is_end: bool,
769    unchecked: bool,
770}
771
772impl<R> InternalRdfXmlParser<R> {
773    fn parse_event(
774        &mut self,
775        event: Event<'_>,
776        results: &mut Vec<Triple>,
777    ) -> Result<(), RdfXmlParseError> {
778        match event {
779            Event::Start(event) => self.parse_start_event(&event, results),
780            Event::End(event) => self.parse_end_event(&event, results),
781            Event::Empty(_) => Err(RdfXmlSyntaxError::msg(
782                "The expand_empty_elements option must be enabled",
783            )
784            .into()),
785            Event::Text(event) => self.parse_text_event(&event),
786            Event::CData(event) => self.parse_text_event(&event.escape()?),
787            Event::Comment(_) | Event::PI(_) => Ok(()),
788            Event::Decl(decl) => {
789                if let Some(encoding) = decl.encoding() {
790                    if !is_utf8(&encoding?) {
791                        return Err(RdfXmlSyntaxError::msg(
792                            "Only UTF-8 is supported by the RDF/XML parser",
793                        )
794                        .into());
795                    }
796                }
797                Ok(())
798            }
799            Event::DocType(dt) => self.parse_doctype(&dt),
800            Event::Eof => {
801                self.is_end = true;
802                Ok(())
803            }
804        }
805    }
806
807    fn parse_doctype(&mut self, dt: &BytesText<'_>) -> Result<(), RdfXmlParseError> {
808        // we extract entities
809        for input in self
810            .reader
811            .decoder()
812            .decode(dt.as_ref())?
813            .split('<')
814            .skip(1)
815        {
816            if let Some(input) = input.strip_prefix("!ENTITY") {
817                let input = input.trim_start().strip_prefix('%').unwrap_or(input);
818                let (entity_name, input) = input.trim_start().split_once(|c: char| c.is_ascii_whitespace()).ok_or_else(|| {
819                    RdfXmlSyntaxError::msg(
820                        "<!ENTITY declarations should contain both an entity name and an entity value",
821                    )
822                })?;
823                let input = input.trim_start().strip_prefix('\"').ok_or_else(|| {
824                    RdfXmlSyntaxError::msg("<!ENTITY values should be enclosed in double quotes")
825                })?;
826                let (entity_value, input) = input.split_once('"').ok_or_else(|| {
827                    RdfXmlSyntaxError::msg(
828                        "<!ENTITY declarations values should be enclosed in double quotes",
829                    )
830                })?;
831                input.trim_start().strip_prefix('>').ok_or_else(|| {
832                    RdfXmlSyntaxError::msg("<!ENTITY declarations values should end with >")
833                })?;
834
835                // Resolves custom entities within the current entity definition.
836                let entity_value =
837                    unescape_with(entity_value, |e| self.resolve_entity(e)).map_err(Error::from)?;
838                self.custom_entities
839                    .insert(entity_name.to_owned(), entity_value.to_string());
840            }
841        }
842        Ok(())
843    }
844
845    fn parse_start_event(
846        &mut self,
847        event: &BytesStart<'_>,
848        results: &mut Vec<Triple>,
849    ) -> Result<(), RdfXmlParseError> {
850        #[derive(PartialEq, Eq)]
851        enum RdfXmlParseType {
852            Default,
853            Collection,
854            Literal,
855            Resource,
856            Other,
857        }
858
859        #[derive(PartialEq, Eq)]
860        enum RdfXmlNextProduction {
861            Rdf,
862            NodeElt,
863            PropertyElt { subject: Subject },
864        }
865
866        // Literal case
867        if let Some(RdfXmlState::ParseTypeLiteralPropertyElt { writer, .. }) = self.state.last_mut()
868        {
869            let mut clean_event = BytesStart::new(
870                self.reader
871                    .decoder()
872                    .decode(event.name().as_ref())?
873                    .to_string(),
874            );
875            for attr in event.attributes() {
876                clean_event.push_attribute(attr.map_err(Error::InvalidAttr)?);
877            }
878            writer.write_event(Event::Start(clean_event))?;
879            self.in_literal_depth += 1;
880            return Ok(());
881        }
882
883        let tag_name = self.resolve_tag_name(event.name())?;
884
885        // We read attributes
886        let mut language = None;
887        let mut base_iri = None;
888        let mut id_attr = None;
889        let mut node_id_attr = None;
890        let mut about_attr = None;
891        let mut property_attrs = Vec::default();
892        let mut resource_attr = None;
893        let mut datatype_attr = None;
894        let mut parse_type = RdfXmlParseType::Default;
895        let mut type_attr = None;
896
897        for attribute in event.attributes() {
898            let attribute = attribute.map_err(Error::InvalidAttr)?;
899            if attribute.key.as_ref().starts_with(b"xml") {
900                if attribute.key.as_ref() == b"xml:lang" {
901                    let tag = self.convert_attribute(&attribute)?.to_ascii_lowercase();
902                    language = Some(if self.unchecked {
903                        tag
904                    } else {
905                        LanguageTag::parse(tag.to_ascii_lowercase())
906                            .map_err(|error| RdfXmlSyntaxError::invalid_language_tag(tag, error))?
907                            .into_inner()
908                    });
909                } else if attribute.key.as_ref() == b"xml:base" {
910                    let iri = self.convert_attribute(&attribute)?;
911                    base_iri = Some(if self.unchecked {
912                        Iri::parse_unchecked(iri.clone())
913                    } else {
914                        Iri::parse(iri.clone())
915                            .map_err(|error| RdfXmlSyntaxError::invalid_iri(iri, error))?
916                    })
917                } else {
918                    // We ignore other xml attributes
919                }
920            } else {
921                let attribute_url = self.resolve_attribute_name(attribute.key)?;
922                if *attribute_url == *RDF_ID {
923                    let mut id = self.convert_attribute(&attribute)?;
924                    if !is_nc_name(&id) {
925                        return Err(RdfXmlSyntaxError::msg(format!(
926                            "{id} is not a valid rdf:ID value"
927                        ))
928                        .into());
929                    }
930                    id.insert(0, '#');
931                    id_attr = Some(id);
932                } else if *attribute_url == *RDF_BAG_ID {
933                    let bag_id = self.convert_attribute(&attribute)?;
934                    if !is_nc_name(&bag_id) {
935                        return Err(RdfXmlSyntaxError::msg(format!(
936                            "{bag_id} is not a valid rdf:bagID value"
937                        ))
938                        .into());
939                    }
940                } else if *attribute_url == *RDF_NODE_ID {
941                    let id = self.convert_attribute(&attribute)?;
942                    if !is_nc_name(&id) {
943                        return Err(RdfXmlSyntaxError::msg(format!(
944                            "{id} is not a valid rdf:nodeID value"
945                        ))
946                        .into());
947                    }
948                    node_id_attr = Some(BlankNode::new_unchecked(id));
949                } else if *attribute_url == *RDF_ABOUT {
950                    about_attr = Some(attribute);
951                } else if *attribute_url == *RDF_RESOURCE {
952                    resource_attr = Some(attribute);
953                } else if *attribute_url == *RDF_DATATYPE {
954                    datatype_attr = Some(attribute);
955                } else if *attribute_url == *RDF_PARSE_TYPE {
956                    parse_type = match attribute.value.as_ref() {
957                        b"Collection" => RdfXmlParseType::Collection,
958                        b"Literal" => RdfXmlParseType::Literal,
959                        b"Resource" => RdfXmlParseType::Resource,
960                        _ => RdfXmlParseType::Other,
961                    };
962                } else if attribute_url == rdf::TYPE.as_str() {
963                    type_attr = Some(attribute);
964                } else if RESERVED_RDF_ATTRIBUTES.contains(&&*attribute_url) {
965                    return Err(RdfXmlSyntaxError::msg(format!(
966                        "{attribute_url} is not a valid attribute"
967                    ))
968                    .into());
969                } else {
970                    property_attrs.push((
971                        self.parse_iri(attribute_url)?,
972                        self.convert_attribute(&attribute)?,
973                    ));
974                }
975            }
976        }
977
978        // Parsing with the base URI
979        let id_attr = match id_attr {
980            Some(iri) => {
981                let iri = self.resolve_iri(base_iri.as_ref(), iri)?;
982                if !self.unchecked {
983                    if self.known_rdf_id.contains(iri.as_str()) {
984                        return Err(RdfXmlSyntaxError::msg(format!(
985                            "{iri} has already been used as rdf:ID value"
986                        ))
987                        .into());
988                    }
989                    self.known_rdf_id.insert(iri.as_str().into());
990                }
991                Some(iri)
992            }
993            None => None,
994        };
995        let about_attr = match about_attr {
996            Some(attr) => Some(self.convert_iri_attribute(base_iri.as_ref(), &attr)?),
997            None => None,
998        };
999        let resource_attr = match resource_attr {
1000            Some(attr) => Some(self.convert_iri_attribute(base_iri.as_ref(), &attr)?),
1001            None => None,
1002        };
1003        let datatype_attr = match datatype_attr {
1004            Some(attr) => Some(self.convert_iri_attribute(base_iri.as_ref(), &attr)?),
1005            None => None,
1006        };
1007        let type_attr = match type_attr {
1008            Some(attr) => Some(self.convert_iri_attribute(base_iri.as_ref(), &attr)?),
1009            None => None,
1010        };
1011
1012        let expected_production = match self.state.last() {
1013            Some(RdfXmlState::Doc { .. }) => RdfXmlNextProduction::Rdf,
1014            Some(
1015                RdfXmlState::Rdf { .. }
1016                | RdfXmlState::PropertyElt { .. }
1017                | RdfXmlState::ParseTypeCollectionPropertyElt { .. },
1018            ) => RdfXmlNextProduction::NodeElt,
1019            Some(RdfXmlState::NodeElt { subject, .. }) => RdfXmlNextProduction::PropertyElt {
1020                subject: subject.clone(),
1021            },
1022            Some(RdfXmlState::ParseTypeLiteralPropertyElt { .. }) => {
1023                return Err(
1024                    RdfXmlSyntaxError::msg("ParseTypeLiteralPropertyElt production children should never be considered as a RDF/XML content").into()
1025                );
1026            }
1027            None => {
1028                return Err(RdfXmlSyntaxError::msg(
1029                    "No state in the stack: the XML is not balanced",
1030                )
1031                .into());
1032            }
1033        };
1034
1035        let new_state = match expected_production {
1036            RdfXmlNextProduction::Rdf => {
1037                if *tag_name == *RDF_RDF {
1038                    RdfXmlState::Rdf { base_iri, language }
1039                } else if RESERVED_RDF_ELEMENTS.contains(&&*tag_name) {
1040                    return Err(RdfXmlSyntaxError::msg(format!(
1041                        "Invalid node element tag name: {tag_name}"
1042                    ))
1043                    .into());
1044                } else {
1045                    self.build_node_elt(
1046                        self.parse_iri(tag_name)?,
1047                        base_iri,
1048                        language,
1049                        id_attr,
1050                        node_id_attr,
1051                        about_attr,
1052                        type_attr,
1053                        property_attrs,
1054                        results,
1055                    )?
1056                }
1057            }
1058            RdfXmlNextProduction::NodeElt => {
1059                if RESERVED_RDF_ELEMENTS.contains(&&*tag_name) {
1060                    return Err(RdfXmlSyntaxError::msg(format!(
1061                        "Invalid property element tag name: {tag_name}"
1062                    ))
1063                    .into());
1064                }
1065                self.build_node_elt(
1066                    self.parse_iri(tag_name)?,
1067                    base_iri,
1068                    language,
1069                    id_attr,
1070                    node_id_attr,
1071                    about_attr,
1072                    type_attr,
1073                    property_attrs,
1074                    results,
1075                )?
1076            }
1077            RdfXmlNextProduction::PropertyElt { subject } => {
1078                let iri = if *tag_name == *RDF_LI {
1079                    let Some(RdfXmlState::NodeElt { li_counter, .. }) = self.state.last_mut()
1080                    else {
1081                        return Err(RdfXmlSyntaxError::msg(format!(
1082                            "Invalid property element tag name: {tag_name}"
1083                        ))
1084                        .into());
1085                    };
1086                    *li_counter += 1;
1087                    NamedNode::new_unchecked(format!(
1088                        "http://www.w3.org/1999/02/22-rdf-syntax-ns#_{li_counter}"
1089                    ))
1090                } else if RESERVED_RDF_ELEMENTS.contains(&&*tag_name)
1091                    || *tag_name == *RDF_DESCRIPTION
1092                {
1093                    return Err(RdfXmlSyntaxError::msg(format!(
1094                        "Invalid property element tag name: {tag_name}"
1095                    ))
1096                    .into());
1097                } else {
1098                    self.parse_iri(tag_name)?
1099                };
1100                match parse_type {
1101                    RdfXmlParseType::Default => {
1102                        if resource_attr.is_some()
1103                            || node_id_attr.is_some()
1104                            || !property_attrs.is_empty()
1105                        {
1106                            let object = match (resource_attr, node_id_attr)
1107                            {
1108                                (Some(resource_attr), None) => Subject::from(resource_attr),
1109                                (None, Some(node_id_attr)) => node_id_attr.into(),
1110                                (None, None) => BlankNode::default().into(),
1111                                (Some(_), Some(_)) => return Err(RdfXmlSyntaxError::msg("Not both rdf:resource and rdf:nodeID could be set at the same time").into())
1112                            };
1113                            self.emit_property_attrs(
1114                                &object,
1115                                property_attrs,
1116                                language.as_deref(),
1117                                results,
1118                            );
1119                            if let Some(type_attr) = type_attr {
1120                                results.push(Triple::new(object.clone(), rdf::TYPE, type_attr));
1121                            }
1122                            RdfXmlState::PropertyElt {
1123                                iri,
1124                                base_iri,
1125                                language,
1126                                subject,
1127                                object: Some(NodeOrText::Node(object)),
1128                                id_attr,
1129                                datatype_attr,
1130                            }
1131                        } else {
1132                            RdfXmlState::PropertyElt {
1133                                iri,
1134                                base_iri,
1135                                language,
1136                                subject,
1137                                object: None,
1138                                id_attr,
1139                                datatype_attr,
1140                            }
1141                        }
1142                    }
1143                    RdfXmlParseType::Literal => RdfXmlState::ParseTypeLiteralPropertyElt {
1144                        iri,
1145                        base_iri,
1146                        language,
1147                        subject,
1148                        writer: Writer::new(Vec::default()),
1149                        id_attr,
1150                        emit: true,
1151                    },
1152                    RdfXmlParseType::Resource => Self::build_parse_type_resource_property_elt(
1153                        iri, base_iri, language, subject, id_attr, results,
1154                    ),
1155                    RdfXmlParseType::Collection => RdfXmlState::ParseTypeCollectionPropertyElt {
1156                        iri,
1157                        base_iri,
1158                        language,
1159                        subject,
1160                        objects: Vec::default(),
1161                        id_attr,
1162                    },
1163                    RdfXmlParseType::Other => RdfXmlState::ParseTypeLiteralPropertyElt {
1164                        iri,
1165                        base_iri,
1166                        language,
1167                        subject,
1168                        writer: Writer::new(Vec::default()),
1169                        id_attr,
1170                        emit: false,
1171                    },
1172                }
1173            }
1174        };
1175        self.state.push(new_state);
1176        Ok(())
1177    }
1178
1179    fn parse_end_event(
1180        &mut self,
1181        event: &BytesEnd<'_>,
1182        results: &mut Vec<Triple>,
1183    ) -> Result<(), RdfXmlParseError> {
1184        // Literal case
1185        if self.in_literal_depth > 0 {
1186            if let Some(RdfXmlState::ParseTypeLiteralPropertyElt { writer, .. }) =
1187                self.state.last_mut()
1188            {
1189                writer.write_event(Event::End(BytesEnd::new(
1190                    self.reader.decoder().decode(event.name().as_ref())?,
1191                )))?;
1192                self.in_literal_depth -= 1;
1193                return Ok(());
1194            }
1195        }
1196
1197        if let Some(current_state) = self.state.pop() {
1198            self.end_state(current_state, results)?;
1199        }
1200        Ok(())
1201    }
1202
1203    fn parse_text_event(&mut self, event: &BytesText<'_>) -> Result<(), RdfXmlParseError> {
1204        let text = event.unescape_with(|e| self.resolve_entity(e))?.to_string();
1205        match self.state.last_mut() {
1206            Some(RdfXmlState::PropertyElt { object, .. }) => {
1207                if is_object_defined(object) {
1208                    if text.bytes().all(is_whitespace) {
1209                        Ok(()) // whitespace anyway, we ignore
1210                    } else {
1211                        Err(
1212                            RdfXmlSyntaxError::msg(format!("Unexpected text event: '{text}'"))
1213                                .into(),
1214                        )
1215                    }
1216                } else {
1217                    *object = Some(NodeOrText::Text(text));
1218                    Ok(())
1219                }
1220            }
1221            Some(RdfXmlState::ParseTypeLiteralPropertyElt { writer, .. }) => {
1222                writer.write_event(Event::Text(BytesText::new(&text)))?;
1223                Ok(())
1224            }
1225            _ => {
1226                if text.bytes().all(is_whitespace) {
1227                    Ok(())
1228                } else {
1229                    Err(RdfXmlSyntaxError::msg(format!("Unexpected text event: '{text}'")).into())
1230                }
1231            }
1232        }
1233    }
1234
1235    fn resolve_tag_name(&self, qname: QName<'_>) -> Result<String, RdfXmlParseError> {
1236        let (namespace, local_name) = self.reader.resolve_element(qname);
1237        self.resolve_ns_name(namespace, local_name)
1238    }
1239
1240    fn resolve_attribute_name(&self, qname: QName<'_>) -> Result<String, RdfXmlParseError> {
1241        let (namespace, local_name) = self.reader.resolve_attribute(qname);
1242        self.resolve_ns_name(namespace, local_name)
1243    }
1244
1245    fn resolve_ns_name(
1246        &self,
1247        namespace: ResolveResult<'_>,
1248        local_name: LocalName<'_>,
1249    ) -> Result<String, RdfXmlParseError> {
1250        match namespace {
1251            ResolveResult::Bound(ns) => {
1252                let mut value = Vec::with_capacity(ns.as_ref().len() + local_name.as_ref().len());
1253                value.extend_from_slice(ns.as_ref());
1254                value.extend_from_slice(local_name.as_ref());
1255                Ok(unescape_with(&self.reader.decoder().decode(&value)?, |e| {
1256                    self.resolve_entity(e)
1257                })
1258                .map_err(Error::from)?
1259                .to_string())
1260            }
1261            ResolveResult::Unbound => {
1262                Err(RdfXmlSyntaxError::msg("XML namespaces are required in RDF/XML").into())
1263            }
1264            ResolveResult::Unknown(v) => Err(RdfXmlSyntaxError::msg(format!(
1265                "Unknown prefix {}:",
1266                self.reader.decoder().decode(&v)?
1267            ))
1268            .into()),
1269        }
1270    }
1271
1272    #[allow(clippy::too_many_arguments)]
1273    fn build_node_elt(
1274        &self,
1275        iri: NamedNode,
1276        base_iri: Option<Iri<String>>,
1277        language: Option<String>,
1278        id_attr: Option<NamedNode>,
1279        node_id_attr: Option<BlankNode>,
1280        about_attr: Option<NamedNode>,
1281        type_attr: Option<NamedNode>,
1282        property_attrs: Vec<(NamedNode, String)>,
1283        results: &mut Vec<Triple>,
1284    ) -> Result<RdfXmlState, RdfXmlSyntaxError> {
1285        let subject = match (id_attr, node_id_attr, about_attr) {
1286            (Some(id_attr), None, None) => Subject::from(id_attr),
1287            (None, Some(node_id_attr), None) => node_id_attr.into(),
1288            (None, None, Some(about_attr)) => about_attr.into(),
1289            (None, None, None) => BlankNode::default().into(),
1290            (Some(_), Some(_), _) => {
1291                return Err(RdfXmlSyntaxError::msg(
1292                    "Not both rdf:ID and rdf:nodeID could be set at the same time",
1293                ))
1294            }
1295            (_, Some(_), Some(_)) => {
1296                return Err(RdfXmlSyntaxError::msg(
1297                    "Not both rdf:nodeID and rdf:resource could be set at the same time",
1298                ))
1299            }
1300            (Some(_), _, Some(_)) => {
1301                return Err(RdfXmlSyntaxError::msg(
1302                    "Not both rdf:ID and rdf:resource could be set at the same time",
1303                ))
1304            }
1305        };
1306
1307        self.emit_property_attrs(&subject, property_attrs, language.as_deref(), results);
1308
1309        if let Some(type_attr) = type_attr {
1310            results.push(Triple::new(subject.clone(), rdf::TYPE, type_attr));
1311        }
1312
1313        if iri != *RDF_DESCRIPTION {
1314            results.push(Triple::new(subject.clone(), rdf::TYPE, iri));
1315        }
1316        Ok(RdfXmlState::NodeElt {
1317            base_iri,
1318            language,
1319            subject,
1320            li_counter: 0,
1321        })
1322    }
1323
1324    fn build_parse_type_resource_property_elt(
1325        iri: NamedNode,
1326        base_iri: Option<Iri<String>>,
1327        language: Option<String>,
1328        subject: Subject,
1329        id_attr: Option<NamedNode>,
1330        results: &mut Vec<Triple>,
1331    ) -> RdfXmlState {
1332        let object = BlankNode::default();
1333        let triple = Triple::new(subject, iri, object.clone());
1334        if let Some(id_attr) = id_attr {
1335            Self::reify(triple.clone(), id_attr, results);
1336        }
1337        results.push(triple);
1338        RdfXmlState::NodeElt {
1339            base_iri,
1340            language,
1341            subject: object.into(),
1342            li_counter: 0,
1343        }
1344    }
1345
1346    fn end_state(
1347        &mut self,
1348        state: RdfXmlState,
1349        results: &mut Vec<Triple>,
1350    ) -> Result<(), RdfXmlSyntaxError> {
1351        match state {
1352            RdfXmlState::PropertyElt {
1353                iri,
1354                language,
1355                subject,
1356                id_attr,
1357                datatype_attr,
1358                object,
1359                ..
1360            } => {
1361                let object = match object {
1362                    Some(NodeOrText::Node(node)) => Term::from(node),
1363                    Some(NodeOrText::Text(text)) => {
1364                        self.new_literal(text, language, datatype_attr).into()
1365                    }
1366                    None => self
1367                        .new_literal(String::new(), language, datatype_attr)
1368                        .into(),
1369                };
1370                let triple = Triple::new(subject, iri, object);
1371                if let Some(id_attr) = id_attr {
1372                    Self::reify(triple.clone(), id_attr, results);
1373                }
1374                results.push(triple);
1375            }
1376            RdfXmlState::ParseTypeCollectionPropertyElt {
1377                iri,
1378                subject,
1379                id_attr,
1380                objects,
1381                ..
1382            } => {
1383                let mut current_node = Subject::from(rdf::NIL);
1384                for object in objects.into_iter().rev() {
1385                    let subject = Subject::from(BlankNode::default());
1386                    results.push(Triple::new(subject.clone(), rdf::FIRST, object));
1387                    results.push(Triple::new(subject.clone(), rdf::REST, current_node));
1388                    current_node = subject;
1389                }
1390                let triple = Triple::new(subject, iri, current_node);
1391                if let Some(id_attr) = id_attr {
1392                    Self::reify(triple.clone(), id_attr, results);
1393                }
1394                results.push(triple);
1395            }
1396            RdfXmlState::ParseTypeLiteralPropertyElt {
1397                iri,
1398                subject,
1399                id_attr,
1400                writer,
1401                emit,
1402                ..
1403            } => {
1404                if emit {
1405                    let object = writer.into_inner();
1406                    if object.is_empty() {
1407                        return Err(RdfXmlSyntaxError::msg(format!(
1408                            "No value found for rdf:XMLLiteral value of property {iri}"
1409                        )));
1410                    }
1411                    let triple = Triple::new(
1412                        subject,
1413                        iri,
1414                        Literal::new_typed_literal(
1415                            str::from_utf8(&object).map_err(|_| {
1416                                RdfXmlSyntaxError::msg(
1417                                    "The XML literal is not in valid UTF-8".to_owned(),
1418                                )
1419                            })?,
1420                            rdf::XML_LITERAL,
1421                        ),
1422                    );
1423                    if let Some(id_attr) = id_attr {
1424                        Self::reify(triple.clone(), id_attr, results);
1425                    }
1426                    results.push(triple);
1427                }
1428            }
1429            RdfXmlState::NodeElt { subject, .. } => match self.state.last_mut() {
1430                Some(RdfXmlState::PropertyElt { object, .. }) => {
1431                    if is_object_defined(object) {
1432                        return Err(RdfXmlSyntaxError::msg(
1433                            "Unexpected node, a text value is already present",
1434                        ));
1435                    }
1436                    *object = Some(NodeOrText::Node(subject))
1437                }
1438                Some(RdfXmlState::ParseTypeCollectionPropertyElt { objects, .. }) => {
1439                    objects.push(subject)
1440                }
1441                _ => (),
1442            },
1443            _ => (),
1444        }
1445        Ok(())
1446    }
1447
1448    fn new_literal(
1449        &self,
1450        value: String,
1451        language: Option<String>,
1452        datatype: Option<NamedNode>,
1453    ) -> Literal {
1454        if let Some(datatype) = datatype {
1455            Literal::new_typed_literal(value, datatype)
1456        } else if let Some(language) =
1457            language.or_else(|| self.current_language().map(ToOwned::to_owned))
1458        {
1459            Literal::new_language_tagged_literal_unchecked(value, language)
1460        } else {
1461            Literal::new_simple_literal(value)
1462        }
1463    }
1464
1465    fn reify(triple: Triple, statement_id: NamedNode, results: &mut Vec<Triple>) {
1466        results.push(Triple::new(statement_id.clone(), rdf::TYPE, rdf::STATEMENT));
1467        results.push(Triple::new(
1468            statement_id.clone(),
1469            rdf::SUBJECT,
1470            triple.subject,
1471        ));
1472        results.push(Triple::new(
1473            statement_id.clone(),
1474            rdf::PREDICATE,
1475            triple.predicate,
1476        ));
1477        results.push(Triple::new(statement_id, rdf::OBJECT, triple.object));
1478    }
1479
1480    fn emit_property_attrs(
1481        &self,
1482        subject: &Subject,
1483        literal_attributes: Vec<(NamedNode, String)>,
1484        language: Option<&str>,
1485        results: &mut Vec<Triple>,
1486    ) {
1487        for (literal_predicate, literal_value) in literal_attributes {
1488            results.push(Triple::new(
1489                subject.clone(),
1490                literal_predicate,
1491                if let Some(language) = language.or_else(|| self.current_language()) {
1492                    Literal::new_language_tagged_literal_unchecked(literal_value, language)
1493                } else {
1494                    Literal::new_simple_literal(literal_value)
1495                },
1496            ));
1497        }
1498    }
1499
1500    fn convert_attribute(&self, attribute: &Attribute<'_>) -> Result<String, RdfXmlParseError> {
1501        Ok(attribute
1502            .decode_and_unescape_value_with(self.reader.decoder(), |e| self.resolve_entity(e))?
1503            .into_owned())
1504    }
1505
1506    fn convert_iri_attribute(
1507        &self,
1508        base_iri: Option<&Iri<String>>,
1509        attribute: &Attribute<'_>,
1510    ) -> Result<NamedNode, RdfXmlParseError> {
1511        Ok(self.resolve_iri(base_iri, self.convert_attribute(attribute)?)?)
1512    }
1513
1514    fn resolve_iri(
1515        &self,
1516        base_iri: Option<&Iri<String>>,
1517        relative_iri: String,
1518    ) -> Result<NamedNode, RdfXmlSyntaxError> {
1519        if let Some(base_iri) = base_iri.or_else(|| self.current_base_iri()) {
1520            Ok(NamedNode::new_unchecked(
1521                if self.unchecked {
1522                    base_iri.resolve_unchecked(&relative_iri)
1523                } else {
1524                    base_iri
1525                        .resolve(&relative_iri)
1526                        .map_err(|error| RdfXmlSyntaxError::invalid_iri(relative_iri, error))?
1527                }
1528                .into_inner(),
1529            ))
1530        } else {
1531            self.parse_iri(relative_iri)
1532        }
1533    }
1534
1535    fn parse_iri(&self, relative_iri: String) -> Result<NamedNode, RdfXmlSyntaxError> {
1536        Ok(NamedNode::new_unchecked(if self.unchecked {
1537            relative_iri
1538        } else {
1539            Iri::parse(relative_iri.clone())
1540                .map_err(|error| RdfXmlSyntaxError::invalid_iri(relative_iri, error))?
1541                .into_inner()
1542        }))
1543    }
1544
1545    fn current_language(&self) -> Option<&str> {
1546        for state in self.state.iter().rev() {
1547            match state {
1548                RdfXmlState::Doc { .. } => (),
1549                RdfXmlState::Rdf { language, .. }
1550                | RdfXmlState::NodeElt { language, .. }
1551                | RdfXmlState::PropertyElt { language, .. }
1552                | RdfXmlState::ParseTypeCollectionPropertyElt { language, .. }
1553                | RdfXmlState::ParseTypeLiteralPropertyElt { language, .. } => {
1554                    if let Some(language) = language {
1555                        return Some(language);
1556                    }
1557                }
1558            }
1559        }
1560        None
1561    }
1562
1563    fn current_base_iri(&self) -> Option<&Iri<String>> {
1564        for state in self.state.iter().rev() {
1565            match state {
1566                RdfXmlState::Doc { base_iri }
1567                | RdfXmlState::Rdf { base_iri, .. }
1568                | RdfXmlState::NodeElt { base_iri, .. }
1569                | RdfXmlState::PropertyElt { base_iri, .. }
1570                | RdfXmlState::ParseTypeCollectionPropertyElt { base_iri, .. }
1571                | RdfXmlState::ParseTypeLiteralPropertyElt { base_iri, .. } => {
1572                    if let Some(base_iri) = base_iri {
1573                        return Some(base_iri);
1574                    }
1575                }
1576            }
1577        }
1578        None
1579    }
1580
1581    fn resolve_entity(&self, e: &str) -> Option<&str> {
1582        resolve_xml_entity(e).or_else(|| self.custom_entities.get(e).map(String::as_str))
1583    }
1584}
1585
1586fn is_object_defined(object: &Option<NodeOrText>) -> bool {
1587    match object {
1588        Some(NodeOrText::Node(_)) => true,
1589        Some(NodeOrText::Text(t)) => !t.bytes().all(is_whitespace),
1590        None => false,
1591    }
1592}
1593
1594fn is_whitespace(c: u8) -> bool {
1595    matches!(c, b' ' | b'\t' | b'\n' | b'\r')
1596}
1597
1598fn is_utf8(encoding: &[u8]) -> bool {
1599    matches!(
1600        encoding.to_ascii_lowercase().as_slice(),
1601        b"unicode-1-1-utf-8"
1602            | b"unicode11utf8"
1603            | b"unicode20utf8"
1604            | b"utf-8"
1605            | b"utf8"
1606            | b"x-unicode20utf8"
1607    )
1608}