1use crate::error::{RdfXmlParseError, RdfXmlSyntaxError};
2use crate::utils::*;
3use oxilangtag::LanguageTag;
4use oxiri::{Iri, IriParseError};
5use oxrdf::vocab::rdf;
6use oxrdf::{BlankNode, Literal, NamedNode, Subject, Term, Triple};
7use quick_xml::escape::{resolve_xml_entity, unescape_with};
8use quick_xml::events::attributes::Attribute;
9use quick_xml::events::*;
10use quick_xml::name::{LocalName, PrefixDeclaration, PrefixIter, QName, ResolveResult};
11use quick_xml::{Decoder, Error, NsReader, Writer};
12use std::borrow::Cow;
13use std::collections::{HashMap, HashSet};
14use std::io::{BufReader, Read};
15use std::str;
16#[cfg(feature = "async-tokio")]
17use tokio::io::{AsyncRead, BufReader as AsyncBufReader};
18
19#[derive(Default, Clone)]
55#[must_use]
56pub struct RdfXmlParser {
57 unchecked: bool,
58 base: Option<Iri<String>>,
59}
60
61impl RdfXmlParser {
62 #[inline]
64 pub fn new() -> Self {
65 Self::default()
66 }
67
68 #[inline]
74 pub fn unchecked(mut self) -> Self {
75 self.unchecked = true;
76 self
77 }
78
79 #[inline]
80 pub fn with_base_iri(mut self, base_iri: impl Into<String>) -> Result<Self, IriParseError> {
81 self.base = Some(Iri::parse(base_iri.into())?);
82 Ok(self)
83 }
84
85 pub fn for_reader<R: Read>(self, reader: R) -> ReaderRdfXmlParser<R> {
114 ReaderRdfXmlParser {
115 results: Vec::new(),
116 parser: self.into_internal(BufReader::new(reader)),
117 reader_buffer: Vec::default(),
118 }
119 }
120
121 #[cfg(feature = "async-tokio")]
154 pub fn for_tokio_async_reader<R: AsyncRead + Unpin>(
155 self,
156 reader: R,
157 ) -> TokioAsyncReaderRdfXmlParser<R> {
158 TokioAsyncReaderRdfXmlParser {
159 results: Vec::new(),
160 parser: self.into_internal(AsyncBufReader::new(reader)),
161 reader_buffer: Vec::default(),
162 }
163 }
164
165 pub fn for_slice(self, slice: &[u8]) -> SliceRdfXmlParser<'_> {
194 SliceRdfXmlParser {
195 results: Vec::new(),
196 parser: self.into_internal(slice),
197 reader_buffer: Vec::default(),
198 }
199 }
200
201 fn into_internal<T>(self, reader: T) -> InternalRdfXmlParser<T> {
202 let mut reader = NsReader::from_reader(reader);
203 reader.config_mut().expand_empty_elements = true;
204 InternalRdfXmlParser {
205 reader,
206 state: vec![RdfXmlState::Doc {
207 base_iri: self.base.clone(),
208 }],
209 custom_entities: HashMap::new(),
210 in_literal_depth: 0,
211 known_rdf_id: HashSet::default(),
212 is_end: false,
213 unchecked: self.unchecked,
214 }
215 }
216}
217
218#[must_use]
249pub struct ReaderRdfXmlParser<R: Read> {
250 results: Vec<Triple>,
251 parser: InternalRdfXmlParser<BufReader<R>>,
252 reader_buffer: Vec<u8>,
253}
254
255impl<R: Read> Iterator for ReaderRdfXmlParser<R> {
256 type Item = Result<Triple, RdfXmlParseError>;
257
258 fn next(&mut self) -> Option<Self::Item> {
259 loop {
260 if let Some(triple) = self.results.pop() {
261 return Some(Ok(triple));
262 } else if self.parser.is_end {
263 return None;
264 }
265 if let Err(e) = self.parse_step() {
266 return Some(Err(e));
267 }
268 }
269 }
270}
271
272impl<R: Read> ReaderRdfXmlParser<R> {
273 pub fn prefixes(&self) -> RdfXmlPrefixesIter<'_> {
305 RdfXmlPrefixesIter {
306 inner: self.parser.reader.prefixes(),
307 decoder: self.parser.reader.decoder(),
308 unchecked: self.parser.unchecked,
309 }
310 }
311
312 pub fn base_iri(&self) -> Option<&str> {
332 Some(self.parser.current_base_iri()?.as_str())
333 }
334
335 pub fn buffer_position(&self) -> u64 {
337 self.parser.reader.buffer_position()
338 }
339
340 fn parse_step(&mut self) -> Result<(), RdfXmlParseError> {
341 self.reader_buffer.clear();
342 let event = self
343 .parser
344 .reader
345 .read_event_into(&mut self.reader_buffer)?;
346 self.parser.parse_event(event, &mut self.results)
347 }
348}
349
350#[cfg(feature = "async-tokio")]
385#[must_use]
386pub struct TokioAsyncReaderRdfXmlParser<R: AsyncRead + Unpin> {
387 results: Vec<Triple>,
388 parser: InternalRdfXmlParser<AsyncBufReader<R>>,
389 reader_buffer: Vec<u8>,
390}
391
392#[cfg(feature = "async-tokio")]
393impl<R: AsyncRead + Unpin> TokioAsyncReaderRdfXmlParser<R> {
394 pub async fn next(&mut self) -> Option<Result<Triple, RdfXmlParseError>> {
396 loop {
397 if let Some(triple) = self.results.pop() {
398 return Some(Ok(triple));
399 } else if self.parser.is_end {
400 return None;
401 }
402 if let Err(e) = self.parse_step().await {
403 return Some(Err(e));
404 }
405 }
406 }
407
408 pub fn prefixes(&self) -> RdfXmlPrefixesIter<'_> {
443 RdfXmlPrefixesIter {
444 inner: self.parser.reader.prefixes(),
445 decoder: self.parser.reader.decoder(),
446 unchecked: self.parser.unchecked,
447 }
448 }
449
450 pub fn base_iri(&self) -> Option<&str> {
473 Some(self.parser.current_base_iri()?.as_str())
474 }
475
476 pub fn buffer_position(&self) -> u64 {
478 self.parser.reader.buffer_position()
479 }
480
481 async fn parse_step(&mut self) -> Result<(), RdfXmlParseError> {
482 self.reader_buffer.clear();
483 let event = self
484 .parser
485 .reader
486 .read_event_into_async(&mut self.reader_buffer)
487 .await?;
488 self.parser.parse_event(event, &mut self.results)
489 }
490}
491
492#[must_use]
523pub struct SliceRdfXmlParser<'a> {
524 results: Vec<Triple>,
525 parser: InternalRdfXmlParser<&'a [u8]>,
526 reader_buffer: Vec<u8>,
527}
528
529impl Iterator for SliceRdfXmlParser<'_> {
530 type Item = Result<Triple, RdfXmlSyntaxError>;
531
532 fn next(&mut self) -> Option<Self::Item> {
533 loop {
534 if let Some(triple) = self.results.pop() {
535 return Some(Ok(triple));
536 } else if self.parser.is_end {
537 return None;
538 }
539 if let Err(RdfXmlParseError::Syntax(e)) = self.parse_step() {
540 return Some(Err(e));
542 }
543 }
544 }
545}
546
547impl SliceRdfXmlParser<'_> {
548 pub fn prefixes(&self) -> RdfXmlPrefixesIter<'_> {
580 RdfXmlPrefixesIter {
581 inner: self.parser.reader.prefixes(),
582 decoder: self.parser.reader.decoder(),
583 unchecked: self.parser.unchecked,
584 }
585 }
586
587 pub fn base_iri(&self) -> Option<&str> {
607 Some(self.parser.current_base_iri()?.as_str())
608 }
609
610 pub fn buffer_position(&self) -> u64 {
612 self.parser.reader.buffer_position()
613 }
614
615 fn parse_step(&mut self) -> Result<(), RdfXmlParseError> {
616 self.reader_buffer.clear();
617 let event = self
618 .parser
619 .reader
620 .read_event_into(&mut self.reader_buffer)?;
621 self.parser.parse_event(event, &mut self.results)
622 }
623}
624
625pub struct RdfXmlPrefixesIter<'a> {
629 inner: PrefixIter<'a>,
630 decoder: Decoder,
631 unchecked: bool,
632}
633
634impl<'a> Iterator for RdfXmlPrefixesIter<'a> {
635 type Item = (&'a str, &'a str);
636
637 #[inline]
638 fn next(&mut self) -> Option<Self::Item> {
639 loop {
640 let (key, value) = self.inner.next()?;
641 return Some((
642 match key {
643 PrefixDeclaration::Default => "",
644 PrefixDeclaration::Named(name) => {
645 let Ok(Cow::Borrowed(name)) = self.decoder.decode(name) else {
646 continue;
647 };
648 let Ok(Cow::Borrowed(name)) = unescape_with(name, |_| None) else {
649 continue;
650 };
651 if !self.unchecked && !is_nc_name(name) {
652 continue; }
654 name
655 }
656 },
657 {
658 let Ok(Cow::Borrowed(value)) = self.decoder.decode(value.0) else {
659 continue;
660 };
661 let Ok(Cow::Borrowed(value)) = unescape_with(value, |_| None) else {
662 continue;
663 };
664 if !self.unchecked && Iri::parse(value).is_err() {
665 continue; }
667 value
668 },
669 ));
670 }
671 }
672
673 #[inline]
674 fn size_hint(&self) -> (usize, Option<usize>) {
675 self.inner.size_hint()
676 }
677}
678
679const RDF_ABOUT: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#about";
680const RDF_ABOUT_EACH: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEach";
681const RDF_ABOUT_EACH_PREFIX: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEachPrefix";
682const RDF_BAG_ID: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#bagID";
683const RDF_DATATYPE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#datatype";
684const RDF_DESCRIPTION: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#Description";
685const RDF_ID: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#ID";
686const RDF_LI: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#li";
687const RDF_NODE_ID: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#nodeID";
688const RDF_PARSE_TYPE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#parseType";
689const RDF_RDF: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#RDF";
690const RDF_RESOURCE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#resource";
691
692const RESERVED_RDF_ELEMENTS: [&str; 11] = [
693 RDF_ABOUT,
694 RDF_ABOUT_EACH,
695 RDF_ABOUT_EACH_PREFIX,
696 RDF_BAG_ID,
697 RDF_DATATYPE,
698 RDF_ID,
699 RDF_LI,
700 RDF_NODE_ID,
701 RDF_PARSE_TYPE,
702 RDF_RDF,
703 RDF_RESOURCE,
704];
705const RESERVED_RDF_ATTRIBUTES: [&str; 5] = [
706 RDF_ABOUT_EACH,
707 RDF_ABOUT_EACH_PREFIX,
708 RDF_LI,
709 RDF_RDF,
710 RDF_RESOURCE,
711];
712
713#[derive(Clone, Debug)]
714enum NodeOrText {
715 Node(Subject),
716 Text(String),
717}
718
719enum RdfXmlState {
720 Doc {
721 base_iri: Option<Iri<String>>,
722 },
723 Rdf {
724 base_iri: Option<Iri<String>>,
725 language: Option<String>,
726 },
727 NodeElt {
728 base_iri: Option<Iri<String>>,
729 language: Option<String>,
730 subject: Subject,
731 li_counter: u64,
732 },
733 PropertyElt {
734 iri: NamedNode,
736 base_iri: Option<Iri<String>>,
737 language: Option<String>,
738 subject: Subject,
739 object: Option<NodeOrText>,
740 id_attr: Option<NamedNode>,
741 datatype_attr: Option<NamedNode>,
742 },
743 ParseTypeCollectionPropertyElt {
744 iri: NamedNode,
745 base_iri: Option<Iri<String>>,
746 language: Option<String>,
747 subject: Subject,
748 objects: Vec<Subject>,
749 id_attr: Option<NamedNode>,
750 },
751 ParseTypeLiteralPropertyElt {
752 iri: NamedNode,
753 base_iri: Option<Iri<String>>,
754 language: Option<String>,
755 subject: Subject,
756 writer: Writer<Vec<u8>>,
757 id_attr: Option<NamedNode>,
758 emit: bool, },
760}
761
762struct InternalRdfXmlParser<R> {
763 reader: NsReader<R>,
764 state: Vec<RdfXmlState>,
765 custom_entities: HashMap<String, String>,
766 in_literal_depth: usize,
767 known_rdf_id: HashSet<String>,
768 is_end: bool,
769 unchecked: bool,
770}
771
772impl<R> InternalRdfXmlParser<R> {
773 fn parse_event(
774 &mut self,
775 event: Event<'_>,
776 results: &mut Vec<Triple>,
777 ) -> Result<(), RdfXmlParseError> {
778 match event {
779 Event::Start(event) => self.parse_start_event(&event, results),
780 Event::End(event) => self.parse_end_event(&event, results),
781 Event::Empty(_) => Err(RdfXmlSyntaxError::msg(
782 "The expand_empty_elements option must be enabled",
783 )
784 .into()),
785 Event::Text(event) => self.parse_text_event(&event),
786 Event::CData(event) => self.parse_text_event(&event.escape()?),
787 Event::Comment(_) | Event::PI(_) => Ok(()),
788 Event::Decl(decl) => {
789 if let Some(encoding) = decl.encoding() {
790 if !is_utf8(&encoding?) {
791 return Err(RdfXmlSyntaxError::msg(
792 "Only UTF-8 is supported by the RDF/XML parser",
793 )
794 .into());
795 }
796 }
797 Ok(())
798 }
799 Event::DocType(dt) => self.parse_doctype(&dt),
800 Event::Eof => {
801 self.is_end = true;
802 Ok(())
803 }
804 }
805 }
806
807 fn parse_doctype(&mut self, dt: &BytesText<'_>) -> Result<(), RdfXmlParseError> {
808 for input in self
810 .reader
811 .decoder()
812 .decode(dt.as_ref())?
813 .split('<')
814 .skip(1)
815 {
816 if let Some(input) = input.strip_prefix("!ENTITY") {
817 let input = input.trim_start().strip_prefix('%').unwrap_or(input);
818 let (entity_name, input) = input.trim_start().split_once(|c: char| c.is_ascii_whitespace()).ok_or_else(|| {
819 RdfXmlSyntaxError::msg(
820 "<!ENTITY declarations should contain both an entity name and an entity value",
821 )
822 })?;
823 let input = input.trim_start().strip_prefix('\"').ok_or_else(|| {
824 RdfXmlSyntaxError::msg("<!ENTITY values should be enclosed in double quotes")
825 })?;
826 let (entity_value, input) = input.split_once('"').ok_or_else(|| {
827 RdfXmlSyntaxError::msg(
828 "<!ENTITY declarations values should be enclosed in double quotes",
829 )
830 })?;
831 input.trim_start().strip_prefix('>').ok_or_else(|| {
832 RdfXmlSyntaxError::msg("<!ENTITY declarations values should end with >")
833 })?;
834
835 let entity_value =
837 unescape_with(entity_value, |e| self.resolve_entity(e)).map_err(Error::from)?;
838 self.custom_entities
839 .insert(entity_name.to_owned(), entity_value.to_string());
840 }
841 }
842 Ok(())
843 }
844
845 fn parse_start_event(
846 &mut self,
847 event: &BytesStart<'_>,
848 results: &mut Vec<Triple>,
849 ) -> Result<(), RdfXmlParseError> {
850 #[derive(PartialEq, Eq)]
851 enum RdfXmlParseType {
852 Default,
853 Collection,
854 Literal,
855 Resource,
856 Other,
857 }
858
859 #[derive(PartialEq, Eq)]
860 enum RdfXmlNextProduction {
861 Rdf,
862 NodeElt,
863 PropertyElt { subject: Subject },
864 }
865
866 if let Some(RdfXmlState::ParseTypeLiteralPropertyElt { writer, .. }) = self.state.last_mut()
868 {
869 let mut clean_event = BytesStart::new(
870 self.reader
871 .decoder()
872 .decode(event.name().as_ref())?
873 .to_string(),
874 );
875 for attr in event.attributes() {
876 clean_event.push_attribute(attr.map_err(Error::InvalidAttr)?);
877 }
878 writer.write_event(Event::Start(clean_event))?;
879 self.in_literal_depth += 1;
880 return Ok(());
881 }
882
883 let tag_name = self.resolve_tag_name(event.name())?;
884
885 let mut language = None;
887 let mut base_iri = None;
888 let mut id_attr = None;
889 let mut node_id_attr = None;
890 let mut about_attr = None;
891 let mut property_attrs = Vec::default();
892 let mut resource_attr = None;
893 let mut datatype_attr = None;
894 let mut parse_type = RdfXmlParseType::Default;
895 let mut type_attr = None;
896
897 for attribute in event.attributes() {
898 let attribute = attribute.map_err(Error::InvalidAttr)?;
899 if attribute.key.as_ref().starts_with(b"xml") {
900 if attribute.key.as_ref() == b"xml:lang" {
901 let tag = self.convert_attribute(&attribute)?.to_ascii_lowercase();
902 language = Some(if self.unchecked {
903 tag
904 } else {
905 LanguageTag::parse(tag.to_ascii_lowercase())
906 .map_err(|error| RdfXmlSyntaxError::invalid_language_tag(tag, error))?
907 .into_inner()
908 });
909 } else if attribute.key.as_ref() == b"xml:base" {
910 let iri = self.convert_attribute(&attribute)?;
911 base_iri = Some(if self.unchecked {
912 Iri::parse_unchecked(iri.clone())
913 } else {
914 Iri::parse(iri.clone())
915 .map_err(|error| RdfXmlSyntaxError::invalid_iri(iri, error))?
916 })
917 } else {
918 }
920 } else {
921 let attribute_url = self.resolve_attribute_name(attribute.key)?;
922 if *attribute_url == *RDF_ID {
923 let mut id = self.convert_attribute(&attribute)?;
924 if !is_nc_name(&id) {
925 return Err(RdfXmlSyntaxError::msg(format!(
926 "{id} is not a valid rdf:ID value"
927 ))
928 .into());
929 }
930 id.insert(0, '#');
931 id_attr = Some(id);
932 } else if *attribute_url == *RDF_BAG_ID {
933 let bag_id = self.convert_attribute(&attribute)?;
934 if !is_nc_name(&bag_id) {
935 return Err(RdfXmlSyntaxError::msg(format!(
936 "{bag_id} is not a valid rdf:bagID value"
937 ))
938 .into());
939 }
940 } else if *attribute_url == *RDF_NODE_ID {
941 let id = self.convert_attribute(&attribute)?;
942 if !is_nc_name(&id) {
943 return Err(RdfXmlSyntaxError::msg(format!(
944 "{id} is not a valid rdf:nodeID value"
945 ))
946 .into());
947 }
948 node_id_attr = Some(BlankNode::new_unchecked(id));
949 } else if *attribute_url == *RDF_ABOUT {
950 about_attr = Some(attribute);
951 } else if *attribute_url == *RDF_RESOURCE {
952 resource_attr = Some(attribute);
953 } else if *attribute_url == *RDF_DATATYPE {
954 datatype_attr = Some(attribute);
955 } else if *attribute_url == *RDF_PARSE_TYPE {
956 parse_type = match attribute.value.as_ref() {
957 b"Collection" => RdfXmlParseType::Collection,
958 b"Literal" => RdfXmlParseType::Literal,
959 b"Resource" => RdfXmlParseType::Resource,
960 _ => RdfXmlParseType::Other,
961 };
962 } else if attribute_url == rdf::TYPE.as_str() {
963 type_attr = Some(attribute);
964 } else if RESERVED_RDF_ATTRIBUTES.contains(&&*attribute_url) {
965 return Err(RdfXmlSyntaxError::msg(format!(
966 "{attribute_url} is not a valid attribute"
967 ))
968 .into());
969 } else {
970 property_attrs.push((
971 self.parse_iri(attribute_url)?,
972 self.convert_attribute(&attribute)?,
973 ));
974 }
975 }
976 }
977
978 let id_attr = match id_attr {
980 Some(iri) => {
981 let iri = self.resolve_iri(base_iri.as_ref(), iri)?;
982 if !self.unchecked {
983 if self.known_rdf_id.contains(iri.as_str()) {
984 return Err(RdfXmlSyntaxError::msg(format!(
985 "{iri} has already been used as rdf:ID value"
986 ))
987 .into());
988 }
989 self.known_rdf_id.insert(iri.as_str().into());
990 }
991 Some(iri)
992 }
993 None => None,
994 };
995 let about_attr = match about_attr {
996 Some(attr) => Some(self.convert_iri_attribute(base_iri.as_ref(), &attr)?),
997 None => None,
998 };
999 let resource_attr = match resource_attr {
1000 Some(attr) => Some(self.convert_iri_attribute(base_iri.as_ref(), &attr)?),
1001 None => None,
1002 };
1003 let datatype_attr = match datatype_attr {
1004 Some(attr) => Some(self.convert_iri_attribute(base_iri.as_ref(), &attr)?),
1005 None => None,
1006 };
1007 let type_attr = match type_attr {
1008 Some(attr) => Some(self.convert_iri_attribute(base_iri.as_ref(), &attr)?),
1009 None => None,
1010 };
1011
1012 let expected_production = match self.state.last() {
1013 Some(RdfXmlState::Doc { .. }) => RdfXmlNextProduction::Rdf,
1014 Some(
1015 RdfXmlState::Rdf { .. }
1016 | RdfXmlState::PropertyElt { .. }
1017 | RdfXmlState::ParseTypeCollectionPropertyElt { .. },
1018 ) => RdfXmlNextProduction::NodeElt,
1019 Some(RdfXmlState::NodeElt { subject, .. }) => RdfXmlNextProduction::PropertyElt {
1020 subject: subject.clone(),
1021 },
1022 Some(RdfXmlState::ParseTypeLiteralPropertyElt { .. }) => {
1023 return Err(
1024 RdfXmlSyntaxError::msg("ParseTypeLiteralPropertyElt production children should never be considered as a RDF/XML content").into()
1025 );
1026 }
1027 None => {
1028 return Err(RdfXmlSyntaxError::msg(
1029 "No state in the stack: the XML is not balanced",
1030 )
1031 .into());
1032 }
1033 };
1034
1035 let new_state = match expected_production {
1036 RdfXmlNextProduction::Rdf => {
1037 if *tag_name == *RDF_RDF {
1038 RdfXmlState::Rdf { base_iri, language }
1039 } else if RESERVED_RDF_ELEMENTS.contains(&&*tag_name) {
1040 return Err(RdfXmlSyntaxError::msg(format!(
1041 "Invalid node element tag name: {tag_name}"
1042 ))
1043 .into());
1044 } else {
1045 self.build_node_elt(
1046 self.parse_iri(tag_name)?,
1047 base_iri,
1048 language,
1049 id_attr,
1050 node_id_attr,
1051 about_attr,
1052 type_attr,
1053 property_attrs,
1054 results,
1055 )?
1056 }
1057 }
1058 RdfXmlNextProduction::NodeElt => {
1059 if RESERVED_RDF_ELEMENTS.contains(&&*tag_name) {
1060 return Err(RdfXmlSyntaxError::msg(format!(
1061 "Invalid property element tag name: {tag_name}"
1062 ))
1063 .into());
1064 }
1065 self.build_node_elt(
1066 self.parse_iri(tag_name)?,
1067 base_iri,
1068 language,
1069 id_attr,
1070 node_id_attr,
1071 about_attr,
1072 type_attr,
1073 property_attrs,
1074 results,
1075 )?
1076 }
1077 RdfXmlNextProduction::PropertyElt { subject } => {
1078 let iri = if *tag_name == *RDF_LI {
1079 let Some(RdfXmlState::NodeElt { li_counter, .. }) = self.state.last_mut()
1080 else {
1081 return Err(RdfXmlSyntaxError::msg(format!(
1082 "Invalid property element tag name: {tag_name}"
1083 ))
1084 .into());
1085 };
1086 *li_counter += 1;
1087 NamedNode::new_unchecked(format!(
1088 "http://www.w3.org/1999/02/22-rdf-syntax-ns#_{li_counter}"
1089 ))
1090 } else if RESERVED_RDF_ELEMENTS.contains(&&*tag_name)
1091 || *tag_name == *RDF_DESCRIPTION
1092 {
1093 return Err(RdfXmlSyntaxError::msg(format!(
1094 "Invalid property element tag name: {tag_name}"
1095 ))
1096 .into());
1097 } else {
1098 self.parse_iri(tag_name)?
1099 };
1100 match parse_type {
1101 RdfXmlParseType::Default => {
1102 if resource_attr.is_some()
1103 || node_id_attr.is_some()
1104 || !property_attrs.is_empty()
1105 {
1106 let object = match (resource_attr, node_id_attr)
1107 {
1108 (Some(resource_attr), None) => Subject::from(resource_attr),
1109 (None, Some(node_id_attr)) => node_id_attr.into(),
1110 (None, None) => BlankNode::default().into(),
1111 (Some(_), Some(_)) => return Err(RdfXmlSyntaxError::msg("Not both rdf:resource and rdf:nodeID could be set at the same time").into())
1112 };
1113 self.emit_property_attrs(
1114 &object,
1115 property_attrs,
1116 language.as_deref(),
1117 results,
1118 );
1119 if let Some(type_attr) = type_attr {
1120 results.push(Triple::new(object.clone(), rdf::TYPE, type_attr));
1121 }
1122 RdfXmlState::PropertyElt {
1123 iri,
1124 base_iri,
1125 language,
1126 subject,
1127 object: Some(NodeOrText::Node(object)),
1128 id_attr,
1129 datatype_attr,
1130 }
1131 } else {
1132 RdfXmlState::PropertyElt {
1133 iri,
1134 base_iri,
1135 language,
1136 subject,
1137 object: None,
1138 id_attr,
1139 datatype_attr,
1140 }
1141 }
1142 }
1143 RdfXmlParseType::Literal => RdfXmlState::ParseTypeLiteralPropertyElt {
1144 iri,
1145 base_iri,
1146 language,
1147 subject,
1148 writer: Writer::new(Vec::default()),
1149 id_attr,
1150 emit: true,
1151 },
1152 RdfXmlParseType::Resource => Self::build_parse_type_resource_property_elt(
1153 iri, base_iri, language, subject, id_attr, results,
1154 ),
1155 RdfXmlParseType::Collection => RdfXmlState::ParseTypeCollectionPropertyElt {
1156 iri,
1157 base_iri,
1158 language,
1159 subject,
1160 objects: Vec::default(),
1161 id_attr,
1162 },
1163 RdfXmlParseType::Other => RdfXmlState::ParseTypeLiteralPropertyElt {
1164 iri,
1165 base_iri,
1166 language,
1167 subject,
1168 writer: Writer::new(Vec::default()),
1169 id_attr,
1170 emit: false,
1171 },
1172 }
1173 }
1174 };
1175 self.state.push(new_state);
1176 Ok(())
1177 }
1178
1179 fn parse_end_event(
1180 &mut self,
1181 event: &BytesEnd<'_>,
1182 results: &mut Vec<Triple>,
1183 ) -> Result<(), RdfXmlParseError> {
1184 if self.in_literal_depth > 0 {
1186 if let Some(RdfXmlState::ParseTypeLiteralPropertyElt { writer, .. }) =
1187 self.state.last_mut()
1188 {
1189 writer.write_event(Event::End(BytesEnd::new(
1190 self.reader.decoder().decode(event.name().as_ref())?,
1191 )))?;
1192 self.in_literal_depth -= 1;
1193 return Ok(());
1194 }
1195 }
1196
1197 if let Some(current_state) = self.state.pop() {
1198 self.end_state(current_state, results)?;
1199 }
1200 Ok(())
1201 }
1202
1203 fn parse_text_event(&mut self, event: &BytesText<'_>) -> Result<(), RdfXmlParseError> {
1204 let text = event.unescape_with(|e| self.resolve_entity(e))?.to_string();
1205 match self.state.last_mut() {
1206 Some(RdfXmlState::PropertyElt { object, .. }) => {
1207 if is_object_defined(object) {
1208 if text.bytes().all(is_whitespace) {
1209 Ok(()) } else {
1211 Err(
1212 RdfXmlSyntaxError::msg(format!("Unexpected text event: '{text}'"))
1213 .into(),
1214 )
1215 }
1216 } else {
1217 *object = Some(NodeOrText::Text(text));
1218 Ok(())
1219 }
1220 }
1221 Some(RdfXmlState::ParseTypeLiteralPropertyElt { writer, .. }) => {
1222 writer.write_event(Event::Text(BytesText::new(&text)))?;
1223 Ok(())
1224 }
1225 _ => {
1226 if text.bytes().all(is_whitespace) {
1227 Ok(())
1228 } else {
1229 Err(RdfXmlSyntaxError::msg(format!("Unexpected text event: '{text}'")).into())
1230 }
1231 }
1232 }
1233 }
1234
1235 fn resolve_tag_name(&self, qname: QName<'_>) -> Result<String, RdfXmlParseError> {
1236 let (namespace, local_name) = self.reader.resolve_element(qname);
1237 self.resolve_ns_name(namespace, local_name)
1238 }
1239
1240 fn resolve_attribute_name(&self, qname: QName<'_>) -> Result<String, RdfXmlParseError> {
1241 let (namespace, local_name) = self.reader.resolve_attribute(qname);
1242 self.resolve_ns_name(namespace, local_name)
1243 }
1244
1245 fn resolve_ns_name(
1246 &self,
1247 namespace: ResolveResult<'_>,
1248 local_name: LocalName<'_>,
1249 ) -> Result<String, RdfXmlParseError> {
1250 match namespace {
1251 ResolveResult::Bound(ns) => {
1252 let mut value = Vec::with_capacity(ns.as_ref().len() + local_name.as_ref().len());
1253 value.extend_from_slice(ns.as_ref());
1254 value.extend_from_slice(local_name.as_ref());
1255 Ok(unescape_with(&self.reader.decoder().decode(&value)?, |e| {
1256 self.resolve_entity(e)
1257 })
1258 .map_err(Error::from)?
1259 .to_string())
1260 }
1261 ResolveResult::Unbound => {
1262 Err(RdfXmlSyntaxError::msg("XML namespaces are required in RDF/XML").into())
1263 }
1264 ResolveResult::Unknown(v) => Err(RdfXmlSyntaxError::msg(format!(
1265 "Unknown prefix {}:",
1266 self.reader.decoder().decode(&v)?
1267 ))
1268 .into()),
1269 }
1270 }
1271
1272 #[allow(clippy::too_many_arguments)]
1273 fn build_node_elt(
1274 &self,
1275 iri: NamedNode,
1276 base_iri: Option<Iri<String>>,
1277 language: Option<String>,
1278 id_attr: Option<NamedNode>,
1279 node_id_attr: Option<BlankNode>,
1280 about_attr: Option<NamedNode>,
1281 type_attr: Option<NamedNode>,
1282 property_attrs: Vec<(NamedNode, String)>,
1283 results: &mut Vec<Triple>,
1284 ) -> Result<RdfXmlState, RdfXmlSyntaxError> {
1285 let subject = match (id_attr, node_id_attr, about_attr) {
1286 (Some(id_attr), None, None) => Subject::from(id_attr),
1287 (None, Some(node_id_attr), None) => node_id_attr.into(),
1288 (None, None, Some(about_attr)) => about_attr.into(),
1289 (None, None, None) => BlankNode::default().into(),
1290 (Some(_), Some(_), _) => {
1291 return Err(RdfXmlSyntaxError::msg(
1292 "Not both rdf:ID and rdf:nodeID could be set at the same time",
1293 ))
1294 }
1295 (_, Some(_), Some(_)) => {
1296 return Err(RdfXmlSyntaxError::msg(
1297 "Not both rdf:nodeID and rdf:resource could be set at the same time",
1298 ))
1299 }
1300 (Some(_), _, Some(_)) => {
1301 return Err(RdfXmlSyntaxError::msg(
1302 "Not both rdf:ID and rdf:resource could be set at the same time",
1303 ))
1304 }
1305 };
1306
1307 self.emit_property_attrs(&subject, property_attrs, language.as_deref(), results);
1308
1309 if let Some(type_attr) = type_attr {
1310 results.push(Triple::new(subject.clone(), rdf::TYPE, type_attr));
1311 }
1312
1313 if iri != *RDF_DESCRIPTION {
1314 results.push(Triple::new(subject.clone(), rdf::TYPE, iri));
1315 }
1316 Ok(RdfXmlState::NodeElt {
1317 base_iri,
1318 language,
1319 subject,
1320 li_counter: 0,
1321 })
1322 }
1323
1324 fn build_parse_type_resource_property_elt(
1325 iri: NamedNode,
1326 base_iri: Option<Iri<String>>,
1327 language: Option<String>,
1328 subject: Subject,
1329 id_attr: Option<NamedNode>,
1330 results: &mut Vec<Triple>,
1331 ) -> RdfXmlState {
1332 let object = BlankNode::default();
1333 let triple = Triple::new(subject, iri, object.clone());
1334 if let Some(id_attr) = id_attr {
1335 Self::reify(triple.clone(), id_attr, results);
1336 }
1337 results.push(triple);
1338 RdfXmlState::NodeElt {
1339 base_iri,
1340 language,
1341 subject: object.into(),
1342 li_counter: 0,
1343 }
1344 }
1345
1346 fn end_state(
1347 &mut self,
1348 state: RdfXmlState,
1349 results: &mut Vec<Triple>,
1350 ) -> Result<(), RdfXmlSyntaxError> {
1351 match state {
1352 RdfXmlState::PropertyElt {
1353 iri,
1354 language,
1355 subject,
1356 id_attr,
1357 datatype_attr,
1358 object,
1359 ..
1360 } => {
1361 let object = match object {
1362 Some(NodeOrText::Node(node)) => Term::from(node),
1363 Some(NodeOrText::Text(text)) => {
1364 self.new_literal(text, language, datatype_attr).into()
1365 }
1366 None => self
1367 .new_literal(String::new(), language, datatype_attr)
1368 .into(),
1369 };
1370 let triple = Triple::new(subject, iri, object);
1371 if let Some(id_attr) = id_attr {
1372 Self::reify(triple.clone(), id_attr, results);
1373 }
1374 results.push(triple);
1375 }
1376 RdfXmlState::ParseTypeCollectionPropertyElt {
1377 iri,
1378 subject,
1379 id_attr,
1380 objects,
1381 ..
1382 } => {
1383 let mut current_node = Subject::from(rdf::NIL);
1384 for object in objects.into_iter().rev() {
1385 let subject = Subject::from(BlankNode::default());
1386 results.push(Triple::new(subject.clone(), rdf::FIRST, object));
1387 results.push(Triple::new(subject.clone(), rdf::REST, current_node));
1388 current_node = subject;
1389 }
1390 let triple = Triple::new(subject, iri, current_node);
1391 if let Some(id_attr) = id_attr {
1392 Self::reify(triple.clone(), id_attr, results);
1393 }
1394 results.push(triple);
1395 }
1396 RdfXmlState::ParseTypeLiteralPropertyElt {
1397 iri,
1398 subject,
1399 id_attr,
1400 writer,
1401 emit,
1402 ..
1403 } => {
1404 if emit {
1405 let object = writer.into_inner();
1406 if object.is_empty() {
1407 return Err(RdfXmlSyntaxError::msg(format!(
1408 "No value found for rdf:XMLLiteral value of property {iri}"
1409 )));
1410 }
1411 let triple = Triple::new(
1412 subject,
1413 iri,
1414 Literal::new_typed_literal(
1415 str::from_utf8(&object).map_err(|_| {
1416 RdfXmlSyntaxError::msg(
1417 "The XML literal is not in valid UTF-8".to_owned(),
1418 )
1419 })?,
1420 rdf::XML_LITERAL,
1421 ),
1422 );
1423 if let Some(id_attr) = id_attr {
1424 Self::reify(triple.clone(), id_attr, results);
1425 }
1426 results.push(triple);
1427 }
1428 }
1429 RdfXmlState::NodeElt { subject, .. } => match self.state.last_mut() {
1430 Some(RdfXmlState::PropertyElt { object, .. }) => {
1431 if is_object_defined(object) {
1432 return Err(RdfXmlSyntaxError::msg(
1433 "Unexpected node, a text value is already present",
1434 ));
1435 }
1436 *object = Some(NodeOrText::Node(subject))
1437 }
1438 Some(RdfXmlState::ParseTypeCollectionPropertyElt { objects, .. }) => {
1439 objects.push(subject)
1440 }
1441 _ => (),
1442 },
1443 _ => (),
1444 }
1445 Ok(())
1446 }
1447
1448 fn new_literal(
1449 &self,
1450 value: String,
1451 language: Option<String>,
1452 datatype: Option<NamedNode>,
1453 ) -> Literal {
1454 if let Some(datatype) = datatype {
1455 Literal::new_typed_literal(value, datatype)
1456 } else if let Some(language) =
1457 language.or_else(|| self.current_language().map(ToOwned::to_owned))
1458 {
1459 Literal::new_language_tagged_literal_unchecked(value, language)
1460 } else {
1461 Literal::new_simple_literal(value)
1462 }
1463 }
1464
1465 fn reify(triple: Triple, statement_id: NamedNode, results: &mut Vec<Triple>) {
1466 results.push(Triple::new(statement_id.clone(), rdf::TYPE, rdf::STATEMENT));
1467 results.push(Triple::new(
1468 statement_id.clone(),
1469 rdf::SUBJECT,
1470 triple.subject,
1471 ));
1472 results.push(Triple::new(
1473 statement_id.clone(),
1474 rdf::PREDICATE,
1475 triple.predicate,
1476 ));
1477 results.push(Triple::new(statement_id, rdf::OBJECT, triple.object));
1478 }
1479
1480 fn emit_property_attrs(
1481 &self,
1482 subject: &Subject,
1483 literal_attributes: Vec<(NamedNode, String)>,
1484 language: Option<&str>,
1485 results: &mut Vec<Triple>,
1486 ) {
1487 for (literal_predicate, literal_value) in literal_attributes {
1488 results.push(Triple::new(
1489 subject.clone(),
1490 literal_predicate,
1491 if let Some(language) = language.or_else(|| self.current_language()) {
1492 Literal::new_language_tagged_literal_unchecked(literal_value, language)
1493 } else {
1494 Literal::new_simple_literal(literal_value)
1495 },
1496 ));
1497 }
1498 }
1499
1500 fn convert_attribute(&self, attribute: &Attribute<'_>) -> Result<String, RdfXmlParseError> {
1501 Ok(attribute
1502 .decode_and_unescape_value_with(self.reader.decoder(), |e| self.resolve_entity(e))?
1503 .into_owned())
1504 }
1505
1506 fn convert_iri_attribute(
1507 &self,
1508 base_iri: Option<&Iri<String>>,
1509 attribute: &Attribute<'_>,
1510 ) -> Result<NamedNode, RdfXmlParseError> {
1511 Ok(self.resolve_iri(base_iri, self.convert_attribute(attribute)?)?)
1512 }
1513
1514 fn resolve_iri(
1515 &self,
1516 base_iri: Option<&Iri<String>>,
1517 relative_iri: String,
1518 ) -> Result<NamedNode, RdfXmlSyntaxError> {
1519 if let Some(base_iri) = base_iri.or_else(|| self.current_base_iri()) {
1520 Ok(NamedNode::new_unchecked(
1521 if self.unchecked {
1522 base_iri.resolve_unchecked(&relative_iri)
1523 } else {
1524 base_iri
1525 .resolve(&relative_iri)
1526 .map_err(|error| RdfXmlSyntaxError::invalid_iri(relative_iri, error))?
1527 }
1528 .into_inner(),
1529 ))
1530 } else {
1531 self.parse_iri(relative_iri)
1532 }
1533 }
1534
1535 fn parse_iri(&self, relative_iri: String) -> Result<NamedNode, RdfXmlSyntaxError> {
1536 Ok(NamedNode::new_unchecked(if self.unchecked {
1537 relative_iri
1538 } else {
1539 Iri::parse(relative_iri.clone())
1540 .map_err(|error| RdfXmlSyntaxError::invalid_iri(relative_iri, error))?
1541 .into_inner()
1542 }))
1543 }
1544
1545 fn current_language(&self) -> Option<&str> {
1546 for state in self.state.iter().rev() {
1547 match state {
1548 RdfXmlState::Doc { .. } => (),
1549 RdfXmlState::Rdf { language, .. }
1550 | RdfXmlState::NodeElt { language, .. }
1551 | RdfXmlState::PropertyElt { language, .. }
1552 | RdfXmlState::ParseTypeCollectionPropertyElt { language, .. }
1553 | RdfXmlState::ParseTypeLiteralPropertyElt { language, .. } => {
1554 if let Some(language) = language {
1555 return Some(language);
1556 }
1557 }
1558 }
1559 }
1560 None
1561 }
1562
1563 fn current_base_iri(&self) -> Option<&Iri<String>> {
1564 for state in self.state.iter().rev() {
1565 match state {
1566 RdfXmlState::Doc { base_iri }
1567 | RdfXmlState::Rdf { base_iri, .. }
1568 | RdfXmlState::NodeElt { base_iri, .. }
1569 | RdfXmlState::PropertyElt { base_iri, .. }
1570 | RdfXmlState::ParseTypeCollectionPropertyElt { base_iri, .. }
1571 | RdfXmlState::ParseTypeLiteralPropertyElt { base_iri, .. } => {
1572 if let Some(base_iri) = base_iri {
1573 return Some(base_iri);
1574 }
1575 }
1576 }
1577 }
1578 None
1579 }
1580
1581 fn resolve_entity(&self, e: &str) -> Option<&str> {
1582 resolve_xml_entity(e).or_else(|| self.custom_entities.get(e).map(String::as_str))
1583 }
1584}
1585
1586fn is_object_defined(object: &Option<NodeOrText>) -> bool {
1587 match object {
1588 Some(NodeOrText::Node(_)) => true,
1589 Some(NodeOrText::Text(t)) => !t.bytes().all(is_whitespace),
1590 None => false,
1591 }
1592}
1593
1594fn is_whitespace(c: u8) -> bool {
1595 matches!(c, b' ' | b'\t' | b'\n' | b'\r')
1596}
1597
1598fn is_utf8(encoding: &[u8]) -> bool {
1599 matches!(
1600 encoding.to_ascii_lowercase().as_slice(),
1601 b"unicode-1-1-utf-8"
1602 | b"unicode11utf8"
1603 | b"unicode20utf8"
1604 | b"utf-8"
1605 | b"utf8"
1606 | b"x-unicode20utf8"
1607 )
1608}