rio_turtle/
utils.rs

1use crate::error::*;
2use crate::MAX_STACK_SIZE;
3use rio_api::parser::LineBytePosition;
4use std::collections::VecDeque;
5use std::io::{BufRead, ErrorKind, Read};
6use std::str;
7
8/// Reads the file in streaming
9pub struct LookAheadByteReader<R: Read> {
10    inner: R,
11    buffer: VecDeque<u8>,
12    current: Option<u8>,
13    line_number: u64,
14    byte_number: u64,
15    stack_size: usize,
16}
17
18const DEFAULT_CAPACITY: usize = 8 * 1024;
19
20impl<R: BufRead> LookAheadByteReader<R> {
21    pub fn new(inner: R) -> Self {
22        let mut buffer = VecDeque::with_capacity(DEFAULT_CAPACITY);
23        buffer.push_back(b'\n');
24        Self {
25            inner,
26            buffer,
27            current: Some(b'\n'),
28            line_number: 0,
29            byte_number: 1,
30            stack_size: 0,
31        }
32    }
33
34    /// Returns the current byte if it exists
35    pub fn current(&self) -> Option<u8> {
36        self.current
37    }
38    /// Returns the current byte if it exists or fail if it does not
39    pub fn required_current(&self) -> Result<u8, TurtleError> {
40        self.current()
41            .ok_or_else(|| self.parse_error(TurtleErrorKind::PrematureEof))
42    }
43
44    /// Returns the next byte if it exists
45    pub fn next(&mut self) -> Result<Option<u8>, TurtleError> {
46        self.ahead(1)
47    }
48
49    /// Returns the next byte if it exists or fail if it does not
50    pub fn required_next(&mut self) -> Result<u8, TurtleError> {
51        self.ahead(1)?
52            .ok_or_else(|| self.parse_error(TurtleErrorKind::PrematureEof))
53    }
54
55    /// Returns a future byte if it exists
56    pub fn ahead(&mut self, count: usize) -> Result<Option<u8>, TurtleError> {
57        loop {
58            let mut position = count;
59            let (first, second) = self.buffer.as_slices();
60            if position < first.len() {
61                return Ok(Some(first[position]));
62            }
63            position -= first.len();
64            if position < second.len() {
65                return Ok(Some(second[position]));
66            }
67            if self.fill_and_is_end()? {
68                return Ok(None);
69            }
70        }
71    }
72
73    /// Consumes the current char and moves to the next one
74    pub fn consume(&mut self) -> Result<(), TurtleError> {
75        self.consume_many(1)
76    }
77
78    /// Consumes the many chars and moves to the next one
79    pub fn consume_many(&mut self, count: usize) -> Result<(), TurtleError> {
80        for _ in 0..count {
81            if self.buffer.is_empty() {
82                self.fill_and_is_end()?;
83            }
84            if let Some(c) = self.buffer.pop_front() {
85                if c == b'\n' {
86                    self.line_number += 1;
87                    self.byte_number = 1;
88                } else {
89                    self.byte_number += 1;
90                }
91            } else {
92                return Err(self.parse_error(TurtleErrorKind::PrematureEof));
93            }
94        }
95        if self.buffer.is_empty() {
96            self.fill_and_is_end()?;
97        }
98        self.current = self.buffer.front().cloned();
99        Ok(())
100    }
101
102    /// Returns the line number of the current byte starting at 1
103    pub fn line_number(&self) -> u64 {
104        self.line_number
105    }
106    /// Returns the byte number of the current byte in the line starting at 1
107    pub fn byte_number(&self) -> u64 {
108        self.byte_number
109    }
110
111    /// Returns if the current buffer starts with a given byte string. Does not work cross line boundaries
112    pub fn starts_with(&mut self, prefix: &[u8]) -> bool {
113        self.starts_with_with_eq(prefix, |a, b| a == b)
114    }
115
116    /// Returns if the current buffer starts with a given byte string in an ASCII case insensitive manner.
117    /// Does not work cross line boundaries
118    pub fn starts_with_ignore_ascii_case(&mut self, prefix: &[u8]) -> bool {
119        self.starts_with_with_eq(prefix, |a, b| a.eq_ignore_ascii_case(b))
120    }
121
122    pub fn unexpected_char_error<T>(&self) -> Result<T, TurtleError> {
123        Err(self.parse_error(match self.current() {
124            Some(c) => TurtleErrorKind::UnexpectedByte(c),
125            None => TurtleErrorKind::PrematureEof,
126        }))
127    }
128
129    pub fn check_is_current(&self, expected: u8) -> Result<(), TurtleError> {
130        if self.current() == Some(expected) {
131            Ok(())
132        } else {
133            self.unexpected_char_error()
134        }
135    }
136
137    pub fn check_is_next(&mut self, expected: u8) -> Result<(), TurtleError> {
138        if self.next()? == Some(expected) {
139            Ok(())
140        } else {
141            self.unexpected_char_error()
142        }
143    }
144
145    pub fn parse_error(&self, kind: TurtleErrorKind) -> TurtleError {
146        TurtleError {
147            kind,
148            position: Some(LineBytePosition::new(
149                self.line_number(),
150                self.byte_number(),
151            )),
152        }
153    }
154
155    pub fn consume_line_end(&mut self) -> Result<(), TurtleError> {
156        loop {
157            match self.current() {
158                None => return Ok(()),
159                Some(b'\n') => return self.consume(),
160                _ => self.consume()?,
161            }
162        }
163    }
164
165    fn fill_and_is_end(&mut self) -> Result<bool, TurtleError> {
166        loop {
167            let mut buf = [0; DEFAULT_CAPACITY]; //TODO: increase?
168            match self.inner.read(&mut buf) {
169                Ok(0) => return Ok(true),
170                Ok(read) => {
171                    self.buffer.extend(buf[..read].iter());
172                    return Ok(false);
173                }
174                Err(e) if e.kind() == ErrorKind::Interrupted => {}
175                Err(e) => return Err(e.into()),
176            }
177        }
178    }
179
180    fn starts_with_with_eq(&mut self, prefix: &[u8], eq: impl Fn(&[u8], &[u8]) -> bool) -> bool {
181        loop {
182            let (first, second) = self.buffer.as_slices();
183            if prefix.len() <= first.len() {
184                return eq(&first[..prefix.len()], prefix);
185            } else if prefix.len() <= first.len() + second.len() {
186                return eq(first, &prefix[..first.len()])
187                    && eq(
188                        &second[..prefix.len() - first.len()],
189                        &prefix[first.len()..],
190                    );
191            }
192            if let Ok(true) | Err(_) = self.fill_and_is_end() {
193                return false;
194            }
195        }
196    }
197
198    pub fn increment_stack_size(&mut self) -> Result<(), TurtleError> {
199        self.stack_size += 1;
200        if self.stack_size > MAX_STACK_SIZE {
201            Err(self.parse_error(TurtleErrorKind::StackOverflow))
202        } else {
203            Ok(())
204        }
205    }
206
207    pub fn decrement_stack_size(&mut self) {
208        self.stack_size -= 1;
209    }
210}
211
212#[derive(Default)]
213pub struct StringBufferStack {
214    inner: Vec<String>,
215    len: usize,
216}
217
218impl StringBufferStack {
219    pub fn with_capacity(cap: usize) -> Self {
220        StringBufferStack {
221            inner: Vec::with_capacity(cap),
222            len: 0,
223        }
224    }
225    pub fn push(&mut self) -> &mut String {
226        self.len += 1;
227        if self.len > self.inner.len() {
228            self.inner.push(String::default())
229        }
230        &mut self.inner[self.len - 1]
231    }
232
233    pub fn push2(&mut self) -> (&mut String, &mut String) {
234        self.push();
235        self.push();
236        let (a1, a2) = self.inner.split_at_mut(self.len - 1);
237        (&mut a1[a1.len() - 1], &mut a2[0])
238    }
239
240    pub fn pop(&mut self) {
241        self.inner[self.len - 1].clear();
242        self.len -= 1;
243    }
244
245    pub fn clear(&mut self) {
246        self.inner.clear();
247        self.len = 0;
248    }
249}
250
251#[derive(Default)]
252pub struct BlankNodeIdGenerator {
253    //TODO: avoid collisions
254    counter: u64,
255}
256
257impl BlankNodeIdGenerator {
258    pub fn generate(&mut self) -> BlankNodeId {
259        let mut id: [u8; 12] = [
260            // IMPORTANT: if this is modified, disambiguate must be updated accordingly
261            b'r', b'i', b'o', b'g', b'0', b'0', b'0', b'0', b'0', b'0', b'0', b'0',
262        ];
263        self.counter += 1;
264        write_u64_to_slice(self.counter, &mut id[4..]);
265        BlankNodeId { id }
266    }
267
268    /// If label could have been generated by self, turn it into a label that could not.
269    pub fn disambiguate(&self, label: &mut String) {
270        const SUFFIX: u8 = b'd';
271        let bytes = label.as_bytes();
272        if bytes.len() >= 12
273            && &bytes[..4] == b"riog"
274            && bytes[4..12].iter().all(u8::is_ascii_digit)
275            && bytes[12..].iter().all(|b| b == &SUFFIX)
276        {
277            label.push(SUFFIX as char)
278        }
279    }
280}
281
282fn write_u64_to_slice(mut v: u64, s: &mut [u8]) {
283    for i in (0..s.len()).rev() {
284        s[i] = b'0' + (v % 10) as u8;
285        v /= 10;
286    }
287}
288
289#[derive(Eq, PartialEq, Copy, Clone, Hash)]
290pub struct BlankNodeId {
291    id: [u8; 12],
292}
293
294impl AsRef<str> for BlankNodeId {
295    fn as_ref(&self) -> &str {
296        // We know what id is and it's always valid UTF8
297        str::from_utf8(&self.id).unwrap()
298    }
299}