oxttl/toolkit/
lexer.rs

1use crate::toolkit::error::{TextPosition, TurtleSyntaxError};
2use memchr::{memchr2, memchr2_iter};
3use std::borrow::Cow;
4use std::cmp::min;
5use std::io::{self, Read};
6use std::ops::{Deref, Range, RangeInclusive};
7use std::str;
8#[cfg(feature = "async-tokio")]
9use tokio::io::{AsyncRead, AsyncReadExt};
10
11pub trait TokenRecognizer {
12    type Token<'a>
13    where
14        Self: 'a;
15    type Options: Default;
16
17    fn recognize_next_token<'a>(
18        &mut self,
19        data: &'a [u8],
20        is_ending: bool,
21        options: &Self::Options,
22    ) -> Option<(usize, Result<Self::Token<'a>, TokenRecognizerError>)>;
23}
24
25#[derive(Debug, PartialEq, Eq)]
26pub enum TokenOrLineJump<T> {
27    Token(T),
28    LineJump,
29}
30
31pub struct TokenRecognizerError {
32    pub location: Range<usize>,
33    pub message: String,
34}
35
36impl<S: Into<String>> From<(Range<usize>, S)> for TokenRecognizerError {
37    fn from((location, message): (Range<usize>, S)) -> Self {
38        Self {
39            location,
40            message: message.into(),
41        }
42    }
43}
44
45#[allow(clippy::range_plus_one)]
46impl<S: Into<String>> From<(RangeInclusive<usize>, S)> for TokenRecognizerError {
47    fn from((location, message): (RangeInclusive<usize>, S)) -> Self {
48        (*location.start()..*location.end() + 1, message).into()
49    }
50}
51
52impl<S: Into<String>> From<(usize, S)> for TokenRecognizerError {
53    fn from((location, message): (usize, S)) -> Self {
54        (location..=location, message).into()
55    }
56}
57
58pub struct Lexer<B, R: TokenRecognizer> {
59    parser: R,
60    data: B,
61    position: Position,
62    previous_position: Position, // Lexer position before the last emitted token
63    is_ending: bool,
64    min_buffer_size: usize,
65    max_buffer_size: usize,
66    line_comment_start: Option<&'static [u8]>,
67}
68
69#[derive(Clone, Copy)]
70struct Position {
71    line_start_buffer_offset: usize,
72    buffer_offset: usize,
73    global_offset: u64,
74    global_line: u64,
75}
76
77impl<B, R: TokenRecognizer> Lexer<B, R> {
78    pub fn new(
79        parser: R,
80        data: B,
81        is_ending: bool,
82        min_buffer_size: usize,
83        max_buffer_size: usize,
84        line_comment_start: Option<&'static [u8]>,
85    ) -> Self {
86        Self {
87            parser,
88            data,
89            position: Position {
90                line_start_buffer_offset: 0,
91                buffer_offset: 0,
92                global_offset: 0,
93                global_line: 0,
94            },
95            previous_position: Position {
96                line_start_buffer_offset: 0,
97                buffer_offset: 0,
98                global_offset: 0,
99                global_line: 0,
100            },
101            is_ending,
102            min_buffer_size,
103            max_buffer_size,
104            line_comment_start,
105        }
106    }
107}
108
109impl<R: TokenRecognizer> Lexer<Vec<u8>, R> {
110    pub fn extend_from_slice(&mut self, other: &[u8]) {
111        self.shrink_data();
112        self.data.extend_from_slice(other);
113    }
114
115    #[inline]
116    pub fn end(&mut self) {
117        self.is_ending = true;
118    }
119
120    pub fn extend_from_reader(&mut self, reader: &mut impl Read) -> io::Result<()> {
121        self.shrink_data();
122        if self.data.len() == self.max_buffer_size {
123            return Err(io::Error::new(
124                io::ErrorKind::OutOfMemory,
125                format!(
126                    "Reached the buffer maximal size of {}",
127                    self.max_buffer_size
128                ),
129            ));
130        }
131        let min_end = min(self.data.len() + self.min_buffer_size, self.max_buffer_size);
132        let new_start = self.data.len();
133        self.data.resize(min_end, 0);
134        if self.data.len() < self.data.capacity() {
135            // We keep extending to have as much space as available without reallocation
136            self.data.resize(self.data.capacity(), 0);
137        }
138        let read = reader.read(&mut self.data[new_start..])?;
139        self.data.truncate(new_start + read);
140        self.is_ending = read == 0;
141        Ok(())
142    }
143
144    #[cfg(feature = "async-tokio")]
145    pub async fn extend_from_tokio_async_read(
146        &mut self,
147        reader: &mut (impl AsyncRead + Unpin),
148    ) -> io::Result<()> {
149        self.shrink_data();
150        if self.data.len() == self.max_buffer_size {
151            return Err(io::Error::new(
152                io::ErrorKind::OutOfMemory,
153                format!(
154                    "Reached the buffer maximal size of {}",
155                    self.max_buffer_size
156                ),
157            ));
158        }
159        let min_end = min(self.data.len() + self.min_buffer_size, self.max_buffer_size);
160        let new_start = self.data.len();
161        self.data.resize(min_end, 0);
162        if self.data.len() < self.data.capacity() {
163            // We keep extending to have as much space as available without reallocation
164            self.data.resize(self.data.capacity(), 0);
165        }
166        let read = reader.read(&mut self.data[new_start..]).await?;
167        self.data.truncate(new_start + read);
168        self.is_ending = read == 0;
169        Ok(())
170    }
171
172    fn shrink_data(&mut self) {
173        if self.position.line_start_buffer_offset > 0 {
174            self.shrink_data_by(self.position.line_start_buffer_offset);
175        }
176        if self.position.buffer_offset > self.max_buffer_size / 2 {
177            // We really need to shrink, let's forget about error quality
178            self.shrink_data_by(self.position.buffer_offset);
179        }
180    }
181
182    fn shrink_data_by(&mut self, shift_amount: usize) {
183        self.data.copy_within(shift_amount.., 0);
184        self.data.truncate(self.data.len() - shift_amount);
185        self.position.buffer_offset -= shift_amount;
186        self.position.line_start_buffer_offset = self
187            .position
188            .line_start_buffer_offset
189            .saturating_sub(shift_amount);
190        self.previous_position = self.position;
191    }
192}
193
194impl<B: Deref<Target = [u8]>, R: TokenRecognizer> Lexer<B, R> {
195    #[allow(clippy::unwrap_in_result)]
196    pub fn parse_next(
197        &mut self,
198        options: &R::Options,
199    ) -> Option<Result<TokenOrLineJump<R::Token<'_>>, TurtleSyntaxError>> {
200        if self.skip_whitespaces_and_comments()? {
201            self.previous_position = self.position;
202            return Some(Ok(TokenOrLineJump::LineJump));
203        }
204        self.previous_position = self.position;
205        let Some((consumed, result)) = self.parser.recognize_next_token(
206            &self.data[self.position.buffer_offset..],
207            self.is_ending,
208            options,
209        ) else {
210            return if self.is_ending {
211                if self.position.buffer_offset == self.data.len() {
212                    None // We have finished
213                } else {
214                    let (new_line_jumps, new_line_start) =
215                        Self::find_number_of_line_jumps_and_start_of_last_line(
216                            &self.data[self.position.buffer_offset..],
217                        );
218                    if new_line_jumps > 0 {
219                        self.position.line_start_buffer_offset =
220                            self.position.buffer_offset + new_line_start;
221                    }
222                    self.position.global_offset +=
223                        u64::try_from(self.data.len() - self.position.buffer_offset).unwrap();
224                    self.position.buffer_offset = self.data.len();
225                    self.position.global_line += new_line_jumps;
226                    let error = TurtleSyntaxError::new(
227                        self.last_token_location(),
228                        "Unexpected end of file",
229                    );
230                    Some(Err(error))
231                }
232            } else {
233                None
234            };
235        };
236        debug_assert!(
237            consumed > 0,
238            "The lexer must consume at least one byte each time"
239        );
240        debug_assert!(
241            self.position.buffer_offset + consumed <= self.data.len(),
242            "The lexer tried to consumed {consumed} bytes but only {} bytes are readable",
243            self.data.len() - self.position.buffer_offset
244        );
245        let (new_line_jumps, new_line_start) =
246            Self::find_number_of_line_jumps_and_start_of_last_line(
247                &self.data[self.position.buffer_offset..self.position.buffer_offset + consumed],
248            );
249        if new_line_jumps > 0 {
250            self.position.line_start_buffer_offset = self.position.buffer_offset + new_line_start;
251        }
252        self.position.buffer_offset += consumed;
253        self.position.global_offset += u64::try_from(consumed).unwrap();
254        self.position.global_line += new_line_jumps;
255        Some(result.map(TokenOrLineJump::Token).map_err(|e| {
256            TurtleSyntaxError::new(
257                self.location_from_buffer_offset_range(e.location),
258                e.message,
259            )
260        }))
261    }
262
263    pub fn location_from_buffer_offset_range(
264        &self,
265        offset_range: Range<usize>,
266    ) -> Range<TextPosition> {
267        let start_offset = self.previous_position.buffer_offset + offset_range.start;
268        let (start_extra_line_jumps, start_line_start) =
269            Self::find_number_of_line_jumps_and_start_of_last_line(
270                &self.data[self.previous_position.buffer_offset..start_offset],
271            );
272        let start_line_start = if start_extra_line_jumps > 0 {
273            start_line_start + self.previous_position.buffer_offset
274        } else {
275            self.previous_position.line_start_buffer_offset
276        };
277        let end_offset = self.previous_position.buffer_offset + offset_range.end;
278        let (end_extra_line_jumps, end_line_start) =
279            Self::find_number_of_line_jumps_and_start_of_last_line(
280                &self.data[self.previous_position.buffer_offset..end_offset],
281            );
282        let end_line_start = if end_extra_line_jumps > 0 {
283            end_line_start + self.previous_position.buffer_offset
284        } else {
285            self.previous_position.line_start_buffer_offset
286        };
287        TextPosition {
288            line: self.previous_position.global_line + start_extra_line_jumps,
289            column: Self::column_from_bytes(&self.data[start_line_start..start_offset]),
290            offset: self.previous_position.global_offset
291                + u64::try_from(offset_range.start).unwrap(),
292        }..TextPosition {
293            line: self.previous_position.global_line + end_extra_line_jumps,
294            column: Self::column_from_bytes(&self.data[end_line_start..end_offset]),
295            offset: self.previous_position.global_offset + u64::try_from(offset_range.end).unwrap(),
296        }
297    }
298
299    pub fn last_token_location(&self) -> Range<TextPosition> {
300        self.text_position_from_position(&self.previous_position)
301            ..self.text_position_from_position(&self.position)
302    }
303
304    fn text_position_from_position(&self, position: &Position) -> TextPosition {
305        TextPosition {
306            line: position.global_line,
307            column: Self::column_from_bytes(
308                &self.data[position.line_start_buffer_offset..position.buffer_offset],
309            ),
310            offset: position.global_offset,
311        }
312    }
313
314    pub fn last_token_source(&self) -> Cow<'_, str> {
315        String::from_utf8_lossy(
316            &self.data[self.previous_position.buffer_offset..self.position.buffer_offset],
317        )
318    }
319
320    pub fn is_end(&self) -> bool {
321        self.is_ending && self.data.len() == self.position.buffer_offset
322    }
323
324    #[allow(clippy::unwrap_in_result)]
325    fn skip_whitespaces_and_comments(&mut self) -> Option<bool> {
326        if self.skip_whitespaces()? {
327            return Some(true);
328        }
329
330        let buf = &self.data[self.position.buffer_offset..];
331        if let Some(line_comment_start) = self.line_comment_start {
332            if buf.starts_with(line_comment_start) {
333                // Comment
334                if let Some(end) = memchr2(b'\r', b'\n', &buf[line_comment_start.len()..]) {
335                    let mut end_position = line_comment_start.len() + end;
336                    if buf.get(end_position).copied() == Some(b'\r') {
337                        // We look for \n for Windows line end style
338                        if let Some(c) = buf.get(end_position + 1) {
339                            if *c == b'\n' {
340                                end_position += 1;
341                            }
342                        } else if !self.is_ending {
343                            return None; // We need to read more
344                        }
345                    }
346                    let comment_size = end_position + 1;
347                    self.position.buffer_offset += comment_size;
348                    self.position.line_start_buffer_offset = self.position.buffer_offset;
349                    self.position.global_offset += u64::try_from(comment_size).unwrap();
350                    self.position.global_line += 1;
351                    return Some(true);
352                }
353                if self.is_ending {
354                    self.position.buffer_offset = self.data.len(); // EOF
355                    return Some(false);
356                }
357                return None; // We need more data
358            } else if !self.is_ending && buf.len() < line_comment_start.len() {
359                return None; // We need more data
360            }
361        }
362        Some(false)
363    }
364
365    fn skip_whitespaces(&mut self) -> Option<bool> {
366        let mut i = self.position.buffer_offset;
367        while let Some(c) = self.data.get(i) {
368            match c {
369                b' ' | b'\t' => {
370                    self.position.buffer_offset += 1;
371                    self.position.global_offset += 1;
372                }
373                b'\r' => {
374                    // We look for \n for Windows line end style
375                    let mut increment: u8 = 1;
376                    if let Some(c) = self.data.get(i + 1) {
377                        if *c == b'\n' {
378                            increment += 1;
379                        }
380                    } else if !self.is_ending {
381                        return None; // We need to read more
382                    }
383                    self.position.buffer_offset += usize::from(increment);
384                    self.position.line_start_buffer_offset = self.position.buffer_offset;
385                    self.position.global_offset += u64::from(increment);
386                    self.position.global_line += 1;
387                    return Some(true);
388                }
389                b'\n' => {
390                    self.position.buffer_offset += 1;
391                    self.position.line_start_buffer_offset = self.position.buffer_offset;
392                    self.position.global_offset += 1;
393                    self.position.global_line += 1;
394                    return Some(true);
395                }
396                _ => return Some(false),
397            }
398            i += 1;
399            // TODO: SIMD
400        }
401        self.is_ending.then_some(false) // We return None if there is not enough data
402    }
403
404    fn find_number_of_line_jumps_and_start_of_last_line(bytes: &[u8]) -> (u64, usize) {
405        let mut num_of_jumps = 0;
406        let mut last_jump_pos = 0;
407        let mut previous_cr = 0;
408        for pos in memchr2_iter(b'\r', b'\n', bytes) {
409            if bytes[pos] == b'\r' {
410                previous_cr = pos;
411                num_of_jumps += 1;
412                last_jump_pos = pos + 1;
413            } else {
414                if previous_cr < pos - 1 {
415                    // We count \r\n as a single line jump
416                    num_of_jumps += 1;
417                }
418                last_jump_pos = pos + 1;
419            }
420        }
421        (num_of_jumps, last_jump_pos)
422    }
423
424    fn column_from_bytes(bytes: &[u8]) -> u64 {
425        match str::from_utf8(bytes) {
426            Ok(s) => u64::try_from(s.chars().count()).unwrap(),
427            Err(e) => {
428                if e.valid_up_to() == 0 {
429                    0
430                } else {
431                    Self::column_from_bytes(&bytes[..e.valid_up_to()])
432                }
433            }
434        }
435    }
436}