1use crate::toolkit::error::{TextPosition, TurtleSyntaxError};
2use memchr::{memchr2, memchr2_iter};
3use std::borrow::Cow;
4use std::cmp::min;
5use std::io::{self, Read};
6use std::ops::{Deref, Range, RangeInclusive};
7use std::str;
8#[cfg(feature = "async-tokio")]
9use tokio::io::{AsyncRead, AsyncReadExt};
10
11pub trait TokenRecognizer {
12 type Token<'a>
13 where
14 Self: 'a;
15 type Options: Default;
16
17 fn recognize_next_token<'a>(
18 &mut self,
19 data: &'a [u8],
20 is_ending: bool,
21 options: &Self::Options,
22 ) -> Option<(usize, Result<Self::Token<'a>, TokenRecognizerError>)>;
23}
24
25#[derive(Debug, PartialEq, Eq)]
26pub enum TokenOrLineJump<T> {
27 Token(T),
28 LineJump,
29}
30
31pub struct TokenRecognizerError {
32 pub location: Range<usize>,
33 pub message: String,
34}
35
36impl<S: Into<String>> From<(Range<usize>, S)> for TokenRecognizerError {
37 fn from((location, message): (Range<usize>, S)) -> Self {
38 Self {
39 location,
40 message: message.into(),
41 }
42 }
43}
44
45#[allow(clippy::range_plus_one)]
46impl<S: Into<String>> From<(RangeInclusive<usize>, S)> for TokenRecognizerError {
47 fn from((location, message): (RangeInclusive<usize>, S)) -> Self {
48 (*location.start()..*location.end() + 1, message).into()
49 }
50}
51
52impl<S: Into<String>> From<(usize, S)> for TokenRecognizerError {
53 fn from((location, message): (usize, S)) -> Self {
54 (location..=location, message).into()
55 }
56}
57
58pub struct Lexer<B, R: TokenRecognizer> {
59 parser: R,
60 data: B,
61 position: Position,
62 previous_position: Position, is_ending: bool,
64 min_buffer_size: usize,
65 max_buffer_size: usize,
66 line_comment_start: Option<&'static [u8]>,
67}
68
69#[derive(Clone, Copy)]
70struct Position {
71 line_start_buffer_offset: usize,
72 buffer_offset: usize,
73 global_offset: u64,
74 global_line: u64,
75}
76
77impl<B, R: TokenRecognizer> Lexer<B, R> {
78 pub fn new(
79 parser: R,
80 data: B,
81 is_ending: bool,
82 min_buffer_size: usize,
83 max_buffer_size: usize,
84 line_comment_start: Option<&'static [u8]>,
85 ) -> Self {
86 Self {
87 parser,
88 data,
89 position: Position {
90 line_start_buffer_offset: 0,
91 buffer_offset: 0,
92 global_offset: 0,
93 global_line: 0,
94 },
95 previous_position: Position {
96 line_start_buffer_offset: 0,
97 buffer_offset: 0,
98 global_offset: 0,
99 global_line: 0,
100 },
101 is_ending,
102 min_buffer_size,
103 max_buffer_size,
104 line_comment_start,
105 }
106 }
107}
108
109impl<R: TokenRecognizer> Lexer<Vec<u8>, R> {
110 pub fn extend_from_slice(&mut self, other: &[u8]) {
111 self.shrink_data();
112 self.data.extend_from_slice(other);
113 }
114
115 #[inline]
116 pub fn end(&mut self) {
117 self.is_ending = true;
118 }
119
120 pub fn extend_from_reader(&mut self, reader: &mut impl Read) -> io::Result<()> {
121 self.shrink_data();
122 if self.data.len() == self.max_buffer_size {
123 return Err(io::Error::new(
124 io::ErrorKind::OutOfMemory,
125 format!(
126 "Reached the buffer maximal size of {}",
127 self.max_buffer_size
128 ),
129 ));
130 }
131 let min_end = min(self.data.len() + self.min_buffer_size, self.max_buffer_size);
132 let new_start = self.data.len();
133 self.data.resize(min_end, 0);
134 if self.data.len() < self.data.capacity() {
135 self.data.resize(self.data.capacity(), 0);
137 }
138 let read = reader.read(&mut self.data[new_start..])?;
139 self.data.truncate(new_start + read);
140 self.is_ending = read == 0;
141 Ok(())
142 }
143
144 #[cfg(feature = "async-tokio")]
145 pub async fn extend_from_tokio_async_read(
146 &mut self,
147 reader: &mut (impl AsyncRead + Unpin),
148 ) -> io::Result<()> {
149 self.shrink_data();
150 if self.data.len() == self.max_buffer_size {
151 return Err(io::Error::new(
152 io::ErrorKind::OutOfMemory,
153 format!(
154 "Reached the buffer maximal size of {}",
155 self.max_buffer_size
156 ),
157 ));
158 }
159 let min_end = min(self.data.len() + self.min_buffer_size, self.max_buffer_size);
160 let new_start = self.data.len();
161 self.data.resize(min_end, 0);
162 if self.data.len() < self.data.capacity() {
163 self.data.resize(self.data.capacity(), 0);
165 }
166 let read = reader.read(&mut self.data[new_start..]).await?;
167 self.data.truncate(new_start + read);
168 self.is_ending = read == 0;
169 Ok(())
170 }
171
172 fn shrink_data(&mut self) {
173 if self.position.line_start_buffer_offset > 0 {
174 self.shrink_data_by(self.position.line_start_buffer_offset);
175 }
176 if self.position.buffer_offset > self.max_buffer_size / 2 {
177 self.shrink_data_by(self.position.buffer_offset);
179 }
180 }
181
182 fn shrink_data_by(&mut self, shift_amount: usize) {
183 self.data.copy_within(shift_amount.., 0);
184 self.data.truncate(self.data.len() - shift_amount);
185 self.position.buffer_offset -= shift_amount;
186 self.position.line_start_buffer_offset = self
187 .position
188 .line_start_buffer_offset
189 .saturating_sub(shift_amount);
190 self.previous_position = self.position;
191 }
192}
193
194impl<B: Deref<Target = [u8]>, R: TokenRecognizer> Lexer<B, R> {
195 #[allow(clippy::unwrap_in_result)]
196 pub fn parse_next(
197 &mut self,
198 options: &R::Options,
199 ) -> Option<Result<TokenOrLineJump<R::Token<'_>>, TurtleSyntaxError>> {
200 if self.skip_whitespaces_and_comments()? {
201 self.previous_position = self.position;
202 return Some(Ok(TokenOrLineJump::LineJump));
203 }
204 self.previous_position = self.position;
205 let Some((consumed, result)) = self.parser.recognize_next_token(
206 &self.data[self.position.buffer_offset..],
207 self.is_ending,
208 options,
209 ) else {
210 return if self.is_ending {
211 if self.position.buffer_offset == self.data.len() {
212 None } else {
214 let (new_line_jumps, new_line_start) =
215 Self::find_number_of_line_jumps_and_start_of_last_line(
216 &self.data[self.position.buffer_offset..],
217 );
218 if new_line_jumps > 0 {
219 self.position.line_start_buffer_offset =
220 self.position.buffer_offset + new_line_start;
221 }
222 self.position.global_offset +=
223 u64::try_from(self.data.len() - self.position.buffer_offset).unwrap();
224 self.position.buffer_offset = self.data.len();
225 self.position.global_line += new_line_jumps;
226 let error = TurtleSyntaxError::new(
227 self.last_token_location(),
228 "Unexpected end of file",
229 );
230 Some(Err(error))
231 }
232 } else {
233 None
234 };
235 };
236 debug_assert!(
237 consumed > 0,
238 "The lexer must consume at least one byte each time"
239 );
240 debug_assert!(
241 self.position.buffer_offset + consumed <= self.data.len(),
242 "The lexer tried to consumed {consumed} bytes but only {} bytes are readable",
243 self.data.len() - self.position.buffer_offset
244 );
245 let (new_line_jumps, new_line_start) =
246 Self::find_number_of_line_jumps_and_start_of_last_line(
247 &self.data[self.position.buffer_offset..self.position.buffer_offset + consumed],
248 );
249 if new_line_jumps > 0 {
250 self.position.line_start_buffer_offset = self.position.buffer_offset + new_line_start;
251 }
252 self.position.buffer_offset += consumed;
253 self.position.global_offset += u64::try_from(consumed).unwrap();
254 self.position.global_line += new_line_jumps;
255 Some(result.map(TokenOrLineJump::Token).map_err(|e| {
256 TurtleSyntaxError::new(
257 self.location_from_buffer_offset_range(e.location),
258 e.message,
259 )
260 }))
261 }
262
263 pub fn location_from_buffer_offset_range(
264 &self,
265 offset_range: Range<usize>,
266 ) -> Range<TextPosition> {
267 let start_offset = self.previous_position.buffer_offset + offset_range.start;
268 let (start_extra_line_jumps, start_line_start) =
269 Self::find_number_of_line_jumps_and_start_of_last_line(
270 &self.data[self.previous_position.buffer_offset..start_offset],
271 );
272 let start_line_start = if start_extra_line_jumps > 0 {
273 start_line_start + self.previous_position.buffer_offset
274 } else {
275 self.previous_position.line_start_buffer_offset
276 };
277 let end_offset = self.previous_position.buffer_offset + offset_range.end;
278 let (end_extra_line_jumps, end_line_start) =
279 Self::find_number_of_line_jumps_and_start_of_last_line(
280 &self.data[self.previous_position.buffer_offset..end_offset],
281 );
282 let end_line_start = if end_extra_line_jumps > 0 {
283 end_line_start + self.previous_position.buffer_offset
284 } else {
285 self.previous_position.line_start_buffer_offset
286 };
287 TextPosition {
288 line: self.previous_position.global_line + start_extra_line_jumps,
289 column: Self::column_from_bytes(&self.data[start_line_start..start_offset]),
290 offset: self.previous_position.global_offset
291 + u64::try_from(offset_range.start).unwrap(),
292 }..TextPosition {
293 line: self.previous_position.global_line + end_extra_line_jumps,
294 column: Self::column_from_bytes(&self.data[end_line_start..end_offset]),
295 offset: self.previous_position.global_offset + u64::try_from(offset_range.end).unwrap(),
296 }
297 }
298
299 pub fn last_token_location(&self) -> Range<TextPosition> {
300 self.text_position_from_position(&self.previous_position)
301 ..self.text_position_from_position(&self.position)
302 }
303
304 fn text_position_from_position(&self, position: &Position) -> TextPosition {
305 TextPosition {
306 line: position.global_line,
307 column: Self::column_from_bytes(
308 &self.data[position.line_start_buffer_offset..position.buffer_offset],
309 ),
310 offset: position.global_offset,
311 }
312 }
313
314 pub fn last_token_source(&self) -> Cow<'_, str> {
315 String::from_utf8_lossy(
316 &self.data[self.previous_position.buffer_offset..self.position.buffer_offset],
317 )
318 }
319
320 pub fn is_end(&self) -> bool {
321 self.is_ending && self.data.len() == self.position.buffer_offset
322 }
323
324 #[allow(clippy::unwrap_in_result)]
325 fn skip_whitespaces_and_comments(&mut self) -> Option<bool> {
326 if self.skip_whitespaces()? {
327 return Some(true);
328 }
329
330 let buf = &self.data[self.position.buffer_offset..];
331 if let Some(line_comment_start) = self.line_comment_start {
332 if buf.starts_with(line_comment_start) {
333 if let Some(end) = memchr2(b'\r', b'\n', &buf[line_comment_start.len()..]) {
335 let mut end_position = line_comment_start.len() + end;
336 if buf.get(end_position).copied() == Some(b'\r') {
337 if let Some(c) = buf.get(end_position + 1) {
339 if *c == b'\n' {
340 end_position += 1;
341 }
342 } else if !self.is_ending {
343 return None; }
345 }
346 let comment_size = end_position + 1;
347 self.position.buffer_offset += comment_size;
348 self.position.line_start_buffer_offset = self.position.buffer_offset;
349 self.position.global_offset += u64::try_from(comment_size).unwrap();
350 self.position.global_line += 1;
351 return Some(true);
352 }
353 if self.is_ending {
354 self.position.buffer_offset = self.data.len(); return Some(false);
356 }
357 return None; } else if !self.is_ending && buf.len() < line_comment_start.len() {
359 return None; }
361 }
362 Some(false)
363 }
364
365 fn skip_whitespaces(&mut self) -> Option<bool> {
366 let mut i = self.position.buffer_offset;
367 while let Some(c) = self.data.get(i) {
368 match c {
369 b' ' | b'\t' => {
370 self.position.buffer_offset += 1;
371 self.position.global_offset += 1;
372 }
373 b'\r' => {
374 let mut increment: u8 = 1;
376 if let Some(c) = self.data.get(i + 1) {
377 if *c == b'\n' {
378 increment += 1;
379 }
380 } else if !self.is_ending {
381 return None; }
383 self.position.buffer_offset += usize::from(increment);
384 self.position.line_start_buffer_offset = self.position.buffer_offset;
385 self.position.global_offset += u64::from(increment);
386 self.position.global_line += 1;
387 return Some(true);
388 }
389 b'\n' => {
390 self.position.buffer_offset += 1;
391 self.position.line_start_buffer_offset = self.position.buffer_offset;
392 self.position.global_offset += 1;
393 self.position.global_line += 1;
394 return Some(true);
395 }
396 _ => return Some(false),
397 }
398 i += 1;
399 }
401 self.is_ending.then_some(false) }
403
404 fn find_number_of_line_jumps_and_start_of_last_line(bytes: &[u8]) -> (u64, usize) {
405 let mut num_of_jumps = 0;
406 let mut last_jump_pos = 0;
407 let mut previous_cr = 0;
408 for pos in memchr2_iter(b'\r', b'\n', bytes) {
409 if bytes[pos] == b'\r' {
410 previous_cr = pos;
411 num_of_jumps += 1;
412 last_jump_pos = pos + 1;
413 } else {
414 if previous_cr < pos - 1 {
415 num_of_jumps += 1;
417 }
418 last_jump_pos = pos + 1;
419 }
420 }
421 (num_of_jumps, last_jump_pos)
422 }
423
424 fn column_from_bytes(bytes: &[u8]) -> u64 {
425 match str::from_utf8(bytes) {
426 Ok(s) => u64::try_from(s.chars().count()).unwrap(),
427 Err(e) => {
428 if e.valid_up_to() == 0 {
429 0
430 } else {
431 Self::column_from_bytes(&bytes[..e.valid_up_to()])
432 }
433 }
434 }
435 }
436}