json_event_parser/
read.rs

1use crate::JsonEvent;
2use std::borrow::Cow;
3use std::cmp::{max, min};
4use std::error::Error;
5use std::io::{self, Read};
6use std::ops::Range;
7use std::{fmt, str};
8#[cfg(feature = "async-tokio")]
9use tokio::io::{AsyncRead, AsyncReadExt};
10
11const MAX_STATE_STACK_SIZE: usize = 65_536;
12const MIN_BUFFER_SIZE: usize = 4096;
13const MAX_BUFFER_SIZE: usize = 4096 * 4096;
14
15/// Parses a JSON file from a [`Read`] implementation.
16///
17///
18/// ```
19/// use json_event_parser::{JsonEvent, ReaderJsonParser};
20///
21/// let mut reader = ReaderJsonParser::new(b"{\"foo\": 1}".as_slice());
22/// assert_eq!(reader.parse_next()?, JsonEvent::StartObject);
23/// assert_eq!(reader.parse_next()?, JsonEvent::ObjectKey("foo".into()));
24/// assert_eq!(reader.parse_next()?, JsonEvent::Number("1".into()));
25/// assert_eq!(reader.parse_next()?, JsonEvent::EndObject);
26/// assert_eq!(reader.parse_next()?, JsonEvent::Eof);
27/// # std::io::Result::Ok(())
28/// ```
29pub struct ReaderJsonParser<R: Read> {
30    input_buffer: Vec<u8>,
31    input_buffer_start: usize,
32    input_buffer_end: usize,
33    max_buffer_size: usize,
34    is_ending: bool,
35    read: R,
36    parser: LowLevelJsonParser,
37}
38
39impl<R: Read> ReaderJsonParser<R> {
40    pub const fn new(read: R) -> Self {
41        Self {
42            input_buffer: Vec::new(),
43            input_buffer_start: 0,
44            input_buffer_end: 0,
45            max_buffer_size: MAX_BUFFER_SIZE,
46            is_ending: false,
47            read,
48            parser: LowLevelJsonParser::new(),
49        }
50    }
51
52    /// Sets the max size of the internal buffer in bytes
53    pub fn with_max_buffer_size(mut self, size: usize) -> Self {
54        self.max_buffer_size = size;
55        self
56    }
57
58    pub fn parse_next(&mut self) -> Result<JsonEvent<'_>, JsonParseError> {
59        loop {
60            {
61                let LowLevelJsonParserResult {
62                    event,
63                    consumed_bytes,
64                } = self.parser.parse_next(
65                    #[allow(unsafe_code)]
66                    unsafe {
67                        let input_buffer_ptr: *const [u8] =
68                            &self.input_buffer[self.input_buffer_start..self.input_buffer_end];
69                        &*input_buffer_ptr
70                    }, // SAFETY: Borrow checker workaround https://github.com/rust-lang/rust/issues/70255
71                    self.is_ending,
72                );
73                self.input_buffer_start += consumed_bytes;
74                if let Some(event) = event {
75                    return Ok(event?);
76                }
77            }
78            if self.input_buffer_start > 0 {
79                self.input_buffer
80                    .copy_within(self.input_buffer_start..self.input_buffer_end, 0);
81                self.input_buffer_end -= self.input_buffer_start;
82                self.input_buffer_start = 0;
83            }
84            if self.input_buffer.len() == self.max_buffer_size {
85                return Err(io::Error::new(
86                    io::ErrorKind::OutOfMemory,
87                    format!(
88                        "Reached the buffer maximal size of {}",
89                        self.max_buffer_size
90                    ),
91                )
92                .into());
93            }
94            let min_end = min(
95                self.input_buffer_end + MIN_BUFFER_SIZE,
96                self.max_buffer_size,
97            );
98            if self.input_buffer.len() < min_end {
99                self.input_buffer.resize(min_end, 0);
100            }
101            if self.input_buffer.len() < self.input_buffer.capacity() {
102                // We keep extending to have as much space as available without reallocation
103                self.input_buffer.resize(self.input_buffer.capacity(), 0);
104            }
105            let read = self
106                .read
107                .read(&mut self.input_buffer[self.input_buffer_end..])?;
108            self.input_buffer_end += read;
109            self.is_ending = read == 0;
110        }
111    }
112
113    #[deprecated(note = "Use parse_next() instead")]
114    pub fn read_next_event(&mut self) -> Result<JsonEvent<'_>, JsonParseError> {
115        self.parse_next()
116    }
117}
118
119/// Parses a JSON file from an [`AsyncRead`] implementation.
120///
121/// ```
122/// use json_event_parser::{JsonEvent, TokioAsyncReaderJsonParser};
123///
124/// # #[tokio::main(flavor = "current_thread")]
125/// # async fn main() -> ::std::io::Result<()> {
126/// let mut reader = TokioAsyncReaderJsonParser::new(b"{\"foo\": 1}".as_slice());
127/// assert_eq!(reader.parse_next().await?, JsonEvent::StartObject);
128/// assert_eq!(
129///     reader.parse_next().await?,
130///     JsonEvent::ObjectKey("foo".into())
131/// );
132/// assert_eq!(reader.parse_next().await?, JsonEvent::Number("1".into()));
133/// assert_eq!(reader.parse_next().await?, JsonEvent::EndObject);
134/// assert_eq!(reader.parse_next().await?, JsonEvent::Eof);
135/// # Ok(())
136/// # }
137/// ```
138#[cfg(feature = "async-tokio")]
139pub struct TokioAsyncReaderJsonParser<R: AsyncRead + Unpin> {
140    input_buffer: Vec<u8>,
141    input_buffer_start: usize,
142    input_buffer_end: usize,
143    max_buffer_size: usize,
144    is_ending: bool,
145    read: R,
146    parser: LowLevelJsonParser,
147}
148
149#[cfg(feature = "async-tokio")]
150impl<R: AsyncRead + Unpin> TokioAsyncReaderJsonParser<R> {
151    pub const fn new(read: R) -> Self {
152        Self {
153            input_buffer: Vec::new(),
154            input_buffer_start: 0,
155            input_buffer_end: 0,
156            max_buffer_size: MAX_BUFFER_SIZE,
157            is_ending: false,
158            read,
159            parser: LowLevelJsonParser::new(),
160        }
161    }
162
163    /// Sets the max size of the internal buffer in bytes
164    pub fn with_max_buffer_size(mut self, size: usize) -> Self {
165        self.max_buffer_size = size;
166        self
167    }
168
169    pub async fn parse_next(&mut self) -> Result<JsonEvent<'_>, JsonParseError> {
170        loop {
171            {
172                let LowLevelJsonParserResult {
173                    event,
174                    consumed_bytes,
175                } = self.parser.parse_next(
176                    #[allow(unsafe_code)]
177                    unsafe {
178                        let input_buffer_ptr: *const [u8] =
179                            &self.input_buffer[self.input_buffer_start..self.input_buffer_end];
180                        &*input_buffer_ptr
181                    }, // Borrow checker workaround https://github.com/rust-lang/rust/issues/70255
182                    self.is_ending,
183                );
184                self.input_buffer_start += consumed_bytes;
185                if let Some(event) = event {
186                    return Ok(event?);
187                }
188            }
189            if self.input_buffer_start > 0 {
190                self.input_buffer
191                    .copy_within(self.input_buffer_start..self.input_buffer_end, 0);
192                self.input_buffer_end -= self.input_buffer_start;
193                self.input_buffer_start = 0;
194            }
195            if self.input_buffer.len() == self.max_buffer_size {
196                return Err(io::Error::new(
197                    io::ErrorKind::OutOfMemory,
198                    format!(
199                        "Reached the buffer maximal size of {}",
200                        self.max_buffer_size
201                    ),
202                )
203                .into());
204            }
205            let min_end = min(
206                self.input_buffer_end + MIN_BUFFER_SIZE,
207                self.max_buffer_size,
208            );
209            if self.input_buffer.len() < min_end {
210                self.input_buffer.resize(min_end, 0);
211            }
212            if self.input_buffer.len() < self.input_buffer.capacity() {
213                // We keep extending to have as much space as available without reallocation
214                self.input_buffer.resize(self.input_buffer.capacity(), 0);
215            }
216            let read = self
217                .read
218                .read(&mut self.input_buffer[self.input_buffer_end..])
219                .await?;
220            self.input_buffer_end += read;
221            self.is_ending = read == 0;
222        }
223    }
224
225    #[deprecated(note = "Use parse_next() instead")]
226    pub async fn read_next_event(&mut self) -> Result<JsonEvent<'_>, JsonParseError> {
227        self.parse_next().await
228    }
229}
230
231/// Parses a JSON file from a `&[u8]`.
232///
233/// ```
234/// use json_event_parser::{JsonEvent, SliceJsonParser};
235///
236/// let mut reader = SliceJsonParser::new(b"{\"foo\": 1}");
237/// assert_eq!(reader.parse_next()?, JsonEvent::StartObject);
238/// assert_eq!(reader.parse_next()?, JsonEvent::ObjectKey("foo".into()));
239/// assert_eq!(reader.parse_next()?, JsonEvent::Number("1".into()));
240/// assert_eq!(reader.parse_next()?, JsonEvent::EndObject);
241/// assert_eq!(reader.parse_next()?, JsonEvent::Eof);
242/// # std::io::Result::Ok(())
243/// ```
244pub struct SliceJsonParser<'a> {
245    input_buffer: &'a [u8],
246    parser: LowLevelJsonParser,
247}
248
249impl<'a> SliceJsonParser<'a> {
250    pub const fn new(buffer: &'a [u8]) -> Self {
251        Self {
252            input_buffer: buffer,
253            parser: LowLevelJsonParser::new(),
254        }
255    }
256
257    pub fn parse_next(&mut self) -> Result<JsonEvent<'a>, JsonSyntaxError> {
258        loop {
259            let LowLevelJsonParserResult {
260                event,
261                consumed_bytes,
262            } = self.parser.parse_next(self.input_buffer, true);
263            self.input_buffer = &self.input_buffer[consumed_bytes..];
264            if let Some(event) = event {
265                return event;
266            }
267        }
268    }
269
270    #[deprecated(note = "Use parse_next() instead")]
271    pub fn read_next_event(&mut self) -> Result<JsonEvent<'_>, JsonSyntaxError> {
272        self.parse_next()
273    }
274}
275
276/// A low-level JSON parser acting on a provided buffer.
277///
278/// Does not allocate except a stack to check if array and object opening and closing are properly nested.
279/// This stack size might be limited using the method [`with_max_stack_size`](LowLevelJsonParser::with_max_stack_size).
280///
281/// ```
282/// # use std::borrow::Cow;
283/// use json_event_parser::{JsonEvent, LowLevelJsonParser, LowLevelJsonParserResult};
284///
285/// let mut reader = LowLevelJsonParser::new();
286/// assert!(matches!(
287///     reader.parse_next(b"{\"foo".as_slice(), false),
288///     LowLevelJsonParserResult {
289///         consumed_bytes: 1,
290///         event: Some(Ok(JsonEvent::StartObject))
291///     }
292/// ));
293/// assert!(matches!(
294///     reader.parse_next(b"\"foo".as_slice(), false),
295///     LowLevelJsonParserResult {
296///         consumed_bytes: 0,
297///         event: None
298///     }
299/// ));
300/// assert!(matches!(
301///     reader.parse_next(b"\"foo\": 1}".as_slice(), false),
302///     LowLevelJsonParserResult {
303///         consumed_bytes: 5,
304///         event: Some(Ok(JsonEvent::ObjectKey(Cow::Borrowed("foo"))))
305///     }
306/// ));
307/// assert!(matches!(
308///     reader.parse_next(b": 1}".as_slice(), false),
309///     LowLevelJsonParserResult {
310///         consumed_bytes: 3,
311///         event: Some(Ok(JsonEvent::Number(Cow::Borrowed("1"))))
312///     }
313/// ));
314/// assert!(matches!(
315///     reader.parse_next(b"}".as_slice(), false),
316///     LowLevelJsonParserResult {
317///         consumed_bytes: 1,
318///         event: Some(Ok(JsonEvent::EndObject))
319///     }
320/// ));
321/// assert!(matches!(
322///     reader.parse_next(b"".as_slice(), true),
323///     LowLevelJsonParserResult {
324///         consumed_bytes: 0,
325///         event: Some(Ok(JsonEvent::Eof))
326///     }
327/// ));
328/// # std::io::Result::Ok(())
329/// ```
330pub struct LowLevelJsonParser {
331    lexer: JsonLexer,
332    state_stack: Vec<JsonState>,
333    max_state_stack_size: usize,
334    element_read: bool,
335    buffered_event: Option<JsonEvent<'static>>,
336}
337
338impl LowLevelJsonParser {
339    pub const fn new() -> Self {
340        Self {
341            lexer: JsonLexer {
342                file_offset: 0,
343                file_line: 0,
344                file_start_of_last_line: 0,
345                file_start_of_last_token: 0,
346                is_start: true,
347            },
348            state_stack: Vec::new(),
349            max_state_stack_size: MAX_STATE_STACK_SIZE,
350            element_read: false,
351            buffered_event: None,
352        }
353    }
354
355    /// Maximal allowed number of nested object and array openings. Infinite by default.
356    pub fn with_max_stack_size(mut self, size: usize) -> Self {
357        self.max_state_stack_size = size;
358        self
359    }
360
361    /// Reads a new event from the data in `input_buffer`.
362    ///
363    /// `is_ending` must be set to true if all the JSON data have been already consumed or are in `input_buffer`.
364    pub fn parse_next<'a>(
365        &mut self,
366        input_buffer: &'a [u8],
367        is_ending: bool,
368    ) -> LowLevelJsonParserResult<'a> {
369        if let Some(event) = self.buffered_event.take() {
370            return LowLevelJsonParserResult {
371                consumed_bytes: 0,
372                event: Some(Ok(event)),
373            };
374        }
375        let start_file_offset = self.lexer.file_offset;
376        while let Some(token) = self.lexer.read_next_token(
377            &input_buffer[usize::try_from(self.lexer.file_offset - start_file_offset).unwrap()..],
378            is_ending,
379        ) {
380            let consumed_bytes = (self.lexer.file_offset - start_file_offset)
381                .try_into()
382                .unwrap();
383            match token {
384                Ok(token) => {
385                    let (event, error) = self.apply_new_token(token);
386                    let error = error.map(|e| {
387                        self.lexer.syntax_error(
388                            self.lexer.file_start_of_last_token..self.lexer.file_offset,
389                            e,
390                        )
391                    });
392                    if let Some(error) = error {
393                        self.buffered_event = event.map(owned_event);
394                        return LowLevelJsonParserResult {
395                            consumed_bytes,
396                            event: Some(Err(error)),
397                        };
398                    }
399                    if let Some(event) = event {
400                        return LowLevelJsonParserResult {
401                            consumed_bytes,
402                            event: Some(Ok(event)),
403                        };
404                    }
405                }
406                Err(error) => {
407                    return LowLevelJsonParserResult {
408                        consumed_bytes,
409                        event: Some(Err(error)),
410                    }
411                }
412            }
413        }
414        LowLevelJsonParserResult {
415            consumed_bytes: (self.lexer.file_offset - start_file_offset)
416                .try_into()
417                .unwrap(),
418            event: if is_ending {
419                self.buffered_event = Some(JsonEvent::Eof);
420                Some(Err(self.lexer.syntax_error(
421                    self.lexer.file_offset..self.lexer.file_offset + 1,
422                    "Unexpected end of file",
423                )))
424            } else {
425                None
426            },
427        }
428    }
429
430    #[deprecated(note = "Use parse_next() instead")]
431    pub fn read_next_event<'a>(
432        &mut self,
433        input_buffer: &'a [u8],
434        is_ending: bool,
435    ) -> LowLevelJsonParserResult<'a> {
436        self.parse_next(input_buffer, is_ending)
437    }
438
439    fn apply_new_token<'a>(
440        &mut self,
441        token: JsonToken<'a>,
442    ) -> (Option<JsonEvent<'a>>, Option<String>) {
443        match self.state_stack.pop() {
444            Some(JsonState::ObjectKeyOrEnd) => {
445                if token == JsonToken::ClosingCurlyBracket {
446                    (Some(JsonEvent::EndObject), None)
447                } else {
448                    if let Err(e) = self.push_state_stack(JsonState::ObjectKey) {
449                        return (None, Some(e));
450                    }
451                    self.apply_new_token(token)
452                }
453            }
454            Some(JsonState::ObjectKey) => {
455                if token == JsonToken::ClosingCurlyBracket {
456                    return (Some(JsonEvent::EndObject), Some("Trailing commas are not allowed".into()));
457                }
458                if let Err(e) = self.push_state_stack(JsonState::ObjectColon) {
459                    return (None, Some(e));
460                }
461                if let JsonToken::String(key) = token {
462                    (Some(JsonEvent::ObjectKey(key)), None)
463                } else {
464                    (None, Some("Object keys must be strings".into()))
465                }
466            }
467            Some(JsonState::ObjectColon) => {
468                if let Err(e) = self.push_state_stack(JsonState::ObjectValue) {
469                    return (None, Some(e));
470                }
471                if token == JsonToken::Colon {
472                    (None, None)
473                } else {
474                    let (event, _) = self.apply_new_token(token);
475                    (event, Some("Object keys must be strings".into()))
476                }
477            }
478            Some(JsonState::ObjectValue) => {
479                if let Err(e) = self.push_state_stack(JsonState::ObjectCommaOrEnd) {
480                    return (None, Some(e));
481                }
482                self.apply_new_token_for_value(token)
483            }
484            Some(JsonState::ObjectCommaOrEnd) => match token {
485                JsonToken::Comma => {
486                    (None, self.push_state_stack(JsonState::ObjectKey).err())
487                }
488                JsonToken::ClosingCurlyBracket => (Some(JsonEvent::EndObject), None),
489                _ => (None, Some("Object values must be followed by a comma to add a new value or a curly bracket to end the object".into())),
490            },
491            Some(JsonState::ArrayValueOrEnd) =>{
492                if token == JsonToken::ClosingSquareBracket {
493                    return (Some(JsonEvent::EndArray), None);
494                }
495                if let Err(e) = self.push_state_stack(JsonState::ArrayValue) {
496                    return (None, Some(e));
497                }
498                self.apply_new_token(token)
499            }
500            Some(JsonState::ArrayValue) => {
501                if token == JsonToken::ClosingSquareBracket {
502                    return (Some(JsonEvent::EndArray), Some("Trailing commas are not allowed".into()));
503                }
504                if let Err(e) = self.push_state_stack(JsonState::ArrayCommaOrEnd) {
505                    return (None, Some(e));
506                }
507                self.apply_new_token_for_value(token)
508            }
509            Some(JsonState::ArrayCommaOrEnd) => match token {
510                JsonToken::Comma => {
511                    (None, self.push_state_stack(JsonState::ArrayValue).err())
512                }
513                JsonToken::ClosingSquareBracket => (Some(JsonEvent::EndArray), None),
514                _ => {
515                    let _ = self.push_state_stack(JsonState::ArrayValue); // We already have an error
516                    let (event, _) = self.apply_new_token(token);
517                    (event, Some("Array values must be followed by a comma to add a new value or a squared bracket to end the array".into()))
518                }
519            }
520            None => if self.element_read {
521                if token == JsonToken::Eof {
522                    (Some(JsonEvent::Eof), None)
523                } else {
524                    (None, Some("The JSON already contains one root element".into()))
525                }
526            } else {
527                self.element_read = true;
528                self.apply_new_token_for_value(token)
529            }
530        }
531    }
532
533    fn apply_new_token_for_value<'a>(
534        &mut self,
535        token: JsonToken<'a>,
536    ) -> (Option<JsonEvent<'a>>, Option<String>) {
537        match token {
538            JsonToken::OpeningSquareBracket => (
539                Some(JsonEvent::StartArray),
540                self.push_state_stack(JsonState::ArrayValueOrEnd).err(),
541            ),
542            JsonToken::ClosingSquareBracket => (
543                None,
544                Some("Unexpected closing square bracket, no array to close".into()),
545            ),
546            JsonToken::OpeningCurlyBracket => (
547                Some(JsonEvent::StartObject),
548                self.push_state_stack(JsonState::ObjectKeyOrEnd).err(),
549            ),
550            JsonToken::ClosingCurlyBracket => (
551                None,
552                Some("Unexpected closing curly bracket, no array to close".into()),
553            ),
554            JsonToken::Comma => (None, Some("Unexpected comma, no values to separate".into())),
555            JsonToken::Colon => (None, Some("Unexpected colon, no key to follow".into())),
556            JsonToken::String(string) => (Some(JsonEvent::String(string)), None),
557            JsonToken::Number(number) => (Some(JsonEvent::Number(number)), None),
558            JsonToken::True => (Some(JsonEvent::Boolean(true)), None),
559            JsonToken::False => (Some(JsonEvent::Boolean(false)), None),
560            JsonToken::Null => (Some(JsonEvent::Null), None),
561            JsonToken::Eof => (
562                Some(JsonEvent::Eof),
563                Some("Unexpected end of file, a value was expected".into()),
564            ),
565        }
566    }
567
568    fn push_state_stack(&mut self, state: JsonState) -> Result<(), String> {
569        self.check_stack_size()?;
570        self.state_stack.push(state);
571        Ok(())
572    }
573
574    fn check_stack_size(&self) -> Result<(), String> {
575        if self.state_stack.len() > self.max_state_stack_size {
576            Err(format!(
577                "Max stack size of {} reached on an object opening",
578                self.max_state_stack_size
579            ))
580        } else {
581            Ok(())
582        }
583    }
584}
585
586impl Default for LowLevelJsonParser {
587    fn default() -> Self {
588        Self::new()
589    }
590}
591
592#[derive(Eq, PartialEq, Copy, Clone, Debug)]
593enum JsonState {
594    ObjectKey,
595    ObjectKeyOrEnd,
596    ObjectColon,
597    ObjectValue,
598    ObjectCommaOrEnd,
599    ArrayValue,
600    ArrayValueOrEnd,
601    ArrayCommaOrEnd,
602}
603
604#[derive(Eq, PartialEq, Clone, Debug)]
605enum JsonToken<'a> {
606    OpeningSquareBracket, // [
607    ClosingSquareBracket, // ]
608    OpeningCurlyBracket,  // {
609    ClosingCurlyBracket,  // }
610    Comma,                // ,
611    Colon,                // :
612    String(Cow<'a, str>), // "..."
613    Number(Cow<'a, str>), // 1.2e3
614    True,                 // true
615    False,                // false
616    Null,                 // null
617    Eof,                  // EOF
618}
619
620struct JsonLexer {
621    file_offset: u64,
622    file_line: u64,
623    file_start_of_last_line: u64,
624    file_start_of_last_token: u64,
625    is_start: bool,
626}
627
628impl JsonLexer {
629    fn read_next_token<'a>(
630        &mut self,
631        mut input_buffer: &'a [u8],
632        is_ending: bool,
633    ) -> Option<Result<JsonToken<'a>, JsonSyntaxError>> {
634        // We remove BOM at the beginning
635        if self.is_start {
636            if input_buffer.len() < 3 && !is_ending {
637                return None;
638            }
639            self.is_start = false;
640            if input_buffer.starts_with(&[0xEF, 0xBB, 0xBF]) {
641                input_buffer = &input_buffer[3..];
642                self.file_offset += 3;
643            }
644        }
645
646        // We skip whitespaces
647        let mut i = 0;
648        while let Some(c) = input_buffer.get(i) {
649            match *c {
650                b' ' | b'\t' => {
651                    i += 1;
652                }
653                b'\n' => {
654                    i += 1;
655                    self.file_line += 1;
656                    self.file_start_of_last_line = self.file_offset + u64::try_from(i).unwrap();
657                }
658                b'\r' => {
659                    i += 1;
660                    if let Some(c) = input_buffer.get(i) {
661                        if *c == b'\n' {
662                            i += 1; // \r\n
663                        }
664                    } else if !is_ending {
665                        // We need an extra byte to check if followed by \n
666                        i -= 1;
667                        self.file_offset += u64::try_from(i).unwrap();
668                        return None;
669                    }
670                    self.file_line += 1;
671                    self.file_start_of_last_line = self.file_offset + u64::try_from(i).unwrap();
672                }
673                _ => {
674                    break;
675                }
676            }
677        }
678        self.file_offset += u64::try_from(i).unwrap();
679        input_buffer = &input_buffer[i..];
680        self.file_start_of_last_token = self.file_offset;
681
682        if is_ending && input_buffer.is_empty() {
683            return Some(Ok(JsonToken::Eof));
684        }
685
686        // we get the first character
687        match *input_buffer.first()? {
688            b'{' => {
689                self.file_offset += 1;
690                Some(Ok(JsonToken::OpeningCurlyBracket))
691            }
692            b'}' => {
693                self.file_offset += 1;
694                Some(Ok(JsonToken::ClosingCurlyBracket))
695            }
696            b'[' => {
697                self.file_offset += 1;
698                Some(Ok(JsonToken::OpeningSquareBracket))
699            }
700            b']' => {
701                self.file_offset += 1;
702                Some(Ok(JsonToken::ClosingSquareBracket))
703            }
704            b',' => {
705                self.file_offset += 1;
706                Some(Ok(JsonToken::Comma))
707            }
708            b':' => {
709                self.file_offset += 1;
710                Some(Ok(JsonToken::Colon))
711            }
712            b'"' => self.read_string(input_buffer),
713            b't' => self.read_constant(input_buffer, is_ending, "true", JsonToken::True),
714            b'f' => self.read_constant(input_buffer, is_ending, "false", JsonToken::False),
715            b'n' => self.read_constant(input_buffer, is_ending, "null", JsonToken::Null),
716            b'-' | b'0'..=b'9' => self.read_number(input_buffer, is_ending),
717            c => {
718                self.file_offset += 1;
719                Some(Err(self.syntax_error(
720                    self.file_offset - 1..self.file_offset,
721                    if c < 128 {
722                        format!("Unexpected char: '{}'", char::from(c))
723                    } else {
724                        format!("Unexpected byte: \\x{c:X}")
725                    },
726                )))
727            }
728        }
729    }
730
731    fn read_string<'a>(
732        &mut self,
733        input_buffer: &'a [u8],
734    ) -> Option<Result<JsonToken<'a>, JsonSyntaxError>> {
735        let mut error = None;
736        let mut string: Option<(String, usize)> = None;
737        let mut next_byte_offset = 1;
738        loop {
739            match *input_buffer.get(next_byte_offset)? {
740                b'"' => {
741                    // end of string
742                    let result = Some(if let Some(error) = error {
743                        Err(error)
744                    } else if let Some((mut string, read_until)) = string {
745                        if read_until < next_byte_offset {
746                            let (str, e) = self.decode_utf8(
747                                &input_buffer[read_until..next_byte_offset],
748                                self.file_offset + u64::try_from(read_until).unwrap(),
749                            );
750                            error = error.or(e);
751                            string.push_str(&str);
752                        }
753                        if let Some(error) = error {
754                            Err(error)
755                        } else {
756                            Ok(JsonToken::String(Cow::Owned(string)))
757                        }
758                    } else {
759                        let (string, error) = self
760                            .decode_utf8(&input_buffer[1..next_byte_offset], self.file_offset + 1);
761                        if let Some(error) = error {
762                            Err(error)
763                        } else {
764                            Ok(JsonToken::String(string))
765                        }
766                    });
767                    self.file_offset += u64::try_from(next_byte_offset).unwrap() + 1;
768                    return result;
769                }
770                b'\\' => {
771                    // Escape sequences
772                    if string.is_none() {
773                        string = Some((String::new(), 1))
774                    }
775                    let (string, read_until) = string.as_mut().unwrap();
776                    if *read_until < next_byte_offset {
777                        let (str, e) = self.decode_utf8(
778                            &input_buffer[*read_until..next_byte_offset],
779                            self.file_offset + u64::try_from(*read_until).unwrap(),
780                        );
781                        error = error.or(e);
782                        string.push_str(&str);
783                    }
784                    next_byte_offset += 1;
785                    match *input_buffer.get(next_byte_offset)? {
786                        b'"' => {
787                            string.push('"');
788                            next_byte_offset += 1;
789                        }
790                        b'\\' => {
791                            string.push('\\');
792                            next_byte_offset += 1;
793                        }
794                        b'/' => {
795                            string.push('/');
796                            next_byte_offset += 1;
797                        }
798                        b'b' => {
799                            string.push('\u{8}');
800                            next_byte_offset += 1;
801                        }
802                        b'f' => {
803                            string.push('\u{C}');
804                            next_byte_offset += 1;
805                        }
806                        b'n' => {
807                            string.push('\n');
808                            next_byte_offset += 1;
809                        }
810                        b'r' => {
811                            string.push('\r');
812                            next_byte_offset += 1;
813                        }
814                        b't' => {
815                            string.push('\t');
816                            next_byte_offset += 1;
817                        }
818                        b'u' => {
819                            next_byte_offset += 1;
820                            let val = input_buffer.get(next_byte_offset..next_byte_offset + 4)?;
821                            next_byte_offset += 4;
822                            let code_point = match read_hexa_char(val) {
823                                Ok(cp) => cp,
824                                Err(e) => {
825                                    error = error.or_else(|| {
826                                        let pos = self.file_offset
827                                            + u64::try_from(next_byte_offset).unwrap();
828                                        Some(self.syntax_error(pos - 4..pos, e))
829                                    });
830                                    char::REPLACEMENT_CHARACTER.into()
831                                }
832                            };
833                            if let Some(c) = char::from_u32(code_point) {
834                                string.push(c);
835                            } else {
836                                let high_surrogate = code_point;
837                                if !(0xD800..=0xDBFF).contains(&high_surrogate) {
838                                    error = error.or_else(|| {
839                                        let pos = self.file_offset
840                                            + u64::try_from(next_byte_offset).unwrap();
841                                        Some(self.syntax_error(
842                                            pos - 6..pos,
843                                            format!(
844                                                "\\u{:X} is not a valid high surrogate",
845                                                high_surrogate
846                                            ),
847                                        ))
848                                    });
849                                }
850                                let val =
851                                    input_buffer.get(next_byte_offset..next_byte_offset + 6)?;
852                                next_byte_offset += 6;
853                                if !val.starts_with(b"\\u") {
854                                    error = error.or_else(|| {
855                                        let pos = self.file_offset + u64::try_from(next_byte_offset).unwrap();
856                                        Some(self.syntax_error(
857                                            pos - 6..pos,
858                                            format!(
859                                                "\\u{:X} is a high surrogate and should be followed by a low surrogate \\uXXXX",
860                                                high_surrogate
861                                            )
862                                        ))
863                                    });
864                                }
865                                let low_surrogate = match read_hexa_char(&val[2..]) {
866                                    Ok(cp) => cp,
867                                    Err(e) => {
868                                        error = error.or_else(|| {
869                                            let pos = self.file_offset
870                                                + u64::try_from(next_byte_offset).unwrap();
871                                            Some(self.syntax_error(pos - 6..pos, e))
872                                        });
873                                        char::REPLACEMENT_CHARACTER.into()
874                                    }
875                                };
876                                if !(0xDC00..=0xDFFF).contains(&low_surrogate) {
877                                    error = error.or_else(|| {
878                                        let pos = self.file_offset
879                                            + u64::try_from(next_byte_offset).unwrap();
880                                        Some(self.syntax_error(
881                                            pos - 6..pos,
882                                            format!(
883                                                "\\u{:X} is not a valid low surrogate",
884                                                low_surrogate
885                                            ),
886                                        ))
887                                    });
888                                }
889                                let code_point = 0x10000
890                                    + ((high_surrogate & 0x03FF) << 10)
891                                    + (low_surrogate & 0x03FF);
892                                if let Some(c) = char::from_u32(code_point) {
893                                    string.push(c)
894                                } else {
895                                    string.push(char::REPLACEMENT_CHARACTER);
896                                    error = error.or_else(|| {
897                                        let pos = self.file_offset
898                                            + u64::try_from(next_byte_offset).unwrap();
899                                        Some(self.syntax_error(
900                                            pos - 12..pos,
901                                            format!(
902                                                "\\u{:X}\\u{:X} is an invalid surrogate pair",
903                                                high_surrogate, low_surrogate
904                                            ),
905                                        ))
906                                    });
907                                }
908                            }
909                        }
910                        c => {
911                            next_byte_offset += 1;
912                            error = error.or_else(|| {
913                                let pos =
914                                    self.file_offset + u64::try_from(next_byte_offset).unwrap();
915                                Some(self.syntax_error(
916                                    pos - 2..pos,
917                                    format!("'\\{}' is not a valid escape sequence", char::from(c)),
918                                ))
919                            });
920                            string.push(char::REPLACEMENT_CHARACTER);
921                        }
922                    }
923                    *read_until = next_byte_offset;
924                }
925                c @ (0..=0x1F) => {
926                    error = error.or_else(|| {
927                        let pos = self.file_offset + u64::try_from(next_byte_offset).unwrap();
928                        Some(self.syntax_error(
929                            pos..pos + 1,
930                            format!("'{}' is not allowed in JSON strings", char::from(c)),
931                        ))
932                    });
933                    next_byte_offset += 1;
934                }
935                _ => {
936                    next_byte_offset += 1;
937                }
938            }
939        }
940    }
941
942    fn read_constant(
943        &mut self,
944        input_buffer: &[u8],
945        is_ending: bool,
946        expected: &str,
947        value: JsonToken<'static>,
948    ) -> Option<Result<JsonToken<'static>, JsonSyntaxError>> {
949        if input_buffer.get(..expected.len())? == expected.as_bytes() {
950            self.file_offset += u64::try_from(expected.len()).unwrap();
951            return Some(Ok(value));
952        }
953        let ascii_chars = input_buffer
954            .iter()
955            .take_while(|c| c.is_ascii_alphabetic())
956            .count();
957        if ascii_chars == input_buffer.len() && !is_ending {
958            return None; // We might read a bigger token
959        }
960        let read = max(1, ascii_chars); // We want to consume at least a byte
961        let start_offset = self.file_offset;
962        self.file_offset += u64::try_from(read).unwrap();
963        Some(Err(self.syntax_error(
964            start_offset..self.file_offset,
965            format!("{} expected", expected),
966        )))
967    }
968
969    fn read_number<'a>(
970        &mut self,
971        input_buffer: &'a [u8],
972        is_ending: bool,
973    ) -> Option<Result<JsonToken<'a>, JsonSyntaxError>> {
974        let mut next_byte_offset = 0;
975        if *input_buffer.get(next_byte_offset)? == b'-' {
976            next_byte_offset += 1;
977        }
978        // integer starting with first bytes
979        match *input_buffer.get(next_byte_offset)? {
980            b'0' => {
981                next_byte_offset += 1;
982            }
983            b'1'..=b'9' => {
984                next_byte_offset += 1;
985                next_byte_offset += read_digits(&input_buffer[next_byte_offset..], is_ending)?;
986            }
987            c => {
988                next_byte_offset += 1;
989                self.file_offset += u64::try_from(next_byte_offset).unwrap();
990                return Some(Err(self.syntax_error(
991                    self.file_offset - 1..self.file_offset,
992                    format!("A number is not allowed to start with '{}'", char::from(c)),
993                )));
994            }
995        }
996
997        // Dot
998        if input_buffer.get(next_byte_offset).map_or_else(
999            || if is_ending { Some(None) } else { None },
1000            |c| Some(Some(*c)),
1001        )? == Some(b'.')
1002        {
1003            next_byte_offset += 1;
1004            let c = *input_buffer.get(next_byte_offset)?;
1005            next_byte_offset += 1;
1006            if !c.is_ascii_digit() {
1007                self.file_offset += u64::try_from(next_byte_offset).unwrap();
1008                return Some(Err(self.syntax_error(
1009                    self.file_offset - 1..self.file_offset,
1010                    format!(
1011                        "A number fractional part must start with a digit and not '{}'",
1012                        char::from(c)
1013                    ),
1014                )));
1015            }
1016            next_byte_offset += read_digits(&input_buffer[next_byte_offset..], is_ending)?;
1017        }
1018
1019        // Exp
1020        let c = input_buffer.get(next_byte_offset).map_or_else(
1021            || if is_ending { Some(None) } else { None },
1022            |c| Some(Some(*c)),
1023        )?;
1024        if c == Some(b'e') || c == Some(b'E') {
1025            next_byte_offset += 1;
1026            match *input_buffer.get(next_byte_offset)? {
1027                b'-' | b'+' => {
1028                    next_byte_offset += 1;
1029                    let c = *input_buffer.get(next_byte_offset)?;
1030                    next_byte_offset += 1;
1031                    if !c.is_ascii_digit() {
1032                        self.file_offset += u64::try_from(next_byte_offset).unwrap();
1033                        return Some(Err(self.syntax_error(
1034                            self.file_offset - 1..self.file_offset,
1035                            format!(
1036                                "A number exponential part must contain at least a digit, '{}' found",
1037                                char::from(c)
1038                            ),
1039                        )));
1040                    }
1041                }
1042                b'0'..=b'9' => {
1043                    next_byte_offset += 1;
1044                }
1045                c => {
1046                    next_byte_offset += 1;
1047                    self.file_offset += u64::try_from(next_byte_offset).unwrap();
1048                    return Some(Err(self.syntax_error(
1049                        self.file_offset - 1..self.file_offset,
1050                        format!(
1051                            "A number exponential part must start with +, - or a digit, '{}' found",
1052                            char::from(c)
1053                        ),
1054                    )));
1055                }
1056            }
1057            next_byte_offset += read_digits(&input_buffer[next_byte_offset..], is_ending)?;
1058        }
1059        self.file_offset += u64::try_from(next_byte_offset).unwrap();
1060        Some(Ok(JsonToken::Number(Cow::Borrowed(
1061            str::from_utf8(&input_buffer[..next_byte_offset]).unwrap(),
1062        ))))
1063    }
1064
1065    fn decode_utf8<'a>(
1066        &self,
1067        input_buffer: &'a [u8],
1068        start_position: u64,
1069    ) -> (Cow<'a, str>, Option<JsonSyntaxError>) {
1070        match str::from_utf8(input_buffer) {
1071            Ok(str) => (Cow::Borrowed(str), None),
1072            Err(e) => (
1073                String::from_utf8_lossy(input_buffer),
1074                Some({
1075                    let pos = start_position + u64::try_from(e.valid_up_to()).unwrap();
1076                    self.syntax_error(pos..pos + 1, format!("Invalid UTF-8: {e}"))
1077                }),
1078            ),
1079        }
1080    }
1081
1082    fn syntax_error(&self, file_offset: Range<u64>, message: impl Into<String>) -> JsonSyntaxError {
1083        let start_file_offset = max(file_offset.start, self.file_start_of_last_line);
1084        JsonSyntaxError {
1085            location: TextPosition {
1086                line: self.file_line,
1087                column: start_file_offset - self.file_start_of_last_line, // TODO: unicode
1088                offset: start_file_offset,
1089            }..TextPosition {
1090                line: self.file_line,
1091                column: file_offset.end - self.file_start_of_last_line, // TODO: unicode
1092                offset: file_offset.end,
1093            },
1094            message: message.into(),
1095        }
1096    }
1097}
1098
1099fn read_hexa_char(input: &[u8]) -> Result<u32, String> {
1100    let mut value = 0;
1101    for c in input.iter().copied() {
1102        value = value * 16
1103            + match c {
1104                b'0'..=b'9' => u32::from(c) - u32::from(b'0'),
1105                b'a'..=b'f' => u32::from(c) - u32::from(b'a') + 10,
1106                b'A'..=b'F' => u32::from(c) - u32::from(b'A') + 10,
1107                _ => {
1108                    return Err(format!(
1109                        "Unexpected character in a unicode escape: '{}'",
1110                        char::from(c)
1111                    ))
1112                }
1113            }
1114    }
1115    Ok(value)
1116}
1117
1118fn read_digits(input_buffer: &[u8], is_ending: bool) -> Option<usize> {
1119    let count = input_buffer
1120        .iter()
1121        .take_while(|c| c.is_ascii_digit())
1122        .count();
1123    if count == input_buffer.len() && !is_ending {
1124        return None;
1125    }
1126    Some(count)
1127}
1128
1129fn owned_event(event: JsonEvent<'_>) -> JsonEvent<'static> {
1130    match event {
1131        JsonEvent::String(s) => JsonEvent::String(s.into_owned().into()),
1132        JsonEvent::Number(n) => JsonEvent::Number(n.into_owned().into()),
1133        JsonEvent::Boolean(b) => JsonEvent::Boolean(b),
1134        JsonEvent::Null => JsonEvent::Null,
1135        JsonEvent::StartArray => JsonEvent::StartArray,
1136        JsonEvent::EndArray => JsonEvent::EndArray,
1137        JsonEvent::StartObject => JsonEvent::StartObject,
1138        JsonEvent::EndObject => JsonEvent::EndObject,
1139        JsonEvent::ObjectKey(k) => JsonEvent::ObjectKey(k.into_owned().into()),
1140        JsonEvent::Eof => JsonEvent::Eof,
1141    }
1142}
1143
1144/// Result of [`LowLevelJsonParser::parse_next`].
1145#[derive(Debug)]
1146pub struct LowLevelJsonParserResult<'a> {
1147    /// How many bytes have been read from `input_buffer` and should be removed from it.
1148    pub consumed_bytes: usize,
1149    /// A possible new event
1150    pub event: Option<Result<JsonEvent<'a>, JsonSyntaxError>>,
1151}
1152
1153/// A position in a text i.e. a `line` number starting from 0, a `column` number starting from 0 (in number of code points) and a global file `offset` starting from 0 (in number of bytes).
1154#[derive(Eq, PartialEq, Debug, Clone, Copy)]
1155pub struct TextPosition {
1156    pub line: u64,
1157    pub column: u64,
1158    pub offset: u64,
1159}
1160
1161/// An error in the syntax of the parsed file.
1162///
1163/// It is composed of a message and a byte range in the input.
1164#[derive(Debug)]
1165pub struct JsonSyntaxError {
1166    location: Range<TextPosition>,
1167    message: String,
1168}
1169
1170impl JsonSyntaxError {
1171    /// The location of the error inside of the file.
1172    #[inline]
1173    pub fn location(&self) -> Range<TextPosition> {
1174        self.location.clone()
1175    }
1176
1177    /// The error message.
1178    #[inline]
1179    pub fn message(&self) -> &str {
1180        &self.message
1181    }
1182}
1183
1184impl fmt::Display for JsonSyntaxError {
1185    #[inline]
1186    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1187        if self.location.start.offset + 1 >= self.location.end.offset {
1188            write!(
1189                f,
1190                "Parser error at line {} column {}: {}",
1191                self.location.start.line + 1,
1192                self.location.start.column + 1,
1193                self.message
1194            )
1195        } else if self.location.start.line == self.location.end.line {
1196            write!(
1197                f,
1198                "Parser error at line {} between columns {} and column {}: {}",
1199                self.location.start.line + 1,
1200                self.location.start.column + 1,
1201                self.location.end.column + 1,
1202                self.message
1203            )
1204        } else {
1205            write!(
1206                f,
1207                "Parser error between line {} column {} and line {} column {}: {}",
1208                self.location.start.line + 1,
1209                self.location.start.column + 1,
1210                self.location.end.line + 1,
1211                self.location.end.column + 1,
1212                self.message
1213            )
1214        }
1215    }
1216}
1217
1218impl Error for JsonSyntaxError {}
1219
1220impl From<JsonSyntaxError> for io::Error {
1221    #[inline]
1222    fn from(error: JsonSyntaxError) -> Self {
1223        io::Error::new(io::ErrorKind::InvalidData, error)
1224    }
1225}
1226
1227/// A parsing error.
1228///
1229/// It is the union of [`JsonSyntaxError`] and [`std::io::Error`].
1230#[derive(Debug)]
1231pub enum JsonParseError {
1232    /// I/O error during parsing (file not found...).
1233    Io(io::Error),
1234    /// An error in the file syntax.
1235    Syntax(JsonSyntaxError),
1236}
1237
1238impl fmt::Display for JsonParseError {
1239    #[inline]
1240    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1241        match self {
1242            Self::Io(e) => e.fmt(f),
1243            Self::Syntax(e) => e.fmt(f),
1244        }
1245    }
1246}
1247
1248impl Error for JsonParseError {
1249    #[inline]
1250    fn source(&self) -> Option<&(dyn Error + 'static)> {
1251        Some(match self {
1252            Self::Io(e) => e,
1253            Self::Syntax(e) => e,
1254        })
1255    }
1256}
1257
1258impl From<JsonSyntaxError> for JsonParseError {
1259    #[inline]
1260    fn from(error: JsonSyntaxError) -> Self {
1261        Self::Syntax(error)
1262    }
1263}
1264
1265impl From<io::Error> for JsonParseError {
1266    #[inline]
1267    fn from(error: io::Error) -> Self {
1268        Self::Io(error)
1269    }
1270}
1271
1272impl From<JsonParseError> for io::Error {
1273    #[inline]
1274    fn from(error: JsonParseError) -> Self {
1275        match error {
1276            JsonParseError::Syntax(e) => e.into(),
1277            JsonParseError::Io(e) => e,
1278        }
1279    }
1280}