chumsky/
stream.rs

1//! Token streams and tools converting to and from them..
2//!
3//! *“What’s up?” “I don’t know,” said Marvin, “I’ve never been there.”*
4//!
5//! [`Stream`] is the primary type used to feed input data into a chumsky parser. You can create them in a number of
6//! ways: from strings, iterators, arrays, etc.
7
8use super::*;
9use alloc::vec;
10
11trait StreamExtend<T>: Iterator<Item = T> {
12    /// Extend the vector with input. The actual amount can be more or less than `n`, but must be at least 1 (0 implies
13    /// that the stream has been exhausted.
14    fn extend(&mut self, v: &mut Vec<T>, n: usize);
15}
16
17#[allow(deprecated)]
18impl<I: Iterator> StreamExtend<I::Item> for I {
19    fn extend(&mut self, v: &mut Vec<I::Item>, n: usize) {
20        v.reserve(n);
21        v.extend(self.take(n));
22    }
23}
24
25/// A utility type used to flatten input trees. See [`Stream::from_nested`].
26pub enum Flat<I, Iter> {
27    /// The input tree flattens into a single input.
28    Single(I),
29    /// The input tree flattens into many sub-trees.
30    Many(Iter),
31}
32
33/// A type that represents a stream of input tokens. Unlike [`Iterator`], this type supports backtracking and a few
34/// other features required by the crate.
35#[allow(deprecated)]
36pub struct Stream<
37    'a,
38    I,
39    S: Span,
40    Iter: Iterator<Item = (I, S)> + ?Sized = dyn Iterator<Item = (I, S)> + 'a,
41> {
42    pub(crate) phantom: PhantomData<&'a ()>,
43    pub(crate) eoi: S,
44    pub(crate) offset: usize,
45    pub(crate) buffer: Vec<(I, S)>,
46    pub(crate) iter: Iter,
47}
48
49/// A [`Stream`] that pulls tokens from a boxed [`Iterator`].
50pub type BoxStream<'a, I, S> = Stream<'a, I, S, Box<dyn Iterator<Item = (I, S)> + 'a>>;
51
52impl<'a, I, S: Span, Iter: Iterator<Item = (I, S)>> Stream<'a, I, S, Iter> {
53    /// Create a new stream from an iterator of `(Token, Span)` pairs. A span representing the end of input must also
54    /// be provided.
55    ///
56    /// There is no requirement that spans must map exactly to the position of inputs in the stream, but they should
57    /// be non-overlapping and should appear in a monotonically-increasing order.
58    pub fn from_iter(eoi: S, iter: Iter) -> Self {
59        Self {
60            phantom: PhantomData,
61            eoi,
62            offset: 0,
63            buffer: Vec::new(),
64            iter,
65        }
66    }
67
68    /// Eagerly evaluate the token stream, returning an iterator over the tokens in it (but without modifying the
69    /// stream's state so that it can still be used for parsing).
70    ///
71    /// This is most useful when you wish to check the input of a parser during debugging.
72    pub fn fetch_tokens(&mut self) -> impl Iterator<Item = (I, S)> + '_
73    where
74        (I, S): Clone,
75    {
76        self.buffer.extend(&mut self.iter);
77        self.buffer.iter().cloned()
78    }
79}
80
81impl<'a, I: Clone, S: Span + 'a> BoxStream<'a, I, S> {
82    /// Create a new `Stream` from an iterator of nested tokens and a function that flattens them.
83    ///
84    /// It's not uncommon for compilers to perform delimiter parsing during the lexing stage (Rust does this!). When
85    /// this is done, the output of the lexing stage is usually a series of nested token trees. This functions allows
86    /// you to easily flatten such token trees into a linear token stream so that they can be parsed (Chumsky currently
87    /// only support parsing linear streams of inputs).
88    ///
89    /// For reference, [here](https://docs.rs/syn/0.11.1/syn/enum.TokenTree.html) is `syn`'s `TokenTree` type that it
90    /// uses when parsing Rust syntax.
91    ///
92    /// # Examples
93    ///
94    /// ```
95    /// # use chumsky::{Stream, BoxStream, Flat};
96    /// type Span = std::ops::Range<usize>;
97    ///
98    /// fn span_at(at: usize) -> Span { at..at + 1 }
99    ///
100    /// #[derive(Clone)]
101    /// enum Token {
102    ///     Local(String),
103    ///     Int(i64),
104    ///     Bool(bool),
105    ///     Add,
106    ///     Sub,
107    ///     OpenParen,
108    ///     CloseParen,
109    ///     OpenBrace,
110    ///     CloseBrace,
111    ///     // etc.
112    /// }
113    ///
114    /// enum Delimiter {
115    ///     Paren, // ( ... )
116    ///     Brace, // { ... }
117    /// }
118    ///
119    /// // The structure of this token tree is very similar to that which Rust uses.
120    /// // See: https://docs.rs/syn/0.11.1/syn/enum.TokenTree.html
121    /// enum TokenTree {
122    ///     Token(Token),
123    ///     Tree(Delimiter, Vec<(TokenTree, Span)>),
124    /// }
125    ///
126    /// // A function that turns a series of nested token trees into a linear stream that can be used for parsing.
127    /// fn flatten_tts(eoi: Span, token_trees: Vec<(TokenTree, Span)>) -> BoxStream<'static, Token, Span> {
128    ///     use std::iter::once;
129    ///     // Currently, this is quite an explicit process: it will likely become easier in future versions of Chumsky.
130    ///     Stream::from_nested(
131    ///         eoi,
132    ///         token_trees.into_iter(),
133    ///         |(tt, span)| match tt {
134    ///             // For token trees that contain just a single token, no flattening needs to occur!
135    ///             TokenTree::Token(token) => Flat::Single((token, span)),
136    ///             // Flatten a parenthesised token tree into an iterator of the inner token trees, surrounded by parenthesis tokens
137    ///             TokenTree::Tree(Delimiter::Paren, tree) => Flat::Many(once((TokenTree::Token(Token::OpenParen), span_at(span.start)))
138    ///                 .chain(tree.into_iter())
139    ///                 .chain(once((TokenTree::Token(Token::CloseParen), span_at(span.end - 1))))),
140    ///             // Flatten a braced token tree into an iterator of the inner token trees, surrounded by brace tokens
141    ///             TokenTree::Tree(Delimiter::Brace, tree) => Flat::Many(once((TokenTree::Token(Token::OpenBrace), span_at(span.start)))
142    ///                 .chain(tree.into_iter())
143    ///                 .chain(once((TokenTree::Token(Token::CloseBrace), span_at(span.end - 1))))),
144    ///         }
145    ///     )
146    /// }
147    /// ```
148    pub fn from_nested<
149        P: 'a,
150        Iter: Iterator<Item = (P, S)>,
151        Many: Iterator<Item = (P, S)>,
152        F: FnMut((P, S)) -> Flat<(I, S), Many> + 'a,
153    >(
154        eoi: S,
155        iter: Iter,
156        mut flatten: F,
157    ) -> Self {
158        let mut v: Vec<alloc::collections::VecDeque<(P, S)>> = vec![iter.collect()];
159        Self::from_iter(
160            eoi,
161            Box::new(core::iter::from_fn(move || loop {
162                if let Some(many) = v.last_mut() {
163                    match many.pop_front().map(&mut flatten) {
164                        Some(Flat::Single(input)) => break Some(input),
165                        Some(Flat::Many(many)) => v.push(many.collect()),
166                        None => {
167                            v.pop();
168                        }
169                    }
170                } else {
171                    break None;
172                }
173            })),
174        )
175    }
176}
177
178impl<'a, I: Clone, S: Span> Stream<'a, I, S> {
179    pub(crate) fn offset(&self) -> usize {
180        self.offset
181    }
182
183    pub(crate) fn save(&self) -> usize {
184        self.offset
185    }
186    pub(crate) fn revert(&mut self, offset: usize) {
187        self.offset = offset;
188    }
189
190    fn pull_until(&mut self, offset: usize) -> Option<&(I, S)> {
191        let additional = offset.saturating_sub(self.buffer.len()) + 1024;
192        #[allow(deprecated)]
193        (&mut &mut self.iter as &mut dyn StreamExtend<_>).extend(&mut self.buffer, additional);
194        self.buffer.get(offset)
195    }
196
197    pub(crate) fn skip_if(&mut self, f: impl FnOnce(&I) -> bool) -> bool {
198        match self.pull_until(self.offset).cloned() {
199            Some((out, _)) if f(&out) => {
200                self.offset += 1;
201                true
202            }
203            Some(_) => false,
204            None => false,
205        }
206    }
207
208    pub(crate) fn next(&mut self) -> (usize, S, Option<I>) {
209        match self.pull_until(self.offset).cloned() {
210            Some((out, span)) => {
211                self.offset += 1;
212                (self.offset - 1, span, Some(out))
213            }
214            None => (self.offset, self.eoi.clone(), None),
215        }
216    }
217
218    pub(crate) fn span_since(&mut self, start_offset: usize) -> S {
219        debug_assert!(
220            start_offset <= self.offset,
221            "{} > {}",
222            self.offset,
223            start_offset
224        );
225        let start = self
226            .pull_until(start_offset)
227            .as_ref()
228            .map(|(_, s)| s.start())
229            .unwrap_or_else(|| self.eoi.start());
230        let end = self
231            .pull_until(self.offset.saturating_sub(1).max(start_offset))
232            .as_ref()
233            .map(|(_, s)| s.end())
234            .unwrap_or_else(|| self.eoi.end());
235        S::new(self.eoi.context(), start..end)
236    }
237
238    pub(crate) fn attempt<R, F: FnOnce(&mut Self) -> (bool, R)>(&mut self, f: F) -> R {
239        let old_offset = self.offset;
240        let (commit, out) = f(self);
241        if !commit {
242            self.offset = old_offset;
243        }
244        out
245    }
246
247    pub(crate) fn try_parse<O, E, F: FnOnce(&mut Self) -> PResult<I, O, E>>(
248        &mut self,
249        f: F,
250    ) -> PResult<I, O, E> {
251        self.attempt(move |stream| {
252            let out = f(stream);
253            (out.1.is_ok(), out)
254        })
255    }
256}
257
258impl<'a> From<&'a str>
259    for Stream<'a, char, Range<usize>, Box<dyn Iterator<Item = (char, Range<usize>)> + 'a>>
260{
261    /// Please note that Chumsky currently uses character indices and not byte offsets in this impl. This is likely to
262    /// change in the future. If you wish to use byte offsets, you can do so with [`Stream::from_iter`].
263    fn from(s: &'a str) -> Self {
264        let len = s.chars().count();
265        Self::from_iter(
266            len..len,
267            Box::new(s.chars().enumerate().map(|(i, c)| (c, i..i + 1))),
268        )
269    }
270}
271
272impl<'a> From<String>
273    for Stream<'a, char, Range<usize>, Box<dyn Iterator<Item = (char, Range<usize>)>>>
274{
275    /// Please note that Chumsky currently uses character indices and not byte offsets in this impl. This is likely to
276    /// change in the future. If you wish to use byte offsets, you can do so with [`Stream::from_iter`].
277    fn from(s: String) -> Self {
278        let chars = s.chars().collect::<Vec<_>>();
279        Self::from_iter(
280            chars.len()..chars.len(),
281            Box::new(chars.into_iter().enumerate().map(|(i, c)| (c, i..i + 1))),
282        )
283    }
284}
285
286impl<'a, T: Clone> From<&'a [T]>
287    for Stream<'a, T, Range<usize>, Box<dyn Iterator<Item = (T, Range<usize>)> + 'a>>
288{
289    fn from(s: &'a [T]) -> Self {
290        let len = s.len();
291        Self::from_iter(
292            len..len,
293            Box::new(s.iter().cloned().enumerate().map(|(i, x)| (x, i..i + 1))),
294        )
295    }
296}
297
298impl<'a, T: Clone + 'a> From<Vec<T>>
299    for Stream<'a, T, Range<usize>, Box<dyn Iterator<Item = (T, Range<usize>)> + 'a>>
300{
301    fn from(s: Vec<T>) -> Self {
302        let len = s.len();
303        Self::from_iter(
304            len..len,
305            Box::new(s.into_iter().enumerate().map(|(i, x)| (x, i..i + 1))),
306        )
307    }
308}
309
310impl<'a, T: Clone + 'a, const N: usize> From<[T; N]>
311    for Stream<'a, T, Range<usize>, Box<dyn Iterator<Item = (T, Range<usize>)> + 'a>>
312{
313    fn from(s: [T; N]) -> Self {
314        Self::from_iter(
315            N..N,
316            Box::new(
317                core::array::IntoIter::new(s)
318                    .enumerate()
319                    .map(|(i, x)| (x, i..i + 1)),
320            ),
321        )
322    }
323}
324
325impl<'a, T: Clone, const N: usize> From<&'a [T; N]>
326    for Stream<'a, T, Range<usize>, Box<dyn Iterator<Item = (T, Range<usize>)> + 'a>>
327{
328    fn from(s: &'a [T; N]) -> Self {
329        Self::from_iter(
330            N..N,
331            Box::new(s.iter().cloned().enumerate().map(|(i, x)| (x, i..i + 1))),
332        )
333    }
334}
335
336// impl<'a, T: Clone, S: Clone + Span<Context = ()>> From<&'a [(T, S)]> for Stream<'a, T, S, Box<dyn Iterator<Item = (T, S)> + 'a>>
337//     where S::Offset: Default
338// {
339//     fn from(s: &'a [(T, S)]) -> Self {
340//         Self::from_iter(Default::default(), Box::new(s.iter().cloned()))
341//     }
342// }