chumsky/stream.rs
1//! Token streams and tools converting to and from them..
2//!
3//! *“What’s up?” “I don’t know,” said Marvin, “I’ve never been there.”*
4//!
5//! [`Stream`] is the primary type used to feed input data into a chumsky parser. You can create them in a number of
6//! ways: from strings, iterators, arrays, etc.
7
8use super::*;
9use alloc::vec;
10
11trait StreamExtend<T>: Iterator<Item = T> {
12 /// Extend the vector with input. The actual amount can be more or less than `n`, but must be at least 1 (0 implies
13 /// that the stream has been exhausted.
14 fn extend(&mut self, v: &mut Vec<T>, n: usize);
15}
16
17#[allow(deprecated)]
18impl<I: Iterator> StreamExtend<I::Item> for I {
19 fn extend(&mut self, v: &mut Vec<I::Item>, n: usize) {
20 v.reserve(n);
21 v.extend(self.take(n));
22 }
23}
24
25/// A utility type used to flatten input trees. See [`Stream::from_nested`].
26pub enum Flat<I, Iter> {
27 /// The input tree flattens into a single input.
28 Single(I),
29 /// The input tree flattens into many sub-trees.
30 Many(Iter),
31}
32
33/// A type that represents a stream of input tokens. Unlike [`Iterator`], this type supports backtracking and a few
34/// other features required by the crate.
35#[allow(deprecated)]
36pub struct Stream<
37 'a,
38 I,
39 S: Span,
40 Iter: Iterator<Item = (I, S)> + ?Sized = dyn Iterator<Item = (I, S)> + 'a,
41> {
42 pub(crate) phantom: PhantomData<&'a ()>,
43 pub(crate) eoi: S,
44 pub(crate) offset: usize,
45 pub(crate) buffer: Vec<(I, S)>,
46 pub(crate) iter: Iter,
47}
48
49/// A [`Stream`] that pulls tokens from a boxed [`Iterator`].
50pub type BoxStream<'a, I, S> = Stream<'a, I, S, Box<dyn Iterator<Item = (I, S)> + 'a>>;
51
52impl<'a, I, S: Span, Iter: Iterator<Item = (I, S)>> Stream<'a, I, S, Iter> {
53 /// Create a new stream from an iterator of `(Token, Span)` pairs. A span representing the end of input must also
54 /// be provided.
55 ///
56 /// There is no requirement that spans must map exactly to the position of inputs in the stream, but they should
57 /// be non-overlapping and should appear in a monotonically-increasing order.
58 pub fn from_iter(eoi: S, iter: Iter) -> Self {
59 Self {
60 phantom: PhantomData,
61 eoi,
62 offset: 0,
63 buffer: Vec::new(),
64 iter,
65 }
66 }
67
68 /// Eagerly evaluate the token stream, returning an iterator over the tokens in it (but without modifying the
69 /// stream's state so that it can still be used for parsing).
70 ///
71 /// This is most useful when you wish to check the input of a parser during debugging.
72 pub fn fetch_tokens(&mut self) -> impl Iterator<Item = (I, S)> + '_
73 where
74 (I, S): Clone,
75 {
76 self.buffer.extend(&mut self.iter);
77 self.buffer.iter().cloned()
78 }
79}
80
81impl<'a, I: Clone, S: Span + 'a> BoxStream<'a, I, S> {
82 /// Create a new `Stream` from an iterator of nested tokens and a function that flattens them.
83 ///
84 /// It's not uncommon for compilers to perform delimiter parsing during the lexing stage (Rust does this!). When
85 /// this is done, the output of the lexing stage is usually a series of nested token trees. This functions allows
86 /// you to easily flatten such token trees into a linear token stream so that they can be parsed (Chumsky currently
87 /// only support parsing linear streams of inputs).
88 ///
89 /// For reference, [here](https://docs.rs/syn/0.11.1/syn/enum.TokenTree.html) is `syn`'s `TokenTree` type that it
90 /// uses when parsing Rust syntax.
91 ///
92 /// # Examples
93 ///
94 /// ```
95 /// # use chumsky::{Stream, BoxStream, Flat};
96 /// type Span = std::ops::Range<usize>;
97 ///
98 /// fn span_at(at: usize) -> Span { at..at + 1 }
99 ///
100 /// #[derive(Clone)]
101 /// enum Token {
102 /// Local(String),
103 /// Int(i64),
104 /// Bool(bool),
105 /// Add,
106 /// Sub,
107 /// OpenParen,
108 /// CloseParen,
109 /// OpenBrace,
110 /// CloseBrace,
111 /// // etc.
112 /// }
113 ///
114 /// enum Delimiter {
115 /// Paren, // ( ... )
116 /// Brace, // { ... }
117 /// }
118 ///
119 /// // The structure of this token tree is very similar to that which Rust uses.
120 /// // See: https://docs.rs/syn/0.11.1/syn/enum.TokenTree.html
121 /// enum TokenTree {
122 /// Token(Token),
123 /// Tree(Delimiter, Vec<(TokenTree, Span)>),
124 /// }
125 ///
126 /// // A function that turns a series of nested token trees into a linear stream that can be used for parsing.
127 /// fn flatten_tts(eoi: Span, token_trees: Vec<(TokenTree, Span)>) -> BoxStream<'static, Token, Span> {
128 /// use std::iter::once;
129 /// // Currently, this is quite an explicit process: it will likely become easier in future versions of Chumsky.
130 /// Stream::from_nested(
131 /// eoi,
132 /// token_trees.into_iter(),
133 /// |(tt, span)| match tt {
134 /// // For token trees that contain just a single token, no flattening needs to occur!
135 /// TokenTree::Token(token) => Flat::Single((token, span)),
136 /// // Flatten a parenthesised token tree into an iterator of the inner token trees, surrounded by parenthesis tokens
137 /// TokenTree::Tree(Delimiter::Paren, tree) => Flat::Many(once((TokenTree::Token(Token::OpenParen), span_at(span.start)))
138 /// .chain(tree.into_iter())
139 /// .chain(once((TokenTree::Token(Token::CloseParen), span_at(span.end - 1))))),
140 /// // Flatten a braced token tree into an iterator of the inner token trees, surrounded by brace tokens
141 /// TokenTree::Tree(Delimiter::Brace, tree) => Flat::Many(once((TokenTree::Token(Token::OpenBrace), span_at(span.start)))
142 /// .chain(tree.into_iter())
143 /// .chain(once((TokenTree::Token(Token::CloseBrace), span_at(span.end - 1))))),
144 /// }
145 /// )
146 /// }
147 /// ```
148 pub fn from_nested<
149 P: 'a,
150 Iter: Iterator<Item = (P, S)>,
151 Many: Iterator<Item = (P, S)>,
152 F: FnMut((P, S)) -> Flat<(I, S), Many> + 'a,
153 >(
154 eoi: S,
155 iter: Iter,
156 mut flatten: F,
157 ) -> Self {
158 let mut v: Vec<alloc::collections::VecDeque<(P, S)>> = vec![iter.collect()];
159 Self::from_iter(
160 eoi,
161 Box::new(core::iter::from_fn(move || loop {
162 if let Some(many) = v.last_mut() {
163 match many.pop_front().map(&mut flatten) {
164 Some(Flat::Single(input)) => break Some(input),
165 Some(Flat::Many(many)) => v.push(many.collect()),
166 None => {
167 v.pop();
168 }
169 }
170 } else {
171 break None;
172 }
173 })),
174 )
175 }
176}
177
178impl<'a, I: Clone, S: Span> Stream<'a, I, S> {
179 pub(crate) fn offset(&self) -> usize {
180 self.offset
181 }
182
183 pub(crate) fn save(&self) -> usize {
184 self.offset
185 }
186 pub(crate) fn revert(&mut self, offset: usize) {
187 self.offset = offset;
188 }
189
190 fn pull_until(&mut self, offset: usize) -> Option<&(I, S)> {
191 let additional = offset.saturating_sub(self.buffer.len()) + 1024;
192 #[allow(deprecated)]
193 (&mut &mut self.iter as &mut dyn StreamExtend<_>).extend(&mut self.buffer, additional);
194 self.buffer.get(offset)
195 }
196
197 pub(crate) fn skip_if(&mut self, f: impl FnOnce(&I) -> bool) -> bool {
198 match self.pull_until(self.offset).cloned() {
199 Some((out, _)) if f(&out) => {
200 self.offset += 1;
201 true
202 }
203 Some(_) => false,
204 None => false,
205 }
206 }
207
208 pub(crate) fn next(&mut self) -> (usize, S, Option<I>) {
209 match self.pull_until(self.offset).cloned() {
210 Some((out, span)) => {
211 self.offset += 1;
212 (self.offset - 1, span, Some(out))
213 }
214 None => (self.offset, self.eoi.clone(), None),
215 }
216 }
217
218 pub(crate) fn span_since(&mut self, start_offset: usize) -> S {
219 debug_assert!(
220 start_offset <= self.offset,
221 "{} > {}",
222 self.offset,
223 start_offset
224 );
225 let start = self
226 .pull_until(start_offset)
227 .as_ref()
228 .map(|(_, s)| s.start())
229 .unwrap_or_else(|| self.eoi.start());
230 let end = self
231 .pull_until(self.offset.saturating_sub(1).max(start_offset))
232 .as_ref()
233 .map(|(_, s)| s.end())
234 .unwrap_or_else(|| self.eoi.end());
235 S::new(self.eoi.context(), start..end)
236 }
237
238 pub(crate) fn attempt<R, F: FnOnce(&mut Self) -> (bool, R)>(&mut self, f: F) -> R {
239 let old_offset = self.offset;
240 let (commit, out) = f(self);
241 if !commit {
242 self.offset = old_offset;
243 }
244 out
245 }
246
247 pub(crate) fn try_parse<O, E, F: FnOnce(&mut Self) -> PResult<I, O, E>>(
248 &mut self,
249 f: F,
250 ) -> PResult<I, O, E> {
251 self.attempt(move |stream| {
252 let out = f(stream);
253 (out.1.is_ok(), out)
254 })
255 }
256}
257
258impl<'a> From<&'a str>
259 for Stream<'a, char, Range<usize>, Box<dyn Iterator<Item = (char, Range<usize>)> + 'a>>
260{
261 /// Please note that Chumsky currently uses character indices and not byte offsets in this impl. This is likely to
262 /// change in the future. If you wish to use byte offsets, you can do so with [`Stream::from_iter`].
263 fn from(s: &'a str) -> Self {
264 let len = s.chars().count();
265 Self::from_iter(
266 len..len,
267 Box::new(s.chars().enumerate().map(|(i, c)| (c, i..i + 1))),
268 )
269 }
270}
271
272impl<'a> From<String>
273 for Stream<'a, char, Range<usize>, Box<dyn Iterator<Item = (char, Range<usize>)>>>
274{
275 /// Please note that Chumsky currently uses character indices and not byte offsets in this impl. This is likely to
276 /// change in the future. If you wish to use byte offsets, you can do so with [`Stream::from_iter`].
277 fn from(s: String) -> Self {
278 let chars = s.chars().collect::<Vec<_>>();
279 Self::from_iter(
280 chars.len()..chars.len(),
281 Box::new(chars.into_iter().enumerate().map(|(i, c)| (c, i..i + 1))),
282 )
283 }
284}
285
286impl<'a, T: Clone> From<&'a [T]>
287 for Stream<'a, T, Range<usize>, Box<dyn Iterator<Item = (T, Range<usize>)> + 'a>>
288{
289 fn from(s: &'a [T]) -> Self {
290 let len = s.len();
291 Self::from_iter(
292 len..len,
293 Box::new(s.iter().cloned().enumerate().map(|(i, x)| (x, i..i + 1))),
294 )
295 }
296}
297
298impl<'a, T: Clone + 'a> From<Vec<T>>
299 for Stream<'a, T, Range<usize>, Box<dyn Iterator<Item = (T, Range<usize>)> + 'a>>
300{
301 fn from(s: Vec<T>) -> Self {
302 let len = s.len();
303 Self::from_iter(
304 len..len,
305 Box::new(s.into_iter().enumerate().map(|(i, x)| (x, i..i + 1))),
306 )
307 }
308}
309
310impl<'a, T: Clone + 'a, const N: usize> From<[T; N]>
311 for Stream<'a, T, Range<usize>, Box<dyn Iterator<Item = (T, Range<usize>)> + 'a>>
312{
313 fn from(s: [T; N]) -> Self {
314 Self::from_iter(
315 N..N,
316 Box::new(
317 core::array::IntoIter::new(s)
318 .enumerate()
319 .map(|(i, x)| (x, i..i + 1)),
320 ),
321 )
322 }
323}
324
325impl<'a, T: Clone, const N: usize> From<&'a [T; N]>
326 for Stream<'a, T, Range<usize>, Box<dyn Iterator<Item = (T, Range<usize>)> + 'a>>
327{
328 fn from(s: &'a [T; N]) -> Self {
329 Self::from_iter(
330 N..N,
331 Box::new(s.iter().cloned().enumerate().map(|(i, x)| (x, i..i + 1))),
332 )
333 }
334}
335
336// impl<'a, T: Clone, S: Clone + Span<Context = ()>> From<&'a [(T, S)]> for Stream<'a, T, S, Box<dyn Iterator<Item = (T, S)> + 'a>>
337// where S::Offset: Default
338// {
339// fn from(s: &'a [(T, S)]) -> Self {
340// Self::from_iter(Default::default(), Box::new(s.iter().cloned()))
341// }
342// }