chumsky/
text.rs

1//! Text-specific parsers and utilities.
2//!
3//! *“Ford!" he said, "there's an infinite number of monkeys outside who want to talk to us about this script for
4//! Hamlet they've worked out.”*
5//!
6//! The parsers in this module are generic over both Unicode ([`char`]) and ASCII ([`u8`]) characters. Most parsers take
7//! a type parameter, `C`, that can be either [`u8`] or [`char`] in order to handle either case.
8//!
9//! The [`TextParser`] trait is an extension on top of the main [`Parser`] trait that adds combinators unique to the
10//! parsing of text.
11
12use super::*;
13use core::iter::FromIterator;
14
15/// The type of a parser that accepts (and ignores) any number of whitespace characters.
16pub type Padding<I, E> = Custom<fn(&mut StreamOf<I, E>) -> PResult<I, (), E>, E>;
17
18/// The type of a parser that accepts (and ignores) any number of whitespace characters before or after another
19/// pattern.
20// pub type Padded<P, I, O> = ThenIgnore<
21//     IgnoreThen<Padding<I, <P as Parser<I, O>>::Error>, P, (), O>,
22//     Padding<I, <P as Parser<I, O>>::Error>,
23//     O,
24//     (),
25// >;
26
27/// A parser that accepts (and ignores) any number of whitespace characters before or after another pattern.
28#[must_use]
29#[derive(Copy, Clone)]
30pub struct Padded<A>(A);
31
32impl<C: Character, O, A: Parser<C, O, Error = E>, E: Error<C>> Parser<C, O> for Padded<A> {
33    type Error = E;
34
35    #[inline]
36    fn parse_inner<D: Debugger>(
37        &self,
38        debugger: &mut D,
39        stream: &mut StreamOf<C, E>,
40    ) -> PResult<C, O, E> {
41        while stream.skip_if(|c| c.is_whitespace()) {}
42        match self.0.parse_inner(debugger, stream) {
43            (a_errors, Ok((a_out, a_alt))) => {
44                while stream.skip_if(|c| c.is_whitespace()) {}
45                (a_errors, Ok((a_out, a_alt)))
46            }
47            (a_errors, Err(err)) => (a_errors, Err(err)),
48        }
49    }
50
51    #[inline]
52    fn parse_inner_verbose(&self, d: &mut Verbose, s: &mut StreamOf<C, E>) -> PResult<C, O, E> {
53        #[allow(deprecated)]
54        self.parse_inner(d, s)
55    }
56    #[inline]
57    fn parse_inner_silent(&self, d: &mut Silent, s: &mut StreamOf<C, E>) -> PResult<C, O, E> {
58        #[allow(deprecated)]
59        self.parse_inner(d, s)
60    }
61}
62
63mod private {
64    pub trait Sealed {}
65
66    impl Sealed for u8 {}
67    impl Sealed for char {}
68}
69
70/// A trait implemented by textual character types (currently, [`u8`] and [`char`]).
71///
72/// Avoid implementing this trait yourself if you can: it's *very* likely to be expanded in future versions!
73pub trait Character: private::Sealed + Copy + PartialEq {
74    /// The default unsized [`str`]-like type of a linear sequence of this character.
75    ///
76    /// For [`char`], this is [`str`]. For [`u8`], this is [`[u8]`].
77    type Str: ?Sized + PartialEq;
78
79    /// The default type that this character collects into.
80    ///
81    /// For [`char`], this is [`String`]. For [`u8`], this is [`Vec<u8>`].
82    type Collection: Chain<Self> + FromIterator<Self> + AsRef<Self::Str> + 'static;
83
84    /// Convert the given ASCII character to this character type.
85    fn from_ascii(c: u8) -> Self;
86
87    /// Returns true if the character is canonically considered to be inline whitespace (i.e: not part of a newline).
88    fn is_inline_whitespace(&self) -> bool;
89
90    /// Returns true if the character is canonically considered to be whitespace.
91    fn is_whitespace(&self) -> bool;
92
93    /// Return the '0' digit of the character.
94    fn digit_zero() -> Self;
95
96    /// Returns true if the character is canonically considered to be a numeric digit.
97    fn is_digit(&self, radix: u32) -> bool;
98
99    /// Returns this character as a [`char`].
100    fn to_char(&self) -> char;
101}
102
103impl Character for u8 {
104    type Str = [u8];
105    type Collection = Vec<u8>;
106
107    fn from_ascii(c: u8) -> Self {
108        c
109    }
110    fn is_inline_whitespace(&self) -> bool {
111        *self == b' ' || *self == b'\t'
112    }
113    fn is_whitespace(&self) -> bool {
114        self.is_ascii_whitespace()
115    }
116    fn digit_zero() -> Self {
117        b'0'
118    }
119    fn is_digit(&self, radix: u32) -> bool {
120        (*self as char).is_digit(radix)
121    }
122    fn to_char(&self) -> char {
123        *self as char
124    }
125}
126
127impl Character for char {
128    type Str = str;
129    type Collection = String;
130
131    fn from_ascii(c: u8) -> Self {
132        c as char
133    }
134    fn is_inline_whitespace(&self) -> bool {
135        *self == ' ' || *self == '\t'
136    }
137    fn is_whitespace(&self) -> bool {
138        char::is_whitespace(*self)
139    }
140    fn digit_zero() -> Self {
141        '0'
142    }
143    fn is_digit(&self, radix: u32) -> bool {
144        char::is_digit(*self, radix)
145    }
146    fn to_char(&self) -> char {
147        *self
148    }
149}
150
151/// A trait containing text-specific functionality that extends the [`Parser`] trait.
152pub trait TextParser<I: Character, O>: Parser<I, O> {
153    /// Parse a pattern, ignoring any amount of whitespace both before and after the pattern.
154    ///
155    /// The output type of this parser is `O`, the same as the original parser.
156    ///
157    /// # Examples
158    ///
159    /// ```
160    /// # use chumsky::prelude::*;
161    /// let ident = text::ident::<_, Simple<char>>().padded();
162    ///
163    /// // A pattern with no whitespace surrounding it is accepted
164    /// assert_eq!(ident.parse("hello"), Ok("hello".to_string()));
165    /// // A pattern with arbitrary whitespace surrounding it is also accepted
166    /// assert_eq!(ident.parse(" \t \n  \t   world  \t  "), Ok("world".to_string()));
167    /// ```
168    fn padded(self) -> Padded<Self>
169    where
170        Self: Sized,
171    {
172        Padded(self)
173        // whitespace().ignore_then(self).then_ignore(whitespace())
174    }
175}
176
177impl<I: Character, O, P: Parser<I, O>> TextParser<I, O> for P {}
178
179/// A parser that accepts (and ignores) any number of whitespace characters.
180///
181/// This parser is a `Parser::Repeated` and so methods such as `at_least()` can be called on it.
182///
183/// The output type of this parser is `Vec<()>`.
184///
185/// # Examples
186///
187/// ```
188/// # use chumsky::prelude::*;
189/// let whitespace = text::whitespace::<_, Simple<char>>();
190///
191/// // Any amount of whitespace is parsed...
192/// assert_eq!(whitespace.parse("\t \n  \r "), Ok(vec![(), (), (), (), (), (), ()]));
193/// // ...including none at all!
194/// assert_eq!(whitespace.parse(""), Ok(vec![]));
195/// ```
196pub fn whitespace<'a, C: Character + 'a, E: Error<C> + 'a>(
197) -> Repeated<impl Parser<C, (), Error = E> + Copy + Clone + 'a> {
198    filter(|c: &C| c.is_whitespace()).ignored().repeated()
199}
200
201/// A parser that accepts (and ignores) any newline characters or character sequences.
202///
203/// The output type of this parser is `()`.
204///
205/// This parser is quite extensive, recognising:
206///
207/// - Line feed (`\n`)
208/// - Carriage return (`\r`)
209/// - Carriage return + line feed (`\r\n`)
210/// - Vertical tab (`\x0B`)
211/// - Form feed (`\x0C`)
212/// - Next line (`\u{0085}`)
213/// - Line separator (`\u{2028}`)
214/// - Paragraph separator (`\u{2029}`)
215///
216/// # Examples
217///
218/// ```
219/// # use chumsky::prelude::*;
220/// let newline = text::newline::<char, Simple<char>>()
221///     .then_ignore(end());
222///
223/// assert_eq!(newline.parse("\n"), Ok(()));
224/// assert_eq!(newline.parse("\r"), Ok(()));
225/// assert_eq!(newline.parse("\r\n"), Ok(()));
226/// assert_eq!(newline.parse("\x0B"), Ok(()));
227/// assert_eq!(newline.parse("\x0C"), Ok(()));
228/// assert_eq!(newline.parse("\u{0085}"), Ok(()));
229/// assert_eq!(newline.parse("\u{2028}"), Ok(()));
230/// assert_eq!(newline.parse("\u{2029}"), Ok(()));
231/// ```
232#[must_use]
233pub fn newline<'a, C: Character + 'a, E: Error<C> + 'a>(
234) -> impl Parser<C, (), Error = E> + Copy + Clone + 'a {
235    just(C::from_ascii(b'\r'))
236        .or_not()
237        .ignore_then(just(C::from_ascii(b'\n')))
238        .or(filter(|c: &C| {
239            [
240                '\r',       // Carriage return
241                '\x0B',     // Vertical tab
242                '\x0C',     // Form feed
243                '\u{0085}', // Next line
244                '\u{2028}', // Line separator
245                '\u{2029}', // Paragraph separator
246            ]
247            .contains(&c.to_char())
248        }))
249        .ignored()
250}
251
252/// A parser that accepts one or more ASCII digits.
253///
254/// The output type of this parser is [`Character::Collection`] (i.e: [`String`] when `C` is [`char`], and [`Vec<u8>`]
255/// when `C` is [`u8`]).
256///
257/// The `radix` parameter functions identically to [`char::is_digit`]. If in doubt, choose `10`.
258///
259/// # Examples
260///
261/// ```
262/// # use chumsky::prelude::*;
263/// let digits = text::digits::<_, Simple<char>>(10);
264///
265/// assert_eq!(digits.parse("0"), Ok("0".to_string()));
266/// assert_eq!(digits.parse("1"), Ok("1".to_string()));
267/// assert_eq!(digits.parse("01234"), Ok("01234".to_string()));
268/// assert_eq!(digits.parse("98345"), Ok("98345".to_string()));
269/// // A string of zeroes is still valid. Use `int` if this is not desirable.
270/// assert_eq!(digits.parse("0000"), Ok("0000".to_string()));
271/// assert!(digits.parse("").is_err());
272/// ```
273#[must_use]
274pub fn digits<C: Character, E: Error<C>>(
275    radix: u32,
276) -> impl Parser<C, C::Collection, Error = E> + Copy + Clone {
277    filter(move |c: &C| c.is_digit(radix))
278        .repeated()
279        .at_least(1)
280        .collect()
281}
282
283/// A parser that accepts a non-negative integer.
284///
285/// An integer is defined as a non-empty sequence of ASCII digits, where the first digit is non-zero or the sequence
286/// has length one.
287///
288/// The output type of this parser is [`Character::Collection`] (i.e: [`String`] when `C` is [`char`], and [`Vec<u8>`]
289/// when `C` is [`u8`]).
290///
291/// The `radix` parameter functions identically to [`char::is_digit`]. If in doubt, choose `10`.
292///
293/// # Examples
294///
295/// ```
296/// # use chumsky::prelude::*;
297/// let dec = text::int::<_, Simple<char>>(10)
298///     .then_ignore(end());
299///
300/// assert_eq!(dec.parse("0"), Ok("0".to_string()));
301/// assert_eq!(dec.parse("1"), Ok("1".to_string()));
302/// assert_eq!(dec.parse("1452"), Ok("1452".to_string()));
303/// // No leading zeroes are permitted!
304/// assert!(dec.parse("04").is_err());
305///
306/// let hex = text::int::<_, Simple<char>>(16)
307///     .then_ignore(end());
308///
309/// assert_eq!(hex.parse("2A"), Ok("2A".to_string()));
310/// assert_eq!(hex.parse("d"), Ok("d".to_string()));
311/// assert_eq!(hex.parse("b4"), Ok("b4".to_string()));
312/// assert!(hex.parse("0B").is_err());
313/// ```
314#[must_use]
315pub fn int<C: Character, E: Error<C>>(
316    radix: u32,
317) -> impl Parser<C, C::Collection, Error = E> + Copy + Clone {
318    filter(move |c: &C| c.is_digit(radix) && c != &C::digit_zero())
319        .map(Some)
320        .chain::<C, Vec<_>, _>(filter(move |c: &C| c.is_digit(radix)).repeated())
321        .collect()
322        .or(just(C::digit_zero()).map(|c| core::iter::once(c).collect()))
323}
324
325/// A parser that accepts a C-style identifier.
326///
327/// The output type of this parser is [`Character::Collection`] (i.e: [`String`] when `C` is [`char`], and [`Vec<u8>`]
328/// when `C` is [`u8`]).
329///
330/// An identifier is defined as an ASCII alphabetic character or an underscore followed by any number of alphanumeric
331/// characters or underscores. The regex pattern for it is `[a-zA-Z_][a-zA-Z0-9_]*`.
332#[must_use]
333pub fn ident<C: Character, E: Error<C>>() -> impl Parser<C, C::Collection, Error = E> + Copy + Clone
334{
335    filter(|c: &C| c.to_char().is_ascii_alphabetic() || c.to_char() == '_')
336        .map(Some)
337        .chain::<C, Vec<_>, _>(
338            filter(|c: &C| c.to_char().is_ascii_alphanumeric() || c.to_char() == '_').repeated(),
339        )
340        .collect()
341}
342
343/// Like [`ident`], but only accepts an exact identifier while ignoring trailing identifier characters.
344///
345/// The output type of this parser is `()`.
346///
347/// # Examples
348///
349/// ```
350/// # use chumsky::prelude::*;
351/// let def = text::keyword::<_, _, Simple<char>>("def");
352///
353/// // Exactly 'def' was found
354/// assert_eq!(def.parse("def"), Ok(()));
355/// // Exactly 'def' was found, with non-identifier trailing characters
356/// assert_eq!(def.parse("def(foo, bar)"), Ok(()));
357/// // 'def' was found, but only as part of a larger identifier, so this fails to parse
358/// assert!(def.parse("define").is_err());
359/// ```
360#[must_use]
361pub fn keyword<'a, C: Character + 'a, S: AsRef<C::Str> + 'a + Clone, E: Error<C> + 'a>(
362    keyword: S,
363) -> impl Parser<C, (), Error = E> + Clone + 'a {
364    // TODO: use .filter(...), improve error messages
365    ident().try_map(move |s: C::Collection, span| {
366        if s.as_ref() == keyword.as_ref() {
367            Ok(())
368        } else {
369            Err(E::expected_input_found(span, None, None))
370        }
371    })
372}
373
374/// A parser that consumes text and generates tokens using semantic whitespace rules and the given token parser.
375///
376/// Also required is a function that collects a [`Vec`] of tokens into a whitespace-indicated token tree.
377#[must_use]
378pub fn semantic_indentation<'a, C, Tok, T, F, E: Error<C> + 'a>(
379    token: T,
380    make_group: F,
381) -> impl Parser<C, Vec<Tok>, Error = E> + Clone + 'a
382where
383    C: Character + 'a,
384    Tok: 'a,
385    T: Parser<C, Tok, Error = E> + Clone + 'a,
386    F: Fn(Vec<Tok>, E::Span) -> Tok + Clone + 'a,
387{
388    let line_ws = filter(|c: &C| c.is_inline_whitespace());
389
390    let line = token.padded_by(line_ws.ignored().repeated()).repeated();
391
392    let lines = line_ws
393        .repeated()
394        .then(line.map_with_span(|line, span| (line, span)))
395        .separated_by(newline())
396        .padded();
397
398    lines.map(move |lines| {
399        fn collapse<C, Tok, F, S>(
400            mut tree: Vec<(Vec<C>, Vec<Tok>, Option<S>)>,
401            make_group: &F,
402        ) -> Option<Tok>
403        where
404            F: Fn(Vec<Tok>, S) -> Tok,
405        {
406            while let Some((_, tts, line_span)) = tree.pop() {
407                let tt = make_group(tts, line_span?);
408                if let Some(last) = tree.last_mut() {
409                    last.1.push(tt);
410                } else {
411                    return Some(tt);
412                }
413            }
414            None
415        }
416
417        let mut nesting = vec![(Vec::new(), Vec::new(), None)];
418        for (indent, (mut line, line_span)) in lines {
419            let mut indent = indent.as_slice();
420            let mut i = 0;
421            while let Some(tail) = nesting
422                .get(i)
423                .and_then(|(n, _, _)| indent.strip_prefix(n.as_slice()))
424            {
425                indent = tail;
426                i += 1;
427            }
428            if let Some(tail) = collapse(nesting.split_off(i), &make_group) {
429                nesting.last_mut().unwrap().1.push(tail);
430            }
431            if !indent.is_empty() {
432                nesting.push((indent.to_vec(), line, Some(line_span)));
433            } else {
434                nesting.last_mut().unwrap().1.append(&mut line);
435            }
436        }
437
438        nesting.remove(0).1
439    })
440}