chumsky/text.rs
1//! Text-specific parsers and utilities.
2//!
3//! *“Ford!" he said, "there's an infinite number of monkeys outside who want to talk to us about this script for
4//! Hamlet they've worked out.”*
5//!
6//! The parsers in this module are generic over both Unicode ([`char`]) and ASCII ([`u8`]) characters. Most parsers take
7//! a type parameter, `C`, that can be either [`u8`] or [`char`] in order to handle either case.
8//!
9//! The [`TextParser`] trait is an extension on top of the main [`Parser`] trait that adds combinators unique to the
10//! parsing of text.
11
12use super::*;
13use core::iter::FromIterator;
14
15/// The type of a parser that accepts (and ignores) any number of whitespace characters.
16pub type Padding<I, E> = Custom<fn(&mut StreamOf<I, E>) -> PResult<I, (), E>, E>;
17
18/// The type of a parser that accepts (and ignores) any number of whitespace characters before or after another
19/// pattern.
20// pub type Padded<P, I, O> = ThenIgnore<
21// IgnoreThen<Padding<I, <P as Parser<I, O>>::Error>, P, (), O>,
22// Padding<I, <P as Parser<I, O>>::Error>,
23// O,
24// (),
25// >;
26
27/// A parser that accepts (and ignores) any number of whitespace characters before or after another pattern.
28#[must_use]
29#[derive(Copy, Clone)]
30pub struct Padded<A>(A);
31
32impl<C: Character, O, A: Parser<C, O, Error = E>, E: Error<C>> Parser<C, O> for Padded<A> {
33 type Error = E;
34
35 #[inline]
36 fn parse_inner<D: Debugger>(
37 &self,
38 debugger: &mut D,
39 stream: &mut StreamOf<C, E>,
40 ) -> PResult<C, O, E> {
41 while stream.skip_if(|c| c.is_whitespace()) {}
42 match self.0.parse_inner(debugger, stream) {
43 (a_errors, Ok((a_out, a_alt))) => {
44 while stream.skip_if(|c| c.is_whitespace()) {}
45 (a_errors, Ok((a_out, a_alt)))
46 }
47 (a_errors, Err(err)) => (a_errors, Err(err)),
48 }
49 }
50
51 #[inline]
52 fn parse_inner_verbose(&self, d: &mut Verbose, s: &mut StreamOf<C, E>) -> PResult<C, O, E> {
53 #[allow(deprecated)]
54 self.parse_inner(d, s)
55 }
56 #[inline]
57 fn parse_inner_silent(&self, d: &mut Silent, s: &mut StreamOf<C, E>) -> PResult<C, O, E> {
58 #[allow(deprecated)]
59 self.parse_inner(d, s)
60 }
61}
62
63mod private {
64 pub trait Sealed {}
65
66 impl Sealed for u8 {}
67 impl Sealed for char {}
68}
69
70/// A trait implemented by textual character types (currently, [`u8`] and [`char`]).
71///
72/// Avoid implementing this trait yourself if you can: it's *very* likely to be expanded in future versions!
73pub trait Character: private::Sealed + Copy + PartialEq {
74 /// The default unsized [`str`]-like type of a linear sequence of this character.
75 ///
76 /// For [`char`], this is [`str`]. For [`u8`], this is [`[u8]`].
77 type Str: ?Sized + PartialEq;
78
79 /// The default type that this character collects into.
80 ///
81 /// For [`char`], this is [`String`]. For [`u8`], this is [`Vec<u8>`].
82 type Collection: Chain<Self> + FromIterator<Self> + AsRef<Self::Str> + 'static;
83
84 /// Convert the given ASCII character to this character type.
85 fn from_ascii(c: u8) -> Self;
86
87 /// Returns true if the character is canonically considered to be inline whitespace (i.e: not part of a newline).
88 fn is_inline_whitespace(&self) -> bool;
89
90 /// Returns true if the character is canonically considered to be whitespace.
91 fn is_whitespace(&self) -> bool;
92
93 /// Return the '0' digit of the character.
94 fn digit_zero() -> Self;
95
96 /// Returns true if the character is canonically considered to be a numeric digit.
97 fn is_digit(&self, radix: u32) -> bool;
98
99 /// Returns this character as a [`char`].
100 fn to_char(&self) -> char;
101}
102
103impl Character for u8 {
104 type Str = [u8];
105 type Collection = Vec<u8>;
106
107 fn from_ascii(c: u8) -> Self {
108 c
109 }
110 fn is_inline_whitespace(&self) -> bool {
111 *self == b' ' || *self == b'\t'
112 }
113 fn is_whitespace(&self) -> bool {
114 self.is_ascii_whitespace()
115 }
116 fn digit_zero() -> Self {
117 b'0'
118 }
119 fn is_digit(&self, radix: u32) -> bool {
120 (*self as char).is_digit(radix)
121 }
122 fn to_char(&self) -> char {
123 *self as char
124 }
125}
126
127impl Character for char {
128 type Str = str;
129 type Collection = String;
130
131 fn from_ascii(c: u8) -> Self {
132 c as char
133 }
134 fn is_inline_whitespace(&self) -> bool {
135 *self == ' ' || *self == '\t'
136 }
137 fn is_whitespace(&self) -> bool {
138 char::is_whitespace(*self)
139 }
140 fn digit_zero() -> Self {
141 '0'
142 }
143 fn is_digit(&self, radix: u32) -> bool {
144 char::is_digit(*self, radix)
145 }
146 fn to_char(&self) -> char {
147 *self
148 }
149}
150
151/// A trait containing text-specific functionality that extends the [`Parser`] trait.
152pub trait TextParser<I: Character, O>: Parser<I, O> {
153 /// Parse a pattern, ignoring any amount of whitespace both before and after the pattern.
154 ///
155 /// The output type of this parser is `O`, the same as the original parser.
156 ///
157 /// # Examples
158 ///
159 /// ```
160 /// # use chumsky::prelude::*;
161 /// let ident = text::ident::<_, Simple<char>>().padded();
162 ///
163 /// // A pattern with no whitespace surrounding it is accepted
164 /// assert_eq!(ident.parse("hello"), Ok("hello".to_string()));
165 /// // A pattern with arbitrary whitespace surrounding it is also accepted
166 /// assert_eq!(ident.parse(" \t \n \t world \t "), Ok("world".to_string()));
167 /// ```
168 fn padded(self) -> Padded<Self>
169 where
170 Self: Sized,
171 {
172 Padded(self)
173 // whitespace().ignore_then(self).then_ignore(whitespace())
174 }
175}
176
177impl<I: Character, O, P: Parser<I, O>> TextParser<I, O> for P {}
178
179/// A parser that accepts (and ignores) any number of whitespace characters.
180///
181/// This parser is a `Parser::Repeated` and so methods such as `at_least()` can be called on it.
182///
183/// The output type of this parser is `Vec<()>`.
184///
185/// # Examples
186///
187/// ```
188/// # use chumsky::prelude::*;
189/// let whitespace = text::whitespace::<_, Simple<char>>();
190///
191/// // Any amount of whitespace is parsed...
192/// assert_eq!(whitespace.parse("\t \n \r "), Ok(vec![(), (), (), (), (), (), ()]));
193/// // ...including none at all!
194/// assert_eq!(whitespace.parse(""), Ok(vec![]));
195/// ```
196pub fn whitespace<'a, C: Character + 'a, E: Error<C> + 'a>(
197) -> Repeated<impl Parser<C, (), Error = E> + Copy + Clone + 'a> {
198 filter(|c: &C| c.is_whitespace()).ignored().repeated()
199}
200
201/// A parser that accepts (and ignores) any newline characters or character sequences.
202///
203/// The output type of this parser is `()`.
204///
205/// This parser is quite extensive, recognising:
206///
207/// - Line feed (`\n`)
208/// - Carriage return (`\r`)
209/// - Carriage return + line feed (`\r\n`)
210/// - Vertical tab (`\x0B`)
211/// - Form feed (`\x0C`)
212/// - Next line (`\u{0085}`)
213/// - Line separator (`\u{2028}`)
214/// - Paragraph separator (`\u{2029}`)
215///
216/// # Examples
217///
218/// ```
219/// # use chumsky::prelude::*;
220/// let newline = text::newline::<char, Simple<char>>()
221/// .then_ignore(end());
222///
223/// assert_eq!(newline.parse("\n"), Ok(()));
224/// assert_eq!(newline.parse("\r"), Ok(()));
225/// assert_eq!(newline.parse("\r\n"), Ok(()));
226/// assert_eq!(newline.parse("\x0B"), Ok(()));
227/// assert_eq!(newline.parse("\x0C"), Ok(()));
228/// assert_eq!(newline.parse("\u{0085}"), Ok(()));
229/// assert_eq!(newline.parse("\u{2028}"), Ok(()));
230/// assert_eq!(newline.parse("\u{2029}"), Ok(()));
231/// ```
232#[must_use]
233pub fn newline<'a, C: Character + 'a, E: Error<C> + 'a>(
234) -> impl Parser<C, (), Error = E> + Copy + Clone + 'a {
235 just(C::from_ascii(b'\r'))
236 .or_not()
237 .ignore_then(just(C::from_ascii(b'\n')))
238 .or(filter(|c: &C| {
239 [
240 '\r', // Carriage return
241 '\x0B', // Vertical tab
242 '\x0C', // Form feed
243 '\u{0085}', // Next line
244 '\u{2028}', // Line separator
245 '\u{2029}', // Paragraph separator
246 ]
247 .contains(&c.to_char())
248 }))
249 .ignored()
250}
251
252/// A parser that accepts one or more ASCII digits.
253///
254/// The output type of this parser is [`Character::Collection`] (i.e: [`String`] when `C` is [`char`], and [`Vec<u8>`]
255/// when `C` is [`u8`]).
256///
257/// The `radix` parameter functions identically to [`char::is_digit`]. If in doubt, choose `10`.
258///
259/// # Examples
260///
261/// ```
262/// # use chumsky::prelude::*;
263/// let digits = text::digits::<_, Simple<char>>(10);
264///
265/// assert_eq!(digits.parse("0"), Ok("0".to_string()));
266/// assert_eq!(digits.parse("1"), Ok("1".to_string()));
267/// assert_eq!(digits.parse("01234"), Ok("01234".to_string()));
268/// assert_eq!(digits.parse("98345"), Ok("98345".to_string()));
269/// // A string of zeroes is still valid. Use `int` if this is not desirable.
270/// assert_eq!(digits.parse("0000"), Ok("0000".to_string()));
271/// assert!(digits.parse("").is_err());
272/// ```
273#[must_use]
274pub fn digits<C: Character, E: Error<C>>(
275 radix: u32,
276) -> impl Parser<C, C::Collection, Error = E> + Copy + Clone {
277 filter(move |c: &C| c.is_digit(radix))
278 .repeated()
279 .at_least(1)
280 .collect()
281}
282
283/// A parser that accepts a non-negative integer.
284///
285/// An integer is defined as a non-empty sequence of ASCII digits, where the first digit is non-zero or the sequence
286/// has length one.
287///
288/// The output type of this parser is [`Character::Collection`] (i.e: [`String`] when `C` is [`char`], and [`Vec<u8>`]
289/// when `C` is [`u8`]).
290///
291/// The `radix` parameter functions identically to [`char::is_digit`]. If in doubt, choose `10`.
292///
293/// # Examples
294///
295/// ```
296/// # use chumsky::prelude::*;
297/// let dec = text::int::<_, Simple<char>>(10)
298/// .then_ignore(end());
299///
300/// assert_eq!(dec.parse("0"), Ok("0".to_string()));
301/// assert_eq!(dec.parse("1"), Ok("1".to_string()));
302/// assert_eq!(dec.parse("1452"), Ok("1452".to_string()));
303/// // No leading zeroes are permitted!
304/// assert!(dec.parse("04").is_err());
305///
306/// let hex = text::int::<_, Simple<char>>(16)
307/// .then_ignore(end());
308///
309/// assert_eq!(hex.parse("2A"), Ok("2A".to_string()));
310/// assert_eq!(hex.parse("d"), Ok("d".to_string()));
311/// assert_eq!(hex.parse("b4"), Ok("b4".to_string()));
312/// assert!(hex.parse("0B").is_err());
313/// ```
314#[must_use]
315pub fn int<C: Character, E: Error<C>>(
316 radix: u32,
317) -> impl Parser<C, C::Collection, Error = E> + Copy + Clone {
318 filter(move |c: &C| c.is_digit(radix) && c != &C::digit_zero())
319 .map(Some)
320 .chain::<C, Vec<_>, _>(filter(move |c: &C| c.is_digit(radix)).repeated())
321 .collect()
322 .or(just(C::digit_zero()).map(|c| core::iter::once(c).collect()))
323}
324
325/// A parser that accepts a C-style identifier.
326///
327/// The output type of this parser is [`Character::Collection`] (i.e: [`String`] when `C` is [`char`], and [`Vec<u8>`]
328/// when `C` is [`u8`]).
329///
330/// An identifier is defined as an ASCII alphabetic character or an underscore followed by any number of alphanumeric
331/// characters or underscores. The regex pattern for it is `[a-zA-Z_][a-zA-Z0-9_]*`.
332#[must_use]
333pub fn ident<C: Character, E: Error<C>>() -> impl Parser<C, C::Collection, Error = E> + Copy + Clone
334{
335 filter(|c: &C| c.to_char().is_ascii_alphabetic() || c.to_char() == '_')
336 .map(Some)
337 .chain::<C, Vec<_>, _>(
338 filter(|c: &C| c.to_char().is_ascii_alphanumeric() || c.to_char() == '_').repeated(),
339 )
340 .collect()
341}
342
343/// Like [`ident`], but only accepts an exact identifier while ignoring trailing identifier characters.
344///
345/// The output type of this parser is `()`.
346///
347/// # Examples
348///
349/// ```
350/// # use chumsky::prelude::*;
351/// let def = text::keyword::<_, _, Simple<char>>("def");
352///
353/// // Exactly 'def' was found
354/// assert_eq!(def.parse("def"), Ok(()));
355/// // Exactly 'def' was found, with non-identifier trailing characters
356/// assert_eq!(def.parse("def(foo, bar)"), Ok(()));
357/// // 'def' was found, but only as part of a larger identifier, so this fails to parse
358/// assert!(def.parse("define").is_err());
359/// ```
360#[must_use]
361pub fn keyword<'a, C: Character + 'a, S: AsRef<C::Str> + 'a + Clone, E: Error<C> + 'a>(
362 keyword: S,
363) -> impl Parser<C, (), Error = E> + Clone + 'a {
364 // TODO: use .filter(...), improve error messages
365 ident().try_map(move |s: C::Collection, span| {
366 if s.as_ref() == keyword.as_ref() {
367 Ok(())
368 } else {
369 Err(E::expected_input_found(span, None, None))
370 }
371 })
372}
373
374/// A parser that consumes text and generates tokens using semantic whitespace rules and the given token parser.
375///
376/// Also required is a function that collects a [`Vec`] of tokens into a whitespace-indicated token tree.
377#[must_use]
378pub fn semantic_indentation<'a, C, Tok, T, F, E: Error<C> + 'a>(
379 token: T,
380 make_group: F,
381) -> impl Parser<C, Vec<Tok>, Error = E> + Clone + 'a
382where
383 C: Character + 'a,
384 Tok: 'a,
385 T: Parser<C, Tok, Error = E> + Clone + 'a,
386 F: Fn(Vec<Tok>, E::Span) -> Tok + Clone + 'a,
387{
388 let line_ws = filter(|c: &C| c.is_inline_whitespace());
389
390 let line = token.padded_by(line_ws.ignored().repeated()).repeated();
391
392 let lines = line_ws
393 .repeated()
394 .then(line.map_with_span(|line, span| (line, span)))
395 .separated_by(newline())
396 .padded();
397
398 lines.map(move |lines| {
399 fn collapse<C, Tok, F, S>(
400 mut tree: Vec<(Vec<C>, Vec<Tok>, Option<S>)>,
401 make_group: &F,
402 ) -> Option<Tok>
403 where
404 F: Fn(Vec<Tok>, S) -> Tok,
405 {
406 while let Some((_, tts, line_span)) = tree.pop() {
407 let tt = make_group(tts, line_span?);
408 if let Some(last) = tree.last_mut() {
409 last.1.push(tt);
410 } else {
411 return Some(tt);
412 }
413 }
414 None
415 }
416
417 let mut nesting = vec![(Vec::new(), Vec::new(), None)];
418 for (indent, (mut line, line_span)) in lines {
419 let mut indent = indent.as_slice();
420 let mut i = 0;
421 while let Some(tail) = nesting
422 .get(i)
423 .and_then(|(n, _, _)| indent.strip_prefix(n.as_slice()))
424 {
425 indent = tail;
426 i += 1;
427 }
428 if let Some(tail) = collapse(nesting.split_off(i), &make_group) {
429 nesting.last_mut().unwrap().1.push(tail);
430 }
431 if !indent.is_empty() {
432 nesting.push((indent.to_vec(), line, Some(line_span)));
433 } else {
434 nesting.last_mut().unwrap().1.append(&mut line);
435 }
436 }
437
438 nesting.remove(0).1
439 })
440}