minijinja/compiler/
lexer.rs

1use std::borrow::Cow;
2use std::ops::ControlFlow;
3
4use crate::compiler::tokens::{Span, Token};
5use crate::error::{Error, ErrorKind};
6use crate::syntax::SyntaxConfig;
7use crate::utils::{memchr, memstr, unescape};
8
9/// Internal config struct to control whitespace in the engine.
10#[derive(Copy, Clone, Debug, Default)]
11pub struct WhitespaceConfig {
12    pub keep_trailing_newline: bool,
13    pub lstrip_blocks: bool,
14    pub trim_blocks: bool,
15}
16
17/// Tokenizes jinja templates.
18pub struct Tokenizer<'s> {
19    stack: Vec<LexerState>,
20    source: &'s str,
21    filename: &'s str,
22    current_line: u16,
23    current_col: u16,
24    current_offset: usize,
25    trim_leading_whitespace: bool,
26    pending_start_marker: Option<(StartMarker, usize)>,
27    paren_balance: isize,
28    syntax_config: SyntaxConfig,
29    ws_config: WhitespaceConfig,
30}
31
32enum LexerState {
33    Template,
34    Variable,
35    Block,
36    #[cfg(feature = "custom_syntax")]
37    LineStatement,
38}
39
40/// Utility enum that defines a marker.
41#[derive(Debug, Copy, Clone, PartialEq, Eq)]
42pub enum StartMarker {
43    Variable,
44    Block,
45    Comment,
46    #[cfg(feature = "custom_syntax")]
47    LineStatement,
48    #[cfg(feature = "custom_syntax")]
49    LineComment,
50}
51
52/// What ends this block tokenization?
53#[derive(Debug, Copy, Clone)]
54enum BlockSentinel {
55    Variable,
56    Block,
57    #[cfg(feature = "custom_syntax")]
58    LineStatement,
59}
60
61#[derive(Copy, Clone, Debug, Eq, PartialEq)]
62enum Whitespace {
63    Default,
64    Preserve,
65    Remove,
66}
67
68impl Whitespace {
69    fn from_byte(b: Option<u8>) -> Whitespace {
70        match b {
71            Some(b'-') => Whitespace::Remove,
72            Some(b'+') => Whitespace::Preserve,
73            _ => Whitespace::Default,
74        }
75    }
76
77    fn len(&self) -> usize {
78        match self {
79            Whitespace::Default => 0,
80            Whitespace::Preserve | Whitespace::Remove => 1,
81        }
82    }
83}
84
85fn find_start_marker_memchr(a: &str) -> Option<(usize, StartMarker, usize, Whitespace)> {
86    let bytes = a.as_bytes();
87    let mut offset = 0;
88    loop {
89        let idx = some!(memchr(&bytes[offset..], b'{'));
90        let marker = match bytes.get(offset + idx + 1).copied() {
91            Some(b'{') => StartMarker::Variable,
92            Some(b'%') => StartMarker::Block,
93            Some(b'#') => StartMarker::Comment,
94            _ => {
95                offset += idx + 1;
96                continue;
97            }
98        };
99        let ws = Whitespace::from_byte(bytes.get(offset + idx + 2).copied());
100        return Some((offset + idx, marker, 2 + ws.len(), ws));
101    }
102}
103
104#[cfg(feature = "custom_syntax")]
105fn find_start_marker(
106    a: &str,
107    offset: usize,
108    syntax_config: &SyntaxConfig,
109) -> Option<(usize, StartMarker, usize, Whitespace)> {
110    // If we have a custom delimiter we need to use the aho-corasick
111    // otherwise we can use internal memchr.
112    let Some(ref ac) = syntax_config.aho_corasick else {
113        return find_start_marker_memchr(&a[offset..]);
114    };
115
116    let bytes = &a.as_bytes()[offset..];
117    let mut state = aho_corasick::automaton::OverlappingState::start();
118    let mut longest_match = None::<(usize, StartMarker, usize, Whitespace)>;
119
120    loop {
121        ac.find_overlapping(bytes, &mut state);
122        let m = match state.get_match() {
123            None => break,
124            Some(m) => m,
125        };
126
127        let marker = syntax_config.pattern_to_marker(m.pattern());
128        let ws = if matches!(marker, StartMarker::LineStatement) {
129            let prefix = &a.as_bytes()[..offset + m.start()];
130            if matches!(
131                prefix
132                    .iter()
133                    .copied()
134                    .rev()
135                    .find(|&x| x != b' ' && x != b'\t'),
136                None | Some(b'\r') | Some(b'\n')
137            ) {
138                Whitespace::Default
139            } else {
140                continue;
141            }
142        } else {
143            Whitespace::from_byte(bytes.get(m.start() + m.len()).copied())
144        };
145        let new_match = (m.start(), marker, m.len() + ws.len(), ws);
146
147        if longest_match.as_ref().is_some_and(|x| new_match.0 > x.0) {
148            break;
149        }
150        longest_match = Some(new_match);
151    }
152
153    longest_match
154}
155
156#[cfg(not(feature = "custom_syntax"))]
157fn find_start_marker(
158    a: &str,
159    offset: usize,
160    _syntax_config: &SyntaxConfig,
161) -> Option<(usize, StartMarker, usize, Whitespace)> {
162    find_start_marker_memchr(&a[offset..])
163}
164
165#[cfg(feature = "unicode")]
166fn lex_identifier(s: &str) -> usize {
167    s.chars()
168        .enumerate()
169        .map_while(|(idx, c)| {
170            let cont = if c == '_' {
171                true
172            } else if idx == 0 {
173                unicode_ident::is_xid_start(c)
174            } else {
175                unicode_ident::is_xid_continue(c)
176            };
177            cont.then(|| c.len_utf8())
178        })
179        .sum::<usize>()
180}
181
182#[cfg(not(feature = "unicode"))]
183fn lex_identifier(s: &str) -> usize {
184    s.as_bytes()
185        .iter()
186        .enumerate()
187        .take_while(|&(idx, &c)| {
188            if c == b'_' {
189                true
190            } else if idx == 0 {
191                c.is_ascii_alphabetic()
192            } else {
193                c.is_ascii_alphanumeric()
194            }
195        })
196        .count()
197}
198
199fn is_nl(c: char) -> bool {
200    c == '\r' || c == '\n'
201}
202
203#[cfg(feature = "custom_syntax")]
204fn skip_nl(mut rest: &str) -> (bool, usize) {
205    let mut skip = 0;
206    let mut was_nl = false;
207    if let Some(new_rest) = rest.strip_prefix('\n') {
208        rest = new_rest;
209        skip += 1;
210        was_nl = true;
211    }
212    if let Some(new_rest) = rest.strip_prefix('\r') {
213        rest = new_rest;
214        skip += 1;
215        was_nl = true;
216    }
217    (was_nl || rest.is_empty(), skip)
218}
219
220fn lstrip_block(s: &str) -> &str {
221    let trimmed = s.trim_end_matches(|x: char| x.is_whitespace() && !is_nl(x));
222    if trimmed.is_empty() || trimmed.as_bytes().get(trimmed.len() - 1) == Some(&b'\n') {
223        trimmed
224    } else {
225        s
226    }
227}
228
229fn should_lstrip_block(flag: bool, marker: StartMarker, prefix: &str) -> bool {
230    if flag && !matches!(marker, StartMarker::Variable) {
231        // Only strip if we're at the start of a line
232        for c in prefix.chars().rev() {
233            if is_nl(c) {
234                return true;
235            } else if !c.is_whitespace() {
236                return false;
237            }
238        }
239        // If we get here, we're at the start of the file
240        return true;
241    }
242    #[cfg(feature = "custom_syntax")]
243    {
244        if matches!(
245            marker,
246            StartMarker::LineStatement | StartMarker::LineComment
247        ) {
248            return true;
249        }
250    }
251    false
252}
253
254fn skip_basic_tag(
255    block_str: &str,
256    name: &str,
257    block_end: &str,
258    skip_ws_control: bool,
259) -> Option<(usize, Whitespace)> {
260    let mut ptr = block_str;
261
262    if skip_ws_control {
263        if let Some(rest) = ptr.strip_prefix(['-', '+']) {
264            ptr = rest;
265        }
266    }
267    while let Some(rest) = ptr.strip_prefix(|x: char| x.is_ascii_whitespace()) {
268        ptr = rest;
269    }
270
271    ptr = some!(ptr.strip_prefix(name));
272
273    while let Some(rest) = ptr.strip_prefix(|x: char| x.is_ascii_whitespace()) {
274        ptr = rest;
275    }
276
277    let ws = if let Some(rest) = ptr.strip_prefix('-') {
278        ptr = rest;
279        Whitespace::Remove
280    } else if let Some(rest) = ptr.strip_prefix('+') {
281        ptr = rest;
282        Whitespace::Preserve
283    } else {
284        Whitespace::Default
285    };
286
287    ptr.strip_prefix(block_end)
288        .map(|ptr| (block_str.len() - ptr.len(), ws))
289}
290
291impl<'s> Tokenizer<'s> {
292    /// Creates a new tokenizer.
293    pub fn new(
294        input: &'s str,
295        filename: &'s str,
296        in_expr: bool,
297        syntax_config: SyntaxConfig,
298        whitespace_config: WhitespaceConfig,
299    ) -> Tokenizer<'s> {
300        let mut source = input;
301        if !whitespace_config.keep_trailing_newline {
302            if source.ends_with('\n') {
303                source = &source[..source.len() - 1];
304            }
305            if source.ends_with('\r') {
306                source = &source[..source.len() - 1];
307            }
308        }
309        Tokenizer {
310            source,
311            filename,
312            stack: vec![if in_expr {
313                LexerState::Variable
314            } else {
315                LexerState::Template
316            }],
317            current_line: 1,
318            current_col: 0,
319            current_offset: 0,
320            paren_balance: 0,
321            trim_leading_whitespace: false,
322            pending_start_marker: None,
323            syntax_config,
324            ws_config: whitespace_config,
325        }
326    }
327
328    /// Returns the current filename.
329    pub fn filename(&self) -> &str {
330        self.filename
331    }
332
333    /// Produces the next token from the tokenizer.
334    pub fn next_token(&mut self) -> Result<Option<(Token<'s>, Span)>, Error> {
335        loop {
336            if self.rest_bytes().is_empty() {
337                // line statements normally close with newlines.  At the end of the file
338                // however we need to use the stack to close out the block instead.
339                #[cfg(feature = "custom_syntax")]
340                {
341                    if matches!(self.stack.pop(), Some(LexerState::LineStatement)) {
342                        return Ok(Some((Token::BlockEnd, self.span(self.loc()))));
343                    }
344                }
345                return Ok(None);
346            }
347            let outcome = match self.stack.last() {
348                Some(LexerState::Template) => self.tokenize_root(),
349                Some(LexerState::Block) => self.tokenize_block_or_var(BlockSentinel::Block),
350                #[cfg(feature = "custom_syntax")]
351                Some(LexerState::LineStatement) => {
352                    self.tokenize_block_or_var(BlockSentinel::LineStatement)
353                }
354                Some(LexerState::Variable) => self.tokenize_block_or_var(BlockSentinel::Variable),
355                None => panic!("empty lexer stack"),
356            };
357            match ok!(outcome) {
358                ControlFlow::Break(rv) => return Ok(Some(rv)),
359                ControlFlow::Continue(()) => continue,
360            }
361        }
362    }
363
364    #[inline]
365    fn rest(&self) -> &'s str {
366        &self.source[self.current_offset..]
367    }
368
369    #[inline]
370    fn rest_bytes(&self) -> &'s [u8] {
371        &self.source.as_bytes()[self.current_offset..]
372    }
373
374    fn advance(&mut self, bytes: usize) -> &'s str {
375        let skipped = &self.rest()[..bytes];
376        for c in skipped.chars() {
377            match c {
378                '\n' => {
379                    self.current_line = self.current_line.saturating_add(1);
380                    self.current_col = 0;
381                }
382                _ => self.current_col = self.current_col.saturating_add(1),
383            }
384        }
385        self.current_offset += bytes;
386        skipped
387    }
388
389    #[inline]
390    fn loc(&self) -> (u16, u16, u32) {
391        (
392            self.current_line,
393            self.current_col,
394            self.current_offset as u32,
395        )
396    }
397
398    #[inline]
399    fn span(&self, (start_line, start_col, start_offset): (u16, u16, u32)) -> Span {
400        Span {
401            start_line,
402            start_col,
403            start_offset,
404            end_line: self.current_line,
405            end_col: self.current_col,
406            end_offset: self.current_offset as u32,
407        }
408    }
409
410    #[inline]
411    fn syntax_error(&mut self, msg: &'static str) -> Error {
412        let mut span = self.span(self.loc());
413        if span.start_col == span.end_col {
414            span.end_col += 1;
415            span.end_offset += 1;
416        }
417        let mut err = Error::new(ErrorKind::SyntaxError, msg);
418        err.set_filename_and_span(self.filename, span);
419        err
420    }
421
422    fn eat_number(&mut self) -> Result<(Token<'s>, Span), Error> {
423        #[derive(Copy, Clone)]
424        enum State {
425            RadixInteger, // 0x10
426            Integer,      // 123
427            Fraction,     // .123
428            Exponent,     // E | e
429            ExponentSign, // +|-
430        }
431
432        let old_loc = self.loc();
433
434        let radix = match self.rest_bytes().get(..2) {
435            Some(b"0b" | b"0B") => 2,
436            Some(b"0o" | b"0O") => 8,
437            Some(b"0x" | b"0X") => 16,
438            _ => 10,
439        };
440
441        let mut state = if radix == 10 {
442            State::Integer
443        } else {
444            self.advance(2);
445            State::RadixInteger
446        };
447
448        let mut num_len = self
449            .rest_bytes()
450            .iter()
451            .take_while(|&c| c.is_ascii_digit())
452            .count();
453        let mut has_underscore = false;
454        for c in self.rest_bytes()[num_len..].iter().copied() {
455            state = match (c, state) {
456                (b'.', State::Integer) => State::Fraction,
457                (b'E' | b'e', State::Integer | State::Fraction) => State::Exponent,
458                (b'+' | b'-', State::Exponent) => State::ExponentSign,
459                (b'0'..=b'9', State::Exponent) => State::ExponentSign,
460                (b'0'..=b'9', state) => state,
461                (b'a'..=b'f' | b'A'..=b'F', State::RadixInteger) if radix == 16 => state,
462                (b'_', _) => {
463                    has_underscore = true;
464                    state
465                }
466                _ => break,
467            };
468            num_len += 1;
469        }
470        let is_float = !matches!(state, State::Integer | State::RadixInteger);
471
472        let mut num = Cow::Borrowed(self.advance(num_len));
473        if has_underscore {
474            if num.ends_with('_') {
475                return Err(self.syntax_error("'_' may not occur at end of number"));
476            }
477            num = Cow::Owned(num.replace('_', ""));
478        }
479
480        Ok((
481            ok!(if is_float {
482                num.parse()
483                    .map(Token::Float)
484                    .map_err(|_| self.syntax_error("invalid float"))
485            } else if let Ok(int) = u64::from_str_radix(&num, radix) {
486                Ok(Token::Int(int))
487            } else {
488                u128::from_str_radix(&num, radix)
489                    .map(|x| Token::Int128(Box::new(x)))
490                    .map_err(|_| self.syntax_error("invalid integer (too large)"))
491            }),
492            self.span(old_loc),
493        ))
494    }
495
496    fn eat_identifier(&mut self) -> Result<(Token<'s>, Span), Error> {
497        let ident_len = lex_identifier(self.rest());
498        if ident_len > 0 {
499            let old_loc = self.loc();
500            let ident = self.advance(ident_len);
501            Ok((Token::Ident(ident), self.span(old_loc)))
502        } else {
503            Err(self.syntax_error("unexpected character"))
504        }
505    }
506
507    fn eat_string(&mut self, delim: u8) -> Result<(Token<'s>, Span), Error> {
508        let old_loc = self.loc();
509        let mut escaped = false;
510        let mut has_escapes = false;
511        let str_len = self
512            .rest_bytes()
513            .iter()
514            .skip(1)
515            .take_while(|&&c| match (escaped, c) {
516                (true, _) => {
517                    escaped = false;
518                    true
519                }
520                (_, b'\\') => {
521                    escaped = true;
522                    has_escapes = true;
523                    true
524                }
525                (_, c) if c == delim => false,
526                _ => true,
527            })
528            .count();
529        if escaped || self.rest_bytes().get(str_len + 1) != Some(&delim) {
530            self.advance(str_len + 1);
531            return Err(self.syntax_error("unexpected end of string"));
532        }
533        let s = self.advance(str_len + 2);
534        Ok(if has_escapes {
535            (
536                Token::String(ok!(unescape(&s[1..s.len() - 1])).into_boxed_str()),
537                self.span(old_loc),
538            )
539        } else {
540            (Token::Str(&s[1..s.len() - 1]), self.span(old_loc))
541        })
542    }
543
544    fn skip_whitespace(&mut self) {
545        let skipped = self
546            .rest()
547            .chars()
548            .map_while(|c| c.is_whitespace().then(|| c.len_utf8()))
549            .sum();
550        if skipped > 0 {
551            self.advance(skipped);
552        }
553    }
554
555    fn skip_newline_if_trim_blocks(&mut self) {
556        if self.ws_config.trim_blocks {
557            if self.rest_bytes().get(0) == Some(&b'\r') {
558                self.advance(1);
559            }
560            if self.rest_bytes().get(0) == Some(&b'\n') {
561                self.advance(1);
562            }
563        }
564    }
565
566    fn handle_tail_ws(&mut self, ws: Whitespace) {
567        match ws {
568            Whitespace::Preserve => {}
569            Whitespace::Default => {
570                self.skip_newline_if_trim_blocks();
571            }
572            Whitespace::Remove => {
573                self.trim_leading_whitespace = true;
574            }
575        }
576    }
577
578    fn variable_end(&self) -> &str {
579        self.syntax_config.variable_delimiters().1
580    }
581
582    fn block_start(&self) -> &str {
583        self.syntax_config.block_delimiters().0
584    }
585
586    fn block_end(&self) -> &str {
587        self.syntax_config.block_delimiters().1
588    }
589
590    fn comment_end(&self) -> &str {
591        self.syntax_config.comment_delimiters().1
592    }
593
594    fn tokenize_root(&mut self) -> Result<ControlFlow<(Token<'s>, Span)>, Error> {
595        if let Some((marker, len)) = self.pending_start_marker.take() {
596            return self.handle_start_marker(marker, len);
597        }
598        if self.trim_leading_whitespace {
599            self.trim_leading_whitespace = false;
600            self.skip_whitespace();
601        }
602        let old_loc = self.loc();
603        let (lead, span) =
604            match find_start_marker(self.source, self.current_offset, &self.syntax_config) {
605                Some((start, marker, len, whitespace)) => {
606                    self.pending_start_marker = Some((marker, len));
607                    match whitespace {
608                        Whitespace::Default
609                            if should_lstrip_block(
610                                self.ws_config.lstrip_blocks,
611                                marker,
612                                &self.source[..self.current_offset + start],
613                            ) =>
614                        {
615                            let peeked = &self.rest()[..start];
616                            let trimmed = lstrip_block(peeked);
617                            let lead = self.advance(trimmed.len());
618                            let span = self.span(old_loc);
619                            self.advance(peeked.len() - trimmed.len());
620                            (lead, span)
621                        }
622                        Whitespace::Default | Whitespace::Preserve => {
623                            (self.advance(start), self.span(old_loc))
624                        }
625                        Whitespace::Remove => {
626                            let peeked = &self.rest()[..start];
627                            let trimmed = peeked.trim_end();
628                            let lead = self.advance(trimmed.len());
629                            let span = self.span(old_loc);
630                            self.advance(peeked.len() - trimmed.len());
631                            (lead, span)
632                        }
633                    }
634                }
635                None => (self.advance(self.rest().len()), self.span(old_loc)),
636            };
637
638        if lead.is_empty() {
639            Ok(ControlFlow::Continue(()))
640        } else {
641            Ok(ControlFlow::Break((Token::TemplateData(lead), span)))
642        }
643    }
644
645    fn handle_start_marker(
646        &mut self,
647        marker: StartMarker,
648        skip: usize,
649    ) -> Result<ControlFlow<(Token<'s>, Span)>, Error> {
650        match marker {
651            StartMarker::Comment => {
652                if let Some(end) = memstr(&self.rest_bytes()[skip..], self.comment_end().as_bytes())
653                {
654                    let ws = Whitespace::from_byte(
655                        self.rest_bytes().get(end.saturating_sub(1) + skip).copied(),
656                    );
657                    self.advance(end + skip + self.comment_end().len());
658                    self.handle_tail_ws(ws);
659                    Ok(ControlFlow::Continue(()))
660                } else {
661                    self.advance(self.rest_bytes().len());
662                    Err(self.syntax_error("unexpected end of comment"))
663                }
664            }
665            StartMarker::Variable => {
666                let old_loc = self.loc();
667                self.advance(skip);
668                self.stack.push(LexerState::Variable);
669                Ok(ControlFlow::Break((
670                    Token::VariableStart,
671                    self.span(old_loc),
672                )))
673            }
674            StartMarker::Block => {
675                // raw blocks require some special handling.  If we are at the beginning of a raw
676                // block we want to skip everything until {% endraw %} completely ignoring interior
677                // syntax and emit the entire raw block as TemplateData.
678                if let Some((raw, ws_start)) =
679                    skip_basic_tag(&self.rest()[skip..], "raw", self.block_end(), false)
680                {
681                    self.advance(raw + skip);
682                    self.handle_raw_tag(ws_start)
683                } else {
684                    let old_loc = self.loc();
685                    self.advance(skip);
686                    self.stack.push(LexerState::Block);
687                    Ok(ControlFlow::Break((Token::BlockStart, self.span(old_loc))))
688                }
689            }
690            #[cfg(feature = "custom_syntax")]
691            StartMarker::LineStatement => {
692                let old_loc = self.loc();
693                self.advance(skip);
694                self.stack.push(LexerState::LineStatement);
695                Ok(ControlFlow::Break((Token::BlockStart, self.span(old_loc))))
696            }
697            #[cfg(feature = "custom_syntax")]
698            StartMarker::LineComment => {
699                let comment_skip = self.rest_bytes()[skip..]
700                    .iter()
701                    .take_while(|&&c| c != b'\r' && c != b'\n')
702                    .count();
703                let (_, nl_skip) = skip_nl(&self.rest()[skip + comment_skip..]);
704                self.advance(skip + comment_skip + nl_skip);
705                Ok(ControlFlow::Continue(()))
706            }
707        }
708    }
709
710    fn handle_raw_tag(
711        &mut self,
712        ws_start: Whitespace,
713    ) -> Result<ControlFlow<(Token<'s>, Span)>, Error> {
714        let old_loc = self.loc();
715        let mut ptr = 0;
716        while let Some(block) = memstr(&self.rest_bytes()[ptr..], self.block_start().as_bytes()) {
717            ptr += block + self.block_start().len();
718            if let Some((endraw, ws_next)) =
719                skip_basic_tag(&self.rest()[ptr..], "endraw", self.block_end(), true)
720            {
721                let ws = Whitespace::from_byte(self.rest_bytes().get(ptr).copied());
722                let end = ptr - self.block_start().len();
723                let mut result = &self.rest()[..end];
724                self.advance(end);
725                let span = self.span(old_loc);
726                self.advance(self.block_start().len() + endraw);
727                match ws_start {
728                    Whitespace::Default if self.ws_config.trim_blocks => {
729                        if result.starts_with('\r') {
730                            result = &result[1..];
731                        }
732                        if result.starts_with('\n') {
733                            result = &result[1..];
734                        }
735                    }
736                    Whitespace::Remove => {
737                        result = result.trim_start();
738                    }
739                    _ => {}
740                }
741                result = match ws {
742                    Whitespace::Default if self.ws_config.lstrip_blocks => lstrip_block(result),
743                    Whitespace::Remove => result.trim_end(),
744                    _ => result,
745                };
746                self.handle_tail_ws(ws_next);
747                return Ok(ControlFlow::Break((Token::TemplateData(result), span)));
748            }
749        }
750        self.advance(self.rest_bytes().len());
751        Err(self.syntax_error("unexpected end of raw block"))
752    }
753
754    fn tokenize_block_or_var(
755        &mut self,
756        sentinel: BlockSentinel,
757    ) -> Result<ControlFlow<(Token<'s>, Span)>, Error> {
758        let old_loc = self.loc();
759        let rest = self.rest();
760
761        // special case for looking for the end of a line statements if there are no
762        // open parens, braces etc.  This can only happen with custom syntax
763        #[cfg(feature = "custom_syntax")]
764        {
765            if matches!(sentinel, BlockSentinel::LineStatement)
766                && self.paren_balance == 0
767                && self.syntax_config.line_statement_prefix().is_some()
768            {
769                let skip = rest
770                    .chars()
771                    .take_while(|&x| x.is_whitespace() && !is_nl(x))
772                    .map(|x| x.len_utf8())
773                    .sum();
774                let (was_nl, nl_skip) = skip_nl(&rest[skip..]);
775                if was_nl {
776                    self.advance(skip + nl_skip);
777                    self.stack.pop();
778                    return Ok(ControlFlow::Break((Token::BlockEnd, self.span(old_loc))));
779                }
780            }
781        }
782
783        // in blocks whitespace is generally ignored, skip it.
784        match rest
785            .as_bytes()
786            .iter()
787            .position(|&x| !x.is_ascii_whitespace())
788        {
789            Some(0) => {}
790            None => {
791                self.advance(rest.len());
792                return Ok(ControlFlow::Continue(()));
793            }
794            Some(offset) => {
795                self.advance(offset);
796                return Ok(ControlFlow::Continue(()));
797            }
798        }
799
800        // look out for the end of blocks
801        if self.paren_balance == 0 {
802            match sentinel {
803                BlockSentinel::Block => {
804                    if matches!(rest.get(..1), Some("-" | "+"))
805                        && rest[1..].starts_with(self.block_end())
806                    {
807                        self.stack.pop();
808                        let was_minus = &rest[..1] == "-";
809                        self.advance(self.block_end().len() + 1);
810                        let span = self.span(old_loc);
811                        if was_minus {
812                            self.trim_leading_whitespace = true;
813                        }
814                        return Ok(ControlFlow::Break((Token::BlockEnd, span)));
815                    }
816                    if rest.starts_with(self.block_end()) {
817                        self.stack.pop();
818                        self.advance(self.block_end().len());
819                        let span = self.span(old_loc);
820                        self.skip_newline_if_trim_blocks();
821                        return Ok(ControlFlow::Break((Token::BlockEnd, span)));
822                    }
823                }
824                BlockSentinel::Variable => {
825                    if matches!(rest.get(..1), Some("-" | "+"))
826                        && rest[1..].starts_with(self.variable_end())
827                    {
828                        self.stack.pop();
829                        let was_minus = &rest[..1] == "-";
830                        self.advance(self.variable_end().len() + 1);
831                        let span = self.span(old_loc);
832                        if was_minus {
833                            self.trim_leading_whitespace = true;
834                        }
835                        return Ok(ControlFlow::Break((Token::VariableEnd, span)));
836                    }
837                    if rest.starts_with(self.variable_end()) {
838                        self.stack.pop();
839                        self.advance(self.variable_end().len());
840                        return Ok(ControlFlow::Break((Token::VariableEnd, self.span(old_loc))));
841                    }
842                }
843                // line statements are handled above
844                #[cfg(feature = "custom_syntax")]
845                BlockSentinel::LineStatement => {}
846            }
847        }
848
849        // two character operators
850        let op = match rest.as_bytes().get(..2) {
851            Some(b"//") => Some(Token::FloorDiv),
852            Some(b"**") => Some(Token::Pow),
853            Some(b"==") => Some(Token::Eq),
854            Some(b"!=") => Some(Token::Ne),
855            Some(b">=") => Some(Token::Gte),
856            Some(b"<=") => Some(Token::Lte),
857            _ => None,
858        };
859        if let Some(op) = op {
860            self.advance(2);
861            return Ok(ControlFlow::Break((op, self.span(old_loc))));
862        }
863
864        macro_rules! with_paren_balance {
865            ($delta:expr, $tok:expr) => {{
866                self.paren_balance += $delta;
867                Some($tok)
868            }};
869        }
870
871        // single character operators (and strings)
872        let op = match rest.as_bytes().get(0) {
873            Some(b'+') => Some(Token::Plus),
874            Some(b'-') => Some(Token::Minus),
875            Some(b'*') => Some(Token::Mul),
876            Some(b'/') => Some(Token::Div),
877            Some(b'%') => Some(Token::Mod),
878            Some(b'.') => Some(Token::Dot),
879            Some(b',') => Some(Token::Comma),
880            Some(b':') => Some(Token::Colon),
881            Some(b'~') => Some(Token::Tilde),
882            Some(b'|') => Some(Token::Pipe),
883            Some(b'=') => Some(Token::Assign),
884            Some(b'>') => Some(Token::Gt),
885            Some(b'<') => Some(Token::Lt),
886            Some(b'(') => with_paren_balance!(1, Token::ParenOpen),
887            Some(b')') => with_paren_balance!(-1, Token::ParenClose),
888            Some(b'[') => with_paren_balance!(1, Token::BracketOpen),
889            Some(b']') => with_paren_balance!(-1, Token::BracketClose),
890            Some(b'{') => with_paren_balance!(1, Token::BraceOpen),
891            Some(b'}') => with_paren_balance!(-1, Token::BraceClose),
892            Some(b'\'') => {
893                return Ok(ControlFlow::Break(ok!(self.eat_string(b'\''))));
894            }
895            Some(b'"') => {
896                return Ok(ControlFlow::Break(ok!(self.eat_string(b'"'))));
897            }
898            Some(c) if c.is_ascii_digit() => return Ok(ControlFlow::Break(ok!(self.eat_number()))),
899            _ => None,
900        };
901        if let Some(op) = op {
902            self.advance(1);
903            Ok(ControlFlow::Break((op, self.span(old_loc))))
904        } else {
905            Ok(ControlFlow::Break(ok!(self.eat_identifier())))
906        }
907    }
908}
909
910/// Utility function to quickly tokenize into an iterator.
911#[cfg(any(test, feature = "unstable_machinery"))]
912pub fn tokenize(
913    input: &str,
914    in_expr: bool,
915    syntax_config: SyntaxConfig,
916    whitespace_config: WhitespaceConfig,
917) -> impl Iterator<Item = Result<(Token<'_>, Span), Error>> {
918    // This function is unused in minijinja itself, it's only used in tests and in the
919    // unstable machinery as a convenient alternative to the tokenizer.
920    let mut tokenizer =
921        Tokenizer::new(input, "<string>", in_expr, syntax_config, whitespace_config);
922    std::iter::from_fn(move || tokenizer.next_token().transpose())
923}
924
925#[cfg(test)]
926mod tests {
927    use super::*;
928
929    use similar_asserts::assert_eq;
930
931    #[test]
932    fn test_is_basic_tag() {
933        assert_eq!(
934            skip_basic_tag(" raw %}", "raw", "%}", false),
935            Some((7, Whitespace::Default))
936        );
937        assert_eq!(skip_basic_tag(" raw %}", "endraw", "%}", false), None);
938        assert_eq!(
939            skip_basic_tag("  raw  %}", "raw", "%}", false),
940            Some((9, Whitespace::Default))
941        );
942        assert_eq!(
943            skip_basic_tag("  raw  -%}", "raw", "%}", false),
944            Some((10, Whitespace::Remove))
945        );
946        assert_eq!(
947            skip_basic_tag("  raw  +%}", "raw", "%}", false),
948            Some((10, Whitespace::Preserve))
949        );
950    }
951
952    #[test]
953    fn test_basic_identifiers() {
954        fn assert_ident(s: &str) {
955            match tokenize(s, true, Default::default(), Default::default()).next() {
956                Some(Ok((Token::Ident(ident), _))) if ident == s => {}
957                _ => panic!("did not get a matching token result: {s:?}"),
958            }
959        }
960
961        fn assert_not_ident(s: &str) {
962            let res = tokenize(s, true, Default::default(), Default::default())
963                .collect::<Result<Vec<_>, _>>();
964            if let Ok(tokens) = res {
965                if let &[(Token::Ident(_), _)] = &tokens[..] {
966                    panic!("got a single ident for {s:?}")
967                }
968            }
969        }
970
971        assert_ident("foo_bar_baz");
972        assert_ident("_foo_bar_baz");
973        assert_ident("_42world");
974        assert_ident("_world42");
975        assert_ident("world42");
976        assert_not_ident("42world");
977
978        #[cfg(feature = "unicode")]
979        {
980            assert_ident("foo");
981            assert_ident("föö");
982            assert_ident("き");
983            assert_ident("_");
984            assert_not_ident("1a");
985            assert_not_ident("a-");
986            assert_not_ident("🐍a");
987            assert_not_ident("a🐍🐍");
988            assert_ident("ᢅ");
989            assert_ident("ᢆ");
990            assert_ident("℘");
991            assert_ident("℮");
992            assert_not_ident("·");
993            assert_ident("a·");
994        }
995    }
996}