1use std::borrow::Cow;
2use std::ops::ControlFlow;
3
4use crate::compiler::tokens::{Span, Token};
5use crate::error::{Error, ErrorKind};
6use crate::syntax::SyntaxConfig;
7use crate::utils::{memchr, memstr, unescape};
8
9#[derive(Copy, Clone, Debug, Default)]
11pub struct WhitespaceConfig {
12 pub keep_trailing_newline: bool,
13 pub lstrip_blocks: bool,
14 pub trim_blocks: bool,
15}
16
17pub struct Tokenizer<'s> {
19 stack: Vec<LexerState>,
20 source: &'s str,
21 filename: &'s str,
22 current_line: u16,
23 current_col: u16,
24 current_offset: usize,
25 trim_leading_whitespace: bool,
26 pending_start_marker: Option<(StartMarker, usize)>,
27 paren_balance: isize,
28 syntax_config: SyntaxConfig,
29 ws_config: WhitespaceConfig,
30}
31
32enum LexerState {
33 Template,
34 Variable,
35 Block,
36 #[cfg(feature = "custom_syntax")]
37 LineStatement,
38}
39
40#[derive(Debug, Copy, Clone, PartialEq, Eq)]
42pub enum StartMarker {
43 Variable,
44 Block,
45 Comment,
46 #[cfg(feature = "custom_syntax")]
47 LineStatement,
48 #[cfg(feature = "custom_syntax")]
49 LineComment,
50}
51
52#[derive(Debug, Copy, Clone)]
54enum BlockSentinel {
55 Variable,
56 Block,
57 #[cfg(feature = "custom_syntax")]
58 LineStatement,
59}
60
61#[derive(Copy, Clone, Debug, Eq, PartialEq)]
62enum Whitespace {
63 Default,
64 Preserve,
65 Remove,
66}
67
68impl Whitespace {
69 fn from_byte(b: Option<u8>) -> Whitespace {
70 match b {
71 Some(b'-') => Whitespace::Remove,
72 Some(b'+') => Whitespace::Preserve,
73 _ => Whitespace::Default,
74 }
75 }
76
77 fn len(&self) -> usize {
78 match self {
79 Whitespace::Default => 0,
80 Whitespace::Preserve | Whitespace::Remove => 1,
81 }
82 }
83}
84
85fn find_start_marker_memchr(a: &str) -> Option<(usize, StartMarker, usize, Whitespace)> {
86 let bytes = a.as_bytes();
87 let mut offset = 0;
88 loop {
89 let idx = some!(memchr(&bytes[offset..], b'{'));
90 let marker = match bytes.get(offset + idx + 1).copied() {
91 Some(b'{') => StartMarker::Variable,
92 Some(b'%') => StartMarker::Block,
93 Some(b'#') => StartMarker::Comment,
94 _ => {
95 offset += idx + 1;
96 continue;
97 }
98 };
99 let ws = Whitespace::from_byte(bytes.get(offset + idx + 2).copied());
100 return Some((offset + idx, marker, 2 + ws.len(), ws));
101 }
102}
103
104#[cfg(feature = "custom_syntax")]
105fn find_start_marker(
106 a: &str,
107 offset: usize,
108 syntax_config: &SyntaxConfig,
109) -> Option<(usize, StartMarker, usize, Whitespace)> {
110 let Some(ref ac) = syntax_config.aho_corasick else {
113 return find_start_marker_memchr(&a[offset..]);
114 };
115
116 let bytes = &a.as_bytes()[offset..];
117 let mut state = aho_corasick::automaton::OverlappingState::start();
118 let mut longest_match = None::<(usize, StartMarker, usize, Whitespace)>;
119
120 loop {
121 ac.find_overlapping(bytes, &mut state);
122 let m = match state.get_match() {
123 None => break,
124 Some(m) => m,
125 };
126
127 let marker = syntax_config.pattern_to_marker(m.pattern());
128 let ws = if matches!(marker, StartMarker::LineStatement) {
129 let prefix = &a.as_bytes()[..offset + m.start()];
130 if matches!(
131 prefix
132 .iter()
133 .copied()
134 .rev()
135 .find(|&x| x != b' ' && x != b'\t'),
136 None | Some(b'\r') | Some(b'\n')
137 ) {
138 Whitespace::Default
139 } else {
140 continue;
141 }
142 } else {
143 Whitespace::from_byte(bytes.get(m.start() + m.len()).copied())
144 };
145 let new_match = (m.start(), marker, m.len() + ws.len(), ws);
146
147 if longest_match.as_ref().is_some_and(|x| new_match.0 > x.0) {
148 break;
149 }
150 longest_match = Some(new_match);
151 }
152
153 longest_match
154}
155
156#[cfg(not(feature = "custom_syntax"))]
157fn find_start_marker(
158 a: &str,
159 offset: usize,
160 _syntax_config: &SyntaxConfig,
161) -> Option<(usize, StartMarker, usize, Whitespace)> {
162 find_start_marker_memchr(&a[offset..])
163}
164
165#[cfg(feature = "unicode")]
166fn lex_identifier(s: &str) -> usize {
167 s.chars()
168 .enumerate()
169 .map_while(|(idx, c)| {
170 let cont = if c == '_' {
171 true
172 } else if idx == 0 {
173 unicode_ident::is_xid_start(c)
174 } else {
175 unicode_ident::is_xid_continue(c)
176 };
177 cont.then(|| c.len_utf8())
178 })
179 .sum::<usize>()
180}
181
182#[cfg(not(feature = "unicode"))]
183fn lex_identifier(s: &str) -> usize {
184 s.as_bytes()
185 .iter()
186 .enumerate()
187 .take_while(|&(idx, &c)| {
188 if c == b'_' {
189 true
190 } else if idx == 0 {
191 c.is_ascii_alphabetic()
192 } else {
193 c.is_ascii_alphanumeric()
194 }
195 })
196 .count()
197}
198
199fn is_nl(c: char) -> bool {
200 c == '\r' || c == '\n'
201}
202
203#[cfg(feature = "custom_syntax")]
204fn skip_nl(mut rest: &str) -> (bool, usize) {
205 let mut skip = 0;
206 let mut was_nl = false;
207 if let Some(new_rest) = rest.strip_prefix('\n') {
208 rest = new_rest;
209 skip += 1;
210 was_nl = true;
211 }
212 if let Some(new_rest) = rest.strip_prefix('\r') {
213 rest = new_rest;
214 skip += 1;
215 was_nl = true;
216 }
217 (was_nl || rest.is_empty(), skip)
218}
219
220fn lstrip_block(s: &str) -> &str {
221 let trimmed = s.trim_end_matches(|x: char| x.is_whitespace() && !is_nl(x));
222 if trimmed.is_empty() || trimmed.as_bytes().get(trimmed.len() - 1) == Some(&b'\n') {
223 trimmed
224 } else {
225 s
226 }
227}
228
229fn should_lstrip_block(flag: bool, marker: StartMarker, prefix: &str) -> bool {
230 if flag && !matches!(marker, StartMarker::Variable) {
231 for c in prefix.chars().rev() {
233 if is_nl(c) {
234 return true;
235 } else if !c.is_whitespace() {
236 return false;
237 }
238 }
239 return true;
241 }
242 #[cfg(feature = "custom_syntax")]
243 {
244 if matches!(
245 marker,
246 StartMarker::LineStatement | StartMarker::LineComment
247 ) {
248 return true;
249 }
250 }
251 false
252}
253
254fn skip_basic_tag(
255 block_str: &str,
256 name: &str,
257 block_end: &str,
258 skip_ws_control: bool,
259) -> Option<(usize, Whitespace)> {
260 let mut ptr = block_str;
261
262 if skip_ws_control {
263 if let Some(rest) = ptr.strip_prefix(['-', '+']) {
264 ptr = rest;
265 }
266 }
267 while let Some(rest) = ptr.strip_prefix(|x: char| x.is_ascii_whitespace()) {
268 ptr = rest;
269 }
270
271 ptr = some!(ptr.strip_prefix(name));
272
273 while let Some(rest) = ptr.strip_prefix(|x: char| x.is_ascii_whitespace()) {
274 ptr = rest;
275 }
276
277 let ws = if let Some(rest) = ptr.strip_prefix('-') {
278 ptr = rest;
279 Whitespace::Remove
280 } else if let Some(rest) = ptr.strip_prefix('+') {
281 ptr = rest;
282 Whitespace::Preserve
283 } else {
284 Whitespace::Default
285 };
286
287 ptr.strip_prefix(block_end)
288 .map(|ptr| (block_str.len() - ptr.len(), ws))
289}
290
291impl<'s> Tokenizer<'s> {
292 pub fn new(
294 input: &'s str,
295 filename: &'s str,
296 in_expr: bool,
297 syntax_config: SyntaxConfig,
298 whitespace_config: WhitespaceConfig,
299 ) -> Tokenizer<'s> {
300 let mut source = input;
301 if !whitespace_config.keep_trailing_newline {
302 if source.ends_with('\n') {
303 source = &source[..source.len() - 1];
304 }
305 if source.ends_with('\r') {
306 source = &source[..source.len() - 1];
307 }
308 }
309 Tokenizer {
310 source,
311 filename,
312 stack: vec![if in_expr {
313 LexerState::Variable
314 } else {
315 LexerState::Template
316 }],
317 current_line: 1,
318 current_col: 0,
319 current_offset: 0,
320 paren_balance: 0,
321 trim_leading_whitespace: false,
322 pending_start_marker: None,
323 syntax_config,
324 ws_config: whitespace_config,
325 }
326 }
327
328 pub fn filename(&self) -> &str {
330 self.filename
331 }
332
333 pub fn next_token(&mut self) -> Result<Option<(Token<'s>, Span)>, Error> {
335 loop {
336 if self.rest_bytes().is_empty() {
337 #[cfg(feature = "custom_syntax")]
340 {
341 if matches!(self.stack.pop(), Some(LexerState::LineStatement)) {
342 return Ok(Some((Token::BlockEnd, self.span(self.loc()))));
343 }
344 }
345 return Ok(None);
346 }
347 let outcome = match self.stack.last() {
348 Some(LexerState::Template) => self.tokenize_root(),
349 Some(LexerState::Block) => self.tokenize_block_or_var(BlockSentinel::Block),
350 #[cfg(feature = "custom_syntax")]
351 Some(LexerState::LineStatement) => {
352 self.tokenize_block_or_var(BlockSentinel::LineStatement)
353 }
354 Some(LexerState::Variable) => self.tokenize_block_or_var(BlockSentinel::Variable),
355 None => panic!("empty lexer stack"),
356 };
357 match ok!(outcome) {
358 ControlFlow::Break(rv) => return Ok(Some(rv)),
359 ControlFlow::Continue(()) => continue,
360 }
361 }
362 }
363
364 #[inline]
365 fn rest(&self) -> &'s str {
366 &self.source[self.current_offset..]
367 }
368
369 #[inline]
370 fn rest_bytes(&self) -> &'s [u8] {
371 &self.source.as_bytes()[self.current_offset..]
372 }
373
374 fn advance(&mut self, bytes: usize) -> &'s str {
375 let skipped = &self.rest()[..bytes];
376 for c in skipped.chars() {
377 match c {
378 '\n' => {
379 self.current_line = self.current_line.saturating_add(1);
380 self.current_col = 0;
381 }
382 _ => self.current_col = self.current_col.saturating_add(1),
383 }
384 }
385 self.current_offset += bytes;
386 skipped
387 }
388
389 #[inline]
390 fn loc(&self) -> (u16, u16, u32) {
391 (
392 self.current_line,
393 self.current_col,
394 self.current_offset as u32,
395 )
396 }
397
398 #[inline]
399 fn span(&self, (start_line, start_col, start_offset): (u16, u16, u32)) -> Span {
400 Span {
401 start_line,
402 start_col,
403 start_offset,
404 end_line: self.current_line,
405 end_col: self.current_col,
406 end_offset: self.current_offset as u32,
407 }
408 }
409
410 #[inline]
411 fn syntax_error(&mut self, msg: &'static str) -> Error {
412 let mut span = self.span(self.loc());
413 if span.start_col == span.end_col {
414 span.end_col += 1;
415 span.end_offset += 1;
416 }
417 let mut err = Error::new(ErrorKind::SyntaxError, msg);
418 err.set_filename_and_span(self.filename, span);
419 err
420 }
421
422 fn eat_number(&mut self) -> Result<(Token<'s>, Span), Error> {
423 #[derive(Copy, Clone)]
424 enum State {
425 RadixInteger, Integer, Fraction, Exponent, ExponentSign, }
431
432 let old_loc = self.loc();
433
434 let radix = match self.rest_bytes().get(..2) {
435 Some(b"0b" | b"0B") => 2,
436 Some(b"0o" | b"0O") => 8,
437 Some(b"0x" | b"0X") => 16,
438 _ => 10,
439 };
440
441 let mut state = if radix == 10 {
442 State::Integer
443 } else {
444 self.advance(2);
445 State::RadixInteger
446 };
447
448 let mut num_len = self
449 .rest_bytes()
450 .iter()
451 .take_while(|&c| c.is_ascii_digit())
452 .count();
453 let mut has_underscore = false;
454 for c in self.rest_bytes()[num_len..].iter().copied() {
455 state = match (c, state) {
456 (b'.', State::Integer) => State::Fraction,
457 (b'E' | b'e', State::Integer | State::Fraction) => State::Exponent,
458 (b'+' | b'-', State::Exponent) => State::ExponentSign,
459 (b'0'..=b'9', State::Exponent) => State::ExponentSign,
460 (b'0'..=b'9', state) => state,
461 (b'a'..=b'f' | b'A'..=b'F', State::RadixInteger) if radix == 16 => state,
462 (b'_', _) => {
463 has_underscore = true;
464 state
465 }
466 _ => break,
467 };
468 num_len += 1;
469 }
470 let is_float = !matches!(state, State::Integer | State::RadixInteger);
471
472 let mut num = Cow::Borrowed(self.advance(num_len));
473 if has_underscore {
474 if num.ends_with('_') {
475 return Err(self.syntax_error("'_' may not occur at end of number"));
476 }
477 num = Cow::Owned(num.replace('_', ""));
478 }
479
480 Ok((
481 ok!(if is_float {
482 num.parse()
483 .map(Token::Float)
484 .map_err(|_| self.syntax_error("invalid float"))
485 } else if let Ok(int) = u64::from_str_radix(&num, radix) {
486 Ok(Token::Int(int))
487 } else {
488 u128::from_str_radix(&num, radix)
489 .map(|x| Token::Int128(Box::new(x)))
490 .map_err(|_| self.syntax_error("invalid integer (too large)"))
491 }),
492 self.span(old_loc),
493 ))
494 }
495
496 fn eat_identifier(&mut self) -> Result<(Token<'s>, Span), Error> {
497 let ident_len = lex_identifier(self.rest());
498 if ident_len > 0 {
499 let old_loc = self.loc();
500 let ident = self.advance(ident_len);
501 Ok((Token::Ident(ident), self.span(old_loc)))
502 } else {
503 Err(self.syntax_error("unexpected character"))
504 }
505 }
506
507 fn eat_string(&mut self, delim: u8) -> Result<(Token<'s>, Span), Error> {
508 let old_loc = self.loc();
509 let mut escaped = false;
510 let mut has_escapes = false;
511 let str_len = self
512 .rest_bytes()
513 .iter()
514 .skip(1)
515 .take_while(|&&c| match (escaped, c) {
516 (true, _) => {
517 escaped = false;
518 true
519 }
520 (_, b'\\') => {
521 escaped = true;
522 has_escapes = true;
523 true
524 }
525 (_, c) if c == delim => false,
526 _ => true,
527 })
528 .count();
529 if escaped || self.rest_bytes().get(str_len + 1) != Some(&delim) {
530 self.advance(str_len + 1);
531 return Err(self.syntax_error("unexpected end of string"));
532 }
533 let s = self.advance(str_len + 2);
534 Ok(if has_escapes {
535 (
536 Token::String(ok!(unescape(&s[1..s.len() - 1])).into_boxed_str()),
537 self.span(old_loc),
538 )
539 } else {
540 (Token::Str(&s[1..s.len() - 1]), self.span(old_loc))
541 })
542 }
543
544 fn skip_whitespace(&mut self) {
545 let skipped = self
546 .rest()
547 .chars()
548 .map_while(|c| c.is_whitespace().then(|| c.len_utf8()))
549 .sum();
550 if skipped > 0 {
551 self.advance(skipped);
552 }
553 }
554
555 fn skip_newline_if_trim_blocks(&mut self) {
556 if self.ws_config.trim_blocks {
557 if self.rest_bytes().get(0) == Some(&b'\r') {
558 self.advance(1);
559 }
560 if self.rest_bytes().get(0) == Some(&b'\n') {
561 self.advance(1);
562 }
563 }
564 }
565
566 fn handle_tail_ws(&mut self, ws: Whitespace) {
567 match ws {
568 Whitespace::Preserve => {}
569 Whitespace::Default => {
570 self.skip_newline_if_trim_blocks();
571 }
572 Whitespace::Remove => {
573 self.trim_leading_whitespace = true;
574 }
575 }
576 }
577
578 fn variable_end(&self) -> &str {
579 self.syntax_config.variable_delimiters().1
580 }
581
582 fn block_start(&self) -> &str {
583 self.syntax_config.block_delimiters().0
584 }
585
586 fn block_end(&self) -> &str {
587 self.syntax_config.block_delimiters().1
588 }
589
590 fn comment_end(&self) -> &str {
591 self.syntax_config.comment_delimiters().1
592 }
593
594 fn tokenize_root(&mut self) -> Result<ControlFlow<(Token<'s>, Span)>, Error> {
595 if let Some((marker, len)) = self.pending_start_marker.take() {
596 return self.handle_start_marker(marker, len);
597 }
598 if self.trim_leading_whitespace {
599 self.trim_leading_whitespace = false;
600 self.skip_whitespace();
601 }
602 let old_loc = self.loc();
603 let (lead, span) =
604 match find_start_marker(self.source, self.current_offset, &self.syntax_config) {
605 Some((start, marker, len, whitespace)) => {
606 self.pending_start_marker = Some((marker, len));
607 match whitespace {
608 Whitespace::Default
609 if should_lstrip_block(
610 self.ws_config.lstrip_blocks,
611 marker,
612 &self.source[..self.current_offset + start],
613 ) =>
614 {
615 let peeked = &self.rest()[..start];
616 let trimmed = lstrip_block(peeked);
617 let lead = self.advance(trimmed.len());
618 let span = self.span(old_loc);
619 self.advance(peeked.len() - trimmed.len());
620 (lead, span)
621 }
622 Whitespace::Default | Whitespace::Preserve => {
623 (self.advance(start), self.span(old_loc))
624 }
625 Whitespace::Remove => {
626 let peeked = &self.rest()[..start];
627 let trimmed = peeked.trim_end();
628 let lead = self.advance(trimmed.len());
629 let span = self.span(old_loc);
630 self.advance(peeked.len() - trimmed.len());
631 (lead, span)
632 }
633 }
634 }
635 None => (self.advance(self.rest().len()), self.span(old_loc)),
636 };
637
638 if lead.is_empty() {
639 Ok(ControlFlow::Continue(()))
640 } else {
641 Ok(ControlFlow::Break((Token::TemplateData(lead), span)))
642 }
643 }
644
645 fn handle_start_marker(
646 &mut self,
647 marker: StartMarker,
648 skip: usize,
649 ) -> Result<ControlFlow<(Token<'s>, Span)>, Error> {
650 match marker {
651 StartMarker::Comment => {
652 if let Some(end) = memstr(&self.rest_bytes()[skip..], self.comment_end().as_bytes())
653 {
654 let ws = Whitespace::from_byte(
655 self.rest_bytes().get(end.saturating_sub(1) + skip).copied(),
656 );
657 self.advance(end + skip + self.comment_end().len());
658 self.handle_tail_ws(ws);
659 Ok(ControlFlow::Continue(()))
660 } else {
661 self.advance(self.rest_bytes().len());
662 Err(self.syntax_error("unexpected end of comment"))
663 }
664 }
665 StartMarker::Variable => {
666 let old_loc = self.loc();
667 self.advance(skip);
668 self.stack.push(LexerState::Variable);
669 Ok(ControlFlow::Break((
670 Token::VariableStart,
671 self.span(old_loc),
672 )))
673 }
674 StartMarker::Block => {
675 if let Some((raw, ws_start)) =
679 skip_basic_tag(&self.rest()[skip..], "raw", self.block_end(), false)
680 {
681 self.advance(raw + skip);
682 self.handle_raw_tag(ws_start)
683 } else {
684 let old_loc = self.loc();
685 self.advance(skip);
686 self.stack.push(LexerState::Block);
687 Ok(ControlFlow::Break((Token::BlockStart, self.span(old_loc))))
688 }
689 }
690 #[cfg(feature = "custom_syntax")]
691 StartMarker::LineStatement => {
692 let old_loc = self.loc();
693 self.advance(skip);
694 self.stack.push(LexerState::LineStatement);
695 Ok(ControlFlow::Break((Token::BlockStart, self.span(old_loc))))
696 }
697 #[cfg(feature = "custom_syntax")]
698 StartMarker::LineComment => {
699 let comment_skip = self.rest_bytes()[skip..]
700 .iter()
701 .take_while(|&&c| c != b'\r' && c != b'\n')
702 .count();
703 let (_, nl_skip) = skip_nl(&self.rest()[skip + comment_skip..]);
704 self.advance(skip + comment_skip + nl_skip);
705 Ok(ControlFlow::Continue(()))
706 }
707 }
708 }
709
710 fn handle_raw_tag(
711 &mut self,
712 ws_start: Whitespace,
713 ) -> Result<ControlFlow<(Token<'s>, Span)>, Error> {
714 let old_loc = self.loc();
715 let mut ptr = 0;
716 while let Some(block) = memstr(&self.rest_bytes()[ptr..], self.block_start().as_bytes()) {
717 ptr += block + self.block_start().len();
718 if let Some((endraw, ws_next)) =
719 skip_basic_tag(&self.rest()[ptr..], "endraw", self.block_end(), true)
720 {
721 let ws = Whitespace::from_byte(self.rest_bytes().get(ptr).copied());
722 let end = ptr - self.block_start().len();
723 let mut result = &self.rest()[..end];
724 self.advance(end);
725 let span = self.span(old_loc);
726 self.advance(self.block_start().len() + endraw);
727 match ws_start {
728 Whitespace::Default if self.ws_config.trim_blocks => {
729 if result.starts_with('\r') {
730 result = &result[1..];
731 }
732 if result.starts_with('\n') {
733 result = &result[1..];
734 }
735 }
736 Whitespace::Remove => {
737 result = result.trim_start();
738 }
739 _ => {}
740 }
741 result = match ws {
742 Whitespace::Default if self.ws_config.lstrip_blocks => lstrip_block(result),
743 Whitespace::Remove => result.trim_end(),
744 _ => result,
745 };
746 self.handle_tail_ws(ws_next);
747 return Ok(ControlFlow::Break((Token::TemplateData(result), span)));
748 }
749 }
750 self.advance(self.rest_bytes().len());
751 Err(self.syntax_error("unexpected end of raw block"))
752 }
753
754 fn tokenize_block_or_var(
755 &mut self,
756 sentinel: BlockSentinel,
757 ) -> Result<ControlFlow<(Token<'s>, Span)>, Error> {
758 let old_loc = self.loc();
759 let rest = self.rest();
760
761 #[cfg(feature = "custom_syntax")]
764 {
765 if matches!(sentinel, BlockSentinel::LineStatement)
766 && self.paren_balance == 0
767 && self.syntax_config.line_statement_prefix().is_some()
768 {
769 let skip = rest
770 .chars()
771 .take_while(|&x| x.is_whitespace() && !is_nl(x))
772 .map(|x| x.len_utf8())
773 .sum();
774 let (was_nl, nl_skip) = skip_nl(&rest[skip..]);
775 if was_nl {
776 self.advance(skip + nl_skip);
777 self.stack.pop();
778 return Ok(ControlFlow::Break((Token::BlockEnd, self.span(old_loc))));
779 }
780 }
781 }
782
783 match rest
785 .as_bytes()
786 .iter()
787 .position(|&x| !x.is_ascii_whitespace())
788 {
789 Some(0) => {}
790 None => {
791 self.advance(rest.len());
792 return Ok(ControlFlow::Continue(()));
793 }
794 Some(offset) => {
795 self.advance(offset);
796 return Ok(ControlFlow::Continue(()));
797 }
798 }
799
800 if self.paren_balance == 0 {
802 match sentinel {
803 BlockSentinel::Block => {
804 if matches!(rest.get(..1), Some("-" | "+"))
805 && rest[1..].starts_with(self.block_end())
806 {
807 self.stack.pop();
808 let was_minus = &rest[..1] == "-";
809 self.advance(self.block_end().len() + 1);
810 let span = self.span(old_loc);
811 if was_minus {
812 self.trim_leading_whitespace = true;
813 }
814 return Ok(ControlFlow::Break((Token::BlockEnd, span)));
815 }
816 if rest.starts_with(self.block_end()) {
817 self.stack.pop();
818 self.advance(self.block_end().len());
819 let span = self.span(old_loc);
820 self.skip_newline_if_trim_blocks();
821 return Ok(ControlFlow::Break((Token::BlockEnd, span)));
822 }
823 }
824 BlockSentinel::Variable => {
825 if matches!(rest.get(..1), Some("-" | "+"))
826 && rest[1..].starts_with(self.variable_end())
827 {
828 self.stack.pop();
829 let was_minus = &rest[..1] == "-";
830 self.advance(self.variable_end().len() + 1);
831 let span = self.span(old_loc);
832 if was_minus {
833 self.trim_leading_whitespace = true;
834 }
835 return Ok(ControlFlow::Break((Token::VariableEnd, span)));
836 }
837 if rest.starts_with(self.variable_end()) {
838 self.stack.pop();
839 self.advance(self.variable_end().len());
840 return Ok(ControlFlow::Break((Token::VariableEnd, self.span(old_loc))));
841 }
842 }
843 #[cfg(feature = "custom_syntax")]
845 BlockSentinel::LineStatement => {}
846 }
847 }
848
849 let op = match rest.as_bytes().get(..2) {
851 Some(b"//") => Some(Token::FloorDiv),
852 Some(b"**") => Some(Token::Pow),
853 Some(b"==") => Some(Token::Eq),
854 Some(b"!=") => Some(Token::Ne),
855 Some(b">=") => Some(Token::Gte),
856 Some(b"<=") => Some(Token::Lte),
857 _ => None,
858 };
859 if let Some(op) = op {
860 self.advance(2);
861 return Ok(ControlFlow::Break((op, self.span(old_loc))));
862 }
863
864 macro_rules! with_paren_balance {
865 ($delta:expr, $tok:expr) => {{
866 self.paren_balance += $delta;
867 Some($tok)
868 }};
869 }
870
871 let op = match rest.as_bytes().get(0) {
873 Some(b'+') => Some(Token::Plus),
874 Some(b'-') => Some(Token::Minus),
875 Some(b'*') => Some(Token::Mul),
876 Some(b'/') => Some(Token::Div),
877 Some(b'%') => Some(Token::Mod),
878 Some(b'.') => Some(Token::Dot),
879 Some(b',') => Some(Token::Comma),
880 Some(b':') => Some(Token::Colon),
881 Some(b'~') => Some(Token::Tilde),
882 Some(b'|') => Some(Token::Pipe),
883 Some(b'=') => Some(Token::Assign),
884 Some(b'>') => Some(Token::Gt),
885 Some(b'<') => Some(Token::Lt),
886 Some(b'(') => with_paren_balance!(1, Token::ParenOpen),
887 Some(b')') => with_paren_balance!(-1, Token::ParenClose),
888 Some(b'[') => with_paren_balance!(1, Token::BracketOpen),
889 Some(b']') => with_paren_balance!(-1, Token::BracketClose),
890 Some(b'{') => with_paren_balance!(1, Token::BraceOpen),
891 Some(b'}') => with_paren_balance!(-1, Token::BraceClose),
892 Some(b'\'') => {
893 return Ok(ControlFlow::Break(ok!(self.eat_string(b'\''))));
894 }
895 Some(b'"') => {
896 return Ok(ControlFlow::Break(ok!(self.eat_string(b'"'))));
897 }
898 Some(c) if c.is_ascii_digit() => return Ok(ControlFlow::Break(ok!(self.eat_number()))),
899 _ => None,
900 };
901 if let Some(op) = op {
902 self.advance(1);
903 Ok(ControlFlow::Break((op, self.span(old_loc))))
904 } else {
905 Ok(ControlFlow::Break(ok!(self.eat_identifier())))
906 }
907 }
908}
909
910#[cfg(any(test, feature = "unstable_machinery"))]
912pub fn tokenize(
913 input: &str,
914 in_expr: bool,
915 syntax_config: SyntaxConfig,
916 whitespace_config: WhitespaceConfig,
917) -> impl Iterator<Item = Result<(Token<'_>, Span), Error>> {
918 let mut tokenizer =
921 Tokenizer::new(input, "<string>", in_expr, syntax_config, whitespace_config);
922 std::iter::from_fn(move || tokenizer.next_token().transpose())
923}
924
925#[cfg(test)]
926mod tests {
927 use super::*;
928
929 use similar_asserts::assert_eq;
930
931 #[test]
932 fn test_is_basic_tag() {
933 assert_eq!(
934 skip_basic_tag(" raw %}", "raw", "%}", false),
935 Some((7, Whitespace::Default))
936 );
937 assert_eq!(skip_basic_tag(" raw %}", "endraw", "%}", false), None);
938 assert_eq!(
939 skip_basic_tag(" raw %}", "raw", "%}", false),
940 Some((9, Whitespace::Default))
941 );
942 assert_eq!(
943 skip_basic_tag(" raw -%}", "raw", "%}", false),
944 Some((10, Whitespace::Remove))
945 );
946 assert_eq!(
947 skip_basic_tag(" raw +%}", "raw", "%}", false),
948 Some((10, Whitespace::Preserve))
949 );
950 }
951
952 #[test]
953 fn test_basic_identifiers() {
954 fn assert_ident(s: &str) {
955 match tokenize(s, true, Default::default(), Default::default()).next() {
956 Some(Ok((Token::Ident(ident), _))) if ident == s => {}
957 _ => panic!("did not get a matching token result: {s:?}"),
958 }
959 }
960
961 fn assert_not_ident(s: &str) {
962 let res = tokenize(s, true, Default::default(), Default::default())
963 .collect::<Result<Vec<_>, _>>();
964 if let Ok(tokens) = res {
965 if let &[(Token::Ident(_), _)] = &tokens[..] {
966 panic!("got a single ident for {s:?}")
967 }
968 }
969 }
970
971 assert_ident("foo_bar_baz");
972 assert_ident("_foo_bar_baz");
973 assert_ident("_42world");
974 assert_ident("_world42");
975 assert_ident("world42");
976 assert_not_ident("42world");
977
978 #[cfg(feature = "unicode")]
979 {
980 assert_ident("foo");
981 assert_ident("föö");
982 assert_ident("き");
983 assert_ident("_");
984 assert_not_ident("1a");
985 assert_not_ident("a-");
986 assert_not_ident("🐍a");
987 assert_not_ident("a🐍🐍");
988 assert_ident("ᢅ");
989 assert_ident("ᢆ");
990 assert_ident("℘");
991 assert_ident("℮");
992 assert_not_ident("·");
993 assert_ident("a·");
994 }
995 }
996}