logos_codegen/
mir.rs

1use std::convert::TryFrom;
2
3use lazy_static::lazy_static;
4use regex_syntax::hir::{Dot, Hir, HirKind};
5use regex_syntax::ParserBuilder;
6
7pub use regex_syntax::hir::{Class, ClassUnicode, Literal};
8
9use crate::error::{Error, Result};
10
11lazy_static! {
12    static ref DOT_UTF8: Hir = Hir::dot(Dot::AnyChar);
13    static ref DOT_BYTES: Hir = Hir::dot(Dot::AnyByte);
14}
15
16/// Middle Intermediate Representation of the regex, built from
17/// `regex_syntax`'s `Hir`. The goal here is to strip and canonicalize
18/// the tree, so that we don't have to do transformations later on the
19/// graph, with the potential of running into looping references.
20#[derive(Clone, Debug)]
21pub enum Mir {
22    Empty,
23    Loop(Box<Mir>),
24    Maybe(Box<Mir>),
25    Concat(Vec<Mir>),
26    Alternation(Vec<Mir>),
27    Class(Class),
28    Literal(Literal),
29}
30
31impl Mir {
32    pub fn utf8(source: &str) -> Result<Mir> {
33        Mir::try_from(ParserBuilder::new().build().parse(source)?)
34    }
35
36    pub fn utf8_ignore_case(source: &str) -> Result<Mir> {
37        Mir::try_from(
38            ParserBuilder::new()
39                .case_insensitive(true)
40                .build()
41                .parse(source)?,
42        )
43    }
44
45    pub fn binary(source: &str) -> Result<Mir> {
46        Mir::try_from(
47            ParserBuilder::new()
48                .utf8(false)
49                .unicode(false)
50                .build()
51                .parse(source)?,
52        )
53    }
54
55    pub fn binary_ignore_case(source: &str) -> Result<Mir> {
56        Mir::try_from(
57            ParserBuilder::new()
58                .utf8(false)
59                .unicode(false)
60                .case_insensitive(true)
61                .build()
62                .parse(source)?,
63        )
64    }
65
66    pub fn priority(&self) -> usize {
67        match self {
68            Mir::Empty | Mir::Loop(_) | Mir::Maybe(_) => 0,
69            Mir::Concat(concat) => concat.iter().map(Mir::priority).sum(),
70            Mir::Alternation(alt) => alt.iter().map(Mir::priority).min().unwrap_or(0),
71            Mir::Class(_) => 2,
72            Mir::Literal(lit) => match std::str::from_utf8(&lit.0) {
73                Ok(s) => 2 * s.chars().count(),
74                Err(_) => 2 * lit.0.len(),
75            },
76        }
77    }
78}
79
80impl TryFrom<Hir> for Mir {
81    type Error = Error;
82
83    fn try_from(hir: Hir) -> Result<Mir> {
84        match hir.into_kind() {
85            HirKind::Empty => Ok(Mir::Empty),
86            HirKind::Concat(concat) => {
87                let mut out = Vec::with_capacity(concat.len());
88
89                fn extend(mir: Mir, out: &mut Vec<Mir>) {
90                    match mir {
91                        Mir::Concat(nested) => {
92                            for child in nested {
93                                extend(child, out);
94                            }
95                        }
96                        mir => out.push(mir),
97                    }
98                }
99
100                for hir in concat {
101                    extend(Mir::try_from(hir)?, &mut out);
102                }
103
104                Ok(Mir::Concat(out))
105            }
106            HirKind::Alternation(alternation) => {
107                let alternation = alternation
108                    .into_iter()
109                    .map(Mir::try_from)
110                    .collect::<Result<_>>()?;
111
112                Ok(Mir::Alternation(alternation))
113            }
114            HirKind::Literal(literal) => Ok(Mir::Literal(literal)),
115            HirKind::Class(class) => Ok(Mir::Class(class)),
116            HirKind::Repetition(repetition) => {
117                if !repetition.greedy {
118                    return Err("#[regex]: non-greedy parsing is currently unsupported.".into());
119                }
120
121                let is_dot = if repetition.sub.properties().is_utf8() {
122                    *repetition.sub == *DOT_UTF8
123                } else {
124                    *repetition.sub == *DOT_BYTES
125                };
126                let mir = Mir::try_from(*repetition.sub)?;
127
128                match (repetition.min, repetition.max) {
129                    (0..=1, None) if is_dot => {
130                        Err(
131                            "#[regex]: \".+\" and \".*\" patterns will greedily consume \
132                            the entire source till the end as Logos does not allow \
133                            backtracking. If you are looking to match everything until \
134                            a specific character, you should use a negative character \
135                            class. E.g., use regex r\"'[^']*'\" to match anything in \
136                            between two quotes. Read more about that here: \
137                            https://github.com/maciejhirsz/logos/issues/302#issuecomment-1521342541."
138                            .into()
139                        )
140                    }
141                    // 0 or 1
142                    (0, Some(1)) => Ok(Mir::Maybe(Box::new(mir))),
143                    // 0 or more
144                    (0, None) => Ok(Mir::Loop(Box::new(mir))),
145                    // 1 or more
146                    (1, None) => {
147                        Ok(Mir::Concat(vec![mir.clone(), Mir::Loop(Box::new(mir))]))
148                    }
149                    // Exact {n}
150                    (n, Some(m)) if m == n => {
151                        let mut out = Vec::with_capacity(n as usize);
152                        for _ in 0..n {
153                            out.push(mir.clone());
154                        }
155                        Ok(Mir::Concat(out))
156                    }
157                    // At least {n,}
158                    (n, None) => {
159                        let mut out = Vec::with_capacity(n as usize);
160                        for _ in 0..n {
161                            out.push(mir.clone());
162                        }
163                        out.push(Mir::Loop(Box::new(mir)));
164                        Ok(Mir::Concat(out))
165                    }
166                    // Bounded {n, m}
167                    (n, Some(m)) => {
168                        let mut out = Vec::with_capacity(m as usize);
169                        for _ in 0..n {
170                            out.push(mir.clone());
171                        }
172                        for _ in n..m {
173                            out.push(Mir::Maybe(Box::new(mir.clone())));
174                        }
175                        Ok(Mir::Concat(out))
176                    }
177                }
178            }
179            HirKind::Capture(capture) => Mir::try_from(*capture.sub),
180            HirKind::Look(_) => {
181                Err("#[regex]: look-around assertions are currently unsupported.".into())
182            }
183        }
184    }
185}
186
187#[cfg(test)]
188mod tests {
189    use super::Mir;
190
191    #[test]
192    fn priorities() {
193        let regexes = [
194            ("a", 2),
195            ("à", 2),
196            ("京", 2),
197            ("Eté", 6),
198            ("Été", 6),
199            ("[a-z]+", 2),
200            ("a|b", 2),
201            ("a|[b-z]", 2),
202            ("(foo)+", 6),
203            ("foobar", 12),
204            ("(fooz|bar)+qux", 12),
205        ];
206
207        for (regex, expected) in regexes.iter() {
208            let mir = Mir::utf8(regex).unwrap();
209            assert_eq!(mir.priority(), *expected, "Failed for regex \"{}\"", regex);
210        }
211    }
212
213    #[test]
214    fn equivalent_patterns() {
215        let regexes = [
216            ("a|b", "[a-b]"),
217            ("1|2|3", "[1-3]"),
218            ("1+", "[1]+"),
219            ("c*", "[c]*"),
220            ("aaa", "a{3}"),
221            ("a[a]{2}", "a{3}"),
222        ];
223
224        for (regex_left, regex_right) in regexes.iter() {
225            let mir_left = Mir::utf8(regex_left).unwrap();
226            let mir_right = Mir::utf8(regex_right).unwrap();
227            assert_eq!(
228                mir_left.priority(),
229                mir_right.priority(),
230                "Regexes \"{regex_left}\" and \"{regex_right}\" \
231                are equivalent but have different priorities"
232            );
233        }
234    }
235}