1use std::convert::TryFrom;
2
3use lazy_static::lazy_static;
4use regex_syntax::hir::{Dot, Hir, HirKind};
5use regex_syntax::ParserBuilder;
6
7pub use regex_syntax::hir::{Class, ClassUnicode, Literal};
8
9use crate::error::{Error, Result};
10
11lazy_static! {
12 static ref DOT_UTF8: Hir = Hir::dot(Dot::AnyChar);
13 static ref DOT_BYTES: Hir = Hir::dot(Dot::AnyByte);
14}
15
16#[derive(Clone, Debug)]
21pub enum Mir {
22 Empty,
23 Loop(Box<Mir>),
24 Maybe(Box<Mir>),
25 Concat(Vec<Mir>),
26 Alternation(Vec<Mir>),
27 Class(Class),
28 Literal(Literal),
29}
30
31impl Mir {
32 pub fn utf8(source: &str) -> Result<Mir> {
33 Mir::try_from(ParserBuilder::new().build().parse(source)?)
34 }
35
36 pub fn utf8_ignore_case(source: &str) -> Result<Mir> {
37 Mir::try_from(
38 ParserBuilder::new()
39 .case_insensitive(true)
40 .build()
41 .parse(source)?,
42 )
43 }
44
45 pub fn binary(source: &str) -> Result<Mir> {
46 Mir::try_from(
47 ParserBuilder::new()
48 .utf8(false)
49 .unicode(false)
50 .build()
51 .parse(source)?,
52 )
53 }
54
55 pub fn binary_ignore_case(source: &str) -> Result<Mir> {
56 Mir::try_from(
57 ParserBuilder::new()
58 .utf8(false)
59 .unicode(false)
60 .case_insensitive(true)
61 .build()
62 .parse(source)?,
63 )
64 }
65
66 pub fn priority(&self) -> usize {
67 match self {
68 Mir::Empty | Mir::Loop(_) | Mir::Maybe(_) => 0,
69 Mir::Concat(concat) => concat.iter().map(Mir::priority).sum(),
70 Mir::Alternation(alt) => alt.iter().map(Mir::priority).min().unwrap_or(0),
71 Mir::Class(_) => 2,
72 Mir::Literal(lit) => match std::str::from_utf8(&lit.0) {
73 Ok(s) => 2 * s.chars().count(),
74 Err(_) => 2 * lit.0.len(),
75 },
76 }
77 }
78}
79
80impl TryFrom<Hir> for Mir {
81 type Error = Error;
82
83 fn try_from(hir: Hir) -> Result<Mir> {
84 match hir.into_kind() {
85 HirKind::Empty => Ok(Mir::Empty),
86 HirKind::Concat(concat) => {
87 let mut out = Vec::with_capacity(concat.len());
88
89 fn extend(mir: Mir, out: &mut Vec<Mir>) {
90 match mir {
91 Mir::Concat(nested) => {
92 for child in nested {
93 extend(child, out);
94 }
95 }
96 mir => out.push(mir),
97 }
98 }
99
100 for hir in concat {
101 extend(Mir::try_from(hir)?, &mut out);
102 }
103
104 Ok(Mir::Concat(out))
105 }
106 HirKind::Alternation(alternation) => {
107 let alternation = alternation
108 .into_iter()
109 .map(Mir::try_from)
110 .collect::<Result<_>>()?;
111
112 Ok(Mir::Alternation(alternation))
113 }
114 HirKind::Literal(literal) => Ok(Mir::Literal(literal)),
115 HirKind::Class(class) => Ok(Mir::Class(class)),
116 HirKind::Repetition(repetition) => {
117 if !repetition.greedy {
118 return Err("#[regex]: non-greedy parsing is currently unsupported.".into());
119 }
120
121 let is_dot = if repetition.sub.properties().is_utf8() {
122 *repetition.sub == *DOT_UTF8
123 } else {
124 *repetition.sub == *DOT_BYTES
125 };
126 let mir = Mir::try_from(*repetition.sub)?;
127
128 match (repetition.min, repetition.max) {
129 (0..=1, None) if is_dot => {
130 Err(
131 "#[regex]: \".+\" and \".*\" patterns will greedily consume \
132 the entire source till the end as Logos does not allow \
133 backtracking. If you are looking to match everything until \
134 a specific character, you should use a negative character \
135 class. E.g., use regex r\"'[^']*'\" to match anything in \
136 between two quotes. Read more about that here: \
137 https://github.com/maciejhirsz/logos/issues/302#issuecomment-1521342541."
138 .into()
139 )
140 }
141 (0, Some(1)) => Ok(Mir::Maybe(Box::new(mir))),
143 (0, None) => Ok(Mir::Loop(Box::new(mir))),
145 (1, None) => {
147 Ok(Mir::Concat(vec![mir.clone(), Mir::Loop(Box::new(mir))]))
148 }
149 (n, Some(m)) if m == n => {
151 let mut out = Vec::with_capacity(n as usize);
152 for _ in 0..n {
153 out.push(mir.clone());
154 }
155 Ok(Mir::Concat(out))
156 }
157 (n, None) => {
159 let mut out = Vec::with_capacity(n as usize);
160 for _ in 0..n {
161 out.push(mir.clone());
162 }
163 out.push(Mir::Loop(Box::new(mir)));
164 Ok(Mir::Concat(out))
165 }
166 (n, Some(m)) => {
168 let mut out = Vec::with_capacity(m as usize);
169 for _ in 0..n {
170 out.push(mir.clone());
171 }
172 for _ in n..m {
173 out.push(Mir::Maybe(Box::new(mir.clone())));
174 }
175 Ok(Mir::Concat(out))
176 }
177 }
178 }
179 HirKind::Capture(capture) => Mir::try_from(*capture.sub),
180 HirKind::Look(_) => {
181 Err("#[regex]: look-around assertions are currently unsupported.".into())
182 }
183 }
184 }
185}
186
187#[cfg(test)]
188mod tests {
189 use super::Mir;
190
191 #[test]
192 fn priorities() {
193 let regexes = [
194 ("a", 2),
195 ("à", 2),
196 ("京", 2),
197 ("Eté", 6),
198 ("Été", 6),
199 ("[a-z]+", 2),
200 ("a|b", 2),
201 ("a|[b-z]", 2),
202 ("(foo)+", 6),
203 ("foobar", 12),
204 ("(fooz|bar)+qux", 12),
205 ];
206
207 for (regex, expected) in regexes.iter() {
208 let mir = Mir::utf8(regex).unwrap();
209 assert_eq!(mir.priority(), *expected, "Failed for regex \"{}\"", regex);
210 }
211 }
212
213 #[test]
214 fn equivalent_patterns() {
215 let regexes = [
216 ("a|b", "[a-b]"),
217 ("1|2|3", "[1-3]"),
218 ("1+", "[1]+"),
219 ("c*", "[c]*"),
220 ("aaa", "a{3}"),
221 ("a[a]{2}", "a{3}"),
222 ];
223
224 for (regex_left, regex_right) in regexes.iter() {
225 let mir_left = Mir::utf8(regex_left).unwrap();
226 let mir_right = Mir::utf8(regex_right).unwrap();
227 assert_eq!(
228 mir_left.priority(),
229 mir_right.priority(),
230 "Regexes \"{regex_left}\" and \"{regex_right}\" \
231 are equivalent but have different priorities"
232 );
233 }
234 }
235}