logos_codegen/graph/
mod.rs

1use std::cmp::Ordering;
2use std::collections::btree_map::Entry;
3use std::collections::BTreeMap as Map;
4use std::hash::{Hash, Hasher};
5use std::num::NonZeroU32;
6use std::ops::Index;
7
8use fnv::FnvHasher;
9
10mod fork;
11mod impls;
12mod meta;
13mod range;
14mod regex;
15mod rope;
16
17pub use self::fork::Fork;
18pub use self::meta::Meta;
19pub use self::range::Range;
20pub use self::rope::Rope;
21
22/// Disambiguation error during the attempt to merge two leaf
23/// nodes with the same priority
24#[derive(Debug)]
25pub struct DisambiguationError(pub NodeId, pub NodeId);
26
27pub struct Graph<Leaf> {
28    /// Internal storage of all allocated nodes. Once a node is
29    /// put here, it should never be mutated.
30    nodes: Vec<Option<Node<Leaf>>>,
31    /// When merging two nodes into a new node, we store the two
32    /// entry keys and the result, so that we don't merge the same
33    /// two nodes multiple times.
34    ///
35    /// Most of the time the entry we want to find will be the last
36    /// one that has been inserted, so we can use a vec with reverse
37    /// order search to get O(1) searches much faster than any *Map.
38    merges: Map<Merge, NodeId>,
39    /// Another map used for accounting. Before `.push`ing a new node
40    /// onto the graph (inserts are exempt), we hash it and find if
41    /// an identical(!) node has been created before.
42    hashes: Map<u64, NodeId>,
43    /// Instead of handling errors over return types, opt to collect
44    /// them internally.
45    errors: Vec<DisambiguationError>,
46    /// Deferred merges. When when attempting to merge a node with an
47    /// empty reserved slot, the merging operation is deferred until
48    /// the reserved slot is populated. This is a stack that keeps track
49    /// of all such deferred merges
50    deferred: Vec<DeferredMerge>,
51}
52
53/// Trait to be implemented on `Leaf` nodes in order to disambiguate
54/// between them.
55pub trait Disambiguate {
56    fn cmp(left: &Self, right: &Self) -> Ordering;
57}
58
59/// Id of a Node in the graph. `NodeId` can be referencing an empty
60/// slot that is going to be populated later in time.
61#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
62pub struct NodeId(NonZeroU32);
63
64impl Hash for NodeId {
65    fn hash<H: Hasher>(&self, state: &mut H) {
66        // Always use little-endian byte order for hashing to avoid
67        // different code generation on big-endian platforms due to
68        // iteration over a HashMap,
69        // see https://github.com/maciejhirsz/logos/issues/427.
70        state.write(&self.0.get().to_le_bytes())
71    }
72}
73
74impl NodeId {
75    fn get(self) -> usize {
76        self.0.get() as usize
77    }
78
79    fn new(n: usize) -> NodeId {
80        NodeId(NonZeroU32::new(n as u32).expect("Invalid NodeId"))
81    }
82}
83
84/// Unique reserved `NodeId` that is guaranteed to point to an
85/// empty allocated slot in the graph. It's safe to create multiple
86/// `NodeId` copies of `ReservedId`, however API users should never
87/// be able to clone a `ReservedId`, or create a new one from `NodeId`.
88///
89/// `ReservedId` is consumed once passed into `Graph::insert`.
90#[derive(Debug)]
91pub struct ReservedId(NodeId);
92
93impl ReservedId {
94    pub fn get(&self) -> NodeId {
95        self.0
96    }
97}
98
99/// Merge key used to lookup whether two nodes have been previously
100/// mered, so we can avoid duplicating merges, potentially running into
101/// loops that blow out the stack.
102///
103/// `Merge::new(a, b)` should always equal to `Merge::new(b, a)` to ensure
104/// that node merges are symmetric.
105#[derive(PartialEq, Eq, PartialOrd, Ord, Debug)]
106struct Merge(NodeId, NodeId);
107
108impl Merge {
109    fn new(a: NodeId, b: NodeId) -> Self {
110        if a < b {
111            Merge(a, b)
112        } else {
113            Merge(b, a)
114        }
115    }
116}
117
118/// When attempting to merge two nodes, one of which was not yet created,
119/// we can record such attempt, and execute the merge later on when the
120/// `awaiting` has been `insert`ed into the graph.
121#[derive(Debug)]
122pub struct DeferredMerge {
123    awaiting: NodeId,
124    with: NodeId,
125    into: ReservedId,
126}
127
128impl<Leaf> Graph<Leaf> {
129    pub fn new() -> Self {
130        Graph {
131            // Start with an empty slot so we can start
132            // counting NodeIds from 1 and use NonZero
133            // optimizations
134            nodes: vec![None],
135            merges: Map::new(),
136            hashes: Map::new(),
137            errors: Vec::new(),
138            deferred: Vec::new(),
139        }
140    }
141
142    pub fn errors(&self) -> &[DisambiguationError] {
143        &self.errors
144    }
145
146    fn next_id(&self) -> NodeId {
147        NodeId::new(self.nodes.len())
148    }
149
150    /// Reserve an empty slot for a node on the graph and return an
151    /// id for it. `ReservedId` cannot be cloned, and must be consumed
152    /// by calling `insert` on the graph.
153    pub fn reserve(&mut self) -> ReservedId {
154        let id = self.next_id();
155
156        self.nodes.push(None);
157
158        ReservedId(id)
159    }
160
161    /// Insert a node at a given, previously reserved id. Returns the
162    /// inserted `NodeId`.
163    pub fn insert<N>(&mut self, reserved: ReservedId, node: N) -> NodeId
164    where
165        N: Into<Node<Leaf>>,
166        Leaf: Disambiguate,
167    {
168        let id = reserved.get();
169
170        self.nodes[id.get()] = Some(node.into());
171
172        let mut awaiting = Vec::new();
173
174        // Partition out all `DeferredMerge`s that can be completed
175        // now that this `ReservedId` has a `Node` inserted into it.
176        for idx in (0..self.deferred.len()).rev() {
177            if self.deferred[idx].awaiting == id {
178                awaiting.push(self.deferred.remove(idx));
179            }
180        }
181
182        // Complete deferred merges. We've collected them from the back,
183        // so we must iterate through them from the back as well to restore
184        // proper order of merges in case there is some cascading going on.
185        for DeferredMerge {
186            awaiting,
187            with,
188            into,
189        } in awaiting.into_iter().rev()
190        {
191            self.merge_unchecked(awaiting, with, into);
192        }
193
194        id
195    }
196
197    /// Push a node onto the graph and get an id to it. If an identical
198    /// node has already been pushed on the graph, it will return the id
199    /// of that node instead.
200    pub fn push<B>(&mut self, node: B) -> NodeId
201    where
202        B: Into<Node<Leaf>>,
203    {
204        let node = node.into();
205
206        if let Node::Leaf(_) = node {
207            return self.push_unchecked(node);
208        }
209
210        let mut hasher = FnvHasher::default();
211        node.hash(&mut hasher);
212
213        let next_id = self.next_id();
214
215        match self.hashes.entry(hasher.finish()) {
216            Entry::Occupied(occupied) => {
217                let id = *occupied.get();
218
219                if self[id].eq(&node) {
220                    return id;
221                }
222            }
223            Entry::Vacant(vacant) => {
224                vacant.insert(next_id);
225            }
226        }
227
228        self.push_unchecked(node)
229    }
230
231    fn push_unchecked(&mut self, node: Node<Leaf>) -> NodeId {
232        let id = self.next_id();
233
234        self.nodes.push(Some(node));
235
236        id
237    }
238
239    /// If nodes `a` and `b` have been already merged, return the
240    /// `NodeId` of the node they have been merged into.
241    fn find_merged(&self, a: NodeId, b: NodeId) -> Option<NodeId> {
242        let probe = Merge::new(a, b);
243
244        self.merges.get(&probe).copied()
245    }
246
247    /// Mark that nodes `a` and `b` have been merged into `product`.
248    ///
249    /// This will also mark merging `a` and `product`, as well as
250    /// `b` and `product` into `product`, since those are symmetric
251    /// operations.
252    ///
253    /// This is necessary to break out asymmetric merge loops.
254    fn set_merged(&mut self, a: NodeId, b: NodeId, product: NodeId) {
255        self.merges.insert(Merge::new(a, b), product);
256        self.merges.insert(Merge::new(a, product), product);
257        self.merges.insert(Merge::new(b, product), product);
258    }
259
260    /// Merge the nodes at id `a` and `b`, returning a new id.
261    pub fn merge(&mut self, a: NodeId, b: NodeId) -> NodeId
262    where
263        Leaf: Disambiguate,
264    {
265        if a == b {
266            return a;
267        }
268
269        // If the id pair is already merged (or is being merged), just return the id
270        if let Some(id) = self.find_merged(a, b) {
271            return id;
272        }
273
274        match (self.get(a), self.get(b)) {
275            (None, None) => {
276                panic!(
277                    "Merging two reserved nodes! This is a bug, please report it:\n\
278                    \n\
279                    https://github.com/maciejhirsz/logos/issues"
280                );
281            }
282            (None, Some(_)) => {
283                let reserved = self.reserve();
284                let id = reserved.get();
285                self.deferred.push(DeferredMerge {
286                    awaiting: a,
287                    with: b,
288                    into: reserved,
289                });
290                self.set_merged(a, b, id);
291
292                return id;
293            }
294            (Some(_), None) => {
295                let reserved = self.reserve();
296                let id = reserved.get();
297                self.deferred.push(DeferredMerge {
298                    awaiting: b,
299                    with: a,
300                    into: reserved,
301                });
302                self.set_merged(a, b, id);
303
304                return id;
305            }
306            (Some(Node::Leaf(left)), Some(Node::Leaf(right))) => {
307                return match Disambiguate::cmp(left, right) {
308                    Ordering::Less => b,
309                    Ordering::Greater => a,
310                    Ordering::Equal => {
311                        self.errors.push(DisambiguationError(a, b));
312
313                        a
314                    }
315                };
316            }
317            _ => (),
318        }
319
320        // Reserve the id for the merge and save it. Since the graph can contain loops,
321        // this prevents us from trying to merge the same id pair in a loop, blowing up
322        // the stack.
323        let reserved = self.reserve();
324        self.set_merged(a, b, reserved.get());
325
326        self.merge_unchecked(a, b, reserved)
327    }
328
329    /// Unchecked merge of `a` and `b`. This fn assumes that `a` and `b` are
330    /// not pointing to empty slots.
331    fn merge_unchecked(&mut self, a: NodeId, b: NodeId, reserved: ReservedId) -> NodeId
332    where
333        Leaf: Disambiguate,
334    {
335        let merged_rope = match (self.get(a), self.get(b)) {
336            (Some(Node::Rope(rope)), _) => {
337                let rope = rope.clone();
338
339                self.merge_rope(rope, b)
340            }
341            (_, Some(Node::Rope(rope))) => {
342                let rope = rope.clone();
343
344                self.merge_rope(rope, a)
345            }
346            _ => None,
347        };
348
349        if let Some(rope) = merged_rope {
350            return self.insert(reserved, rope);
351        }
352
353        let mut fork = self.fork_off(a);
354        fork.merge(self.fork_off(b), self);
355
356        let mut stack = vec![reserved.get()];
357
358        // Flatten the fork
359        while let Some(miss) = fork.miss {
360            if stack.contains(&miss) {
361                break;
362            }
363            stack.push(miss);
364
365            let other = match self.get(miss) {
366                Some(Node::Fork(other)) => other.clone(),
367                Some(Node::Rope(other)) => other.clone().into_fork(self),
368                _ => break,
369            };
370            match other.miss {
371                Some(id) if self.get(id).is_none() => break,
372                _ => (),
373            }
374            fork.miss = None;
375            fork.merge(other, self);
376        }
377
378        self.insert(reserved, fork)
379    }
380
381    fn merge_rope(&mut self, rope: Rope, other: NodeId) -> Option<Rope>
382    where
383        Leaf: Disambiguate,
384    {
385        match self.get(other) {
386            Some(Node::Fork(fork)) if rope.miss.is_none() => {
387                // Count how many consecutive ranges in this rope would
388                // branch into the fork that results in a loop.
389                //
390                // e.g.: for rope "foobar" and a looping fork [a-z]: 6
391                let count = rope
392                    .pattern
393                    .iter()
394                    .take_while(|range| fork.contains(**range) == Some(other))
395                    .count();
396
397                let mut rope = rope.split_at(count, self)?.miss_any(other);
398
399                rope.then = self.merge(rope.then, other);
400
401                Some(rope)
402            }
403            Some(Node::Rope(other)) => {
404                let (prefix, miss) = rope.prefix(other)?;
405
406                let (a, b) = (rope, other.clone());
407
408                let a = a.remainder(prefix.len(), self);
409                let b = b.remainder(prefix.len(), self);
410
411                let rope = Rope::new(prefix, self.merge(a, b)).miss(miss);
412
413                Some(rope)
414            }
415            Some(Node::Leaf(_)) | None => {
416                if rope.miss.is_none() {
417                    Some(rope.miss(other))
418                } else {
419                    None
420                }
421            }
422            _ => None,
423        }
424    }
425
426    pub fn fork_off(&mut self, id: NodeId) -> Fork
427    where
428        Leaf: Disambiguate,
429    {
430        match self.get(id) {
431            Some(Node::Fork(fork)) => fork.clone(),
432            Some(Node::Rope(rope)) => rope.clone().into_fork(self),
433            Some(Node::Leaf(_)) | None => Fork::new().miss(id),
434        }
435    }
436
437    pub fn nodes(&self) -> &[Option<Node<Leaf>>] {
438        &self.nodes
439    }
440
441    /// Find all nodes that have no references and remove them.
442    pub fn shake(&mut self, root: NodeId) {
443        let mut filter = vec![false; self.nodes.len()];
444
445        filter[root.get()] = true;
446
447        self[root].shake(self, &mut filter);
448
449        for (id, referenced) in filter.into_iter().enumerate() {
450            if !referenced {
451                self.nodes[id] = None;
452            }
453        }
454    }
455
456    pub fn get(&self, id: NodeId) -> Option<&Node<Leaf>> {
457        self.nodes.get(id.get())?.as_ref()
458    }
459}
460
461impl<Leaf> Index<NodeId> for Graph<Leaf> {
462    type Output = Node<Leaf>;
463
464    fn index(&self, id: NodeId) -> &Node<Leaf> {
465        self.get(id).expect(
466            "Indexing into an empty node. This is a bug, please report it at:\n\
467            \n\
468            https://github.com/maciejhirsz/logos/issues",
469        )
470    }
471}
472
473impl std::fmt::Display for NodeId {
474    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
475        std::fmt::Display::fmt(&self.0, f)
476    }
477}
478
479#[cfg_attr(test, derive(PartialEq))]
480pub enum Node<Leaf> {
481    /// Fork node, can lead to more than one state
482    Fork(Fork),
483    /// Rope node, can lead to one state on match, one state on miss
484    Rope(Rope),
485    /// Leaf node, terminal state
486    Leaf(Leaf),
487}
488
489impl<Leaf> Node<Leaf> {
490    pub fn miss(&self) -> Option<NodeId> {
491        match self {
492            Node::Rope(rope) => rope.miss.first(),
493            Node::Fork(fork) => fork.miss,
494            Node::Leaf(_) => None,
495        }
496    }
497
498    fn eq(&self, other: &Node<Leaf>) -> bool {
499        match (self, other) {
500            (Node::Fork(a), Node::Fork(b)) => a == b,
501            (Node::Rope(a), Node::Rope(b)) => a == b,
502            _ => false,
503        }
504    }
505
506    fn shake(&self, graph: &Graph<Leaf>, filter: &mut [bool]) {
507        match self {
508            Node::Fork(fork) => fork.shake(graph, filter),
509            Node::Rope(rope) => rope.shake(graph, filter),
510            Node::Leaf(_) => (),
511        }
512    }
513
514    pub fn unwrap_leaf(&self) -> &Leaf {
515        match self {
516            Node::Fork(_) => panic!("Internal Error: called unwrap_leaf on a fork"),
517            Node::Rope(_) => panic!("Internal Error: called unwrap_leaf on a rope"),
518            Node::Leaf(leaf) => leaf,
519        }
520    }
521}
522
523#[cfg(test)]
524mod tests {
525    use super::*;
526    use pretty_assertions::assert_eq;
527
528    #[test]
529    fn leaf_stack_size() {
530        use std::mem::size_of;
531
532        const WORD: usize = size_of::<usize>();
533        const NODE: usize = size_of::<Node<()>>();
534
535        assert!(NODE <= 6 * WORD, "Size of Node<()> is {} bytes!", NODE);
536    }
537
538    #[test]
539    fn create_a_loop() {
540        let mut graph = Graph::new();
541
542        let token = graph.push(Node::Leaf("IDENT"));
543        let id = graph.reserve();
544        let fork = Fork::new().branch('a'..='z', id.get()).miss(token);
545        let root = graph.insert(id, fork);
546
547        assert_eq!(graph[token], Node::Leaf("IDENT"));
548        assert_eq!(graph[root], Fork::new().branch('a'..='z', root).miss(token),);
549    }
550
551    #[test]
552    fn fork_off() {
553        let mut graph = Graph::new();
554
555        let leaf = graph.push(Node::Leaf("LEAF"));
556        let rope = graph.push(Rope::new("rope", leaf));
557        let fork = graph.push(Fork::new().branch(b'!', leaf));
558
559        assert_eq!(graph.fork_off(leaf), Fork::new().miss(leaf));
560        assert_eq!(
561            graph.fork_off(rope),
562            Fork::new().branch(b'r', NodeId::new(graph.nodes.len() - 1))
563        );
564        assert_eq!(graph.fork_off(fork), Fork::new().branch(b'!', leaf));
565    }
566}