logos/
source.rs

1//! This module contains a bunch of traits necessary for processing byte strings.
2//!
3//! Most notable are:
4//! * `Source` - implemented by default for `&str`, `&[u8]` and wrapper types, used by the `Lexer`.
5//! * `Slice` - slices of `Source`, returned by `Lexer::slice`.
6
7use core::fmt::Debug;
8use core::ops::{Deref, Range};
9
10/// Trait for types the `Lexer` can read from.
11///
12/// Most notably this is implemented for `&str`. It is unlikely you will
13/// ever want to use this Trait yourself, unless implementing a new `Source`
14/// the `Lexer` can use.
15///
16/// SAFETY: Unless the unsafe functions of this trait are disabled with the `forbid_unsafe`
17/// feature, the correctness of the unsafe functions of this trait depend on the correct
18/// implementation of the `len` and `find_boundary` functions so generated code does not request
19/// out-of-bounds access.
20#[allow(clippy::len_without_is_empty)]
21pub trait Source {
22    /// A type this `Source` can be sliced into.
23    type Slice<'a>: PartialEq + Eq + Debug
24    where
25        Self: 'a;
26
27    /// Length of the source
28    fn len(&self) -> usize;
29
30    /// Read a chunk of bytes into an array. Returns `None` when reading
31    /// out of bounds would occur.
32    ///
33    /// This is very useful for matching fixed-size byte arrays, and tends
34    /// to be very fast at it too, since the compiler knows the byte lengths.
35    ///
36    /// ```rust
37    /// use logos::Source;
38    ///
39    /// let foo = "foo";
40    ///
41    /// assert_eq!(foo.read(0), Some(b"foo"));     // Option<&[u8; 3]>
42    /// assert_eq!(foo.read(0), Some(b"fo"));      // Option<&[u8; 2]>
43    /// assert_eq!(foo.read(2), Some(b'o'));       // Option<u8>
44    /// assert_eq!(foo.read::<&[u8; 4]>(0), None); // Out of bounds
45    /// assert_eq!(foo.read::<&[u8; 2]>(2), None); // Out of bounds
46    /// ```
47    fn read<'a, Chunk>(&'a self, offset: usize) -> Option<Chunk>
48    where
49        Chunk: self::Chunk<'a>;
50
51    /// Read a byte without doing bounds checks.
52    ///
53    /// # Safety
54    ///
55    /// Offset should not exceed bounds.
56    #[cfg(not(feature = "forbid_unsafe"))]
57    unsafe fn read_byte_unchecked(&self, offset: usize) -> u8;
58
59    /// Read a byte with bounds checking.
60    #[cfg(feature = "forbid_unsafe")]
61    fn read_byte(&self, offset: usize) -> u8;
62
63    /// Get a slice of the source at given range. This is analogous to
64    /// `slice::get(range)`.
65    ///
66    /// ```rust
67    /// use logos::Source;
68    ///
69    /// let foo = "It was the year when they finally immanentized the Eschaton.";
70    /// assert_eq!(<str as Source>::slice(&foo, 51..59), Some("Eschaton"));
71    /// ```
72    fn slice(&self, range: Range<usize>) -> Option<Self::Slice<'_>>;
73
74    /// Get a slice of the source at given range. This is analogous to
75    /// `slice::get_unchecked(range)`.
76    ///
77    /// # Safety
78    ///
79    /// Range should not exceed bounds.
80    ///
81    /// ```rust
82    /// use logos::Source;
83    ///
84    /// let foo = "It was the year when they finally immanentized the Eschaton.";
85    ///
86    /// unsafe {
87    ///     assert_eq!(<str as Source>::slice_unchecked(&foo, 51..59), "Eschaton");
88    /// }
89    /// ```
90    #[cfg(not(feature = "forbid_unsafe"))]
91    unsafe fn slice_unchecked(&self, range: Range<usize>) -> Self::Slice<'_>;
92
93    /// For `&str` sources attempts to find the closest `char` boundary at which source
94    /// can be sliced, starting from `index`.
95    ///
96    /// For binary sources (`&[u8]`) this should just return `index` back.
97    #[inline]
98    fn find_boundary(&self, index: usize) -> usize {
99        index
100    }
101
102    /// Check if `index` is valid for this `Source`, that is:
103    ///
104    /// + It's not larger than the byte length of the `Source`.
105    /// + (`str` only) It doesn't land in the middle of a UTF-8 code point.
106    fn is_boundary(&self, index: usize) -> bool;
107}
108
109impl Source for str {
110    type Slice<'a> = &'a str;
111
112    #[inline]
113    fn len(&self) -> usize {
114        self.len()
115    }
116
117    #[inline]
118    fn read<'a, Chunk>(&'a self, offset: usize) -> Option<Chunk>
119    where
120        Chunk: self::Chunk<'a>,
121    {
122        #[cfg(not(feature = "forbid_unsafe"))]
123        if offset + (Chunk::SIZE - 1) < self.len() {
124            // # Safety: we just performed a bound check.
125            Some(unsafe { Chunk::from_ptr(self.as_ptr().add(offset)) })
126        } else {
127            None
128        }
129
130        #[cfg(feature = "forbid_unsafe")]
131        Chunk::from_slice(self.as_bytes().slice(offset..Chunk::SIZE + offset)?)
132    }
133
134    #[inline]
135    #[cfg(not(feature = "forbid_unsafe"))]
136    unsafe fn read_byte_unchecked(&self, offset: usize) -> u8 {
137        Chunk::from_ptr(self.as_ptr().add(offset))
138    }
139
140    #[inline]
141    #[cfg(feature = "forbid_unsafe")]
142    fn read_byte(&self, offset: usize) -> u8 {
143        self.as_bytes()[offset]
144    }
145
146    #[inline]
147    fn slice(&self, range: Range<usize>) -> Option<&str> {
148        self.get(range)
149    }
150
151    #[cfg(not(feature = "forbid_unsafe"))]
152    #[inline]
153    unsafe fn slice_unchecked(&self, range: Range<usize>) -> &str {
154        debug_assert!(
155            range.start <= self.len() && range.end <= self.len(),
156            "Reading out of bounds {:?} for {}!",
157            range,
158            self.len()
159        );
160
161        self.get_unchecked(range)
162    }
163
164    #[inline]
165    fn find_boundary(&self, mut index: usize) -> usize {
166        while !self.is_char_boundary(index) {
167            index += 1;
168        }
169
170        index
171    }
172
173    #[inline]
174    fn is_boundary(&self, index: usize) -> bool {
175        self.is_char_boundary(index)
176    }
177}
178
179impl Source for [u8] {
180    type Slice<'a> = &'a [u8];
181
182    #[inline]
183    fn len(&self) -> usize {
184        self.len()
185    }
186
187    #[inline]
188    fn read<'a, Chunk>(&'a self, offset: usize) -> Option<Chunk>
189    where
190        Chunk: self::Chunk<'a>,
191    {
192        #[cfg(not(feature = "forbid_unsafe"))]
193        if offset + (Chunk::SIZE - 1) < self.len() {
194            Some(unsafe { Chunk::from_ptr(self.as_ptr().add(offset)) })
195        } else {
196            None
197        }
198
199        #[cfg(feature = "forbid_unsafe")]
200        Chunk::from_slice(self.slice(offset..Chunk::SIZE + offset)?)
201    }
202
203    #[inline]
204    #[cfg(not(feature = "forbid_unsafe"))]
205    unsafe fn read_byte_unchecked(&self, offset: usize) -> u8 {
206        Chunk::from_ptr(self.as_ptr().add(offset))
207    }
208
209    #[inline]
210    #[cfg(feature = "forbid_unsafe")]
211    fn read_byte(&self, offset: usize) -> u8 {
212        self[offset]
213    }
214
215    #[inline]
216    fn slice(&self, range: Range<usize>) -> Option<&[u8]> {
217        self.get(range)
218    }
219
220    #[cfg(not(feature = "forbid_unsafe"))]
221    #[inline]
222    unsafe fn slice_unchecked(&self, range: Range<usize>) -> &[u8] {
223        debug_assert!(
224            range.start <= self.len() && range.end <= self.len(),
225            "Reading out of bounds {:?} for {}!",
226            range,
227            self.len()
228        );
229
230        self.get_unchecked(range)
231    }
232
233    #[inline]
234    fn is_boundary(&self, index: usize) -> bool {
235        index <= self.len()
236    }
237}
238
239impl<T> Source for T
240where
241    T: Deref,
242    <T as Deref>::Target: Source,
243{
244    type Slice<'a>
245        = <T::Target as Source>::Slice<'a>
246    where
247        T: 'a;
248
249    fn len(&self) -> usize {
250        self.deref().len()
251    }
252
253    fn read<'a, Chunk>(&'a self, offset: usize) -> Option<Chunk>
254    where
255        Chunk: self::Chunk<'a>,
256    {
257        self.deref().read(offset)
258    }
259
260    #[cfg(not(feature = "forbid_unsafe"))]
261    unsafe fn read_byte_unchecked(&self, offset: usize) -> u8 {
262        self.deref().read_byte_unchecked(offset)
263    }
264
265    #[cfg(feature = "forbid_unsafe")]
266    fn read_byte(&self, offset: usize) -> u8 {
267        self.deref().read_byte(offset)
268    }
269
270    fn slice(&self, range: Range<usize>) -> Option<Self::Slice<'_>> {
271        self.deref().slice(range)
272    }
273
274    #[cfg(not(feature = "forbid_unsafe"))]
275    unsafe fn slice_unchecked(&self, range: Range<usize>) -> Self::Slice<'_> {
276        self.deref().slice_unchecked(range)
277    }
278
279    fn is_boundary(&self, index: usize) -> bool {
280        self.deref().is_boundary(index)
281    }
282
283    fn find_boundary(&self, index: usize) -> usize {
284        self.deref().find_boundary(index)
285    }
286}
287
288/// A fixed, statically sized chunk of data that can be read from the `Source`.
289///
290/// This is implemented for `u8`, as well as byte arrays `&[u8; 1]` to `&[u8; 32]`.
291pub trait Chunk<'source>: Sized + Copy + PartialEq + Eq {
292    /// Size of the chunk being accessed in bytes.
293    const SIZE: usize;
294
295    /// Create a chunk from a raw byte pointer.
296    ///
297    /// # Safety
298    ///
299    /// Raw byte pointer should point to a valid location in source.
300    #[cfg(not(feature = "forbid_unsafe"))]
301    unsafe fn from_ptr(ptr: *const u8) -> Self;
302
303    /// Create a chunk from a slice.
304    /// Returns None if the slice is not long enough to produce the chunk.
305    #[cfg(feature = "forbid_unsafe")]
306    fn from_slice(s: &'source [u8]) -> Option<Self>;
307}
308
309#[allow(clippy::needless_lifetimes)]
310impl<'source> Chunk<'source> for u8 {
311    const SIZE: usize = 1;
312
313    #[inline]
314    #[cfg(not(feature = "forbid_unsafe"))]
315    unsafe fn from_ptr(ptr: *const u8) -> Self {
316        *ptr
317    }
318
319    #[inline]
320    #[cfg(feature = "forbid_unsafe")]
321    fn from_slice(s: &'source [u8]) -> Option<Self> {
322        s.first().copied()
323    }
324}
325
326impl<'source, const N: usize> Chunk<'source> for &'source [u8; N] {
327    const SIZE: usize = N;
328
329    #[inline]
330    #[cfg(not(feature = "forbid_unsafe"))]
331    unsafe fn from_ptr(ptr: *const u8) -> Self {
332        &*(ptr as *const [u8; N])
333    }
334
335    #[inline]
336    #[cfg(feature = "forbid_unsafe")]
337    fn from_slice(s: &'source [u8]) -> Option<Self> {
338        s.slice(0..Self::SIZE).and_then(|x| x.try_into().ok())
339    }
340}