sophia_api/term/
language_tag.rs

1//! I define the [`LanguageTag`] wrapper type,
2//! which guarantees that the underlying `str`
3//! is a valid [BCP47](https://tools.ietf.org/search/bcp47) language tag.
4
5use lazy_static::lazy_static;
6use regex::Regex;
7use std::borrow::Borrow;
8use std::cmp::{Ordering, PartialOrd};
9use std::fmt::Debug;
10use thiserror::Error;
11
12lazy_static! {
13    /// Regular expression approximating the grammar defined in BCP47.
14    /// (it is actually more permissive).
15    ///
16    /// # Captures
17    ///
18    /// This regular expression matches the whole input (`^...$`),
19    /// therefore, it can not be used to capture language tags in an arbitrary string.
20    static ref LANG_TAG: Regex = Regex::new(r#"(?x)
21      ^
22      [A-Za-z][A-Za-z0-9]*
23      (-[A-Za-z0-9]+)*
24      $
25    "#).unwrap();
26}
27
28/// This wrapper guarantees that the underlying `str`
29/// is a valid [BCP47](https://tools.ietf.org/search/bcp47) language tag.
30///
31/// NB: it is actually more permissive than BCP47.
32///
33/// A [`LanguageTag`] can be combined to a `&str` with the `*` operator,
34/// to produce an RDF [language tagged string](https://www.w3.org/TR/rdf11-concepts/#dfn-language-tagged-string)
35/// implementing the [`Term`](crate::term::Term) trait:
36///
37/// ```
38/// # use sophia_api::{ns::rdf, term::{LanguageTag, Term}};
39/// let fr = LanguageTag::new_unchecked("fr");
40/// let message = "Bonjour le monde" * fr;
41/// assert!(message.is_literal());
42/// assert_eq!(message.lexical_form().unwrap(), "Bonjour le monde");
43/// assert_eq!(message.datatype().unwrap(), rdf::langString.iri().unwrap());
44/// assert_eq!(message.language_tag().unwrap(), fr);
45/// ```
46#[derive(Clone, Copy, Debug)]
47pub struct LanguageTag<T: Borrow<str>>(T);
48
49impl<T: Borrow<str>> LanguageTag<T> {
50    /// Build a new [`LanguageTag`] from `tag`,
51    /// returning an error if it is not a valid BCP47 language tag.
52    pub fn new(tag: T) -> Result<Self, InvalidLanguageTag> {
53        if LANG_TAG.is_match(tag.borrow()) {
54            Ok(LanguageTag(tag))
55        } else {
56            Err(InvalidLanguageTag(tag.borrow().to_string()))
57        }
58    }
59
60    /// Build a new [`LanguageTag`] from `tag`.
61    /// It does not check that the value returned by the function is valid.
62    /// If it is not, it may result in undefined behaviour.
63    pub fn new_unchecked(tag: T) -> Self {
64        assert!(LANG_TAG.is_match(tag.borrow()));
65        LanguageTag(tag)
66    }
67
68    /// Returns the wrapped value, consuming `self`.
69    pub fn unwrap(self) -> T {
70        self.0
71    }
72
73    /// Gets a reference to the underlying `str`.
74    pub fn as_str(&self) -> &str {
75        self.0.borrow()
76    }
77
78    /// Convert reference to a `LanguageTag<&str>`
79    pub fn as_ref(&self) -> LanguageTag<&str> {
80        LanguageTag(self.0.borrow())
81    }
82
83    /// Map a [`LanguageTag`]`<T>` to a [`LanguageTag`]`<U>`
84    /// by applying a function to the wrapped value.
85    ///
86    /// It does not check that the value returned by the function is valid.
87    /// If it is not, it may result in undefined behaviour.
88    pub fn map_unchecked<F, U>(self, f: F) -> LanguageTag<U>
89    where
90        F: FnOnce(T) -> U,
91        U: Borrow<str>,
92    {
93        LanguageTag(f(self.0))
94    }
95}
96
97impl LanguageTag<&'static str> {
98    /// Construct a `LanguageTag<&'static>`
99    /// without checking that the inner value is valid.
100    /// If it is not, it may result in undefined behaviour.
101    pub const fn new_unchecked_const(inner: &'static str) -> Self {
102        Self(inner)
103    }
104}
105
106impl<T: Borrow<str>> std::ops::Deref for LanguageTag<T> {
107    type Target = T;
108
109    fn deref(&self) -> &T {
110        &self.0
111    }
112}
113
114impl<T: Borrow<str>> AsRef<T> for LanguageTag<T> {
115    fn as_ref(&self) -> &T {
116        &self.0
117    }
118}
119
120impl<T: Borrow<str>> AsRef<str> for LanguageTag<T> {
121    fn as_ref(&self) -> &str {
122        self.0.borrow()
123    }
124}
125
126impl<T: Borrow<str>> Borrow<T> for LanguageTag<T> {
127    fn borrow(&self) -> &T {
128        &self.0
129    }
130}
131
132impl<T: Borrow<str>> Borrow<str> for LanguageTag<T> {
133    fn borrow(&self) -> &str {
134        self.0.borrow()
135    }
136}
137
138impl<T: Borrow<str>, U: Borrow<str>> PartialEq<LanguageTag<T>> for LanguageTag<U> {
139    fn eq(&self, other: &LanguageTag<T>) -> bool {
140        self.as_str().eq_ignore_ascii_case(other.as_str())
141    }
142}
143
144impl<T: Borrow<str>> PartialEq<str> for LanguageTag<T> {
145    fn eq(&self, other: &str) -> bool {
146        self.as_str().eq_ignore_ascii_case(other)
147    }
148}
149
150impl<T: Borrow<str>> Eq for LanguageTag<T> {}
151
152impl<T: Borrow<str>> PartialOrd for LanguageTag<T> {
153    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
154        Some(self.cmp(other))
155    }
156}
157
158impl<T: Borrow<str>> PartialOrd<str> for LanguageTag<T> {
159    fn partial_cmp(&self, other: &str) -> Option<Ordering> {
160        let iter1 = self.as_str().chars().map(|c| c.to_ascii_lowercase());
161        let iter2 = other.chars().map(|c| c.to_ascii_lowercase());
162        iter1.partial_cmp(iter2)
163    }
164}
165
166impl<T: Borrow<str>> Ord for LanguageTag<T> {
167    fn cmp(&self, other: &LanguageTag<T>) -> Ordering {
168        let iter1 = self.as_str().chars().map(|c| c.to_ascii_lowercase());
169        let iter2 = other.as_str().chars().map(|c| c.to_ascii_lowercase());
170        iter1.cmp(iter2)
171    }
172}
173
174impl<T: Borrow<str>> std::hash::Hash for LanguageTag<T> {
175    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
176        self.as_str()
177            .chars()
178            .map(|c| c.to_ascii_lowercase())
179            .for_each(|c| c.hash(state));
180    }
181}
182
183/// This error is raised when trying to parse an invalid language tag.
184#[derive(Debug, Error)]
185#[error("The given language tag '{0}' does not comply with BCP47")]
186pub struct InvalidLanguageTag(pub String);
187
188impl<'a> std::ops::Mul<LanguageTag<&'a str>> for &'a str {
189    type Output = super::SimpleTerm<'a>;
190
191    fn mul(self, rhs: LanguageTag<&'a str>) -> Self::Output {
192        super::SimpleTerm::LiteralLanguage(self.into(), rhs.map_unchecked(mownstr::MownStr::from))
193    }
194}
195
196#[cfg(test)]
197mod test {
198    use crate::term::Term;
199
200    use super::*;
201    use test_case::test_case;
202
203    #[test_case("en")]
204    #[test_case("fr")]
205    #[test_case("fr-FR")]
206    #[test_case("fr-ca")]
207    #[test_case("fr-056")]
208    #[test_case("ja-Hani")]
209    #[test_case("ja-Hira")]
210    #[test_case("abc-de-fg-hi")]
211    #[test_case("x-abc-de-fg-hi")]
212    fn valid(tag: &str) {
213        assert!(LanguageTag::new(tag).is_ok());
214    }
215
216    #[test_case(""; "empty")]
217    #[test_case(" "; "space")]
218    #[test_case("éh")]
219    #[test_case("a.")]
220    fn invalid(tag: &str) {
221        assert!(LanguageTag::new(tag).is_err());
222    }
223
224    #[test_case("fr", "fr"; "all_lower")]
225    #[test_case("fr-ca", "fr-ca"; "all_lower_with_country")]
226    #[test_case("fr", "FR"; "language_differ")]
227    #[test_case("en-us", "en-US"; "country_differ")]
228    fn case_insensitive_eq(tag1: &str, tag2: &str) {
229        let ltag1 = LanguageTag::new_unchecked(tag1);
230        let ltag2 = LanguageTag::new_unchecked(tag2);
231        assert_eq!(ltag1, ltag2); // LanguageTag == LanguageTag
232        assert_eq!(&ltag1, tag2); // &LanguageTag == &str
233    }
234
235    #[test_case("EN", "FR"; "all_upper")]
236    #[test_case("en", "fr"; "all_lower")]
237    #[test_case("en", "FR"; "lower_upper")]
238    #[test_case("EN", "fr"; "upper_lower")]
239    #[test_case("en-UK", "en-US"; "counry_all_upper")]
240    #[test_case("en-uk", "en-us"; "counry_all_lower")]
241    #[test_case("en-uk", "en-US"; "counry_lower_upper")]
242    #[test_case("en-UK", "en-us"; "counry_upper_lower")]
243    fn case_insensitive_cmp(tag1: &str, tag2: &str) {
244        let ltag1 = LanguageTag::new_unchecked(tag1);
245        let ltag2 = LanguageTag::new_unchecked(tag2);
246        assert!(ltag1 <= ltag2); // LanguageTag == LanguageTag
247        assert!(&ltag1 <= tag2); // &LanguageTag == &str
248    }
249
250    #[test]
251    fn test_product() {
252        let en = LanguageTag::new("en").unwrap();
253        let frfr = LanguageTag::new("fr-FR").unwrap();
254        let t1 = "chat" * en;
255        assert!(t1.is_literal());
256        assert_eq!(t1.lexical_form().unwrap(), "chat");
257        assert_eq!(t1.language_tag().unwrap(), en);
258        let t2 = "chat" * frfr;
259        assert!(t2.is_literal());
260        assert_eq!(t2.lexical_form().unwrap(), "chat");
261        assert_eq!(t2.language_tag().unwrap(), frfr);
262        let t3 = "cat" * en;
263        assert!(t3.is_literal());
264        assert_eq!(t3.lexical_form().unwrap(), "cat");
265        assert_eq!(t3.language_tag().unwrap(), en);
266    }
267}