oxilangtag/
lib.rs

1#![doc = include_str!("../README.md")]
2#![cfg_attr(docsrs, feature(doc_auto_cfg))]
3#![deny(unsafe_code)]
4#![no_std]
5
6#[cfg(feature = "std")]
7extern crate std;
8
9extern crate alloc;
10
11use alloc::borrow::{Borrow, Cow};
12use alloc::boxed::Box;
13use alloc::fmt;
14use alloc::str::{FromStr, Split};
15use alloc::string::String;
16use core::cmp::Ordering;
17use core::hash::{Hash, Hasher};
18use core::iter::once;
19use core::ops::Deref;
20#[cfg(feature = "serde")]
21use serde::{Deserialize, Deserializer, Serialize, Serializer};
22
23/// A [RFC 5646](https://tools.ietf.org/html/rfc5646) language tag.
24///
25/// ```
26/// use oxilangtag::LanguageTag;
27///
28/// let language_tag = LanguageTag::parse("en-us").unwrap();
29/// assert_eq!(language_tag.into_inner(), "en-us")
30/// ```
31#[derive(Copy, Clone)]
32pub struct LanguageTag<T> {
33    tag: T,
34    positions: TagElementsPositions,
35}
36
37impl<T: Deref<Target = str>> LanguageTag<T> {
38    /// Parses a language tag according to [RFC 5646](https://tools.ietf.org/html/rfc5646).
39    /// and checks if the tag is ["well-formed"](https://tools.ietf.org/html/rfc5646#section-2.2.9).
40    ///
41    /// This operation keeps internally the `tag` parameter and does not allocate on the heap.
42    ///
43    /// ```
44    /// use oxilangtag::LanguageTag;
45    ///
46    /// let language_tag = LanguageTag::parse("en-us").unwrap();
47    /// assert_eq!(language_tag.into_inner(), "en-us")
48    /// ```
49    pub fn parse(tag: T) -> Result<Self, LanguageTagParseError> {
50        let positions = parse_language_tag(&tag, &mut VoidOutputBuffer::default())?;
51        Ok(Self { tag, positions })
52    }
53
54    /// Returns the underlying language tag representation.
55    #[inline]
56    pub fn as_str(&self) -> &str {
57        &self.tag
58    }
59
60    /// Returns the underlying language tag representation.
61    #[inline]
62    pub fn into_inner(self) -> T {
63        self.tag
64    }
65
66    /// Returns the [primary language subtag](https://tools.ietf.org/html/rfc5646#section-2.2.1).
67    ///
68    /// ```
69    /// use oxilangtag::LanguageTag;
70    ///
71    /// let language_tag = LanguageTag::parse("zh-cmn-Hans-CN").unwrap();
72    /// assert_eq!(language_tag.primary_language(), "zh");
73    /// ```
74    #[inline]
75    pub fn primary_language(&self) -> &str {
76        &self.tag[..self.positions.language_end]
77    }
78
79    /// Returns the [extended language subtags](https://tools.ietf.org/html/rfc5646#section-2.2.2).
80    ///
81    /// Valid language tags have at most one extended language.
82    ///
83    /// ```
84    /// use oxilangtag::LanguageTag;
85    ///
86    /// let language_tag = LanguageTag::parse("zh-cmn-Hans-CN").unwrap();
87    /// assert_eq!(language_tag.extended_language(), Some("cmn"));
88    /// ```
89    #[inline]
90    pub fn extended_language(&self) -> Option<&str> {
91        if self.positions.language_end == self.positions.extlang_end {
92            None
93        } else {
94            Some(&self.tag[self.positions.language_end + 1..self.positions.extlang_end])
95        }
96    }
97
98    /// Iterates on the [extended language subtags](https://tools.ietf.org/html/rfc5646#section-2.2.2).
99    ///
100    /// Valid language tags have at most one extended language.
101    ///
102    /// ```
103    /// use oxilangtag::LanguageTag;
104    ///
105    /// let language_tag = LanguageTag::parse("zh-cmn-Hans-CN").unwrap();
106    /// assert_eq!(language_tag.extended_language_subtags().collect::<Vec<_>>(), vec!["cmn"]);
107    /// ```
108    #[inline]
109    pub fn extended_language_subtags(&self) -> impl Iterator<Item = &str> {
110        self.extended_language().unwrap_or("").split_terminator('-')
111    }
112
113    /// Returns the [primary language subtag](https://tools.ietf.org/html/rfc5646#section-2.2.1)
114    /// and its [extended language subtags](https://tools.ietf.org/html/rfc5646#section-2.2.2).
115    ///
116    /// ```
117    /// use oxilangtag::LanguageTag;
118    ///
119    /// let language_tag = LanguageTag::parse("zh-cmn-Hans-CN").unwrap();
120    /// assert_eq!(language_tag.full_language(), "zh-cmn");
121    /// ```
122    #[inline]
123    pub fn full_language(&self) -> &str {
124        &self.tag[..self.positions.extlang_end]
125    }
126
127    /// Returns the [script subtag](https://tools.ietf.org/html/rfc5646#section-2.2.3).
128    ///
129    /// ```
130    /// use oxilangtag::LanguageTag;
131    ///
132    /// let language_tag = LanguageTag::parse("zh-cmn-Hans-CN").unwrap();
133    /// assert_eq!(language_tag.script(), Some("Hans"));
134    /// ```
135    #[inline]
136    pub fn script(&self) -> Option<&str> {
137        if self.positions.extlang_end == self.positions.script_end {
138            None
139        } else {
140            Some(&self.tag[self.positions.extlang_end + 1..self.positions.script_end])
141        }
142    }
143
144    /// Returns the [region subtag](https://tools.ietf.org/html/rfc5646#section-2.2.4).
145    ///
146    /// ```
147    /// use oxilangtag::LanguageTag;
148    ///
149    /// let language_tag = LanguageTag::parse("zh-cmn-Hans-CN").unwrap();
150    /// assert_eq!(language_tag.region(), Some("CN"));
151    /// ```
152    #[inline]
153    pub fn region(&self) -> Option<&str> {
154        if self.positions.script_end == self.positions.region_end {
155            None
156        } else {
157            Some(&self.tag[self.positions.script_end + 1..self.positions.region_end])
158        }
159    }
160
161    /// Returns the [variant subtags](https://tools.ietf.org/html/rfc5646#section-2.2.5).
162    ///
163    /// ```
164    /// use oxilangtag::LanguageTag;
165    ///
166    /// let language_tag = LanguageTag::parse("zh-Latn-TW-pinyin").unwrap();
167    /// assert_eq!(language_tag.variant(), Some("pinyin"));
168    /// ```
169    #[inline]
170    pub fn variant(&self) -> Option<&str> {
171        if self.positions.region_end == self.positions.variant_end {
172            None
173        } else {
174            Some(&self.tag[self.positions.region_end + 1..self.positions.variant_end])
175        }
176    }
177
178    /// Iterates on the [variant subtags](https://tools.ietf.org/html/rfc5646#section-2.2.5).
179    ///
180    /// ```
181    /// use oxilangtag::LanguageTag;
182    ///
183    /// let language_tag = LanguageTag::parse("zh-Latn-TW-pinyin").unwrap();
184    /// assert_eq!(language_tag.variant_subtags().collect::<Vec<_>>(), vec!["pinyin"]);
185    /// ```
186    #[inline]
187    pub fn variant_subtags(&self) -> impl Iterator<Item = &str> {
188        self.variant().unwrap_or("").split_terminator('-')
189    }
190
191    /// Returns the [extension subtags](https://tools.ietf.org/html/rfc5646#section-2.2.6).
192    ///
193    /// ```
194    /// use oxilangtag::LanguageTag;
195    ///
196    /// let language_tag = LanguageTag::parse("de-DE-u-co-phonebk").unwrap();
197    /// assert_eq!(language_tag.extension(), Some("u-co-phonebk"));
198    /// ```
199    #[inline]
200    pub fn extension(&self) -> Option<&str> {
201        if self.positions.variant_end == self.positions.extension_end {
202            None
203        } else {
204            Some(&self.tag[self.positions.variant_end + 1..self.positions.extension_end])
205        }
206    }
207
208    /// Iterates on the [extension subtags](https://tools.ietf.org/html/rfc5646#section-2.2.6).
209    ///
210    /// ```
211    /// use oxilangtag::LanguageTag;
212    ///
213    /// let language_tag = LanguageTag::parse("de-DE-u-co-phonebk").unwrap();
214    /// assert_eq!(language_tag.extension_subtags().collect::<Vec<_>>(), vec![('u', "co-phonebk")]);
215    /// ```
216    #[inline]
217    pub fn extension_subtags(&self) -> impl Iterator<Item = (char, &str)> {
218        match self.extension() {
219            Some(parts) => ExtensionsIterator::new(parts),
220            None => ExtensionsIterator::new(""),
221        }
222    }
223
224    /// Returns the [private use subtags](https://tools.ietf.org/html/rfc5646#section-2.2.7).
225    ///
226    /// ```
227    /// use oxilangtag::LanguageTag;
228    ///
229    /// let language_tag = LanguageTag::parse("de-x-foo-bar").unwrap();
230    /// assert_eq!(language_tag.private_use(), Some("x-foo-bar"));
231    /// ```
232    #[inline]
233    pub fn private_use(&self) -> Option<&str> {
234        if self.tag.starts_with("x-") {
235            Some(&self.tag)
236        } else if self.positions.extension_end == self.tag.len() {
237            None
238        } else {
239            Some(&self.tag[self.positions.extension_end + 1..])
240        }
241    }
242
243    /// Iterates on the [private use subtags](https://tools.ietf.org/html/rfc5646#section-2.2.7).
244    ///
245    /// ```
246    /// use oxilangtag::LanguageTag;
247    ///
248    /// let language_tag = LanguageTag::parse("de-x-foo-bar").unwrap();
249    /// assert_eq!(language_tag.private_use_subtags().collect::<Vec<_>>(), vec!["foo", "bar"]);
250    /// ```
251    #[inline]
252    pub fn private_use_subtags(&self) -> impl Iterator<Item = &str> {
253        self.private_use()
254            .map(|part| &part[2..])
255            .unwrap_or("")
256            .split_terminator('-')
257    }
258}
259
260impl LanguageTag<String> {
261    /// Parses a language tag according to [RFC 5646](https://tools.ietf.org/html/rfc5646)
262    /// and normalizes its case.
263    ///
264    /// This parser accepts the language tags that are "well-formed" according to
265    /// [RFC 5646](https://tools.ietf.org/html/rfc5646#section-2.2.9).
266    ///
267    /// This operation does heap allocation.
268    ///
269    /// ```
270    /// use oxilangtag::LanguageTag;
271    ///
272    /// let language_tag = LanguageTag::parse_and_normalize("en-us").unwrap();
273    /// assert_eq!(language_tag.into_inner(), "en-US")
274    /// ```
275    pub fn parse_and_normalize(tag: &str) -> Result<Self, LanguageTagParseError> {
276        let mut output_buffer = String::with_capacity(tag.len());
277        let positions = parse_language_tag(tag, &mut output_buffer)?;
278        Ok(Self {
279            tag: output_buffer,
280            positions,
281        })
282    }
283}
284
285impl<Lft: PartialEq<Rhs>, Rhs> PartialEq<LanguageTag<Rhs>> for LanguageTag<Lft> {
286    #[inline]
287    fn eq(&self, other: &LanguageTag<Rhs>) -> bool {
288        self.tag.eq(&other.tag)
289    }
290}
291
292impl<T: PartialEq<str>> PartialEq<str> for LanguageTag<T> {
293    #[inline]
294    fn eq(&self, other: &str) -> bool {
295        self.tag.eq(other)
296    }
297}
298
299impl<'a, T: PartialEq<&'a str>> PartialEq<&'a str> for LanguageTag<T> {
300    #[inline]
301    fn eq(&self, other: &&'a str) -> bool {
302        self.tag.eq(other)
303    }
304}
305
306impl<T: PartialEq<String>> PartialEq<String> for LanguageTag<T> {
307    #[inline]
308    fn eq(&self, other: &String) -> bool {
309        self.tag.eq(other)
310    }
311}
312
313impl<'a, T: PartialEq<Cow<'a, str>>> PartialEq<Cow<'a, str>> for LanguageTag<T> {
314    #[inline]
315    fn eq(&self, other: &Cow<'a, str>) -> bool {
316        self.tag.eq(other)
317    }
318}
319
320impl<T: PartialEq<str>> PartialEq<LanguageTag<T>> for str {
321    #[inline]
322    fn eq(&self, other: &LanguageTag<T>) -> bool {
323        other.tag.eq(self)
324    }
325}
326
327impl<'a, T: PartialEq<&'a str>> PartialEq<LanguageTag<T>> for &'a str {
328    #[inline]
329    fn eq(&self, other: &LanguageTag<T>) -> bool {
330        other.tag.eq(self)
331    }
332}
333
334impl<T: PartialEq<String>> PartialEq<LanguageTag<T>> for String {
335    #[inline]
336    fn eq(&self, other: &LanguageTag<T>) -> bool {
337        other.tag.eq(self)
338    }
339}
340
341impl<'a, T: PartialEq<Cow<'a, str>>> PartialEq<LanguageTag<T>> for Cow<'a, str> {
342    #[inline]
343    fn eq(&self, other: &LanguageTag<T>) -> bool {
344        other.tag.eq(self)
345    }
346}
347
348impl<T: Eq> Eq for LanguageTag<T> {}
349
350impl<T: Hash> Hash for LanguageTag<T> {
351    #[inline]
352    fn hash<H: Hasher>(&self, state: &mut H) {
353        self.tag.hash(state)
354    }
355}
356
357impl<T: PartialOrd> PartialOrd for LanguageTag<T> {
358    #[inline]
359    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
360        self.tag.partial_cmp(&other.tag)
361    }
362}
363
364impl<T: Ord> Ord for LanguageTag<T> {
365    #[inline]
366    fn cmp(&self, other: &Self) -> Ordering {
367        self.tag.cmp(&other.tag)
368    }
369}
370
371impl<T: Deref<Target = str>> Deref for LanguageTag<T> {
372    type Target = str;
373
374    #[inline]
375    fn deref(&self) -> &str {
376        self.tag.deref()
377    }
378}
379
380impl<T: AsRef<str>> AsRef<str> for LanguageTag<T> {
381    #[inline]
382    fn as_ref(&self) -> &str {
383        self.tag.as_ref()
384    }
385}
386
387impl<T: Borrow<str>> Borrow<str> for LanguageTag<T> {
388    #[inline]
389    fn borrow(&self) -> &str {
390        self.tag.borrow()
391    }
392}
393
394impl<T: fmt::Debug> fmt::Debug for LanguageTag<T> {
395    #[inline]
396    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
397        self.tag.fmt(f)
398    }
399}
400
401impl<T: fmt::Display> fmt::Display for LanguageTag<T> {
402    #[inline]
403    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
404        self.tag.fmt(f)
405    }
406}
407
408impl FromStr for LanguageTag<String> {
409    type Err = LanguageTagParseError;
410
411    #[inline]
412    fn from_str(tag: &str) -> Result<Self, LanguageTagParseError> {
413        Self::parse_and_normalize(tag)
414    }
415}
416
417impl<'a> From<LanguageTag<&'a str>> for LanguageTag<String> {
418    #[inline]
419    fn from(tag: LanguageTag<&'a str>) -> Self {
420        Self {
421            tag: tag.tag.into(),
422            positions: tag.positions,
423        }
424    }
425}
426
427impl<'a> From<LanguageTag<Cow<'a, str>>> for LanguageTag<String> {
428    #[inline]
429    fn from(tag: LanguageTag<Cow<'a, str>>) -> Self {
430        Self {
431            tag: tag.tag.into(),
432            positions: tag.positions,
433        }
434    }
435}
436
437impl From<LanguageTag<Box<str>>> for LanguageTag<String> {
438    #[inline]
439    fn from(tag: LanguageTag<Box<str>>) -> Self {
440        Self {
441            tag: tag.tag.into(),
442            positions: tag.positions,
443        }
444    }
445}
446
447impl<'a> From<LanguageTag<&'a str>> for LanguageTag<Cow<'a, str>> {
448    #[inline]
449    fn from(tag: LanguageTag<&'a str>) -> Self {
450        Self {
451            tag: tag.tag.into(),
452            positions: tag.positions,
453        }
454    }
455}
456
457impl<'a> From<LanguageTag<String>> for LanguageTag<Cow<'a, str>> {
458    #[inline]
459    fn from(tag: LanguageTag<String>) -> Self {
460        Self {
461            tag: tag.tag.into(),
462            positions: tag.positions,
463        }
464    }
465}
466
467#[cfg(feature = "serde")]
468impl<T: Serialize> Serialize for LanguageTag<T> {
469    fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
470        self.tag.serialize(serializer)
471    }
472}
473
474#[cfg(feature = "serde")]
475impl<'de, T: Deref<Target = str> + Deserialize<'de>> Deserialize<'de> for LanguageTag<T> {
476    fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<LanguageTag<T>, D::Error> {
477        use serde::de::Error;
478
479        Self::parse(T::deserialize(deserializer)?).map_err(D::Error::custom)
480    }
481}
482
483/// An error raised during [`LanguageTag`](struct.LanguageTag.html) validation.
484#[derive(Debug)]
485pub struct LanguageTagParseError {
486    kind: TagParseErrorKind,
487}
488
489impl fmt::Display for LanguageTagParseError {
490    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
491        match self.kind {
492            TagParseErrorKind::EmptyExtension => {
493                write!(f, "If an extension subtag is present, it must not be empty")
494            }
495            TagParseErrorKind::EmptyPrivateUse => {
496                write!(f, "If the `x` subtag is present, it must not be empty")
497            }
498            TagParseErrorKind::ForbiddenChar => {
499                write!(f, "The langtag contains a char not allowed")
500            }
501            TagParseErrorKind::InvalidSubtag => write!(
502                f,
503                "A subtag fails to parse, it does not match any other subtags"
504            ),
505            TagParseErrorKind::InvalidLanguage => write!(f, "The given language subtag is invalid"),
506            TagParseErrorKind::SubtagTooLong => {
507                write!(f, "A subtag may be eight characters in length at maximum")
508            }
509            TagParseErrorKind::EmptySubtag => write!(f, "A subtag should not be empty"),
510            TagParseErrorKind::TooManyExtlangs => {
511                write!(f, "At maximum three extlangs are allowed")
512            }
513        }
514    }
515}
516
517// Move to core::error::Error once stable see
518// https://github.com/rust-lang/rust/issues/103765
519#[cfg(feature = "std")]
520impl std::error::Error for LanguageTagParseError {}
521
522#[derive(Debug)]
523enum TagParseErrorKind {
524    /// If an extension subtag is present, it must not be empty.
525    EmptyExtension,
526    /// If the `x` subtag is present, it must not be empty.
527    EmptyPrivateUse,
528    /// The langtag contains a char that is not A-Z, a-z, 0-9 or the dash.
529    ForbiddenChar,
530    /// A subtag fails to parse, it does not match any other subtags.
531    InvalidSubtag,
532    /// The given language subtag is invalid.
533    InvalidLanguage,
534    /// A subtag may be eight characters in length at maximum.
535    SubtagTooLong,
536    /// A subtag should not be empty.
537    EmptySubtag,
538    /// At maximum three extlangs are allowed, but zero to one extlangs are preferred.
539    TooManyExtlangs,
540}
541
542#[derive(Copy, Clone, Debug)]
543struct TagElementsPositions {
544    language_end: usize,
545    extlang_end: usize,
546    script_end: usize,
547    region_end: usize,
548    variant_end: usize,
549    extension_end: usize,
550}
551
552trait OutputBuffer: Extend<char> {
553    fn push(&mut self, c: char);
554
555    fn push_str(&mut self, s: &str);
556}
557
558#[derive(Default)]
559struct VoidOutputBuffer {}
560
561impl OutputBuffer for VoidOutputBuffer {
562    #[inline]
563    fn push(&mut self, _: char) {}
564
565    #[inline]
566    fn push_str(&mut self, _: &str) {}
567}
568
569impl Extend<char> for VoidOutputBuffer {
570    #[inline]
571    fn extend<T: IntoIterator<Item = char>>(&mut self, _: T) {}
572}
573
574impl OutputBuffer for String {
575    #[inline]
576    fn push(&mut self, c: char) {
577        self.push(c);
578    }
579
580    #[inline]
581    fn push_str(&mut self, s: &str) {
582        self.push_str(s);
583    }
584}
585
586/// Parses language tag following [the RFC5646 grammar](https://tools.ietf.org/html/rfc5646#section-2.1)
587fn parse_language_tag(
588    input: &str,
589    output: &mut impl OutputBuffer,
590) -> Result<TagElementsPositions, LanguageTagParseError> {
591    //grandfathered tags
592    if let Some(tag) = GRANDFATHEREDS
593        .iter()
594        .find(|record| record.eq_ignore_ascii_case(input))
595    {
596        output.push_str(tag);
597        Ok(TagElementsPositions {
598            language_end: tag.len(),
599            extlang_end: tag.len(),
600            script_end: tag.len(),
601            region_end: tag.len(),
602            variant_end: tag.len(),
603            extension_end: tag.len(),
604        })
605    } else if input.starts_with("x-") || input.starts_with("X-") {
606        // private use
607        if !is_alphanumeric_or_dash(input) {
608            Err(LanguageTagParseError {
609                kind: TagParseErrorKind::ForbiddenChar,
610            })
611        } else if input.len() == 2 {
612            Err(LanguageTagParseError {
613                kind: TagParseErrorKind::EmptyPrivateUse,
614            })
615        } else {
616            output.extend(input.chars().map(|c| c.to_ascii_lowercase()));
617            Ok(TagElementsPositions {
618                language_end: input.len(),
619                extlang_end: input.len(),
620                script_end: input.len(),
621                region_end: input.len(),
622                variant_end: input.len(),
623                extension_end: input.len(),
624            })
625        }
626    } else {
627        parse_langtag(input, output)
628    }
629}
630
631/// Handles normal tags.
632fn parse_langtag(
633    input: &str,
634    output: &mut impl OutputBuffer,
635) -> Result<TagElementsPositions, LanguageTagParseError> {
636    #[derive(PartialEq, Eq)]
637    enum State {
638        Start,
639        AfterLanguage,
640        AfterExtLang,
641        AfterScript,
642        AfterRegion,
643        InExtension { expected: bool },
644        InPrivateUse { expected: bool },
645    }
646
647    let mut state = State::Start;
648    let mut language_end = 0;
649    let mut extlang_end = 0;
650    let mut script_end = 0;
651    let mut region_end = 0;
652    let mut variant_end = 0;
653    let mut extension_end = 0;
654    let mut extlangs_count = 0;
655    for (subtag, end) in SubTagIterator::new(input) {
656        if subtag.is_empty() {
657            return Err(LanguageTagParseError {
658                kind: TagParseErrorKind::EmptySubtag,
659            });
660        }
661        if subtag.len() > 8 {
662            return Err(LanguageTagParseError {
663                kind: TagParseErrorKind::SubtagTooLong,
664            });
665        }
666        if state == State::Start {
667            // Primary language
668            if subtag.len() < 2 || !is_alphabetic(subtag) {
669                return Err(LanguageTagParseError {
670                    kind: TagParseErrorKind::InvalidLanguage,
671                });
672            }
673            language_end = end;
674            output.extend(to_lowercase(subtag));
675            if subtag.len() < 4 {
676                // extlangs are only allowed for short language tags
677                state = State::AfterLanguage;
678            } else {
679                state = State::AfterExtLang;
680            }
681        } else if let State::InPrivateUse { .. } = state {
682            if !is_alphanumeric(subtag) {
683                return Err(LanguageTagParseError {
684                    kind: TagParseErrorKind::InvalidSubtag,
685                });
686            }
687            output.push('-');
688            output.extend(to_lowercase(subtag));
689            state = State::InPrivateUse { expected: false };
690        } else if subtag == "x" || subtag == "X" {
691            // We make sure extension is found
692            if let State::InExtension { expected: true } = state {
693                return Err(LanguageTagParseError {
694                    kind: TagParseErrorKind::EmptyExtension,
695                });
696            }
697            output.push('-');
698            output.push('x');
699            state = State::InPrivateUse { expected: true };
700        } else if subtag.len() == 1 && is_alphanumeric(subtag) {
701            // We make sure extension is found
702            if let State::InExtension { expected: true } = state {
703                return Err(LanguageTagParseError {
704                    kind: TagParseErrorKind::EmptyExtension,
705                });
706            }
707            let extension_tag = subtag.chars().next().unwrap().to_ascii_lowercase();
708            output.push('-');
709            output.push(extension_tag);
710            state = State::InExtension { expected: true };
711        } else if let State::InExtension { .. } = state {
712            if !is_alphanumeric(subtag) {
713                return Err(LanguageTagParseError {
714                    kind: TagParseErrorKind::InvalidSubtag,
715                });
716            }
717            extension_end = end;
718            output.push('-');
719            output.extend(to_lowercase(subtag));
720            state = State::InExtension { expected: false };
721        } else if state == State::AfterLanguage && subtag.len() == 3 && is_alphabetic(subtag) {
722            extlangs_count += 1;
723            if extlangs_count > 3 {
724                return Err(LanguageTagParseError {
725                    kind: TagParseErrorKind::TooManyExtlangs,
726                });
727            }
728            // valid extlangs
729            extlang_end = end;
730            output.push('-');
731            output.extend(to_lowercase(subtag));
732        } else if (state == State::AfterLanguage || state == State::AfterExtLang)
733            && subtag.len() == 4
734            && is_alphabetic(subtag)
735        {
736            // Script
737            script_end = end;
738            output.push('-');
739            output.extend(to_uppercase_first(subtag));
740            state = State::AfterScript;
741        } else if (state == State::AfterLanguage
742            || state == State::AfterExtLang
743            || state == State::AfterScript)
744            && (subtag.len() == 2 && is_alphabetic(subtag)
745                || subtag.len() == 3 && is_numeric(subtag))
746        {
747            // Region
748            region_end = end;
749            output.push('-');
750            output.extend(to_uppercase(subtag));
751            state = State::AfterRegion;
752        } else if (state == State::AfterLanguage
753            || state == State::AfterExtLang
754            || state == State::AfterScript
755            || state == State::AfterRegion)
756            && is_alphanumeric(subtag)
757            && (subtag.len() >= 5 && is_alphabetic(&subtag[0..1])
758                || subtag.len() >= 4 && is_numeric(&subtag[0..1]))
759        {
760            // Variant
761            variant_end = end;
762            output.push('-');
763            output.extend(to_lowercase(subtag));
764            state = State::AfterRegion;
765        } else {
766            return Err(LanguageTagParseError {
767                kind: TagParseErrorKind::InvalidSubtag,
768            });
769        }
770    }
771
772    //We make sure we are in a correct final state
773    if let State::InExtension { expected: true } = state {
774        return Err(LanguageTagParseError {
775            kind: TagParseErrorKind::EmptyExtension,
776        });
777    }
778    if let State::InPrivateUse { expected: true } = state {
779        return Err(LanguageTagParseError {
780            kind: TagParseErrorKind::EmptyPrivateUse,
781        });
782    }
783
784    //We make sure we have not skipped anyone
785    if extlang_end < language_end {
786        extlang_end = language_end;
787    }
788    if script_end < extlang_end {
789        script_end = extlang_end;
790    }
791    if region_end < script_end {
792        region_end = script_end;
793    }
794    if variant_end < region_end {
795        variant_end = region_end;
796    }
797    if extension_end < variant_end {
798        extension_end = variant_end;
799    }
800
801    Ok(TagElementsPositions {
802        language_end,
803        extlang_end,
804        script_end,
805        region_end,
806        variant_end,
807        extension_end,
808    })
809}
810
811struct ExtensionsIterator<'a> {
812    input: &'a str,
813}
814
815impl<'a> ExtensionsIterator<'a> {
816    fn new(input: &'a str) -> Self {
817        Self { input }
818    }
819}
820
821impl<'a> Iterator for ExtensionsIterator<'a> {
822    type Item = (char, &'a str);
823
824    fn next(&mut self) -> Option<(char, &'a str)> {
825        let mut parts_iterator = self.input.split_terminator('-');
826        let singleton = parts_iterator.next()?.chars().next().unwrap();
827        let mut content_size: usize = 2;
828        for part in parts_iterator {
829            if part.len() == 1 {
830                let content = &self.input[2..content_size - 1];
831                self.input = &self.input[content_size..];
832                return Some((singleton, content));
833            } else {
834                content_size += part.len() + 1;
835            }
836        }
837        let result = self.input.get(2..).map(|content| (singleton, content));
838        self.input = "";
839        result
840    }
841}
842
843struct SubTagIterator<'a> {
844    split: Split<'a, char>,
845    position: usize,
846}
847
848impl<'a> SubTagIterator<'a> {
849    #[inline]
850    fn new(input: &'a str) -> Self {
851        Self {
852            split: input.split('-'),
853            position: 0,
854        }
855    }
856}
857
858impl<'a> Iterator for SubTagIterator<'a> {
859    type Item = (&'a str, usize);
860
861    #[inline]
862    fn next(&mut self) -> Option<(&'a str, usize)> {
863        let tag = self.split.next()?;
864        let tag_end = self.position + tag.len();
865        self.position = tag_end + 1;
866        Some((tag, tag_end))
867    }
868}
869
870#[inline]
871fn is_alphabetic(s: &str) -> bool {
872    s.chars().all(|x| x.is_ascii_alphabetic())
873}
874
875#[inline]
876fn is_numeric(s: &str) -> bool {
877    s.chars().all(|x| x.is_ascii_digit())
878}
879
880#[inline]
881fn is_alphanumeric(s: &str) -> bool {
882    s.chars().all(|x| x.is_ascii_alphanumeric())
883}
884
885#[inline]
886fn is_alphanumeric_or_dash(s: &str) -> bool {
887    s.chars().all(|x| x.is_ascii_alphanumeric() || x == '-')
888}
889
890#[inline]
891fn to_uppercase(s: &str) -> impl Iterator<Item = char> + '_ {
892    s.chars().map(|c| c.to_ascii_uppercase())
893}
894
895// Beware: panics if s.len() == 0 (should never happen in our code)
896#[inline]
897fn to_uppercase_first(s: &str) -> impl Iterator<Item = char> + '_ {
898    let mut chars = s.chars();
899    once(chars.next().unwrap().to_ascii_uppercase()).chain(chars.map(|c| c.to_ascii_lowercase()))
900}
901
902#[inline]
903fn to_lowercase(s: &str) -> impl Iterator<Item = char> + '_ {
904    s.chars().map(|c| c.to_ascii_lowercase())
905}
906
907const GRANDFATHEREDS: [&str; 26] = [
908    "art-lojban",
909    "cel-gaulish",
910    "en-GB-oed",
911    "i-ami",
912    "i-bnn",
913    "i-default",
914    "i-enochian",
915    "i-hak",
916    "i-klingon",
917    "i-lux",
918    "i-mingo",
919    "i-navajo",
920    "i-pwn",
921    "i-tao",
922    "i-tay",
923    "i-tsu",
924    "no-bok",
925    "no-nyn",
926    "sgn-BE-FR",
927    "sgn-BE-NL",
928    "sgn-CH-DE",
929    "zh-guoyu",
930    "zh-hakka",
931    "zh-min",
932    "zh-min-nan",
933    "zh-xiang",
934];
oxilangtag/lib.rs

oxilangtag/
lib.rs