monorepo/l10n/src/lib.rs

548 lines
18 KiB
Rust

use chrono::{Datelike, NaiveDate, Timelike};
use chrono_tz::Tz;
use fixed_decimal::FixedDecimal;
use fluent::{bundle::FluentBundle, FluentResource};
use icu::{datetime::options::length, decimal::FixedDecimalFormatter, locid::Locale};
use icu_provider::DataLocale;
use std::{fs::File, io::Read, ops::Deref};
use sys_locale::get_locale;
use thiserror::Error;
use unic_langid::LanguageIdentifierError;
// Re-exports. I'm doing these so that clients of this library don't have to go tracking down
// additional structures
pub use fixed_decimal::FloatPrecision;
pub use fluent::{FluentArgs, FluentValue};
#[derive(Debug)]
pub enum NonEmptyListError {
BuildFromEmptyContainer,
}
pub struct NonEmptyList<A>(Vec<A>);
impl<A> NonEmptyList<A> {
pub fn new(elem: A) -> Self {
Self(vec![elem])
}
pub fn from_iter(
iter: impl IntoIterator<Item = A>,
) -> Result<NonEmptyList<A>, NonEmptyListError> {
let lst = iter.into_iter().collect::<Vec<A>>();
if lst.len() > 0 {
Ok(NonEmptyList(lst))
} else {
Err(NonEmptyListError::BuildFromEmptyContainer)
}
}
pub fn push(&mut self, item: A) {
self.0.push(item);
}
pub fn find(&self, f: impl Fn(&A) -> bool) -> Option<&A> {
self.0.iter().find(|item| f(*item))
}
fn first(&self) -> &A {
&self.0[0]
}
fn iter<'a>(&'a self) -> impl Iterator<Item = &'a A> {
self.0.iter()
}
}
impl<A> Deref for NonEmptyList<A> {
type Target = Vec<A>;
fn deref(&self) -> &Self::Target {
&self.0
}
}
#[derive(Debug, Error)]
pub enum L10NError {
#[error("Unparsable Locale")]
UnparsableLocale,
}
impl From<icu::locid::Error> for L10NError {
fn from(_: icu::locid::Error) -> L10NError {
L10NError::UnparsableLocale
}
}
#[derive(Debug, Error)]
pub enum FileLoadError {
#[error("Unparsable Locale")]
UnparsableLocale,
#[error("Source string file not found")]
FileNotFound,
#[error("The Fluent file is malformed")]
FluentParseError(String),
#[error("An unknown IO error was found")]
IOError(std::io::Error),
}
impl From<LanguageIdentifierError> for FileLoadError {
fn from(_: LanguageIdentifierError) -> Self {
Self::UnparsableLocale
}
}
impl From<std::io::Error> for FileLoadError {
fn from(err: std::io::Error) -> Self {
Self::IOError(err)
}
}
// Potential Message structure.
//
// Let's assume the application has an enumeration that implements Message. For each element of the
// enumeration, there should be some boilerplate code that returns the message ID and the arguments
// as a FluentArgs.
//
// Nobody wants to generate all of that code, though I have done so in the past, and manually
// generating that code could be useful for illustration. I think I'm going to want to do code
// generation from the source strings file, and then compile the enumeration into the code.
// However, I have not found a mechanism in Fluent to identify all of the placeholders within a
// message, so I'm not even sure that I can automate this code generation.
pub trait Message {
fn msgid(&self) -> &str;
fn args(&self) -> Option<FluentArgs>;
}
pub struct L10N {
messages_root: std::path::PathBuf,
message_bundles: Vec<FluentBundle<FluentResource, intl_memoizer::concurrent::IntlLangMemoizer>>,
locales: NonEmptyList<Locale>,
zone: chrono_tz::Tz,
}
impl L10N {
pub fn new(messages_root: std::path::PathBuf) -> Self {
let english = "en-US".parse::<Locale>().unwrap();
let sys_locale = get_locale()
.and_then(|locale_str| locale_str.parse::<Locale>().ok())
.unwrap_or(english.clone());
let locales = NonEmptyList::new(sys_locale.clone());
let zone = chrono_tz::UTC;
/*
let mut source_message_path = messages_root.clone();
source_message_path.push("en-US.ftl");
let english_phrases = FluentResource::try_new
*/
let mut s = Self {
messages_root,
message_bundles: vec![],
locales,
zone,
};
s.load_messages_from_file("en-US".to_owned()).unwrap();
s
}
fn load_messages_from_file(&mut self, locale: String) -> Result<(), FileLoadError> {
let langid: unic_langid::LanguageIdentifier = locale.parse()?;
let mut path = self.messages_root.clone();
path.push(locale);
path.set_extension("ftl");
println!("{:?}", path);
let mut buffer = Vec::new();
let mut f = File::open(path)?;
f.read_to_end(&mut buffer)?;
let text = String::from_utf8(buffer).unwrap();
match FluentResource::try_new(text) {
Ok(resource) => {
let mut bundle = FluentBundle::new_concurrent(vec![langid]);
let _ = bundle.add_resource(resource);
self.message_bundles.push(bundle);
Ok(())
}
Err((_, errors)) => Err(FileLoadError::FluentParseError(
errors
.into_iter()
.map(|err| err.to_string())
.collect::<Vec<String>>()
.join("\n"),
)),
}
}
// Now, whenever the user changes the locales, the list of messages has to change. How do we
// automatically set up the messages? Theoretically they all need to be reloaded, and I've
// already split how the messages get loaded from how the locales are specified.
//
// But, FluentErgo does that, too. It already has the concept of being constructed with a list
// of languages and then having each language bundle manually loaded afterwards.
//
// Problem: be able to change the preferred list of locales and automatically have a new
// FluentBundle which has all relevant translations loaded.
//
// One solution is that all bundles get loaded at startup time, and the bundle list gets
// changed any time the list of locales gets changed. Also, the system can just run through the
// entire list of fallbacks.
pub fn set_locales(&mut self, locales: NonEmptyList<&str>) -> Result<(), L10NError> {
let locales = locales
.iter()
.map(|locale| Locale::try_from_bytes(locale.as_bytes()))
.collect::<Result<Vec<Locale>, icu::locid::Error>>()?;
for locale in locales.iter() {
self.load_messages_from_file(locale.to_string()).unwrap();
}
self.locales = NonEmptyList(locales);
Ok(())
}
pub fn set_timezone(&mut self, zone: Tz) {
self.zone = zone;
}
// Need to take a message and turn it into a string in the current language. Except I don't
// know yet what form the message should take. Forming an adapter around fluent_ergonomics or
// even around fluent itself. I would want for the message to be statically typed, but then I
// don't know what can be the data type that gets passed in here.
//
// Formatting a message requires identifying the message and passing it any relevant
// parameters. In an ideal world, neither of these can be incorrect. Messages are all checked
// at compile time, as are their parameters. That implies an enumeration, with one element per
// message, and with each element knowing its parameters.
// pub fn messages(&self) -> Vec<FluentBundle<FluentResource>> {
// self.message_bundles.clone()
// }
pub fn tr(&self, message: impl Message) -> String {
for bundle in self.message_bundles.iter().rev() {
let msg = bundle
.get_message(message.msgid())
.and_then(|msg| msg.value());
match msg {
Some(msg) => {
let mut errors = vec![];
return self.message_bundles[0]
.format_pattern(msg, message.args().as_ref(), &mut errors)
.to_string();
}
None => continue,
}
}
unreachable!("The message {} is missing", message.msgid());
}
pub fn format_date_time_utc(
&self,
time: DateTime,
date_style: length::Date,
time_style: length::Time,
) -> String {
let time: DateTime = time.with_timezone(&chrono_tz::UTC).into();
let options = length::Bag::from_date_time_style(date_style, time_style);
let formatter = icu::datetime::DateTimeFormatter::try_new(
&DataLocale::from(self.locales.first()),
options.into(),
)
.unwrap();
let icu_time: icu::calendar::DateTime<icu::calendar::Gregorian> = time.into();
formatter.format_to_string(&icu_time.to_any()).unwrap()
}
pub fn format_date_time_local(
&self,
time: DateTime,
date_style: length::Date,
time_style: length::Time,
) -> String {
let time: DateTime = time.with_timezone(&self.zone).into();
let options = length::Bag::from_date_time_style(date_style, time_style);
let formatter = icu::datetime::DateTimeFormatter::try_new(
&DataLocale::from(self.locales.first()),
options.into(),
)
.unwrap();
let icu_time: icu::calendar::DateTime<icu::calendar::Gregorian> = time.into();
formatter.format_to_string(&icu_time.to_any()).unwrap()
}
/*
* I have been unable to get from a chrono_tz::Tz to an ICU timezone. I have tried a variety of
* parsers on the CustomTimeZone object. I have not researched the data provider to see what is
* available there. The ZoneID for the reference date is US/Mountain, and the abbreviation is
* MST. I'll want to get to a CustomTimeZone so that the formatter can render MST or Mountain
* Standard Time or something similar.
fn format_date_time_tz(
&self,
time: DateTime,
date_style: length::Date,
time_style: length::Time,
) -> String {
let options = length::Bag::from_date_time_style(date_style, time_style);
let formatter = icu::datetime::ZonedDateTimeFormatter::try_new(
&DataLocale::from(&self.locale),
options.into(),
Default::default(),
)
.unwrap();
let icu_time: icu::calendar::DateTime<icu::calendar::Gregorian> = time.into();
let any = icu_time.to_any();
println!("{:?}", time.offset());
let zone_id: String = time.offset().abbreviation().to_owned();
println!("{:?}", zone_id);
let zone_id = icu::timezone::TimeZoneBcp47Id::from_str(&zone_id).unwrap();
let zone: CustomTimeZone = CustomTimeZone {
gmt_offset: None,
time_zone_id: Some(zone_id),
/*
icu::timezone::TimeZoneBcp47Id::from_str(time.offset().tz_id().parse().unwrap())
.unwrap(),
*/
metazone_id: None,
zone_variant: None,
};
formatter.format_to_string(&any, &zone).unwrap()
}
*/
pub fn format_date(&self, date: NaiveDate, date_style: length::Date) -> String {
let formatter = icu::datetime::DateFormatter::try_new_with_length(
&DataLocale::from(self.locales.first()),
date_style,
)
.unwrap();
let icu_date: icu::calendar::Date<icu::calendar::Gregorian> =
icu::calendar::Date::try_new_gregorian_date(
date.year(),
date.month().try_into().unwrap(),
date.day().try_into().unwrap(),
)
.unwrap();
formatter.format_to_string(&icu_date.to_any()).unwrap()
}
pub fn format_f64(&self, value: f64, precision: FloatPrecision) -> String {
let fdf = FixedDecimalFormatter::try_new(
&self.locales.first().clone().into(),
Default::default(),
)
.expect("locale should be present");
let number = FixedDecimal::try_from_f64(value, precision).unwrap();
fdf.format_to_string(&FixedDecimal::try_from_f64(value, precision).unwrap())
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub struct DateTime(chrono::DateTime<Tz>);
impl Deref for DateTime {
type Target = chrono::DateTime<Tz>;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl From<chrono::DateTime<Tz>> for DateTime {
fn from(time: chrono::DateTime<Tz>) -> Self {
Self(time)
}
}
impl From<DateTime> for icu::calendar::DateTime<icu::calendar::Gregorian> {
fn from(time: DateTime) -> Self {
// SAFETY: these unwraps should be safe since chrono dates are already valid Gregorian
// dates
icu::calendar::DateTime::try_new_gregorian_datetime(
time.year(),
time.month().try_into().unwrap(),
time.day().try_into().unwrap(),
time.hour().try_into().unwrap(),
time.minute().try_into().unwrap(),
time.second().try_into().unwrap(),
)
.unwrap()
}
}
#[cfg(test)]
mod tests {
use super::*;
use fluent::fluent_args;
fn ref_l10n() -> L10N {
let mut l10n = L10N::new(std::path::PathBuf::from("./test_files"));
// Make sure we know the locale before the test begins. Some systems, such as my own, are
// not actually in English.
l10n.set_locales(NonEmptyList::from_iter(vec!["en-US"]).unwrap());
l10n.set_timezone(chrono_tz::US::Eastern);
l10n
}
fn ref_date() -> NaiveDate {
NaiveDate::from_ymd_opt(2006, 1, 2).unwrap()
}
fn ref_time() -> DateTime {
NaiveDate::from_ymd_opt(2006, 1, 2)
.unwrap()
.and_hms_opt(3, 4, 5)
.unwrap()
.and_local_timezone(Tz::US__Mountain)
.unwrap()
.into()
}
#[test]
fn it_formats_a_time_in_utc() {
let mut l10n = ref_l10n();
let now = ref_time();
// 202f is the code-point for a narrow non-breaking space. Presumably this is used in
// particular to ensure that the am/pm marker doesn't get split off from the time
assert_eq!(
l10n.format_date_time_utc(now.clone(), length::Date::Long, length::Time::Medium),
"January 2, 2006, 10:04:05\u{202f}AM"
);
l10n.set_locales(NonEmptyList::from_iter(vec!["eo-EO", "en-US"]).unwrap());
assert_eq!(
l10n.format_date_time_utc(now.clone(), length::Date::Long, length::Time::Medium),
"2006-Januaro-02 10:04:05"
);
}
#[test]
fn it_formats_a_time_in_the_current_zone() {
let mut l10n = ref_l10n();
let now = ref_time();
// 202f is the code-point for a narrow non-breaking space. Presumably this is used in
// particular to ensure that the am/pm marker doesn't get split off from the time
assert_eq!(
l10n.format_date_time_local(now.clone(), length::Date::Long, length::Time::Medium),
"January 2, 2006, 5:04:05\u{202f}AM"
);
l10n.set_locales(NonEmptyList::from_iter(vec!["eo-EO", "en-US"]).unwrap());
assert_eq!(
l10n.format_date_time_local(now.clone(), length::Date::Long, length::Time::Medium),
"2006-Januaro-02 05:04:05"
);
}
#[test]
fn it_formats_dates() {
let mut l10n = ref_l10n();
let today = ref_date();
assert_eq!(
l10n.format_date(today.clone(), length::Date::Long),
"January 2, 2006"
);
l10n.set_locales(NonEmptyList::from_iter(vec!["eo-EO", "en-US"]).unwrap());
assert_eq!(
l10n.format_date(today.clone(), length::Date::Long),
"2006-Januaro-02"
);
}
#[test]
fn it_formats_a_number_according_to_locale() {
let mut l10n = ref_l10n();
assert_eq!(l10n.format_f64(100.4, FloatPrecision::Floating), "100.4",);
assert_eq!(
l10n.format_f64(15000.4, FloatPrecision::Floating),
"15,000.4",
);
l10n.set_locales(NonEmptyList::from_iter(vec!["de-DE", "en-US"]).unwrap());
assert_eq!(l10n.format_f64(100.4, FloatPrecision::Floating), "100,4",);
assert_eq!(
l10n.format_f64(15000.4, FloatPrecision::Floating),
"15.000,4",
);
}
#[test]
fn it_can_load_message_files() {
let mut l10n = ref_l10n();
let messages = l10n.messages();
let args = fluent_args![
"name" => "Savanni"
];
assert_eq!(
messages.tr("welcome", Some(&args)).unwrap(),
"Hello, Savanni"
);
let args = fluent_args![
"count" => 1
];
assert_eq!(
messages.tr("games-in-database", Some(&args)).unwrap(),
"There is one game in the database"
);
let args = fluent_args![
"count" => 2
];
assert_eq!(
messages.tr("games-in-database", Some(&args)).unwrap(),
"There are 2 games in the database"
);
}
/*
#[test]
fn it_can_change_languages_on_locale_change() {
}
#[test]
fn phrases_can_be_translated() {
}
#[test]
fn phrases_can_fall_back() {
}
*/
/* Not really a unit test, more of a test to see what I could introspect within a fluent
* message. I was hoping that attributes would give me placeholder names, but that doesn't seem
* to be the case.
#[test]
fn messages() {
let langid_en = "en-US".parse().expect("Parsing failed.");
let resource = FluentResource::try_new(MESSAGES.to_owned()).unwrap();
let mut bundle = FluentBundle::new(vec![langid_en]);
bundle.add_resource(&resource).unwrap();
let msg = bundle.get_message("welcome").expect("message should exist");
for attr in msg.attributes() {
println!("attr: {:?}", attr);
}
assert!(false);
}
*/
}