From 5cf63680fa7c52b7bb14e901bfdb2c08c140b2a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Kr=C3=B6ning?= Date: Thu, 6 Jun 2024 16:56:50 +0200 Subject: [PATCH] feat: add `char`-based API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Martin Kröning --- src/lib.rs | 161 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/string.rs | 128 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 289 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index 29185cb..2fcf52f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,6 +8,8 @@ include!(concat!(env!("OUT_DIR"), "/code_table.rs")); #[cfg(feature = "alloc")] mod string; +use core::fmt; + #[cfg(feature = "alloc")] pub use string::*; @@ -29,3 +31,162 @@ pub mod code_table_type { Incomplete(&'static [Option; 128]), } } + +#[derive(Debug)] +pub struct TryFromCharError; + +impl fmt::Display for TryFromCharError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("unicode code point out of range") + } +} + +#[derive(Debug)] +pub struct TryFromU8Error; + +impl fmt::Display for TryFromU8Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("code point out of range") + } +} + +pub trait IncompleteCp: + Clone + + Copy + + fmt::Debug + + fmt::Display + + TryFrom + + TryFrom + + Into + + Into + + PartialEq +{ + fn from_char_lossy(c: char) -> Self; + fn from_u8_lossy(cp: u8) -> Self; +} + +pub trait CompleteCp: IncompleteCp + From {} + +const REPLACEMENT: u8 = b'?'; + +macro_rules! cp_impl { + ($Cp:ident(Common, $ENCODING_TABLE:ident, $DECODING_TABLE:ident)) => { + #[derive(Clone, Copy, PartialEq, Eq, Debug)] + #[repr(transparent)] + pub struct $Cp(pub u8); + + impl IncompleteCp for $Cp { + fn from_char_lossy(c: char) -> Self { + Self::try_from(c).unwrap_or(Self(REPLACEMENT)) + } + + fn from_u8_lossy(cp: u8) -> Self { + Self::try_from(cp).unwrap_or(Self(REPLACEMENT)) + } + } + + impl PartialEq for $Cp { + fn eq(&self, other: &u8) -> bool { + self.0.eq(other) + } + } + + impl TryFrom for $Cp { + type Error = TryFromCharError; + + fn try_from(value: char) -> Result { + if (value as u32) < 128 { + Ok(Self(value as u8)) + } else { + code_table::$ENCODING_TABLE + .get(&value) + .copied() + .ok_or(TryFromCharError) + .map(Self) + } + } + } + + impl From<$Cp> for u8 { + fn from(value: $Cp) -> Self { + value.0 + } + } + + impl fmt::Display for $Cp { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + char::from(*self).fmt(f) + } + } + }; + ($Cp:ident(Complete, $ENCODING_TABLE:ident, $DECODING_TABLE:ident)) => { + cp_impl! { $Cp(Common, $ENCODING_TABLE, $DECODING_TABLE) } + + impl CompleteCp for $Cp {} + + impl From for $Cp { + fn from(value: u8) -> Self { + Self(value) + } + } + + impl From<$Cp> for char { + fn from(value: $Cp) -> Self { + if value.0 < 128 { + value.0 as char + } else { + code_table::$DECODING_TABLE[usize::from(value.0 - 128)] + } + } + } + }; + ($Cp:ident(Incomplete, $ENCODING_TABLE:ident, $DECODING_TABLE:ident)) => { + cp_impl! { $Cp(Common, $ENCODING_TABLE, $DECODING_TABLE) } + + impl TryFrom for $Cp { + type Error = TryFromU8Error; + + fn try_from(value: u8) -> Result { + if value < 128 || code_table::$DECODING_TABLE[usize::from(value - 128)].is_some() { + Ok(Self(value)) + } else { + Err(TryFromU8Error) + } + } + } + + impl From<$Cp> for char { + fn from(value: $Cp) -> Self { + if value.0 < 128 { + value.0 as char + } else { + code_table::$DECODING_TABLE[usize::from(value.0 - 128)].unwrap() + } + } + } + }; + ($($Cp:ident($Type:ident, $ENCODING_TABLE:ident, $DECODING_TABLE:ident),)*) => { + $(cp_impl! { $Cp($Type, $ENCODING_TABLE, $DECODING_TABLE) })* + }; +} + +cp_impl! { + Cp437(Complete, ENCODING_TABLE_CP437, DECODING_TABLE_CP437), + Cp720(Complete, ENCODING_TABLE_CP720, DECODING_TABLE_CP720), + Cp737(Complete, ENCODING_TABLE_CP737, DECODING_TABLE_CP737), + Cp775(Complete, ENCODING_TABLE_CP775, DECODING_TABLE_CP775), + Cp850(Complete, ENCODING_TABLE_CP850, DECODING_TABLE_CP850), + Cp852(Complete, ENCODING_TABLE_CP852, DECODING_TABLE_CP852), + Cp855(Complete, ENCODING_TABLE_CP855, DECODING_TABLE_CP855), + Cp857(Incomplete, ENCODING_TABLE_CP857, DECODING_TABLE_CP857), + Cp858(Complete, ENCODING_TABLE_CP858, DECODING_TABLE_CP858), + Cp860(Complete, ENCODING_TABLE_CP860, DECODING_TABLE_CP860), + Cp861(Complete, ENCODING_TABLE_CP861, DECODING_TABLE_CP861), + Cp862(Complete, ENCODING_TABLE_CP862, DECODING_TABLE_CP862), + Cp863(Complete, ENCODING_TABLE_CP863, DECODING_TABLE_CP863), + Cp864(Incomplete, ENCODING_TABLE_CP864, DECODING_TABLE_CP864), + Cp865(Complete, ENCODING_TABLE_CP865, DECODING_TABLE_CP865), + Cp866(Complete, ENCODING_TABLE_CP866, DECODING_TABLE_CP866), + Cp869(Complete, ENCODING_TABLE_CP869, DECODING_TABLE_CP869), + Cp874(Incomplete, ENCODING_TABLE_CP874, DECODING_TABLE_CP874), +} diff --git a/src/string.rs b/src/string.rs index 2a391a0..f324f2c 100644 --- a/src/string.rs +++ b/src/string.rs @@ -1,11 +1,139 @@ use alloc::string::String; use alloc::vec::Vec; +use crate::{CompleteCp, IncompleteCp, TryFromU8Error}; + use super::code_table_type::TableType; use super::OEMCPHashMap; use TableType::*; +pub trait StrExt { + /// ``` + /// use oem_cp::{Cp437, Cp737, StrExt}; + /// + /// assert_eq!("π≈22/7".to_cp::().unwrap(), vec![0xE3, 0xF7, 0x32, 0x32, 0x2F, 0x37]); + /// // Archimedes in Greek + /// assert_eq!("Αρχιμήδης".to_cp::().unwrap(), vec![0x80, 0xA8, 0xAE, 0xA0, 0xA3, 0xE3, 0x9B, 0x9E, 0xAA]); + /// // Japanese characters are not defined in CP437 + /// assert!("日本語ja_jp".to_cp::().is_err()); + /// ``` + fn to_cp(&self) -> Result, >::Error> + where + u8: From, + char: From; + + /// ``` + /// use oem_cp::{Cp437, Cp737, StrExt}; + /// + /// assert_eq!("π≈22/7".to_cp_lossy::(), vec![0xE3, 0xF7, 0x32, 0x32, 0x2F, 0x37]); + /// // Archimedes in Greek + /// assert_eq!("Αρχιμήδης".to_cp_lossy::(), vec![0x80, 0xA8, 0xAE, 0xA0, 0xA3, 0xE3, 0x9B, 0x9E, 0xAA]); + /// // Japanese characters are not defined in CP437 and replaced with `?` (0x3F) + /// // "日本語ja_jp" => "???ja_jp" + /// assert_eq!("日本語ja_jp".to_cp_lossy::(), vec![0x3F, 0x3F, 0x3F, 0x6A, 0x61, 0x5F, 0x6A, 0x70]); + /// ``` + fn to_cp_lossy(&self) -> Vec + where + u8: From, + char: From; +} + +impl StrExt for str { + fn to_cp(&self) -> Result, >::Error> + where + u8: From, + char: From, + { + self.chars().map(T::try_from).collect() + } + + fn to_cp_lossy(&self) -> Vec + where + u8: From, + char: From, + { + self.chars().map(T::from_char_lossy).collect() + } +} + +pub trait StringExt: Sized { + /// ``` + /// use oem_cp::{Cp874, StringExt}; + /// + /// // means shrimp in Thai (U+E49 => 0xE9) + /// assert_eq!(String::try_from_cp::(&[0xA1, 0xD8, 0xE9, 0xA7]).unwrap(), "กุ้ง"); + /// // 0xDB-0xDE,0xFC-0xFF is invalid in CP874 in Windows + /// assert!(String::try_from_cp::(&[0x30, 0xDB]).is_err()); + /// ``` + fn try_from_cp(v: &[u8]) -> Result + where + u8: From, + char: From, + TryFromU8Error: From<>::Error>; + + /// ``` + /// use oem_cp::{Cp874, StringExt}; + /// + /// // means shrimp in Thai (U+E49 => 0xE9) + /// assert_eq!(String::from_cp_lossy::(&[0xA1, 0xD8, 0xE9, 0xA7]), "กุ้ง"); + /// // 0xDB-0xDE,0xFC-0xFF is invalid in CP874 in Windows + /// assert_eq!(String::from_cp_lossy::(&[0x30, 0xDB]), "0\u{FFFD}"); + /// ``` + fn from_cp_lossy(v: &[u8]) -> Self + where + u8: From, + char: From; + + /// ``` + /// use oem_cp::{Cp437, StringExt}; + /// + /// assert_eq!(String::from_cp::(&[0xFB, 0xAC, 0x3D, 0xAB]), "√¼=½"); + /// ``` + fn from_cp(v: &[u8]) -> Self + where + u8: From, + char: From; +} + +impl StringExt for String { + fn from_cp_lossy(v: &[u8]) -> Self + where + u8: From, + char: From, + { + const REPLACEMENT: char = '\u{FFFD}'; + v.iter() + .copied() + .map(|cp| T::try_from(cp).map(char::from).unwrap_or(REPLACEMENT)) + .collect() + } + + fn from_cp(v: &[u8]) -> Self + where + u8: From, + char: From, + { + v.iter().copied().map(T::from).map(char::from).collect() + } + + fn try_from_cp(v: &[u8]) -> Result + where + u8: From, + char: From, + TryFromU8Error: From<>::Error>, + { + v.iter() + .copied() + .map(|cp| { + T::try_from(cp) + .map(char::from) + .map_err(TryFromU8Error::from) + }) + .collect() + } +} + impl TableType { /// Wrapper function for decoding bytes encoded in SBCSs ///