Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add char-based API #13

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 161 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ include!(concat!(env!("OUT_DIR"), "/code_table.rs"));
#[cfg(feature = "alloc")]
mod string;

use core::fmt;

#[cfg(feature = "alloc")]
pub use string::*;

Expand All @@ -29,3 +31,162 @@ pub mod code_table_type {
Incomplete(&'static [Option<char>; 128]),
}
}

#[derive(Debug)]
pub struct TryFromCharError;

impl fmt::Display for TryFromCharError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str("unicode code point out of range")
}
}

#[derive(Debug)]
pub struct TryFromU8Error;

impl fmt::Display for TryFromU8Error {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str("code point out of range")
}
}

pub trait IncompleteCp:
Clone
+ Copy
+ fmt::Debug
+ fmt::Display
+ TryFrom<char>
+ TryFrom<u8>
+ Into<char>
+ Into<u8>
+ PartialEq<u8>
{
fn from_char_lossy(c: char) -> Self;
fn from_u8_lossy(cp: u8) -> Self;
}

pub trait CompleteCp: IncompleteCp + From<u8> {}

const REPLACEMENT: u8 = b'?';

macro_rules! cp_impl {
($Cp:ident(Common, $ENCODING_TABLE:ident, $DECODING_TABLE:ident)) => {
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
#[repr(transparent)]
pub struct $Cp(pub u8);

impl IncompleteCp for $Cp {
fn from_char_lossy(c: char) -> Self {
Self::try_from(c).unwrap_or(Self(REPLACEMENT))
}

fn from_u8_lossy(cp: u8) -> Self {
Self::try_from(cp).unwrap_or(Self(REPLACEMENT))
}
}

impl PartialEq<u8> for $Cp {
fn eq(&self, other: &u8) -> bool {
self.0.eq(other)
}
}

impl TryFrom<char> for $Cp {
type Error = TryFromCharError;

fn try_from(value: char) -> Result<Self, Self::Error> {
if (value as u32) < 128 {
Ok(Self(value as u8))
} else {
code_table::$ENCODING_TABLE
.get(&value)
.copied()
.ok_or(TryFromCharError)
.map(Self)
}
}
}

impl From<$Cp> for u8 {
fn from(value: $Cp) -> Self {
value.0
}
}

impl fmt::Display for $Cp {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
char::from(*self).fmt(f)
}
}
};
($Cp:ident(Complete, $ENCODING_TABLE:ident, $DECODING_TABLE:ident)) => {
cp_impl! { $Cp(Common, $ENCODING_TABLE, $DECODING_TABLE) }

impl CompleteCp for $Cp {}

impl From<u8> for $Cp {
fn from(value: u8) -> Self {
Self(value)
}
}

impl From<$Cp> for char {
fn from(value: $Cp) -> Self {
if value.0 < 128 {
value.0 as char
} else {
code_table::$DECODING_TABLE[usize::from(value.0 - 128)]
}
}
}
};
($Cp:ident(Incomplete, $ENCODING_TABLE:ident, $DECODING_TABLE:ident)) => {
cp_impl! { $Cp(Common, $ENCODING_TABLE, $DECODING_TABLE) }

impl TryFrom<u8> for $Cp {
type Error = TryFromU8Error;

fn try_from(value: u8) -> Result<Self, Self::Error> {
if value < 128 || code_table::$DECODING_TABLE[usize::from(value - 128)].is_some() {
Ok(Self(value))
} else {
Err(TryFromU8Error)
}
}
}

impl From<$Cp> for char {
fn from(value: $Cp) -> Self {
if value.0 < 128 {
value.0 as char
} else {
code_table::$DECODING_TABLE[usize::from(value.0 - 128)].unwrap()
}
}
}
};
($($Cp:ident($Type:ident, $ENCODING_TABLE:ident, $DECODING_TABLE:ident),)*) => {
$(cp_impl! { $Cp($Type, $ENCODING_TABLE, $DECODING_TABLE) })*
};
}

cp_impl! {
Cp437(Complete, ENCODING_TABLE_CP437, DECODING_TABLE_CP437),
Cp720(Complete, ENCODING_TABLE_CP720, DECODING_TABLE_CP720),
Cp737(Complete, ENCODING_TABLE_CP737, DECODING_TABLE_CP737),
Cp775(Complete, ENCODING_TABLE_CP775, DECODING_TABLE_CP775),
Cp850(Complete, ENCODING_TABLE_CP850, DECODING_TABLE_CP850),
Cp852(Complete, ENCODING_TABLE_CP852, DECODING_TABLE_CP852),
Cp855(Complete, ENCODING_TABLE_CP855, DECODING_TABLE_CP855),
Cp857(Incomplete, ENCODING_TABLE_CP857, DECODING_TABLE_CP857),
Cp858(Complete, ENCODING_TABLE_CP858, DECODING_TABLE_CP858),
Cp860(Complete, ENCODING_TABLE_CP860, DECODING_TABLE_CP860),
Cp861(Complete, ENCODING_TABLE_CP861, DECODING_TABLE_CP861),
Cp862(Complete, ENCODING_TABLE_CP862, DECODING_TABLE_CP862),
Cp863(Complete, ENCODING_TABLE_CP863, DECODING_TABLE_CP863),
Cp864(Incomplete, ENCODING_TABLE_CP864, DECODING_TABLE_CP864),
Cp865(Complete, ENCODING_TABLE_CP865, DECODING_TABLE_CP865),
Cp866(Complete, ENCODING_TABLE_CP866, DECODING_TABLE_CP866),
Cp869(Complete, ENCODING_TABLE_CP869, DECODING_TABLE_CP869),
Cp874(Incomplete, ENCODING_TABLE_CP874, DECODING_TABLE_CP874),
Comment on lines +174 to +191
Copy link
Owner

@tats-u tats-u Jun 6, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wouldn't like to maintain these Complete/Incomplete and C[Pp]* written by hand if possible.
build.rs automatically generates ENCODING_TABLE_* and DECODING_TABLE_*.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, so we might be able to move this macro call into build.rs. Is that what you had in mind?

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is that what you had in mind?

Right. write_* functions are for code generation and you have only to modify (some of) them and maybe generate_tables.
I wonder if it's possible.

}
128 changes: 128 additions & 0 deletions src/string.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,139 @@
use alloc::string::String;
use alloc::vec::Vec;

use crate::{CompleteCp, IncompleteCp, TryFromU8Error};

use super::code_table_type::TableType;
use super::OEMCPHashMap;

use TableType::*;

pub trait StrExt {
/// ```
/// use oem_cp::{Cp437, Cp737, StrExt};
///
/// assert_eq!("π≈22/7".to_cp::<Cp437>().unwrap(), vec![0xE3, 0xF7, 0x32, 0x32, 0x2F, 0x37]);
/// // Archimedes in Greek
/// assert_eq!("Αρχιμήδης".to_cp::<Cp737>().unwrap(), vec![0x80, 0xA8, 0xAE, 0xA0, 0xA3, 0xE3, 0x9B, 0x9E, 0xAA]);
/// // Japanese characters are not defined in CP437
/// assert!("日本語ja_jp".to_cp::<Cp437>().is_err());
/// ```
fn to_cp<T: IncompleteCp>(&self) -> Result<Vec<T>, <T as TryFrom<char>>::Error>
where
u8: From<T>,
char: From<T>;

/// ```
/// use oem_cp::{Cp437, Cp737, StrExt};
///
/// assert_eq!("π≈22/7".to_cp_lossy::<Cp437>(), vec![0xE3, 0xF7, 0x32, 0x32, 0x2F, 0x37]);
/// // Archimedes in Greek
/// assert_eq!("Αρχιμήδης".to_cp_lossy::<Cp737>(), vec![0x80, 0xA8, 0xAE, 0xA0, 0xA3, 0xE3, 0x9B, 0x9E, 0xAA]);
/// // Japanese characters are not defined in CP437 and replaced with `?` (0x3F)
/// // "日本語ja_jp" => "???ja_jp"
/// assert_eq!("日本語ja_jp".to_cp_lossy::<Cp437>(), vec![0x3F, 0x3F, 0x3F, 0x6A, 0x61, 0x5F, 0x6A, 0x70]);
/// ```
fn to_cp_lossy<T: IncompleteCp>(&self) -> Vec<T>
where
u8: From<T>,
char: From<T>;
}

impl StrExt for str {
fn to_cp<T: IncompleteCp>(&self) -> Result<Vec<T>, <T as TryFrom<char>>::Error>
where
u8: From<T>,
char: From<T>,
{
self.chars().map(T::try_from).collect()
}

fn to_cp_lossy<T: IncompleteCp>(&self) -> Vec<T>
where
u8: From<T>,
char: From<T>,
{
self.chars().map(T::from_char_lossy).collect()
}
}

pub trait StringExt: Sized {
/// ```
/// use oem_cp::{Cp874, StringExt};
///
/// // means shrimp in Thai (U+E49 => 0xE9)
/// assert_eq!(String::try_from_cp::<Cp874>(&[0xA1, 0xD8, 0xE9, 0xA7]).unwrap(), "กุ้ง");
/// // 0xDB-0xDE,0xFC-0xFF is invalid in CP874 in Windows
/// assert!(String::try_from_cp::<Cp874>(&[0x30, 0xDB]).is_err());
/// ```
fn try_from_cp<T: IncompleteCp>(v: &[u8]) -> Result<Self, TryFromU8Error>
where
u8: From<T>,
char: From<T>,
TryFromU8Error: From<<T as TryFrom<u8>>::Error>;

/// ```
/// use oem_cp::{Cp874, StringExt};
///
/// // means shrimp in Thai (U+E49 => 0xE9)
/// assert_eq!(String::from_cp_lossy::<Cp874>(&[0xA1, 0xD8, 0xE9, 0xA7]), "กุ้ง");
/// // 0xDB-0xDE,0xFC-0xFF is invalid in CP874 in Windows
/// assert_eq!(String::from_cp_lossy::<Cp874>(&[0x30, 0xDB]), "0\u{FFFD}");
/// ```
fn from_cp_lossy<T: IncompleteCp>(v: &[u8]) -> Self
where
u8: From<T>,
char: From<T>;

/// ```
/// use oem_cp::{Cp437, StringExt};
///
/// assert_eq!(String::from_cp::<Cp437>(&[0xFB, 0xAC, 0x3D, 0xAB]), "√¼=½");
/// ```
fn from_cp<T: CompleteCp>(v: &[u8]) -> Self
where
u8: From<T>,
char: From<T>;
}

impl StringExt for String {
fn from_cp_lossy<T: IncompleteCp>(v: &[u8]) -> Self
where
u8: From<T>,
char: From<T>,
{
const REPLACEMENT: char = '\u{FFFD}';
v.iter()
.copied()
.map(|cp| T::try_from(cp).map(char::from).unwrap_or(REPLACEMENT))
.collect()
}

fn from_cp<T: CompleteCp>(v: &[u8]) -> Self
where
u8: From<T>,
char: From<T>,
{
v.iter().copied().map(T::from).map(char::from).collect()
}

fn try_from_cp<T: IncompleteCp>(v: &[u8]) -> Result<Self, TryFromU8Error>
where
u8: From<T>,
char: From<T>,
TryFromU8Error: From<<T as TryFrom<u8>>::Error>,
{
v.iter()
.copied()
.map(|cp| {
T::try_from(cp)
.map(char::from)
.map_err(TryFromU8Error::from)
})
.collect()
}
}

impl TableType {
/// Wrapper function for decoding bytes encoded in SBCSs
///
Expand Down
Loading