Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unicode 15.1 support #253

Merged
merged 7 commits into from
Oct 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ endif()
if(UTF8PROC_ENABLE_TESTING)
enable_testing()
file(MAKE_DIRECTORY data)
set(UNICODE_VERSION 15.0.0)
set(UNICODE_VERSION 15.1.0)
file(DOWNLOAD https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/NormalizationTest.txt ${CMAKE_BINARY_DIR}/data/NormalizationTest.txt SHOW_PROGRESS)
file(DOWNLOAD https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/auxiliary/GraphemeBreakTest.txt ${CMAKE_BINARY_DIR}/data/GraphemeBreakTest.txt SHOW_PROGRESS)
add_executable(case test/tests.h test/tests.c utf8proc.h test/case.c)
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ The C library is found in this directory after successful compilation
and is named `libutf8proc.a` (for the static library) and
`libutf8proc.so` (for the dynamic library).

The Unicode version supported is 15.0.0.
The Unicode version supported is 15.1.0.

For Unicode normalizations, the following options are used:

Expand Down
2 changes: 1 addition & 1 deletion data/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ CharWidths.txt: charwidths.jl EastAsianWidth.txt
$(JULIA) charwidths.jl > $@

# Unicode data version (must also update utf8proc_unicode_version function)
UNICODE_VERSION=15.0.0
UNICODE_VERSION=15.1.0

UnicodeData.txt:
$(CURL) $(CURLFLAGS) -o $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/UnicodeData.txt
Expand Down
33 changes: 30 additions & 3 deletions data/data_generator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,32 @@
end
end

$icb_linker_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Indic_Conjunct_Break=Linker.*?# Total code points:/m]
$icb = Hash.new("UTF8PROC_INDIC_CONJUNCT_BREAK_NONE")
$icb_linker_list.each_line do |entry|
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
$1.hex.upto($2.hex) { |e2| $icb[e2] = "UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER" }
elsif entry =~ /^[0-9A-F]+/
$icb[$&.hex] = "UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER"
end
end
$icb_consonant_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Indic_Conjunct_Break=Consonant.*?# Total code points:/m]
$icb_consonant_list.each_line do |entry|
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
$1.hex.upto($2.hex) { |e2| $icb[e2] = "UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT" }
elsif entry =~ /^[0-9A-F]+/
$icb[$&.hex] = "UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT"
end
end
$icb_extend_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Indic_Conjunct_Break=Extend.*?# Total code points:/m]
$icb_extend_list.each_line do |entry|
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
$1.hex.upto($2.hex) { |e2| $icb[e2] = "UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND" }
elsif entry =~ /^[0-9A-F]+/
$icb[$&.hex] = "UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND"
end
end

$grapheme_boundclass_list = File.read("GraphemeBreakProperty.txt", :encoding => 'utf-8')
$grapheme_boundclass = Hash.new("UTF8PROC_BOUNDCLASS_OTHER")
$grapheme_boundclass_list.each_line do |entry|
Expand Down Expand Up @@ -174,7 +200,7 @@ def cpary2c(array)
return "UINT16_MAX" if array.nil? || array.length == 0
lencode = array.length - 1 #no sequence has len 0, so we encode len 1 as 0, len 2 as 1, ...
array = cpary2utf16encoded(array)
if lencode >= 3 #we have only 2 bits for the length
if lencode >= 3 #we have only 2 bits for the length
array = [lencode] + array
lencode = 3
end
Expand Down Expand Up @@ -249,7 +275,8 @@ def c_entry(comb_indicies)
"#{$ignorable.include?(code)}, " <<
"#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " <<
"#{$charwidth[code]}, 0, " <<
"#{$grapheme_boundclass[code]}},\n"
"#{$grapheme_boundclass[code]}, " <<
"#{$icb[code]}},\n"
end
end

Expand Down Expand Up @@ -415,7 +442,7 @@ def c_entry(comb_indicies)
$stdout << "};\n\n"

$stdout << "static const utf8proc_property_t utf8proc_properties[] = {\n"
$stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},\n"
$stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},\n"
properties.each { |line|
$stdout << line
}
Expand Down
7 changes: 7 additions & 0 deletions test/graphemetest.c
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,13 @@ int main(int argc, char **argv)
checkline("/ 1f926 1f3fc 200d 2642 fe0f /", true); /* facepalm + pale skin + zwj + male sign + FE0F */
checkline("/ 1f468 1f3fb 200d 1f91d 200d 1f468 1f3fd /", true); /* man face + pale skin + zwj + hand holding + zwj + man face + dark skin */

/* more GB9c tests */
checkline("/ 0915 0300 094d 0300 0924 / 0915 /", true);
checkline("/ 0915 0300 094d 0300 094d 0924 / 0915 /", true);
checkline("/ 0915 0300 0300 / 0924 / 0915 /", true);
checkline("/ 0915 0300 094d 0300 / 0078 /", true);
checkline("/ 0300 094d 0300 / 0924 / 0915 /", true);

check(utf8proc_grapheme_break(0x03b1, 0x03b2), "failed 03b1 / 03b2 test");
check(!utf8proc_grapheme_break(0x03b1, 0x0302), "failed 03b1 0302 test");

Expand Down
2 changes: 2 additions & 0 deletions test/printproperty.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ int main(int argc, char **argv)
" ignorable = %d\n"
" control_boundary = %d\n"
" boundclass = %d\n"
" indic_conjunct_break = %d\n"
" charwidth = %d\n",
argv[i], (char*) cstr,
utf8proc_category_string(c),
Expand All @@ -55,6 +56,7 @@ int main(int argc, char **argv)
p->ignorable,
p->control_boundary,
p->boundclass,
p->indic_conjunct_break,
utf8proc_charwidth(c));
free(map);
}
Expand Down
61 changes: 42 additions & 19 deletions utf8proc.c
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_version(void) {
}

UTF8PROC_DLLEXPORT const char *utf8proc_unicode_version(void) {
return "15.0.0";
return "15.1.0";
}

UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
Expand Down Expand Up @@ -288,35 +288,54 @@ static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
true; // GB999
}

static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state)
static utf8proc_bool grapheme_break_extended(int lbc, int tbc, int licb, int ticb, utf8proc_int32_t *state)
{
if (state) {
int lbc_override;
if (*state == UTF8PROC_BOUNDCLASS_START)
*state = lbc_override = lbc;
else
lbc_override = *state;
utf8proc_bool break_permitted = grapheme_break_simple(lbc_override, tbc);
int state_bc, state_icb; /* boundclass and indic_conjunct_break state */
if (*state == 0) { /* state initialization */
state_bc = lbc;
state_icb = licb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT ? licb : UTF8PROC_INDIC_CONJUNCT_BREAK_NONE;
}
else { /* lbc and licb are already encoded in *state */
state_bc = *state & 0xff; // 1st byte of state is bound class
state_icb = *state >> 8; // 2nd byte of state is indic conjunct break
}

utf8proc_bool break_permitted = grapheme_break_simple(state_bc, tbc) &&
!(state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER
&& ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT); // GB9c

// Special support for GB9c. Don't break between two consonants
// separated 1+ linker characters and 0+ extend characters in any order.
// After a consonant, we enter LINKER state after at least one linker.
if (ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT
|| state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT
|| state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND)
state_icb = ticb;
else if (state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER)
state_icb = ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND ?
UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER : ticb;

// Special support for GB 12/13 made possible by GB999. After two RI
// class codepoints we want to force a break. Do this by resetting the
// second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
// after that character according to GB999 (unless of course such a break is
// forbidden by a different rule such as GB9).
if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
*state = UTF8PROC_BOUNDCLASS_OTHER;
if (state_bc == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
state_bc = UTF8PROC_BOUNDCLASS_OTHER;
// Special support for GB11 (emoji extend* zwj / emoji)
else if (*state == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) {
else if (state_bc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) {
if (tbc == UTF8PROC_BOUNDCLASS_EXTEND) // fold EXTEND codepoints into emoji
*state = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC;
state_bc = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC;
else if (tbc == UTF8PROC_BOUNDCLASS_ZWJ)
*state = UTF8PROC_BOUNDCLASS_E_ZWG; // state to record emoji+zwg combo
state_bc = UTF8PROC_BOUNDCLASS_E_ZWG; // state to record emoji+zwg combo
else
*state = tbc;
state_bc = tbc;
}
else
*state = tbc;
state_bc = tbc;

*state = state_bc + (state_icb << 8);
return break_permitted;
}
else
Expand All @@ -326,8 +345,12 @@ static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t *state) {

return grapheme_break_extended(utf8proc_get_property(c1)->boundclass,
utf8proc_get_property(c2)->boundclass,
const utf8proc_property_t *p1 = utf8proc_get_property(c1);
const utf8proc_property_t *p2 = utf8proc_get_property(c2);
return grapheme_break_extended(p1->boundclass,
p2->boundclass,
p1->indic_conjunct_break,
p2->indic_conjunct_break,
state);
}

Expand Down Expand Up @@ -498,8 +521,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
}
if (options & UTF8PROC_CHARBOUND) {
utf8proc_bool boundary;
int tbc = property->boundclass;
boundary = grapheme_break_extended(*last_boundclass, tbc, last_boundclass);
boundary = grapheme_break_extended(0, property->boundclass, 0, property->indic_conjunct_break,
last_boundclass);
if (boundary) {
if (bufsize >= 1) dst[0] = -1; /* sentinel value for grapheme break */
if (bufsize >= 2) dst[1] = uc;
Expand Down
16 changes: 13 additions & 3 deletions utf8proc.h
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,8 @@ typedef struct utf8proc_property_struct {
* Boundclass.
* @see utf8proc_boundclass_t.
*/
unsigned boundclass:8;
unsigned boundclass:6;
unsigned indic_conjunct_break:2;
} utf8proc_property_t;

/** Unicode categories. */
Expand Down Expand Up @@ -388,6 +389,14 @@ typedef enum {
UTF8PROC_BOUNDCLASS_E_ZWG = 20, /* UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC + ZWJ */
} utf8proc_boundclass_t;

/** Indic_Conjunct_Break property. (TR44) */
typedef enum {
UTF8PROC_INDIC_CONJUNCT_BREAK_NONE = 0,
UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER = 1,
UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT = 2,
UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND = 3,
} utf8proc_indic_conjunct_break_t;

/**
* Function pointer type passed to @ref utf8proc_map_custom and
* @ref utf8proc_decompose_custom, which is used to specify a user-defined
Expand Down Expand Up @@ -481,8 +490,9 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int
* - @ref UTF8PROC_STRIPNA - remove unassigned codepoints
* @param last_boundclass
* Pointer to an integer variable containing
* the previous codepoint's boundary class if the @ref UTF8PROC_CHARBOUND
* option is used. Otherwise, this parameter is ignored.
* the previous codepoint's (boundclass + indic_conjunct_break << 1) if the @ref UTF8PROC_CHARBOUND
* option is used. If the string is being processed in order, this can be initialized to 0 for
* the beginning of the string, and is thereafter updated automatically. Otherwise, this parameter is ignored.
*
* @return
* In case of success, the number of codepoints written is returned; in case
Expand Down
Loading
Loading