Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sketch for UTF-16 support #95

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions include/ada/checkers.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ namespace ada::checkers {
// safe if input.size() >=2. See has_hex_prefix.
inline bool has_hex_prefix_unsafe(std::string_view input) {
// This is actualy efficient code, see has_hex_prefix for the assembly.
uint32_t value = 1;
bool is_little_endian = (static_cast<uint8_t>(value) == 1);
uint32_t value_one = 1;
bool is_little_endian = (reinterpret_cast<char*>(&value_one)[0] == 1);
Comment on lines +23 to +24
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code is already included in the main branch (with your latest commit)

uint16_t word0x{};
std::memcpy(&word0x, "0x", 2); // we would use bit_cast in C++20 and the function could be constexpr.
uint16_t two_first_bytes{};
Expand Down
1 change: 0 additions & 1 deletion include/ada/parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
namespace ada::parser {
url parse_url(std::string_view user_input,
std::optional<ada::url> base_url = std::nullopt,
ada::encoding_type encoding = ada::encoding_type::UTF8,
std::optional<ada::url> optional_url = std::nullopt);

} // namespace ada
Expand Down
2 changes: 1 addition & 1 deletion include/ada/unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ namespace ada::unicode {
// first_percent should be = plain.find('%')
std::string percent_decode(const std::string_view input, size_t first_percent);
std::string percent_encode(const std::string_view input, const uint8_t character_set[]);

size_t utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output, encoding_type type);
} // namespace ada::unicode

#endif // ADA_UNICODE_H
66 changes: 60 additions & 6 deletions src/implementation.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#include <charconv>
#include <iostream>
#include <memory>
#include <vector>
#include <string_view>
#include <utility>

Expand All @@ -26,10 +28,24 @@ namespace ada {
std::optional<ada::url> base_url,
ada::encoding_type encoding) {
if(encoding != encoding_type::UTF8) {
// todo: unsupported !
// If there is a BOM, prune it out.
if(input.size() >= 2) {
if((uint8_t(input[0]) == 0xff) && (uint8_t(input[1]) == 0xfe) && encoding == encoding_type::UTF_16LE) {
input.remove_prefix(2);
} else if ((uint8_t(input[0]) == 0xfe) && (uint8_t(input[1]) == 0xff) && encoding == encoding_type::UTF_16BE) {
input.remove_prefix(2);
}
}
if(!input.empty()) {
std::unique_ptr<char[]> utf8buffer(new char[input.size() * 2]);
size_t utf8_length = unicode::utf16_to_utf8(reinterpret_cast<const char16_t*>(input.data()), input.size()/2,utf8buffer.get(), encoding);
if((input.size() % 2) != 0) { utf8_length = 0; }
std::string_view utf8_input(utf8buffer.get(), utf8_length); // in case of error utf8_length == 0
return ada::parser::parse_url(utf8_input, std::move(base_url));
}
}
// TODO std::move(base_url) might be unwise. Check.
return ada::parser::parse_url(input, std::move(base_url), encoding);
return ada::parser::parse_url(input, std::move(base_url));
}

/*
Expand All @@ -44,7 +60,20 @@ namespace ada {
*/
bool set_scheme(ada::url& base, std::string input, ada::encoding_type encoding) noexcept {
if(encoding != encoding_type::UTF8) {
return false; // unsupported !
std::string_view initial_input = input;
// If there is a BOM, prune it out.
if(initial_input.size() >= 2) {
if((uint8_t(initial_input[0]) == 0xff) && (uint8_t(initial_input[1]) == 0xfe) && encoding == encoding_type::UTF_16LE) {
initial_input.remove_prefix(2);
} else if ((uint8_t(input[0]) == 0xfe) && (uint8_t(initial_input[1]) == 0xff) && encoding == encoding_type::UTF_16BE) {
initial_input.remove_prefix(2);
}
}
std::unique_ptr<char[]> utf8buffer(new char[input.size() * 2]);
size_t utf8_length = unicode::utf16_to_utf8(reinterpret_cast<const char16_t*>(initial_input.data()), initial_input.size()/2,utf8buffer.get(), encoding);
if((input.size() % 2) != 0) { utf8_length = 0; }
std::string_view utf8_input(utf8buffer.get(), utf8_length); // in case of error utf8_length == 0
return set_scheme(base, std::string(utf8_input), encoding_type::UTF8);
}
if (!input.empty()) {
input.append(":");
Expand Down Expand Up @@ -110,7 +139,20 @@ namespace ada {
*/
bool set_host(ada::url& base, std::string_view input, ada::encoding_type encoding) noexcept {
if(encoding != encoding_type::UTF8) {
return false; // unsupported !
std::string_view initial_input = input;
// If there is a BOM, prune it out.
if(initial_input.size() >= 2) {
if((uint8_t(initial_input[0]) == 0xff) && (uint8_t(initial_input[1]) == 0xfe) && encoding == encoding_type::UTF_16LE) {
initial_input.remove_prefix(2);
} else if ((uint8_t(input[0]) == 0xfe) && (uint8_t(initial_input[1]) == 0xff) && encoding == encoding_type::UTF_16BE) {
initial_input.remove_prefix(2);
}
}
std::unique_ptr<char[]> utf8buffer(new char[input.size() * 2]);
size_t utf8_length = unicode::utf16_to_utf8(reinterpret_cast<const char16_t*>(initial_input.data()), initial_input.size()/2,utf8buffer.get(), encoding);
if((input.size() % 2) != 0) { utf8_length = 0; }
std::string_view utf8_input(utf8buffer.get(), utf8_length); // in case of error utf8_length == 0
return set_host(base, utf8_input, encoding_type::UTF8);
}
// If this’s URL has an opaque path, then return.
if (base.has_opaque_path) {
Expand Down Expand Up @@ -199,9 +241,21 @@ namespace ada {
* @see https://url.spec.whatwg.org/#dom-url-pathname
*/
bool set_pathname(ada::url& base, std::string_view input, ada::encoding_type encoding) noexcept {

if(encoding != encoding_type::UTF8) {
return false; // unsupported !
std::string_view initial_input = input;
// If there is a BOM, prune it out.
if(initial_input.size() >= 2) {
if((uint8_t(initial_input[0]) == 0xff) && (uint8_t(initial_input[1]) == 0xfe) && encoding == encoding_type::UTF_16LE) {
initial_input.remove_prefix(2);
} else if ((uint8_t(input[0]) == 0xfe) && (uint8_t(initial_input[1]) == 0xff) && encoding == encoding_type::UTF_16BE) {
initial_input.remove_prefix(2);
}
}
std::unique_ptr<char[]> utf8buffer(new char[input.size() * 2]);
size_t utf8_length = unicode::utf16_to_utf8(reinterpret_cast<const char16_t*>(initial_input.data()), initial_input.size()/2,utf8buffer.get(), encoding);
if((input.size() % 2) != 0) { utf8_length = 0; }
std::string_view utf8_input(utf8buffer.get(), utf8_length); // in case of error utf8_length == 0
return set_pathname(base, utf8_input, encoding_type::UTF8);
}
// If this’s URL has an opaque path, then return.
if (base.has_opaque_path) {
Expand Down
16 changes: 9 additions & 7 deletions src/parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ namespace ada::parser {

url parse_url(std::string_view user_input,
std::optional<ada::url> base_url,
ada::encoding_type encoding,
std::optional<ada::url> optional_url) {
// Let state be state override if given, or scheme start state otherwise.
ada::state state = ada::state::SCHEME_START;
Expand Down Expand Up @@ -395,12 +394,15 @@ namespace ada::parser {
// If encoding is not UTF-8 and one of the following is true:
// - url is not special
// - url’s scheme is "ws" or "wss"
if (encoding != ada::encoding_type::UTF8) {
if (!url.is_special() || url.get_scheme_type() == ada::scheme::type::WS || url.get_scheme_type() == ada::scheme::type::WSS) {
// then set encoding to UTF-8.
encoding = ada::encoding_type::UTF8;
}
}
//////////////
// All of the processing *requires* UTF-8. So we would never get here:
/////////////
//if (encoding != ada::encoding_type::UTF8) {
// if (!url.is_special() || url.get_scheme_type() == ada::scheme::type::WS || url.get_scheme_type() == ada::scheme::type::WSS) {
// // then set encoding to UTF-8.
// encoding = ada::encoding_type::UTF8;
// }
//}

// Let queryPercentEncodeSet be the special-query percent-encode set if url is special;
// otherwise the query percent-encode set.
Expand Down
80 changes: 80 additions & 0 deletions src/unicode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -412,4 +412,84 @@ constexpr static bool is_forbidden_domain_code_point_table[] = {
return true;
}

// This function attemps to convert an ASCII string to a lower-case version.
// Once the lower cased version has been materialized, we check for the presence
// of the substring 'xn-', if it is found (unlikely), we then call the expensive 'to_ascii'.
ada_really_inline bool to_lower_ascii_string(std::optional<std::string>& out, size_t first_percent) noexcept {
#if ADA_DEVELOP_MODE
if(!out.has_value()) { abort(); }
#endif
if(std::any_of(out.value().begin(), out.value().end(), ada::unicode::is_forbidden_domain_code_point)) { return false; }
std::transform(out.value().begin(), out.value().end(), out.value().begin(), [](char c) -> char {
return (uint8_t((c|0x20) - 0x61) <= 25 ? (c|0x20) : c);}
);
if (out.value().find("xn-") == std::string_view::npos) {
return true;
}

return to_ascii(out, out.value(), false, first_percent);
}


size_t utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output, encoding_type type) {
uint32_t value_one = 1;
bool is_little_endian = (reinterpret_cast<char*>(&value_one)[0] == 1);
bool need_flip = (is_little_endian) ? (type == encoding_type::UTF_16BE) : (type == encoding_type::UTF_16LE);
const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
size_t pos = 0;
auto swap_bytes = [](uint16_t word) { return uint16_t((word >> 8) | (word << 8)); };
char* start{utf8_output};
while (pos < len) {
// try to convert the next block of 8 ASCII characters
if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
uint64_t v;
::memcpy(&v, data + pos, sizeof(uint64_t));
if (need_flip) v = (v >> 8) | (v << (64 - 8));
if ((v & 0xFF80FF80FF80FF80) == 0) {
size_t final_pos = pos + 4;
while(pos < final_pos) {
*utf8_output++ = need_flip ? char(swap_bytes(buf[pos])) : char(buf[pos]);
pos++;
}
continue;
}
}
uint16_t word = need_flip ? swap_bytes(data[pos]) : data[pos];
if((word & 0xFF80)==0) {
// will generate one UTF-8 bytes
*utf8_output++ = char(word);
pos++;
} else if((word & 0xF800)==0) {
// will generate two UTF-8 bytes
// we have 0b110XXXXX 0b10XXXXXX
*utf8_output++ = char((word>>6) | 0b11000000);
*utf8_output++ = char((word & 0b111111) | 0b10000000);
pos++;
} else if((word &0xF800 ) != 0xD800) {
// will generate three UTF-8 bytes
// we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
*utf8_output++ = char((word>>12) | 0b11100000);
*utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
*utf8_output++ = char((word & 0b111111) | 0b10000000);
pos++;
} else {
// must be a surrogate pair
if(pos + 1 >= len) { return 0; }
uint16_t diff = uint16_t(word - 0xD800);
if(diff > 0x3FF) { return 0; }
uint16_t next_word = need_flip ? swap_bytes(data[pos + 1]) : data[pos + 1];
uint16_t diff2 = uint16_t(next_word - 0xDC00);
if(diff2 > 0x3FF) { return 0; }
uint32_t value = (diff << 10) + diff2 + 0x10000;
// will generate four UTF-8 bytes
// we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
*utf8_output++ = char((value>>18) | 0b11110000);
*utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
*utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
*utf8_output++ = char((value & 0b111111) | 0b10000000);
pos += 2;
}
}
return utf8_output - start;
}
} // namespace ada::unicode