Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make the BERT protocol encoding aware #24

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 50 additions & 6 deletions ext/bert/c/decode.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "ruby.h"
#include "ruby/encoding.h"
#include <stdint.h>
#include <netinet/in.h>

Expand All @@ -14,9 +15,17 @@
#define ERL_BIN 109
#define ERL_SMALL_BIGNUM 110
#define ERL_LARGE_BIGNUM 111

/* These two types are specific to version 2 of the protocol. They diverge
* from Erlang, but allow us to pass string encodings across the wire. */
#define ERLEXT_ENC_STRING 112
#define ERLEXT_UNICODE_STRING 113

/* Protocol version constants. */
#define ERL_VERSION 131
#define ERL_VERSION2 132

#define BERT_VALID_TYPE(t) ((t) >= ERL_SMALL_INT && (t) <= ERL_LARGE_BIGNUM)
#define BERT_VALID_TYPE(t) ((t) >= ERL_SMALL_INT && (t) <= ERLEXT_UNICODE_STRING)
#define BERT_TYPE_OFFSET (ERL_SMALL_INT)

static VALUE rb_mBERT;
Expand All @@ -40,6 +49,8 @@ static VALUE bert_read_nil(struct bert_buf *buf);
static VALUE bert_read_string(struct bert_buf *buf);
static VALUE bert_read_list(struct bert_buf *buf);
static VALUE bert_read_bin(struct bert_buf *buf);
static VALUE bert_read_enc_string(struct bert_buf *buf);
static VALUE bert_read_unicode_string(struct bert_buf *buf);
static VALUE bert_read_sbignum(struct bert_buf *buf);
static VALUE bert_read_lbignum(struct bert_buf *buf);

Expand All @@ -59,7 +70,9 @@ static bert_ptr bert_callbacks[] = {
&bert_read_list,
&bert_read_bin,
&bert_read_sbignum,
&bert_read_lbignum
&bert_read_lbignum,
&bert_read_enc_string,
&bert_read_unicode_string
};

static inline uint8_t bert_buf_read8(struct bert_buf *buf)
Expand Down Expand Up @@ -293,6 +306,34 @@ static VALUE bert_read_bin(struct bert_buf *buf)
return rb_bin;
}

static VALUE bert_read_unicode_string(struct bert_buf *buf)
{
VALUE rb_str;

rb_str = bert_read_bin(buf);
rb_enc_associate(rb_str, rb_utf8_encoding());

return rb_str;
}

static VALUE bert_read_enc_string(struct bert_buf *buf)
{
uint8_t type;
VALUE rb_bin, enc;

rb_bin = bert_read_bin(buf);

bert_buf_ensure(buf, 1);
type = bert_buf_read8(buf);
if (ERL_BIN != type)
rb_raise(rb_eRuntimeError, "Invalid tag '%d' for term", type);

enc = bert_read_bin(buf);
rb_enc_associate(rb_bin, rb_find_encoding(enc));

return rb_bin;
}

static VALUE bert_read_string(struct bert_buf *buf)
{
uint16_t i, length;
Expand Down Expand Up @@ -467,17 +508,20 @@ static VALUE bert_read_invalid(struct bert_buf *buf)
static VALUE rb_bert_decode(VALUE klass, VALUE rb_string)
{
struct bert_buf buf;
uint8_t proto_version;

Check_Type(rb_string, T_STRING);
buf.data = (uint8_t *)RSTRING_PTR(rb_string);
buf.end = buf.data + RSTRING_LEN(rb_string);

bert_buf_ensure(&buf, 1);

if (bert_buf_read8(&buf) != ERL_VERSION)
rb_raise(rb_eTypeError, "Invalid magic value for BERT string");

return bert_read(&buf);
proto_version = bert_buf_read8(&buf);
if (proto_version == ERL_VERSION || proto_version == ERL_VERSION2) {
return bert_read(&buf);
} else {
rb_raise(rb_eTypeError, "Invalid magic value for BERT string");
}
}

static VALUE rb_bert_impl(VALUE klass)
Expand Down
2 changes: 1 addition & 1 deletion lib/bert.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@
# Global method for specifying that an array should be encoded as a tuple.
def t
BERT::Tuple
end
end
32 changes: 30 additions & 2 deletions lib/bert/decode.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,13 @@ def self.impl
def self.decode(string)
io = StringIO.new(string)
io.set_encoding('binary') if io.respond_to?(:set_encoding)
new(io).read_any
header = io.getbyte
case header
when MAGIC, VERSION_2
new(io).read_any
else
fail("Bad Magic")
end
end

def initialize(ins)
Expand All @@ -19,7 +25,6 @@ def initialize(ins)
end

def read_any
fail("Bad Magic") unless read_1 == MAGIC
read_any_raw
end

Expand All @@ -37,6 +42,8 @@ def read_any_raw
when STRING then read_erl_string
when LIST then read_list
when BIN then read_bin
when ENC_STRING then read_enc_string
when UNICODE_STRING then read_unicode_string
else
fail("Unknown term tag: #{peek_1}")
end
Expand Down Expand Up @@ -223,6 +230,14 @@ def read_nil
[]
end

def read_unicode_string
fail("Invalid Type, not a unicode string") unless read_1 == UNICODE_STRING
length = read_4
str = read_string(length)
str.force_encoding "UTF-8"
str
end

def read_erl_string
fail("Invalid Type, not an erlang string") unless read_1 == STRING
length = read_2
Expand All @@ -246,5 +261,18 @@ def read_bin
def fail(str)
raise str
end

private

def read_enc_string
fail("Invalid Type, not an erlang binary") unless read_1 == ENC_STRING
length = read_4
x = read_string(length)

fail("Invalid Type, not an erlang binary") unless read_1 == BIN
length = read_4
x.force_encoding read_string(length)
x
end
end
end
55 changes: 53 additions & 2 deletions lib/bert/encode.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,47 @@ module BERT
class Encode
include Types

class V2 < Encode
def write_binary(data)
enc = data.encoding
case enc
when ::Encoding::UTF_8, ::Encoding::US_ASCII
write_unicode_string data
when ::Encoding::ASCII_8BIT
super
else
write_enc_string data
end
end

private

def write_unicode_string(data)
write_1 UNICODE_STRING
write_4 data.bytesize
write_string data
end

def write_enc_string(data)
write_1 ENC_STRING
write_4 data.bytesize
write_string data
enc = data.encoding.name
write_1 BIN
write_4 enc.bytesize
write_string enc
end

def version_header
VERSION_2
end
end

class << self
attr_accessor :version
end
self.version = :v1

attr_accessor :out

def initialize(out)
Expand All @@ -11,12 +52,18 @@ def initialize(out)
def self.encode(data)
io = StringIO.new
io.set_encoding('binary') if io.respond_to?(:set_encoding)
self.new(io).write_any(data)

if version == :v2
Encode::V2.new(io).write_any(data)
else
new(io).write_any(data)
end

io.string
end

def write_any obj
write_1 MAGIC
write_1 version_header
write_any_raw obj
end

Expand Down Expand Up @@ -132,6 +179,10 @@ def write_binary(data)

private

def version_header
MAGIC
end

def fail(obj)
raise "Cannot encode to erlang external format: #{obj.inspect}"
end
Expand Down
6 changes: 4 additions & 2 deletions lib/bert/types.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@ module Types
STRING = 107
LIST = 108
BIN = 109
ENC_STRING = 112
UNICODE_STRING = 113
FUN = 117
NEW_FUN = 112
MAGIC = 131
VERSION_2 = 132
MAX_INT = (1 << 27) -1
MIN_INT = -(1 << 27)
end
end
end
49 changes: 42 additions & 7 deletions test/bert_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,55 @@ class BertTest < Test::Unit::TestCase
setup do
time = Time.at(1254976067)
@ruby = t[:user, {:name => 'TPW'}, [/cat/i, 9.9], time, nil, true, false, :true, :false]
@bert = "\203h\td\000\004userh\003d\000\004bertd\000\004dictl\000\000\000\001h\002d\000\004namem\000\000\000\003TPWjl\000\000\000\002h\004d\000\004bertd\000\005regexm\000\000\000\003catl\000\000\000\001d\000\bcaselessjc9.900000000000000e+00\000\000\000\000\000\000\000\000\000\000jh\005d\000\004bertd\000\004timeb\000\000\004\346b\000\016\344\303a\000h\002d\000\004bertd\000\003nilh\002d\000\004bertd\000\004trueh\002d\000\004bertd\000\005falsed\000\004trued\000\005false"
@ebin = "<<131,104,9,100,0,4,117,115,101,114,104,3,100,0,4,98,101,114,116,100,0,4,100,105,99,116,108,0,0,0,1,104,2,100,0,4,110,97,109,101,109,0,0,0,3,84,80,87,106,108,0,0,0,2,104,4,100,0,4,98,101,114,116,100,0,5,114,101,103,101,120,109,0,0,0,3,99,97,116,108,0,0,0,1,100,0,8,99,97,115,101,108,101,115,115,106,99,57,46,57,48,48,48,48,48,48,48,48,48,48,48,48,48,48,101,43,48,48,0,0,0,0,0,0,0,0,0,0,106,104,5,100,0,4,98,101,114,116,100,0,4,116,105,109,101,98,0,0,4,230,98,0,14,228,195,97,0,104,2,100,0,4,98,101,114,116,100,0,3,110,105,108,104,2,100,0,4,98,101,114,116,100,0,4,116,114,117,101,104,2,100,0,4,98,101,114,116,100,0,5,102,97,108,115,101,100,0,4,116,114,117,101,100,0,5,102,97,108,115,101>>"
@bert_old = "\203h\td\000\004userh\003d\000\004bertd\000\004dictl\000\000\000\001h\002d\000\004namem\000\000\000\003TPWjl\000\000\000\002h\004d\000\004bertd\000\005regexm\000\000\000\003catl\000\000\000\001d\000\bcaselessjc9.900000000000000e+00\000\000\000\000\000\000\000\000\000\000jh\005d\000\004bertd\000\004timeb\000\000\004\346b\000\016\344\303a\000h\002d\000\004bertd\000\003nilh\002d\000\004bertd\000\004trueh\002d\000\004bertd\000\005falsed\000\004trued\000\005false".b
@ebin_old = "<<131,104,9,100,0,4,117,115,101,114,104,3,100,0,4,98,101,114,116,100,0,4,100,105,99,116,108,0,0,0,1,104,2,100,0,4,110,97,109,101,109,0,0,0,3,84,80,87,106,108,0,0,0,2,104,4,100,0,4,98,101,114,116,100,0,5,114,101,103,101,120,109,0,0,0,3,99,97,116,108,0,0,0,1,100,0,8,99,97,115,101,108,101,115,115,106,99,57,46,57,48,48,48,48,48,48,48,48,48,48,48,48,48,48,101,43,48,48,0,0,0,0,0,0,0,0,0,0,106,104,5,100,0,4,98,101,114,116,100,0,4,116,105,109,101,98,0,0,4,230,98,0,14,228,195,97,0,104,2,100,0,4,98,101,114,116,100,0,3,110,105,108,104,2,100,0,4,98,101,114,116,100,0,4,116,114,117,101,104,2,100,0,4,98,101,114,116,100,0,5,102,97,108,115,101,100,0,4,116,114,117,101,100,0,5,102,97,108,115,101>>"
end

should "encode" do
assert_equal @bert, BERT.encode(@ruby)
context "v2 encoder" do
setup do
@old_version = BERT::Encode.version
BERT::Encode.version = :v2
@bert = "\x84h\td\x00\x04userh\x03d\x00\x04bertd\x00\x04dictl\x00\x00\x00\x01h\x02d\x00\x04nameq\x00\x00\x00\x03TPWjl\x00\x00\x00\x02h\x04d\x00\x04bertd\x00\x05regexq\x00\x00\x00\x03catl\x00\x00\x00\x01d\x00\bcaselessjc9.900000000000000e+00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00jh\x05d\x00\x04bertd\x00\x04timeb\x00\x00\x04\xE6b\x00\x0E\xE4\xC3a\x00h\x02d\x00\x04bertd\x00\x03nilh\x02d\x00\x04bertd\x00\x04trueh\x02d\x00\x04bertd\x00\x05falsed\x00\x04trued\x00\x05false".b
@ebin = "<<132,104,9,100,0,4,117,115,101,114,104,3,100,0,4,98,101,114,116,100,0,4,100,105,99,116,108,0,0,0,1,104,2,100,0,4,110,97,109,101,113,0,0,0,3,84,80,87,106,108,0,0,0,2,104,4,100,0,4,98,101,114,116,100,0,5,114,101,103,101,120,113,0,0,0,3,99,97,116,108,0,0,0,1,100,0,8,99,97,115,101,108,101,115,115,106,99,57,46,57,48,48,48,48,48,48,48,48,48,48,48,48,48,48,101,43,48,48,0,0,0,0,0,0,0,0,0,0,106,104,5,100,0,4,98,101,114,116,100,0,4,116,105,109,101,98,0,0,4,230,98,0,14,228,195,97,0,104,2,100,0,4,98,101,114,116,100,0,3,110,105,108,104,2,100,0,4,98,101,114,116,100,0,4,116,114,117,101,104,2,100,0,4,98,101,114,116,100,0,5,102,97,108,115,101,100,0,4,116,114,117,101,100,0,5,102,97,108,115,101>>"
end

teardown do
BERT::Encode.version = @old_version
end

should "decode new format" do
assert_equal @ruby, BERT.decode(@bert)
end

should "roundtrip string and maintain encoding" do
str = "日本語".encode 'EUC-JP'
round = BERT.decode(BERT.encode(str))
assert_equal str, round
assert_equal str.encoding, round.encoding
end

should "roundtrip binary string" do
str = "日本語".b
round = BERT.decode(BERT.encode(str))
assert_equal str, round
assert_equal str.encoding, round.encoding
end

should "encode" do
assert_equal @bert, BERT.encode(@ruby)
end

should "ebin" do
assert_equal @ebin, BERT.ebin(@bert)
end
end

should "decode" do
assert_equal @ruby, BERT.decode(@bert)
should "decode the old format" do
assert_equal @ruby, BERT.decode(@bert_old)
end

should "ebin" do
assert_equal @ebin, BERT.ebin(@bert)
assert_equal @ebin_old, BERT.ebin(@bert_old)
end

should "do roundtrips" do
Expand Down
31 changes: 31 additions & 0 deletions test/encoder_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ class EncoderTest < Test::Unit::TestCase
end

should 'handle utf8 strings' do
str = "été".encode 'UTF-8'
bert = [131, 109, 0, 0, 0, 5, 195, 169, 116, 195, 169].pack('C*')
assert_equal bert, BERT::Encoder.encode("été")
end
Expand All @@ -99,6 +100,36 @@ class EncoderTest < Test::Unit::TestCase
assert_equal bert, BERT::Encoder.encode(-10_000_000_000_000_000_000)
end

context "v2" do
setup do
@old_version = BERT::Encode.version
BERT::Encode.version = :v2
end

teardown do
BERT::Encode.version = @old_version
end

should 'handle utf8 strings' do
str = "été".encode 'UTF-8'
bert = [132, 113, 0, 0, 0, 5, 195, 169, 116, 195, 169].pack('C*')
assert_equal bert, BERT::Encoder.encode("été")
end

should 'handle utf8 symbols' do
bert = [132, 100, 0, 5, 195, 169, 116, 195, 169].pack('C*')
assert_equal bert, BERT::Encoder.encode(:'été')
end

should "handle bignums" do
bert = [132,110,8,0,0,0,232,137,4,35,199,138].pack('c*')
assert_equal bert, BERT::Encoder.encode(10_000_000_000_000_000_000)

bert = [132,110,8,1,0,0,232,137,4,35,199,138].pack('c*')
assert_equal bert, BERT::Encoder.encode(-10_000_000_000_000_000_000)
end
end

should "leave other stuff alone" do
before = [1, 2.0, [:foo, 'bar']]
assert_equal before, BERT::Encoder.convert(before)
Expand Down