From c6a36f20e7b331e7427963c087d015da2acf408d Mon Sep 17 00:00:00 2001 From: John Viega Date: Fri, 5 Jul 2024 16:24:07 -0400 Subject: [PATCH] Make Slice Syntax Pythonic (#80) Fixed slicing semantics to be Pythonic in how they treat negative numbers. Also, for naming specific tests, respect C4M_TEST_DIR when there isn't an absolute path provided. --- include/compiler/datatypes/parse.h | 1 + include/con4m/path.h | 16 +++++++- src/con4m/compiler/check_pass.c | 20 +++++++++ src/con4m/compiler/codegen.c | 66 +++++++++++++++++++++++++++++- src/con4m/compiler/parse.c | 20 ++++++++- src/con4m/grid.c | 1 - src/con4m/hatlists.c | 4 +- src/con4m/list.c | 6 +-- src/con4m/string.c | 4 +- src/con4m/vm.c | 4 +- src/tests/test.c | 13 +++--- tests/list.c4m | 6 +-- tests/slice.c4m | 29 +++++++++++++ 13 files changed, 166 insertions(+), 24 deletions(-) create mode 100644 tests/slice.c4m diff --git a/include/compiler/datatypes/parse.h b/include/compiler/datatypes/parse.h index 775867fb..050cc8d4 100644 --- a/include/compiler/datatypes/parse.h +++ b/include/compiler/datatypes/parse.h @@ -79,6 +79,7 @@ typedef enum { c4m_nt_field_prop, c4m_nt_expression, c4m_nt_extern_box, + c4m_nt_elided, #ifdef C4M_DEV c4m_nt_print, #endif diff --git a/include/con4m/path.h b/include/con4m/path.h index 023b8c80..e21c8ef2 100644 --- a/include/con4m/path.h +++ b/include/con4m/path.h @@ -20,7 +20,7 @@ c4m_utf8_t *c4m_get_user_dir(c4m_utf8_t *); c4m_utf8_t *c4m_get_current_directory(c4m_utf8_t *); c4m_utf8_t *c4m_path_join(c4m_list_t *); c4m_file_kind c4m_get_file_kind(c4m_utf8_t *); -c4m_list_t *_c4m_path_walk(c4m_utf8_t *, ...); +c4m_list_t *_c4m_path_walk(c4m_utf8_t *, ...); #define c4m_path_walk(x, ...) _c4m_path_walk(x, C4M_VA(__VA_ARGS__)) @@ -40,3 +40,17 @@ c4m_get_user_name() return c4m_new_utf8(pw->pw_name); } + +static inline c4m_utf8_t * +c4m_path_simple_join(c4m_utf8_t *p1, c4m_utf8_t *p2) +{ + if (c4m_str_starts_with(p2, c4m_get_slash_const())) { + return p2; + } + + c4m_list_t *x = c4m_list(c4m_type_utf8()); + c4m_list_append(x, p1); + c4m_list_append(x, p2); + + return c4m_path_join(x); +} diff --git a/src/con4m/compiler/check_pass.c b/src/con4m/compiler/check_pass.c index 37a89813..3dee9d30 100644 --- a/src/con4m/compiler/check_pass.c +++ b/src/con4m/compiler/check_pass.c @@ -741,6 +741,21 @@ lookup_or_add(pass2_ctx *ctx, c4m_utf8_t *name) return result; } +static void +handle_elision(pass2_ctx *ctx) +{ + c4m_pnode_t *cur = c4m_get_pnode(ctx->node); + c4m_pnode_t *parent = c4m_get_pnode(ctx->node->parent); + + switch (parent->kind) { + case c4m_nt_range: + cur->type = c4m_type_int(); + return; + default: + c4m_unreachable(); + } +} + static void handle_index(pass2_ctx *ctx) { @@ -756,6 +771,7 @@ handle_index(pass2_ctx *ctx) use_context_enter(ctx); process_child(ctx, 1); + ix1_type = c4m_type_resolve(get_pnode_type(ctx->node->children[1])); pnode->type = node_type; @@ -2270,6 +2286,10 @@ base_check_pass_dispatch(pass2_ctx *ctx) handle_return(ctx); break; + case c4m_nt_elided: + handle_elision(ctx); + break; + #ifdef C4M_DEV case c4m_nt_print: c4m_list_append(ctx->file_ctx->print_nodes, ctx->node); diff --git a/src/con4m/compiler/codegen.c b/src/con4m/compiler/codegen.c index 2c498509..2eeeb7c7 100644 --- a/src/con4m/compiler/codegen.c +++ b/src/con4m/compiler/codegen.c @@ -21,6 +21,7 @@ typedef enum { assign_to_mem_slot, assign_via_index_set_call, assign_via_slice_set_call, + assign_via_len_then_slice_set_call, } assign_type_t; typedef struct { @@ -1650,6 +1651,28 @@ gen_assign(gen_ctx *ctx) case assign_via_slice_set_call: gen_tcall(ctx, C4M_BI_SLICE_SET, ctx->cur_pnode->type); break; + case assign_via_len_then_slice_set_call: + // Need to call len() on the object for the 2nd slice + // param. The 2nd slice parameter is supposed to get + // pushed on first though. + // + // Stash the value in R0. + emit(ctx, C4M_ZPopToR0); + // stash start ix + emit(ctx, C4M_ZPopToR1); + // Dupe the container. + emit(ctx, C4M_ZDupTop); + // Call len on the non-popped version. + gen_tcall(ctx, C4M_BI_LEN, ctx->cur_pnode->type); + // Push the index back. + emit(ctx, C4M_ZPushFromR1); + // Swap the two indices to be in the proper order. + emit(ctx, C4M_ZSwap); + // Push the value back. + emit(ctx, C4M_ZPushFromR0); + // Slice! + gen_tcall(ctx, C4M_BI_SLICE_SET, ctx->cur_pnode->type); + break; case assign_via_index_set_call: emit(ctx, C4M_ZSwap); gen_tcall(ctx, C4M_BI_INDEX_SET, ctx->cur_pnode->type); @@ -1739,7 +1762,12 @@ gen_index_or_slice(gen_ctx *ctx) if (lvalue) { if (slice) { - ctx->assign_method = assign_via_slice_set_call; + if (pnode->extra_info == (void *)1) { + ctx->assign_method = assign_via_len_then_slice_set_call; + } + else { + ctx->assign_method = assign_via_slice_set_call; + } } else { ctx->assign_method = assign_via_index_set_call; @@ -1747,8 +1775,23 @@ gen_index_or_slice(gen_ctx *ctx) ctx->lvalue = true; return; } - if (slice) { + if (pnode->extra_info == (void *)1) { + // Need to call len() on the object for the 2nd slice + // param. The 2nd slice parameter is supposed to get + // pushed on first though. + // + // Stash the other index. + emit(ctx, C4M_ZPopToR0); + // Dupe the copy. + emit(ctx, C4M_ZDupTop); + // Call len on the dupe. + gen_tcall(ctx, C4M_BI_LEN, ctx->cur_pnode->type); + // Push the index back. + emit(ctx, C4M_ZPushFromR0); + // Swap positions. + emit(ctx, C4M_ZSwap); + } gen_tcall(ctx, C4M_BI_SLICE_GET, ctx->cur_pnode->type); } else { @@ -1756,6 +1799,22 @@ gen_index_or_slice(gen_ctx *ctx) } } +static inline void +gen_elision(gen_ctx *ctx) +{ + // Right now, this is only for indexes on slices. If were' on the + // LHS, life is easy; we just emit an actual 0. + if (ctx->cur_node->parent->children[0] == ctx->cur_node) { + gen_load_immediate(ctx, 0); + return; + } + + // Otherwise, we cheat a little bit here, and signal to + // gen_index_or_slice through the range pnode. + c4m_pnode_t *range_pnode = c4m_get_pnode(ctx->cur_node->parent); + range_pnode->extra_info = (void *)1; +} + static inline void gen_sym_decl(gen_ctx *ctx) { @@ -1932,6 +1991,9 @@ gen_one_node(gen_ctx *ctx) case c4m_nt_use: gen_use(ctx); break; + case c4m_nt_elided: + gen_elision(ctx); + break; // The following list is still TODO: case c4m_nt_varargs_param: // These should always be passthrough. diff --git a/src/con4m/compiler/parse.c b/src/con4m/compiler/parse.c index 0dc4f3df..757dd98a 100644 --- a/src/con4m/compiler/parse.c +++ b/src/con4m/compiler/parse.c @@ -229,6 +229,7 @@ static const node_type_info_t node_type_info[] = { { "nt_field_prop", 1, 0, 0, 0, 0, }, { "nt_expression", 0, 0, 0, 0, 0, }, { "nt_extern_box", 0, 0, 0, 0, 0, }, + { "nt_elided", 0, 0, 0, 0, 0, }, #ifdef C4M_DEV { "nt_print", 0, 0, 0, 0, 0, }, #endif @@ -1602,7 +1603,13 @@ optional_range(parse_ctx *ctx, c4m_tree_node_t *lhs) start_node(ctx, c4m_nt_range, true); adopt_kid(ctx, lhs); - adopt_kid(ctx, expression(ctx)); + if (tok_kind(ctx) == c4m_tt_rbracket) { + start_node(ctx, c4m_nt_elided, false); + end_node(ctx); + } + else { + adopt_kid(ctx, expression(ctx)); + } end_node(ctx); return true; } @@ -3116,7 +3123,16 @@ index_expr(parse_ctx *ctx, c4m_tree_node_t *lhs) temporary_tree(ctx, c4m_nt_index); adopt_kid(ctx, lhs); expect(ctx, c4m_tt_lbracket); - c4m_tree_node_t *item = expression(ctx); + + c4m_tree_node_t *item; + + if (tok_kind(ctx) == c4m_tt_colon) { + temporary_tree(ctx, c4m_nt_elided); + item = restore_tree(ctx); + } + else { + item = expression(ctx); + } if (!optional_range(ctx, item)) { adopt_kid(ctx, item); diff --git a/src/con4m/grid.c b/src/con4m/grid.c index 42ed3bea..246f25cd 100644 --- a/src/con4m/grid.c +++ b/src/con4m/grid.c @@ -57,7 +57,6 @@ styled_repeat(c4m_codepoint_t c, uint32_t width, c4m_style_t style) static inline c4m_utf32_t * get_styled_pad(uint32_t width, c4m_style_t style) { - assert(width < 200); return styled_repeat(' ', width, style); } diff --git a/src/con4m/hatlists.c b/src/con4m/hatlists.c index 5943a5f4..d6e4bfac 100644 --- a/src/con4m/hatlists.c +++ b/src/con4m/hatlists.c @@ -272,7 +272,7 @@ c4m_flexarray_get_slice(flexarray_t *list, int64_t start, int64_t end) } } if (end < 0) { - end += len + 1; + end += len; } else { if (end > len) { @@ -315,7 +315,7 @@ c4m_flexarray_set_slice(flexarray_t *list, int64_t start, int64_t end, flexarray } } if (end < 0) { - end += len1 + 1; + end += len1; } else { if (end > len1) { diff --git a/src/con4m/list.c b/src/con4m/list.c index 462489be..c68067de 100644 --- a/src/con4m/list.c +++ b/src/con4m/list.c @@ -1,5 +1,3 @@ -// "Exclusive" array, meaning not shared across threads. It's dynamic, -// and supports resizing. #include "con4m.h" static void @@ -469,7 +467,7 @@ c4m_list_get_slice(c4m_list_t *list, int64_t start, int64_t end) } } if (end < 0) { - end += len + 1; + end += len; } else { if (end > len) { @@ -516,7 +514,7 @@ c4m_list_set_slice(c4m_list_t *list, } } if (end < 0) { - end += len1 + 1; + end += len1; } else { if (end > len1) { diff --git a/src/con4m/string.c b/src/con4m/string.c index 5643dfba..3e4c52e1 100644 --- a/src/con4m/string.c +++ b/src/con4m/string.c @@ -65,7 +65,7 @@ c4m_str_slice(const c4m_str_t *instr, int64_t start, int64_t end) } } if (end < 0) { - end += len + 1; + end += len; } else { if (end > len) { @@ -909,7 +909,7 @@ c4m_str_ends_with(const c4m_str_t *s1, const c4m_str_t *s2) c4m_utf32_t *u1 = c4m_to_utf32(s1); c4m_utf32_t *u2 = c4m_to_utf32(s2); - u1 = c4m_str_slice(u1, l1 - l2, -1); + u1 = c4m_str_slice(u1, l1 - l2, l1); return c4m_str_eq(u1, u2); } diff --git a/src/con4m/vm.c b/src/con4m/vm.c index 5169ed79..0603684b 100644 --- a/src/con4m/vm.c +++ b/src/con4m/vm.c @@ -532,9 +532,9 @@ c4m_vm_tcall(c4m_vmthread_t *tstate, c4m_zinstruction_t *i) return; case C4M_BI_SLICE_SET: STACK_REQUIRE_VALUES(4); - // endIx = sp[3] + // container = sp[3] + // endIx = sp[2] // startIx = sp[1] - // container = sp[2] // value = sp[0] c4m_slice_set(tstate->sp[3].rvalue.obj, diff --git a/src/tests/test.c b/src/tests/test.c index 2f2c1512..052e1d2c 100644 --- a/src/tests/test.c +++ b/src/tests/test.c @@ -77,7 +77,7 @@ c4m_parse_kat(c4m_str_t *path, c4m_str_t *s) err_basic_usage(path); return NULL; } - extract_errors(result, s, 9, -1); + extract_errors(result, s, 9, c4m_str_codepoint_len(s)); result->ignore_output = 1; return result; } @@ -87,7 +87,7 @@ c4m_parse_kat(c4m_str_t *path, c4m_str_t *s) err_basic_usage(path); return NULL; } - extract_output(result, s, 9, -1); + extract_output(result, s, 9, c4m_str_codepoint_len(s)); return result; } @@ -98,11 +98,11 @@ c4m_parse_kat(c4m_str_t *path, c4m_str_t *s) if (errix != 0) { extract_output(result, s, 9, errix); - extract_errors(result, s, errix + 9, -1); + extract_errors(result, s, errix + 9, c4m_str_codepoint_len(s)); } else { extract_errors(result, s, 9, outix); - extract_output(result, s, outix + 9, -1); + extract_output(result, s, outix + 9, c4m_str_codepoint_len(s)); } return result; @@ -230,7 +230,8 @@ build_file_list() for (int i = 0; i < n; i++) { c4m_utf8_t *s = c4m_to_utf8(c4m_list_get(argv, i, NULL)); - s = c4m_resolve_path(s); + s = c4m_resolve_path(c4m_path_simple_join(test_dir, s)); + switch (c4m_get_file_kind(s)) { case C4M_FK_IS_REG_FILE: case C4M_FK_IS_FLINK: @@ -259,6 +260,7 @@ build_file_list() } n = c4m_list_len(to_recurse); + for (int i = 0; i < n; i++) { int num_hits = 0; c4m_utf8_t *path = c4m_list_get(to_recurse, i, NULL); @@ -269,6 +271,7 @@ build_file_list() int walk_len = c4m_list_len(files); for (int j = 0; j < walk_len; j++) { c4m_utf8_t *one = c4m_list_get(files, j, NULL); + if (c4m_str_ends_with(one, ext)) { kat = c4m_extract_kat(one); // When scanning dirs, if we have test cases that span diff --git a/tests/list.c4m b/tests/list.c4m index 01128e86..5669db70 100644 --- a/tests/list.c4m +++ b/tests/list.c4m @@ -26,9 +26,9 @@ $output: 30 40 40 -90 -[10, 100] -110 +50 +[10, 100, 40] +150 """ x = [1, 2, 3, 4] diff --git a/tests/slice.c4m b/tests/slice.c4m new file mode 100644 index 00000000..2a81113d --- /dev/null +++ b/tests/slice.c4m @@ -0,0 +1,29 @@ +""" +Ensure slice semantics match Python's. +""" +""" +$output: +[0, 1, 2, 3, 4, 5] +[0, 1, 2] +[0, 1, 2] +[3, 4, 5] +[3, 4] +[0, 1, 2, 3] +[4, 3, 2, 1, 5] +foob + +""" +x = [0, 1, 2, 3, 4, 5] + +print(x[:]) +print(x[0:3]) +print(x[:3]) +print(x[3:]) +print(x[-3:-1]) +print(x[:-2]) +x[:-1] = [4, 3, 2, 1] +print(x) + + +s = "foobar" +print(s[:-2]) \ No newline at end of file