Skip to content

Commit

Permalink
Merge pull request #1445 from linas/empty-zzz
Browse files Browse the repository at this point in the history
Revised ZZZ-connector handling.
  • Loading branch information
linas authored Feb 23, 2023
2 parents 01b836d + 34fc152 commit 4672e07
Show file tree
Hide file tree
Showing 7 changed files with 49 additions and 20 deletions.
4 changes: 4 additions & 0 deletions data/en/4.0.dict
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@
% This is mostly fixed, except that some uses of <noun-main-m>
% remain, below.

% empty-connector is used for hard-coded handling of unpaired
% quote marks in the C code.
#define empty-connector ZZZ;

% Capitalization handling (null effect for now- behave as empty words).
<1stCAP>: ZZZ-;
<nonCAP>: ZZZ-;
Expand Down
4 changes: 4 additions & 0 deletions data/en/4.0.dict.m4
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ changecom(`%')
% This is mostly fixed, except that some uses of <noun-main-m>
% remain, below.

% empty-connector is used for hard-coded handling of unpaired
% quote marks in the C code.
#define empty-connector ZZZ;

% Capitalization handling (null effect for now- behave as empty words).
<1stCAP>: ZZZ-;
<nonCAP>: ZZZ-;
Expand Down
9 changes: 6 additions & 3 deletions link-grammar/dict-common/dict-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@
#include "memory-pool.h" // Pool_desc
#include "utilities.h" // locale_t

// EMPTY_CONNECTOR must be at least 6 or 7 chars long, to avoid
// collisions with autogened connectors in the Atomese dict.
#define EMPTY_CONNECTOR "ZZZZZZZZZ"

// Dict may have `#define empty-connector ZZZ` in it.
#define EMPTY_CONNECTOR "empty-connector"
#define UNLIMITED_CONNECTORS_WORD ("UNLIMITED-CONNECTORS")
#define LIMITED_CONNECTORS_WORD ("LENGTH-LIMIT-")
#define IS_GENERATION(dict) (dict->category != NULL)
Expand Down Expand Up @@ -123,10 +123,13 @@ struct Dictionary_s
const char * version;
const char * locale; /* Locale name */
locale_t lctype; /* Locale argument for the *_l() functions */

int num_entries;
float default_max_disjunct_cost;
dfine_s dfine; /* Name-value definitions */

const char * zzz_connector;

bool use_unknown_word;
bool unknown_word_defined;
bool left_wall_defined;
Expand Down
5 changes: 5 additions & 0 deletions link-grammar/dict-common/dict-locale.c
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,11 @@ bool dictionary_setup_defines(Dictionary dict)

dict->shuffle_linkages = false;

// Used for unattached quote marks, in the English dict only.
dict->zzz_connector = linkgrammar_get_dict_define(dict, EMPTY_CONNECTOR);
if (NULL != dict->zzz_connector)
dict->zzz_connector = string_set_add(dict->zzz_connector, dict->string_set);

dictionary_setup_locale(dict);

if (!dictionary_setup_max_disjunct_cost(dict)) return false;
Expand Down
7 changes: 3 additions & 4 deletions link-grammar/dict-common/dict-utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ static bool exp_has_connector(const Exp * e, int depth,
}

/**
* Find if an expression has a connector ZZZ- (that an empty-word has).
* Check if an expression has a connector ZZZ- (that an empty-word has).
* This is a costly way to find it. To reduce the overhead, the
* exp_has_connector() "depth" argument limits the expression depth check,
* supposing the ZZZ- connectors are not deep in the word expression.
Expand All @@ -297,9 +297,8 @@ static bool exp_has_connector(const Exp * e, int depth,
**/
bool is_exp_like_empty_word(Dictionary dict, Exp *exp)
{
const char *cs = string_set_lookup(EMPTY_CONNECTOR, dict->string_set);
if (NULL == cs) return false;
return exp_has_connector(exp, 2, cs, '-');
if (NULL == dict->zzz_connector) return false;
return exp_has_connector(exp, 2, dict->zzz_connector, '-');
}

/* ======================================================== */
Expand Down
11 changes: 7 additions & 4 deletions link-grammar/linkage/linkage.c
Original file line number Diff line number Diff line change
Expand Up @@ -692,12 +692,15 @@ static void compute_chosen_words(Sentence sent, Linkage linkage,
* to facilitate using diff on sentence batch runs. */
if (test_enabled("removeZZZ"))
{
for (i=0; i<linkage->num_links; i++)
if (sent->dict->zzz_connector)
{
Link *lnk = &(linkage->link_array[i]);
for (i=0; i<linkage->num_links; i++)
{
Link *lnk = &(linkage->link_array[i]);

if (0 == strcmp("ZZZ", lnk->link_name))
chosen_words[lnk->rw] = NULL;
if (0 == strcmp(sent->dict->zzz_connector, lnk->link_name))
chosen_words[lnk->rw] = NULL;
}
}
}

Expand Down
29 changes: 20 additions & 9 deletions link-grammar/tokenize/lookup-exprs.c
Original file line number Diff line number Diff line change
Expand Up @@ -161,30 +161,31 @@ GNUC_UNUSED static void print_x_node(X_node *x)
* This function was mainly used to support using empty-words, a concept
* that has been eliminated. However, it is still used to support linking of
* quotes that don't get the QUc/QUd links.
*
* This function is called only if ZZZ is defined in the dictionary.
* This is currently used only by the English dict, to allow quotes to
* appear anywhere in the sentence.
*/
static void add_empty_word(Sentence sent, X_node *x)
{
Exp *zn, *an;
const char *ZZZ = string_set_lookup(EMPTY_CONNECTOR, sent->dict->string_set);
/* This function is called only if ZZZ is in the dictionary. */

/* The left-wall already has ZZZ-. The right-wall will not arrive here. */
if (MT_WALL == x->word->morpheme_type) return;

/* Replace plain-word-exp by {ZZZ+} & (plain-word-exp) in each X_node. */
for(; NULL != x; x = x->next)
{
/* Ignore stems for now, decreases a little the overhead for
* stem-suffix languages. */
/* Ignore stems for now; this decreases the overhead a little
* for stem-suffix languages. */
if (is_stem(x->string)) continue; /* Avoid an unneeded overhead. */
//lgdebug(+0, "Processing '%s'\n", x->string);

/* zn points at {ZZZ+} */
zn = make_connector_node(sent->dict, sent->Exp_pool, ZZZ, '+', false);
Exp *zn = make_connector_node(sent->dict,
sent->Exp_pool, sent->dict->zzz_connector, '+', false);
zn = make_optional_node(sent->Exp_pool, zn);

/* an will be {ZZZ+} & (plain-word-exp) */
an = make_and_node(sent->Exp_pool, zn, x->exp);
Exp *an = make_and_node(sent->Exp_pool, zn, x->exp);

x->exp = an;
}
Expand Down Expand Up @@ -257,7 +258,17 @@ static bool determine_word_expressions(Sentence sent, Gword *w,
* supposing that the word has it in all of its dict entries
* (in any case, currently there is only 1 entry for each such word).
* Note that ZZZ_added starts by 0 and so also wordpos, and that the
* first sentence word (usually LEFT-WALL) doesn't need a check. */
* first sentence word (usually LEFT-WALL) doesn't need a check.
*
* At this time, the empty-connector device is used only by the
* English dict, to allow quotation marks to appear in random
* locations in sentences. Rather than writing the English dict
* so that *every word* has an optional {ZZZ-} & connector on it,
* which would double the size of the dict, we instead add it here,
* dynamically, on-the-fly, as needed. This whole thing feels
* half-baked to me. It works, but is this weird exception being
* made for one language.
*/
if ((wordpos != *ZZZ_added) && is_exp_like_empty_word(dict, we->exp))
{
lgdebug(D_DWE, " (has ZZZ-)");
Expand Down

0 comments on commit 4672e07

Please sign in to comment.