Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parse_set / Parse_choice shrinking #1440

Merged
merged 5 commits into from
Feb 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions link-grammar/connectors.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,14 @@
*/
#define MAX_SENTENCE 254 /* Maximum number of words in a sentence */

/* Since tracon IDs are unique per sentence, for convenience NULL
* connectors (zero-length tracons) have tracon IDs equal to the word
* number on which their disjunct resides. To that end an initial block
* of IDs is reserved. It is the maximum number of words in a sentence
* rounded up to a power of 2.
*/
#define NULL_TRACON_BLOCK 256

/* For faster comparisons, the connector lc part is encoded into a number
* and a mask. Each letter is encoded using LC_BITS bits. With 7 bits, it
* is possible to encode up to 9 letters in an uint64_t. */
Expand Down
8 changes: 1 addition & 7 deletions link-grammar/disjunct-utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -806,12 +806,6 @@ static Disjunct *pack_disjuncts(Sentence sent, Tracon_sharing *ts,

#define TLSZ 8192 /* Initial size of the tracon list table */

/* Reserved tracon ID space for NULL connectors (zero-length tracons).
* Currently, tracons are unique per word. So this is actually the max.
* number of words in a sentence rounded up to a power of 2.
* FIXME: Derive it from MAX_SENTENCE. */
#define WORD_OFFSET 256

/** Create a context descriptor for disjuncts & connector memory "packing".
* Allocate a memory block for all the disjuncts & connectors.
* The current Connector struct size is 32 bytes, and the intention is
Expand Down Expand Up @@ -873,7 +867,7 @@ static Tracon_sharing *pack_sentence_init(Sentence sent, bool is_pruning)
ts->dblock = dblock;
ts->num_connectors = ccnt;
ts->num_disjuncts = dcnt;
ts->word_offset = is_pruning ? 1 : WORD_OFFSET;
ts->word_offset = is_pruning ? 1 : NULL_TRACON_BLOCK;
ts->is_pruning = is_pruning;
ts->next_id[0] = ts->next_id[1] = ts->word_offset;
ts->last_token = (uintptr_t)-1;
Expand Down
127 changes: 73 additions & 54 deletions link-grammar/parse/extract-links.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,19 +41,19 @@ struct Parse_choice_struct
Parse_choice * next;
Parse_set * set[2];
Disjunct *md; /* the chosen disjunct for the middle word */
Connector *lc, *rc; /* the connectors on the middle word */
int32_t l_id, r_id; /* the tracon IDs used in this disjunct */
};

/* Parse_set serves as a header of Parse_choice chained elements, that
* describe the possible parses with the specified null_count, using
* tracons l_id and r_id on words lw and rw, correspondingly. */
struct Parse_set_struct
{
Parse_choice *first;
Connector *le, *re;
Parse_choice *first;
unsigned int num_pc; /* number of Parse_choice elements */
uint8_t lw, rw; /* left and right word index */
uint8_t null_count; /* number of island words */
int32_t l_id, r_id; /* tracons on words lw, rw */

count_t count; /* The number of ways to parse. */
#ifdef RECOUNT
Expand Down Expand Up @@ -105,13 +105,12 @@ make_choice(Parse_set *lset, Connector * lrc,
Parse_set *rset, Connector * rlc,
Disjunct *md, extractor_t* pex)
{
Parse_choice *pc;
pc = pool_alloc(pex->Parse_choice_pool);
Parse_choice *pc = pool_alloc(pex->Parse_choice_pool);
pc->next = NULL;
pc->set[0] = lset;
pc->set[1] = rset;
pc->lc = lrc;
pc->rc = rlc;
pc->l_id = (lrc == NULL) ? -1 : lrc->tracon_id;
pc->r_id = (rlc == NULL) ? -1 : rlc->tracon_id;
pc->md = md;
return pc;
}
Expand All @@ -126,6 +125,7 @@ static void record_choice(
// Chain it into the parse set.
pc->next = s->first;
s->first = pc;
s->num_pc++;
}

/**
Expand Down Expand Up @@ -201,8 +201,8 @@ extractor_t * extractor_new(Sentence sent)
pex->x_table_size = (1 << log2_table_size);

#ifdef DEBUG_X_TABLE
printf("Allocating x_table of size %u (nwords %d)\n",
pex->x_table_size, nwords);
printf("Allocating x_table of size %u (log2 %d)\n",
pex->x_table_size, log2_table_size);
#endif /* DEBUG_X_TABLE */

pex->x_table = (Pset_bucket**) xalloc(pex->x_table_size * sizeof(Pset_bucket*));
Expand Down Expand Up @@ -267,23 +267,40 @@ void free_extractor(extractor_t * pex)
xfree((void *) pex, sizeof(extractor_t));
}

/**
* Return a dummy connector that represents a null tracon for word \p w.
* Its purpose is to greatly simplify the condition in x_table_pointer().
* \p w may be in the range [-1,sentence length].
* We assume here is that an integer check and assignment is thread-safe.
*/
static Connector *dummy_null_tracon(int w)
{
/* +1 for w+1 (see below).
* +1 for invocations with w equal to MAX_SENTENCE. */
static Connector dnt[MAX_SENTENCE+1+1];

/* w+1 supports invocations with w==-1. */
if (dnt[w+1].tracon_id != w) dnt[w+1].tracon_id = w;
return &dnt[w+1];
}

/**
* Returns the pointer to this info, NULL if not there.
* Note that there is no need to use (lw, rw) as keys because tracon_id
* values are not shared between words.
*/
static Pset_bucket * x_table_pointer(int lw, int rw,
Connector *le, Connector *re,
unsigned int null_count, extractor_t * pex)
static Pset_bucket *x_table_pointer(int lw, int rw,
Connector *le, Connector *re,
unsigned int null_count, extractor_t * pex)
{
Pset_bucket *t;
int l_id = (NULL != le) ? le->tracon_id : lw;
int r_id = (NULL != re) ? re->tracon_id : rw;
unsigned int hash = pair_hash(lw, rw, l_id, r_id, null_count);
t = pex->x_table[hash & (pex->x_table_size-1)];
Pset_bucket *t = pex->x_table[hash & (pex->x_table_size-1)];

for (; t != NULL; t = t->next) {
if ((t->set.l_id == l_id) && (t->set.r_id == r_id) &&
for (; t != NULL; t = t->next)
{
if ((t->set.le->tracon_id == l_id) && (t->set.re->tracon_id == r_id) &&
(t->set.null_count == null_count)) return t;
}
return NULL;
Expand All @@ -296,22 +313,21 @@ static Pset_bucket * x_table_store(int lw, int rw,
Connector *le, Connector *re,
unsigned int null_count, extractor_t * pex)
{
Pset_bucket **t, *n;
unsigned int h;
int32_t l_id = (NULL != le) ? le->tracon_id : lw;
int32_t r_id = (NULL != re) ? re->tracon_id : rw;
unsigned int h = pair_hash(lw, rw, l_id, r_id, null_count);
Pset_bucket **t = &pex->x_table[h & (pex->x_table_size -1)];
Pset_bucket *n = pool_alloc(pex->Pset_bucket_pool);

n = pool_alloc(pex->Pset_bucket_pool);
n->set.lw = lw;
n->set.rw = rw;
n->set.null_count = null_count;
n->set.l_id = (NULL != le) ? le->tracon_id : lw;
n->set.r_id = (NULL != re) ? re->tracon_id : rw;
n->set.le = le;
n->set.re = re;
n->set.le = (NULL != le) ? le : dummy_null_tracon(lw);
n->set.re = (NULL != re) ? re : dummy_null_tracon(rw);
n->set.count = 0;
n->set.first = NULL;
n->set.num_pc = 0;

h = pair_hash(lw, rw, n->set.l_id, n->set.r_id, null_count);
t = &pex->x_table[h & (pex->x_table_size -1)];
n->next = *t;
*t = n;
return n;
Expand Down Expand Up @@ -716,19 +732,35 @@ bool build_parse_set(extractor_t* pex, Sentence sent,
return set_overflowed(pex);
}

static Connector *get_tracon_by_id(const Disjunct *d, int32_t tracon_id,
int dir)
{
if (tracon_id < 0) return NULL; /* See make_choice() */
for (Connector *c = dir ? d->right : d->left; c != NULL; c = c->next)
if (tracon_id == c->tracon_id) return c;

assert(0, "tracon_id %d not found on disjunct %p in direction %d\n",
tracon_id, d, dir);
}

static bool is_zero_tracon(Connector *c)
{
return (c == NULL) || (c->tracon_id < NULL_TRACON_BLOCK);
}

/**
* Assemble the link array and the chosen_disjuncts of a linkage.
*/
static void issue_link(Linkage lkg, int lr, Parse_choice *pc,
const Parse_set *set)
{
Connector *lc = lr ? pc->rc : set->le;
if (lc == NULL) return; /* No choice to record. */
Connector *lc = lr ? get_tracon_by_id(pc->md, pc->r_id, 1) : set->le;
if (is_zero_tracon(lc)) return; /* No choice to record. */

lkg->chosen_disjuncts[lr ? pc->set[1]->lw : pc->set[0]->rw] = pc->md;

Connector *rc = lr ? set->re : pc->lc;
if (rc == NULL) return; /* No link to generate. */
Connector *rc = lr ? set->re : get_tracon_by_id(pc->md, pc->l_id, 0);
if (is_zero_tracon(rc)) return; /* No choice to record. */

assert(lkg->num_links < lkg->lasz, "Linkage array too small!");
Link *link = &lkg->link_array[lkg->num_links];
Expand Down Expand Up @@ -792,12 +824,13 @@ static void issue_links_for_choice(Linkage lkg, Parse_choice *pc,
* For S0: (Nindex % pc->set[0]->count) ranges from 0 to (S0ₘ-1).
* For S1: (Nindex / pc->set[0]->count) ranges from 0 to (S1ₘ-1).
*/
static void list_links(Linkage lkg, const Parse_set * set, int index)
static void list_links(Linkage lkg, Parse_set * set, int index)
{
Parse_choice *pc;
count_t n; /* No overflow - see extract_links() and process_linkages() */

if (set == NULL || set->first == NULL) return;
assert(set != NULL, "Unexpected NULL Parse_set");
if (set->first == NULL) return;
for (pc = set->first; pc != NULL; pc = pc->next) {
n = pc->set[0]->count * pc->set[1]->count;
if (index < n) break;
Expand All @@ -810,32 +843,18 @@ static void list_links(Linkage lkg, const Parse_set * set, int index)
}

static void list_random_links(Linkage lkg, unsigned int *rand_state,
const Parse_set * set)
Parse_set * set)
{
Parse_choice *pc;
int num_pc, new_index;
assert(set != NULL, "Unexpected NULL Parse_set");
if (set->first == NULL) return;

if (set == NULL || set->first == NULL) return;
/* Avoid calling rand_r() for the common case of a single element. */
unsigned int new_index = (set->num_pc == 1) ? 0 :
rand_r(rand_state) % set->num_pc;

/* Most of the time, there is only one list element. */
if (set->first->next == NULL)
{
pc = set->first;
}
else
{
num_pc = 0;
for (pc = set->first; pc != NULL; pc = pc->next) {
num_pc++;
}

new_index = rand_r(rand_state) % num_pc;

num_pc = 0;
for (pc = set->first; new_index != num_pc; pc = pc->next) {
num_pc++;
}
}
Parse_choice *pc;
for (pc = set->first; new_index > 0; pc = pc->next)
new_index--;

issue_links_for_choice(lkg, pc, set);
list_random_links(lkg, rand_state, pc->set[0]);
Expand Down