49#include "EST_simplestats.h"
65static double find_score_if_split(
EST_WFST &wfst,
90 if (
ts.open(filename) == -1)
91 EST_error(
"wfst_train: failed to read data from \"%s\"",
92 (
const char *)filename);
105 cerr <<
"wfst_train: data contains unknown symbol \"" <<
108 s = cons(flocons(
id),s);
111 while (!
ts.eoln() && !
ts.eof());
113 ss = cons(reverse(s),
ss);
116 printf(
"wfst_train: loaded %d lines of %d tokens\n",
139 for (i=0; i < wfst.num_states(); i++)
148 for (i=0,d=data; d; d=cdr(d),i++)
150 s = wfst.start_state();
151 for (w=car(d); w; w=cdr(w))
154 id = get_c_int(car(w));
158 printf(
"sentence %d not in language, skipping\n",i);
163 trans->set_weight(trans->weight()+1);
179 w = s->transitions(
tp)->weight();
203 for (i=0; i < wfst.num_states(); i++)
222 printf(
"No new max_entropy state\n");
227 printf(
"No best_trans in max_entropy state\n");
241 printf(
"reached cycle end %d\n",c);
260static int me_compare_function(
const void *a,
const void *b)
267 float fa = get_c_float(car(
la));
268 float fb = get_c_float(car(
lb));
290 for (i=0; i < wfst.num_states(); i++)
301 qsort(
slist,wfst.num_states(),
sizeof(
LISP),me_compare_function);
324 pdf_all.cumulate(get_c_int(car(car(
dd))));
326 if (siod_llength(
splits) < 2)
344 score = score_pdf_combine(*
a_pdf,*pdf(car(cdr(cdr(
ssplits[b])))),
366 for (i=
b_pdf->item_start(); !
b_pdf->item_end(i);
367 i =
b_pdf->item_next(i))
404 for (i=
ab.item_start(); !
ab.item_end(i);
411 score = (
ab.entropy() *
ab.samples()) +
431 for (i=0; i < wfst.num_states(); i++)
434 for (
tp=s->transitions.head();
tp != 0;
tp =
tp->next())
437 && (s->transitions(
tp)->weight() > 0))
439 in = s->transitions(
tp)->in_symbol();
442 for (
dd = data[i];
dd;
dd = cdr(
dd))
444 id = get_c_int(car(car(
dd)));
452 value = score_pdf_combine(*pdf,empty,
pdf_all);
456 t = siod(s->transitions(
tp));
458 ttt = cons(flocons(value),
485 for (i=1; i < wfst.num_states(); i++)
488 for (
tp=s->transitions.head();
tp != 0;
tp =
tp->next())
490 if ((wfst.
state(s->transitions(
tp)->state()) == split_state) &&
491 (s->transitions(
tp)->weight() > 0))
493 bb = find_score_if_split(wfst,i,s->transitions(
tp),data);
515static double find_score_if_split(
EST_WFST &wfst,
541 in = trans->in_symbol();
544 id = get_c_int(car(car(
dd)));
589 int ostate = trans->state();
621 trans(car(t))->set_state(
nstate);
EST_Litem * item_next(EST_Litem *idx) const
Used for iterating through members of the distribution.
void item_freq(EST_Litem *idx, EST_String &s, double &freq) const
During iteration returns name and frequency given index
EST_Litem * item_start() const
Used for iterating through members of the distribution.
double samples(void) const
Total number of example found.
void cumulate(const EST_String &s, double count=1)
Add this observation, may specify number of occurrences.
int item_end(EST_Litem *idx) const
Used for iterating through members of the distribution.
void start_cumulate()
Clear and start cumulation.
int add_state(enum wfst_state_type state_type)
Add a new state, returns new name.
EST_WFST_State * state_non_const(int i)
Return internal state information (non-const)
EST_WFST_Transition * find_transition(int state, int in, int out) const
Find (first) transition given in and out symbols.
int in_symbol(const EST_String &s) const
Map input symbol to input alphabet index.
EST_write_status save(const EST_String &filename, const EST_String type="ascii")
?
const EST_WFST_State * state(int i) const
Return internal state information.
void stop_cumulate()
Stop cumulation and calculate probabilities on transitions.
const EST_Discrete & in_symbols() const
Accessing the input alphabet.