58static void load_wstream(
const EST_String &filename,
63static void load_given(
const EST_String &filename,
66static double find_gram_prob(
EST_VTPath *p,
int *state);
69static double find_extra_gram_prob(
EST_VTPath *p,
int *state,
int time);
73static int is_a_special(
const EST_String &s,
int &val);
74static int max_history=0;
77static EST_String pstring = SENTENCE_START_MARKER;
78static EST_String ppstring = SENTENCE_END_MARKER;
79static float lm_scale = 1.0;
80static float ob_scale = 1.0;
81static float ob_scale2 = 1.0;
85static float ob_beam=-1;
86static int n_beam = -1;
88static bool trace_on = FALSE;
91static double ob_log_prob_floor = SAFE_LOG_ZERO;
92static double ob_log_prob_floor2 = SAFE_LOG_ZERO;
93static double lm_log_prob_floor = SAFE_LOG_ZERO;
95int btest_debug = FALSE;
101int using_given=FALSE;
104int take_logs = FALSE;
182 EST_String(
"[observations file] -o [output file]\n")+
183 "Summary: find the most likely path through a sequence of\n"+
184 " observations, constrained by a language model.\n"+
185 "-ngram <string> Grammar file, required\n"+
186 "-given <string> ngram left contexts, per frame\n"+
187 "-vocab <string> File with names of vocabulary, this\n"+
188 " must be same number as width of observations, required\n"+
189 "-ob_type <string> Observation type : likelihood .... and change doc\"probs\" or \"logs\" (default is \"logs\")\n"+
190 "\nFloor values and scaling (scaling is applied after floor value)\n"+
191 "-lm_floor <float> LM floor probability\n"+
192 "-lm_scale <float> LM scale factor factor (applied to log prob)\n"+
193 "-ob_floor <float> Observations floor probability\n"+
194 "-ob_scale <float> Observation scale factor (applied to prob or log prob, depending on -ob_type)\n\n"+
195 "-prev_tag <string>\n"+
196 " tag before sentence start\n"+
197 "-prev_prev_tag <string>\n"+
198 " all words before 'prev_tag'\n"+
199 "-last_tag <string>\n"+
200 " after sentence end\n"+
201 "-default_tags use default tags of "+SENTENCE_START_MARKER+
","
202 SENTENCE_END_MARKER+
" and "+SENTENCE_END_MARKER+
"\n"+
205 "-observes2 <string> second observations (overlays first, ob_type must be same)\n"+
206 "-ob_floor2 <float> \n"+
207 "-ob_scale2 <float> \n\n"+
208 "-ob_prune <float> observation pruning beam width (log) probability\n"+
209 "-n_prune <int> top-n pruning of observations\n"+
210 "-prune <float> pruning beam width (log) probability\n"+
211 "-trace show details of search as it proceeds\n",
216 if (
files.length() != 1)
219 cerr <<
": you must give exactly one observations file on the command line";
221 cerr <<
"(use -observes2 for optional second observations)" <<
endl;
225 if (
al.present(
"-ngram"))
227 ngram.load(
al.val(
"-ngram"));
235 if(!
al.present(
"-vocab"))
237 cerr <<
"You must provide a vocabulary file !" <<
endl;
241 load_wstream(
files.first(),
al.val(
"-vocab"),
wstream,observations);
242 if (
al.present(
"-observes2"))
244 load_wstream(
al.val(
"-observes2"),
al.val(
"-vocab"),
wstream,observations2);
248 if (
al.present(
"-given"))
250 load_given(
al.val(
"-given"),ngram.order());
254 if (
al.present(
"-lm_scale"))
255 lm_scale =
al.fval(
"-lm_scale");
259 if (
al.present(
"-ob_scale"))
260 ob_scale =
al.fval(
"-ob_scale");
264 if (
al.present(
"-ob_scale2"))
265 ob_scale2 =
al.fval(
"-ob_scale2");
269 if (
al.present(
"-prev_tag"))
270 pstring =
al.val(
"-prev_tag");
271 if (
al.present(
"-prev_prev_tag"))
272 ppstring =
al.val(
"-prev_prev_tag");
275 if (
al.present(
"-prune"))
276 beam =
al.fval(
"-prune");
280 if (
al.present(
"-ob_prune"))
281 ob_beam =
al.fval(
"-ob_prune");
285 if (
al.present(
"-n_prune"))
287 n_beam =
al.ival(
"-n_prune");
290 cerr <<
"WARNING : " << n_beam;
291 cerr <<
" is not a reasonable value for -n_prune !" <<
endl;
299 if (
al.present(
"-trace"))
303 if (
al.present(
"-lm_floor"))
308 cerr <<
"Error : LM floor probability is negative !" <<
endl;
313 cerr <<
"Error : LM floor probability > 1 " <<
endl;
316 lm_log_prob_floor = safe_log(
floor);
320 if (
al.present(
"-ob_floor"))
325 cerr <<
"Error : Observation floor probability is negative !" <<
endl;
330 cerr <<
"Error : Observation floor probability > 1 " <<
endl;
333 ob_log_prob_floor = safe_log(
floor);
336 if (
al.present(
"-ob_floor2"))
341 cerr <<
"Error : Observation2 floor probability is negative !" <<
endl;
346 cerr <<
"Error : Observation2 floor probability > 1 " <<
endl;
349 ob_log_prob_floor2 = safe_log(
floor);
353 if (
al.present(
"-ob_type"))
355 if(
al.val(
"-ob_type") ==
"logs")
357 else if(
al.val(
"-ob_type") ==
"probs")
361 cerr <<
"\"" <<
al.val(
"-ob_type")
362 <<
"\" is not a valid ob_type : try \"logs\" or \"probs\"" <<
endl;
370 cerr <<
"No path could be found." <<
endl;
384 else if ((fd =
fopen(out_file,
"wb")) == NULL)
386 cerr <<
"can't open \"" << out_file <<
"\" for output" <<
endl;
390 for (s=
wstream.head(); s != 0 ; s=inext(s))
392 predict = s->f(
"best").string();
393 pscore = s->f(
"best_score");
407 states = ngram.num_states();
412 if((beam > 0) || (ob_beam > 0))
413 vc.set_pruning_parameters(beam,ob_beam);
418 cerr <<
"Starting Viterbi search..." <<
endl;
423 return vc.result(
"best");
427static void load_wstream(
const EST_String &filename,
439 if (
obs.load(filename,0.10) != 0)
441 cerr <<
"can't find observations file \"" << filename <<
"\"" <<
endl;
445 if (vocab.
length() !=
obs.num_channels())
447 cerr <<
"Number in vocab (" << vocab.
length() <<
448 ") not equal to observation's width (" <<
449 obs.num_channels() <<
")" <<
endl;
455 for (i=0; i <
obs.num_frames(); i++)
457 add_word(w,itoString(i),i);
464static void load_given(
const EST_String &filename,
472 if (load_TList_of_StrVector(given,filename,
ngram_order-1) != 0)
474 cerr <<
"can't load given file \"" << filename <<
"\"" <<
endl;
479 for (p = given.head(); p; p = p->next())
481 for(i=0;i<given(p).length();i++)
482 if( is_a_special( given(p)(i),
j) && (-
j > max_history))
504 vocab.append(
ts.get().string());
515 item->set_name(
word);
516 item->
set(
"pos",pos);
522 double prob=1.0,
prob2=1.0;
531 for (i=0,p=vocab.head(); i < observations.num_channels(); i++,p=p->next())
535 prob = observations.a(
observe,i);
541 prob = safe_log10(prob);
542 if (prob < ob_log_prob_floor)
543 prob = ob_log_prob_floor;
548 if (
prob2 < ob_log_prob_floor2)
549 prob2 = ob_log_prob_floor2;
554 if (prob < ob_log_prob_floor)
555 prob = ob_log_prob_floor;
556 if ((num_obs == 2) && (
prob2 < ob_log_prob_floor2))
557 prob2 = ob_log_prob_floor2;
564 c->score = prob +
prob2;
576 top_n_candidates(
all_c);
599 prob = find_extra_gram_prob(
np,&
np->state,c->s->f(
"pos"));
601 prob = find_gram_prob(
np,&
np->state);
603 lprob = safe_log10(prob);
604 if (
lprob < lm_log_prob_floor)
605 lprob = lm_log_prob_floor;
609 np->f.set(
"lscore",(c->score+
lprob));
613 np->score = (c->score+
lprob) + p->score;
618static double find_gram_prob(
EST_VTPath *p,
int *state)
622 double prob=0.0,
nprob;
627 for (
pp=p->from,i=ngram.order()-2; i >= 0; i--)
649 for (i=0; i < ngram.order()-1; i++)
657static double find_extra_gram_prob(
EST_VTPath *p,
int *state,
int time)
661 double prob=0.0,
nprob;
685 for(i=
history.length()-1;i>0;i--)
704 for (
pp=p->from,i=0; i <
history.length(); i++)
732 if( time >= given.length() )
748 for(i=0;i<ngram.order()-1;i++)
751 if( is_a_special( (*
this_g)(i),
j))
760static int is_a_special(
const EST_String &s,
int &val)
791 for(i=0;i<n_beam;i++)
800 for(p=
all_c;p!= NULL;
q=p,p=p->next)
double samples(void) const
Total number of example found.
void set(const EST_String &name, int ival)
int contains(const char *s, int pos=-1) const
Does it contain this substring?
EST_String after(int pos, int len=1) const
Part after pos+len.
void empty()
Fill vector with default value.
INLINE int length() const
number of items in vector.
const EST_String & string(void) const