Edinburgh Speech Tools 2.4-release
 
Loading...
Searching...
No Matches
sig2fv_main.cc
1/*************************************************************************/
2/* */
3/* Centre for Speech Technology Research */
4/* University of Edinburgh, UK */
5/* Copyright (c) 1995,1996 */
6/* All Rights Reserved. */
7/* */
8/* Permission is hereby granted, free of charge, to use and distribute */
9/* this software and its documentation without restriction, including */
10/* without limitation the rights to use, copy, modify, merge, publish, */
11/* distribute, sublicense, and/or sell copies of this work, and to */
12/* permit persons to whom this work is furnished to do so, subject to */
13/* the following conditions: */
14/* 1. The code must retain the above copyright notice, this list of */
15/* conditions and the following disclaimer. */
16/* 2. Any modifications must be clearly marked as such. */
17/* 3. Original authors' names are not deleted. */
18/* 4. The authors' names are not used to endorse or promote products */
19/* derived from this software without specific prior written */
20/* permission. */
21/* */
22/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30/* THIS SOFTWARE. */
31/* */
32/*************************************************************************/
33/* Authors: Paul Taylor and Simon King */
34/* Date : April 1995 */
35/*-----------------------------------------------------------------------*/
36/* Generate feature vectors */
37/* */
38/*=======================================================================*/
39
40#include <cstdlib>
41#include "EST_speech_class.h"
42#include "EST_string_aux.h"
43#include "EST_cmd_line.h"
44#include "EST_cmd_line_options.h"
45#include "sigpr/EST_sigpr_utt.h"
46#include "sigpr/EST_filter.h"
47
48#define EPSILON (0.0001)
49
50#define DEFAULT_FRAME_SIZE 0.01
51#define DEFAULT_FRAME_FACTOR 2.0
52#define DEFAULT_LPC_ORDER 16
53#define DEFAULT_REF_ORDER 16
54#define DEFAULT_CEP_ORDER 12
55#define DEFAULT_FBANK_ORDER 20
56#define DEFAULT_MELCEP_ORDER 12
57#define DEFAULT_WINDOW "hamming"
58#define DEFAULT_PREEMPH 0
59#define DEFAULT_LIFTER 0
60
61
62// sane values for pitchmarks (in seconds)
63
64#define MINIMUM_PITCH_PERIOD (0.0033) // 300 hz
65#define MAXIMUM_PITCH_PERIOD (0.02) // 50 Hz
66#define DEFAULT_PITCH_PERIOD (0.01) // 100 Hz
67
68void calculate_orders(EST_StrList &clist, EST_IList &olist,
69 EST_Option &op);
70
71void add_channels_to_map(EST_StrList &map, EST_StrList &types,
72 EST_Features &op, int order);
73
74void set_options(EST_Features &op, EST_Option &al);
75
76EST_String sigpr_options_supported(void)
77{
78 return
79 EST_String("")+
80 " lpc linear predictive coding\n"
81 " cep cepstrum coding from lpc coefficients\n"
82 " melcep Mel scale cepstrum coding via fbank\n"
83 " fbank Mel scale log filterbank analysis\n"
84 " lsf line spectral frequencies\n"
85 " ref Linear prediction reflection coefficients\n"
86 " power\n"
87 " f0\n"
88 " energy: root mean square energy\n";
89};
90
91
92
93/** @name <command>sig2fv</command> <emphasis>Generate signal processing coefficients from waveforms</emphasis>
94 * @id sigfv-manual
95 * @toc
96 */
97
98//@{
99
100/**@name Synopsis
101 */
102//@{
103
104//@synopsis
105
106/**
107sig2fv is used to create signal processing feature vector analysis on speech
108waveforms.
109The following types of analysis are provided:
110
111<itemizedlist>
112<listitem><para>Linear prediction (LPC)</para></listitem>
113<listitem><para>Cepstrum coding from lpc coefficients</para></listitem>
114<listitem><para>Mel scale cepstrum coding via fbank</para></listitem>
115<listitem><para>Mel scale log filterbank analysis</para></listitem>
116<listitem><para>Line spectral frequencies</para></listitem>
117<listitem><para>Linear prediction reflection coefficients</para></listitem>
118<listitem><para>Root mean square energy</para></listitem>
119<listitem><para>Power</para></listitem>
120<listitem><para>fundamental frequency (pitch)</para></listitem>
121<listitem><para>calculation of delta and acceleration coefficients of all of the
122above</para></listitem>
123</itemizedlist>
124
125The -coefs option is used to specify a list of the names of what sort
126of basic processing is required, and -delta and -acc are used for
127delta and acceleration coefficients respectively.
128
129*/
130
131//@}
132
133/**@name Options
134 */
135//@{
136
137//@options
138
139//@}
140
141
142int main(int argc, char *argv[])
143{
144 EST_String out_file("-");
147 EST_Features op;
152
153 parse_command_line
154 (argc, argv,
155 EST_String("[input file] -o [output file]\n")+
156 "Summary: generate acoustic feature vectors for a waveform file \n"
157 "use \"-\" to make input and output files stdin/out \n"
158 "-h Options help \n\n" +
159 options_wave_input() +
160 options_track_output() + " \n"
161 "-shift <float> frame spacing in seconds for fixed frame analysis. This \n"
162 " doesn't have to be the same as the output file spacing - the \n"
163 " S option can be used to resample the track before saving \n"
164 " default: "+ftoString(DEFAULT_FRAME_SIZE) +"\n\n"
165 "-factor <float> Frames lengths will be FACTOR times the \n"
166 " local pitch period. \n"
167 " default: "+ftoString(DEFAULT_FRAME_FACTOR) +"\n\n"
168 "-pm <ifile> Pitch mark file name. This is used to \n"
169 " specify the positions of the analysis frames for pitch \n"
170 " synchronous analysis. Pitchmark files are just standard \n"
171 " track files, but the channel information is ignored and \n"
172 " only the time positions are used\n"
173 "-size <float> If specified with pm, size is used as the \n"
174 " fixed window size (times factor) rather than size within \n"
175 " each the pms.\n\n"
176
177 "-coefs <string> list of basic types of processing required. \n"
178 " Permissible types are: \n" + sigpr_options_supported()+" \n"
179 "-delta <string> list of delta types of processing required. Basic \n"
180 " processing does not need to be specified for this option to work. \n"
181 " Permissible types are: \n" + sigpr_options_supported()+" \n"
182 "-acc <string> list of acceleration (delta delta) processing \n"
183 " required. Basic processing does not need to be specified for \n"
184 " this option to work. \n"
185 " Permissible types are: \n"
186 + sigpr_options_supported()+"\n"
187 "-window_type <string> Type of window used on waveform. \n"
188 " Permissible types are: \n" +
190 " default: \"DEFAULT_WINDOW\"\n\n"
191 "-lpc_order <int> Order of lpc analysis. \n\n"
192 "-ref_order <int> Order of lpc reflection coefficient analysis. \n\n"
193 "-cep_order <int> Order of lpc cepstral analysis.\n\n"
194 "-melcep_order <int> Order of Mel cepstral analysis.\n\n"
195 "-fbank_order <int> Order of filter bank analysis.\n\n"
196 "-preemph <float> Perform pre-emphasis with this factor.\n\n"
197 "-lifter <float> lifter coefficient.\n\n"
198 "-usepower use power rather than energy in filter bank \n"
199 " analysis\n\n"+
200 "-include_c0 include cepstral coefficient 0\n\n"
201 "-order <string> order of analyses\n", files, al);
202
203 out_file = al.present("-o") ? al.val("-o") : (EST_String)"-";
204 set_options(op, al);
205
206 StringtoStrList(al.val("-coefs"), coef_list);
207 StringtoStrList(al.val("-delta"), delta_list);
208 StringtoStrList(al.val("-acc"), acc_list);
209
210 StringtoStrList(al.val("-order"), tlist);
211 StrListtoIList(tlist, olist);
212
213 if (read_wave(sig, files.first(), al) != read_ok)
214 exit(-1);
215
216 // allocate and fill time axis
217 if (al.present("-pm"))
218 {
219 if (read_track(full, al.val("-pm"), al))
220 exit(1);
221 }
222 else
223 {
224 full.resize((int)ceil(sig.end() / op.F("frame_shift")), 0);
225 full.fill_time(op.F("frame_shift"));
226 }
227
228 // allocate channels
229 add_channels_to_map(map, coef_list, op, 0);
230 add_channels_to_map(map, delta_list, op, 1);
231 add_channels_to_map(map, acc_list, op, 2);
232
233 //cerr << "MAP " << map << endl;
234
235 full.resize(EST_CURRENT, map);
236
237 if (al.present("-preemph"))
238 pre_emphasis(sig, al.fval("-preemph"));
239
240 if(al.present("-usepower"))
241 cerr << "sig2fv: -usepower currently not supported" << endl;
242
243 sigpr_base(sig, full, op, coef_list);
244 sigpr_delta(sig, full, op, delta_list);
245 sigpr_acc(sig, full, op, acc_list);
246
247 if (al.present("-S"))
248 {
249 cout << "-S " << al.fval("-S") << endl;
250 full.sample(al.fval("-S"));
251 }
252
253 if (full.save(out_file, al.val("-otype", 0)) != write_ok)
254 {
255 cerr << "sig2fv: failed to write output to \"" << out_file
256 << "\"" << endl;
257 exit(-1);
258 }
259 return 0;
260}
261
262
263
264void calculate_orders(EST_StrList &clist, EST_IList &olist,
265 EST_Option &op)
266{
267 EST_Litem *c, *o;
268 EST_String k;
269 int v;
270
271 for (c = clist.head(), o = olist.head(); c && o; c= c->next(), o = o->next())
272 {
273 k = clist(c) + "_order";
274 v = olist(o);
275 op.override_ival(k, v);
276 }
277}
278
279void set_options(EST_Features &op, EST_Option &al)
280{
281 op.set("frame_shift", DEFAULT_FRAME_SIZE);
282 op.set("frame_factor", DEFAULT_FRAME_FACTOR);
283 op.set("window_type", DEFAULT_WINDOW);
284
285 op.set("preemph", DEFAULT_PREEMPH);
286 op.set("lifter", DEFAULT_LIFTER);
287
288 op.set("lpc_order", DEFAULT_LPC_ORDER);
289 op.set("ref_order", DEFAULT_REF_ORDER);
290 op.set("cep_order", DEFAULT_CEP_ORDER);
291 op.set("fbank_order", DEFAULT_FBANK_ORDER);
292 op.set("melcep_order", DEFAULT_MELCEP_ORDER);
293
294 op.set("max_period", MAXIMUM_PITCH_PERIOD);
295 op.set("min_period", MINIMUM_PITCH_PERIOD);
296 op.set("def_period", DEFAULT_PITCH_PERIOD);
297
298 if (al.present("-max_period"))
299 op.set("max_period", al.fval("-max_period", 0));
300 if (al.present("-min_period"))
301 op.set("min_period", al.fval("-min_period", 0));
302 if (al.present("-def_period"))
303 op.set("def_period", al.fval("-def_period", 0));
304
305 if (al.present("-window_type"))
306 op.set("window_type", al.sval("-window_type", 1));
307
308 if (al.present("-shift"))
309 op.set("frame_shift", al.fval("-shift", 1));
310 if (al.present("-factor"))
311 op.set("frame_factor", al.fval("-factor", 1));
312 if (al.present("-size"))
313 op.set("frame_factor", op.F("frame_factor")*-1.0*al.fval("-size"));
314 if (al.present("-length"))
315 op.set("frame_factor",
316 al.fval("-length", est_errors_allowed)/op.F("frame_shift",est_errors_allowed));
317
318 if (al.present("-preemph"))
319 op.set("preemph", al.fval("-preemph", 1));
320 if (al.present("-lifter"))
321 op.set("lifter", al.fval("-lifter", 1));
322
323 if (al.present("-lpc_order"))
324 op.set("lpc_order", al.ival("-lpc_order", 1));
325 if (al.present("-ref_order"))
326 op.set("ref_order", al.ival("-ref_order", 1));
327 if (al.present("-cep_order"))
328 op.set("cep_order", al.ival("-cep_order", 1));
329 if (al.present("-fbank_order"))
330 op.set("fbank_order", al.ival("-fbank_order", 1));
331 if (al.present("-melcep_order"))
332 op.set("melcep_order", al.ival("-melcep_order", 1));
333
334 if (al.present("-usepower"))
335 op.set("usepower", al.val("-usepower", 1));
336
337 if (al.present("-include_c0"))
338 op.set("include_c0", al.val("-include_c0", 1));
339
340}
341
342/**@name Examples
343
344
345Fixed frame basic linear prediction:
346
347To produce a set of linear prediction coefficients at every 10ms, using
348pre-emphasis and saving in EST format:
349
350<para>
351<screen>
352$ sig2fv kdt_010.wav -o kdt_010.lpc -coefs "lpc" -otype est -shift 0.01 -preemph 0.5
353</screen>
354</para>
355<formalpara><title>
356Pitch Synchronous linear prediction</title><para>. The following used the set of pitchmarks
357in kdt_010.pm as the centres of the analysis windows.
358</para>
359</formalpara>
360
361<para>
362<screen>
363$ sig2fv kdt_010.wav -pm kdt_010.pm -o kdt_010.lpc -coefs "lpc" -otype est -shift 0.01 -preemph 0.5
364</screen>
365</para>
366
367<para>
368F0, Linear prediction and cepstral coefficients:
369
370<screen>
371$ sig2fv kdt_010.wav -o kdt_010.lpc -coefs "f0 lpc cep" -otype est -shift 0.01
372</screen>
373
374Note that pitchtracking can also be done with the
375<command>pda</command> program. Both use the same underlying
376technique, but the pda program offers much finer control over the
377pitch track specific processing parameters.
378
379</para>
380
381<para>Energy, Linear Prediction and Cepstral coefficients, with a 10ms frame shift
382during analysis but a 5ms frame shift in the output file:
383
384<para>
385<screen>
386$ sig2fv kdt_010.wav -o kdt_010.lpc -coefs "f0 lpc cep" -otype est -S 0.005
387 -shift 0.01
388</screen>
389</para>
390
391<para>Delta and acc coefficients can be calculated even if their base form is not
392required. This produces normal energy coefficients and cepstral delta coefficients:
393
394<para>
395<screen>
396$ sig2fv ../kdt_010.wav -o kdt_010.lpc -coefs "energy" -delta "cep" -otype est
397</screen>
398</para>
399
400<para>Mel-scaled cepstra, Delta and acc coefficients, as is common in speech
401recognition:
402<para>
403<screen>
404$ sig2fv ../kdt_010.wav -o kdt_010.lpc -coefs "melcep" -delta "melcep" -acc "melcep" -otype est -preemph 0.96
405</screen>
406
407*/
408//@{
409//@}
410
411
412
413//@}
void set(const EST_String &name, int ival)
const float F(const EST_String &path) const
int override_ival(const EST_String rkey, const int rval)
add to end of list or overwrite. If rval is empty, do nothing
Definition EST_Option.cc:66
static EST_String options_supported(void)
Return a paragraph describing the available windows.