Skip to content

Commit 0ddf4bb

Browse files
committed
trunk: updating online-nnet2-decoding setup to allow for downweighting of silence in the stats for iVector estimation.
git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4972 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
1 parent 9045a18 commit 0ddf4bb

25 files changed

Lines changed: 813 additions & 279 deletions

egs/librispeech/s5/local/online/run_nnet2_ms.sh

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ if [ $stage -le 13 ]; then
168168
done
169169
fi
170170

171-
#exit 0;
171+
exit 0;
172172
###### Comment out the "exit 0" above to run the multi-threaded decoding. #####
173173

174174
if [ $stage -le 14 ]; then
@@ -189,4 +189,14 @@ if [ $stage -le 15 ]; then
189189
${dir}_online/decode_pp_${test}_tgsmall_utt_threaded_ep || exit 1;
190190
fi
191191

192+
if [ $stage -le 16 ]; then
193+
# Demonstrate the multi-threaded decoding with silence excluded
194+
# from iVector estimation.
195+
test=dev_clean
196+
steps/online/nnet2/decode.sh --threaded true --silence-weight 0.0 \
197+
--config conf/decode.config --cmd "$decode_cmd" --nj 30 \
198+
--per-utt true exp/tri6b/graph_pp_tgsmall data/$test \
199+
${dir}_online/decode_pp_${test}_tgsmall_utt_threaded_sil0.0 || exit 1;
200+
fi
201+
192202
exit 0;

egs/wsj/s5/steps/nnet2/decode.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ for f in $graphdir/HCLG.fst $data/feats.scp $model $extra_files; do
6666
done
6767

6868
sdata=$data/split$nj;
69-
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
69+
cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1;
7070
thread_string=
7171
[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"
7272

egs/wsj/s5/steps/online/nnet2/decode.sh

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@ do_endpointing=false
2020
do_speex_compressing=false
2121
scoring_opts=
2222
skip_scoring=false
23+
silence_weight=1.0 # set this to a value less than 1 (e.g. 0) to enable silence weighting.
24+
max_state_duration=40 # This only has an effect if you are doing silence
25+
# weighting. This default is probably reasonable. transition-ids repeated
26+
# more than this many times in an alignment are treated as silence.
2327
iter=final
2428
# End configuration section.
2529

@@ -94,6 +98,12 @@ if $do_endpointing; then
9498
wav_rspecifier="$wav_rspecifier extend-wav-with-silence ark:- ark:- |"
9599
fi
96100

101+
if [ "$silence_weight" != "1.0" ]; then
102+
silphones=$(cat $graphdir/phones/silence.csl) || exit 1
103+
silence_weighting_opts="--ivector-silence-weighting.max-state-duration=$max_state_duration --ivector-silence-weighting.silence_phones=$silphones --ivector-silence-weighting.silence-weight=$silence_weight"
104+
else
105+
silence_weighting_opts=
106+
fi
97107

98108

99109
if $threaded; then
@@ -110,7 +120,7 @@ fi
110120

111121
if [ $stage -le 0 ]; then
112122
$cmd $parallel_opts JOB=1:$nj $dir/log/decode.JOB.log \
113-
$decoder $opts --do-endpointing=$do_endpointing \
123+
$decoder $opts $silence_weighting_opts --do-endpointing=$do_endpointing \
114124
--config=$srcdir/conf/online_nnet2_decoding.conf \
115125
--max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \
116126
--acoustic-scale=$acwt --word-symbol-table=$graphdir/words.txt \

egs/wsj/s5/utils/split_data.sh

Lines changed: 39 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,10 @@ if [ "$1" == "--per-utt" ]; then
2222
fi
2323

2424
if [ $# != 2 ]; then
25-
echo "Usage: split_data.sh <data-dir> <num-to-split>"
25+
echo "Usage: split_data.sh [--per-utt] <data-dir> <num-to-split>"
2626
echo "This script will not split the data-dir if it detects that the output is newer than the input."
27+
echo "By default it splits per speaker (so each speaker is in only one split dir),"
28+
echo "but with the --per-utt option it will ignore the speaker information while splitting."
2729
exit 1
2830
fi
2931

@@ -45,13 +47,11 @@ nu=`cat $data/utt2spk | wc -l`
4547
nf=`cat $data/feats.scp 2>/dev/null | wc -l`
4648
nt=`cat $data/text 2>/dev/null | wc -l` # take it as zero if no such file
4749
if [ -f $data/feats.scp ] && [ $nu -ne $nf ]; then
48-
echo "** split_data.sh: warning, #lines is (utt2spk,feats.scp) is ($nu,$nf); this script "
49-
echo "** may produce incorrectly split data."
50+
echo "** split_data.sh: warning, #lines is (utt2spk,feats.scp) is ($nu,$nf); you can "
5051
echo "** use utils/fix_data_dir.sh $data to fix this."
5152
fi
5253
if [ -f $data/text ] && [ $nu -ne $nt ]; then
53-
echo "** split_data.sh: warning, #lines is (utt2spk,text) is ($nu,$nt); this script "
54-
echo "** may produce incorrectly split data."
54+
echo "** split_data.sh: warning, #lines is (utt2spk,text) is ($nu,$nt); you can "
5555
echo "** use utils/fix_data_dir.sh to fix this."
5656
fi
5757

@@ -74,11 +74,7 @@ fi
7474

7575
for n in `seq $numsplit`; do
7676
mkdir -p $data/split$numsplit/$n
77-
feats="$feats $data/split$numsplit/$n/feats.scp"
78-
vads="$vads $data/split$numsplit/$n/vad.scp"
79-
texts="$texts $data/split$numsplit/$n/text"
8077
utt2spks="$utt2spks $data/split$numsplit/$n/utt2spk"
81-
utt2langs="$utt2langs $data/split$numsplit/$n/utt2lang"
8278
done
8379

8480
if $split_per_spk; then
@@ -87,37 +83,51 @@ else
8783
utt2spk_opt=
8884
fi
8985

90-
utils/split_scp.pl $utt2spk_opt $data/utt2spk $utt2spks || exit 1
86+
# If lockfile is not installed, just don't lock it. It's not a big deal.
87+
which lockfile >&/dev/null && lockfile -l 60 $data/.split_lock
9188

92-
[ -f $data/feats.scp ] && utils/split_scp.pl $utt2spk_opt $data/feats.scp $feats
89+
utils/split_scp.pl $utt2spk_opt $data/utt2spk $utt2spks || exit 1
9390

94-
[ -f $data/text ] && utils/split_scp.pl $utt2spk_opt $data/text $texts
91+
for n in `seq $numsplit`; do
92+
dsn=$data/split$numsplit/$n
93+
utils/utt2spk_to_spk2utt.pl $dsn/utt2spk > $dsn/spk2utt || exit 1;
94+
done
9595

96-
[ -f $data/vad.scp ] && utils/split_scp.pl $utt2spk_opt $data/vad.scp $vads
96+
maybe_wav_scp=
97+
if [ ! -f $data/segments ]; then
98+
maybe_wav_scp=wav.scp # If there is no segments file, then wav file is
99+
# indexed per utt.
100+
fi
97101

98-
[ -f $data/utt2lang ] && utils/split_scp.pl $utt2spk_opt $data/utt2lang $utt2langs
102+
# split some things that are indexed by utterance.
103+
for f in feats.scp text vad.scp utt2lang $maybe_wav_scp; do
104+
if [ -f $data/$f ]; then
105+
utils/filter_scps.pl JOB=1:$numsplit \
106+
$data/split$numsplit/JOB/utt2spk $data/$f $data/split$numsplit/JOB/$f || exit 1;
107+
fi
108+
done
99109

100-
# If lockfile is not installed, just don't lock it. It's not a big deal.
101-
which lockfile >&/dev/null && lockfile -l 60 $data/.split_lock
110+
# split some things that are indexed by speaker
111+
for f in spk2gender spk2warp cmvn.scp; do
112+
if [ -f $data/$f ]; then
113+
utils/filter_scps.pl JOB=1:$numsplit \
114+
$data/split$numsplit/JOB/spk2utt $data/$f $data/split$numsplit/JOB/$f || exit 1;
115+
fi
116+
done
102117

103118
for n in `seq $numsplit`; do
104119
dsn=$data/split$numsplit/$n
105-
utils/utt2spk_to_spk2utt.pl $dsn/utt2spk > $dsn/spk2utt || exit 1;
106-
for f in spk2gender spk2warp cmvn.scp; do
107-
[ -f $data/$f ] && \
108-
utils/filter_scp.pl $dsn/spk2utt $data/$f > $dsn/$f
109-
done
110120
if [ -f $data/segments ]; then
111121
utils/filter_scp.pl $dsn/utt2spk $data/segments > $dsn/segments
112-
awk '{print $2;}' $dsn/segments |sort|uniq > $data/tmp.reco # recording-ids.
113-
[ -f $data/reco2file_and_channel ] &&
114-
utils/filter_scp.pl $data/tmp.reco $data/reco2file_and_channel > $dsn/reco2file_and_channel
115-
[ -f $data/wav.scp ] && utils/filter_scp.pl $data/tmp.reco $data/wav.scp > $dsn/wav.scp
122+
awk '{print $2;}' $dsn/segments | sort | uniq > $data/tmp.reco # recording-ids.
123+
if [ -f $data/reco2file_and_channel ]; then
124+
utils/filter_scp.pl $data/tmp.reco $data/reco2file_and_channel > $dsn/reco2file_and_channel
125+
fi
126+
if [ -f $data/wav.scp ]; then
127+
utils/filter_scp.pl $data/tmp.reco $data/wav.scp >$dsn/wav.scp
128+
fi
116129
rm $data/tmp.reco
117-
else # else wav indexed by utterance -> filter on this.
118-
[ -f $data/wav.scp ] &&
119-
utils/filter_scp.pl $dsn/utt2spk $data/wav.scp > $dsn/wav.scp
120-
fi
130+
fi # else it would have been handled above, see maybe_wav.
121131
done
122132

123133
rm -f $data/.split_lock

src/Doxyfile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -669,13 +669,13 @@ HTML_HEADER = doc/header.html
669669
# 180 is cyan, 240 is blue, 300 purple, and 360 is red again.
670670
# The allowed range is 0 to 359.
671671

672-
HTML_COLORSTYLE_HUE = 26
672+
HTML_COLORSTYLE_HUE = 31
673673

674674
# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of
675675
# the colors in the HTML output. For a value of 0 the output will use
676676
# grayscales only. A value of 255 will produce the most vivid colors.
677677

678-
HTML_COLORSTYLE_SAT = 80
678+
HTML_COLORSTYLE_SAT = 115
679679

680680
# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to
681681
# the luminance component of the colors in the HTML output. Values below
@@ -684,7 +684,7 @@ HTML_COLORSTYLE_SAT = 80
684684
# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2,
685685
# and 100 does not change the gamma.
686686

687-
HTML_COLORSTYLE_GAMMA = 90
687+
HTML_COLORSTYLE_GAMMA = 80
688688

689689

690690

src/decoder/lattice-faster-online-decoder.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,15 @@ class LatticeFasterOnlineDecoder {
5050
typedef Arc::Label Label;
5151
typedef Arc::StateId StateId;
5252
typedef Arc::Weight Weight;
53+
5354
struct BestPathIterator {
5455
void *tok;
5556
int32 frame;
57+
// note, "frame" is the frame-index of the frame you'll get the
58+
// transition-id for next time, if you call TraceBackBestPath on this
59+
// iterator (assuming it's not an epsilon transition). Note that this
60+
// is one less than you might reasonably expect, e.g. it's -1 for
61+
// the nonemitting transitions before the first frame.
5662
BestPathIterator(void *t, int32 f): tok(t), frame(f) { }
5763
bool Done() { return tok == NULL; }
5864
};

src/doc/README

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,4 +56,6 @@ fi
5656
# moved the header.html to doc/ and edited it to include the following snippet,
5757
# and added it to the repo.
5858
#<link rel="icon" type="image/png" href="http://kaldi.sf.net/favicon.ico">
59+
# Also did similar with stylesheet.
60+
5961

src/doc/dnn.dox

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,9 @@ namespace kaldi {
4949
be run in order to build the systems used for alignment.
5050

5151
Regarding which of the two setups you should use:
52-
- Karel's setup (nnet1) supports training on a single GPU card, which allows
52+
- Karel's setup (\ref dnn1 "nnet1") supports training on a single GPU card, which allows
5353
the implementation to be simpler and relatively easy to modify.
54-
- Dan's setup (nnet2) is more flexible in how
54+
- Dan's setup (\ref dnn2 "nnet2") is more flexible in how
5555
you can train: it supports using multiple GPUs, or multiple CPU's each with
5656
multiple threads. Multiple GPU's is the recommended setup.
5757
They don't have to all be on the same machine. Both setups give commensurate results.

src/doc/header.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
<tbody>
2424
<tr style="height: 56px;">
2525
<!--BEGIN PROJECT_LOGO-->
26-
<td id="projectlogo"><img alt="Logo" src="$relpath$$projectlogo"/ style="padding: 4px 5px 1px 5px"></td>
26+
<td id="projectlogo"><img alt="Logo" src="$relpath$$projectlogo"/ style="padding: 3px 5px 1px 5px"></td>
2727
<!--END PROJECT_LOGO-->
2828
<!--BEGIN PROJECT_NAME-->
2929
<td style="padding-left: 0.5em;">

src/ivector/ivector-extractor.cc

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -534,7 +534,9 @@ void OnlineIvectorEstimationStats::AccStats(
534534
for (size_t idx = 0; idx < gauss_post.size(); idx++) {
535535
int32 g = gauss_post[idx].first;
536536
double weight = gauss_post[idx].second;
537-
KALDI_ASSERT(weight >= 0.0);
537+
// allow negative weights; it's needed in the online iVector extraction
538+
// with speech-silence detection based on decoder traceback (we subtract
539+
// stuff we previously added if the traceback changes).
538540
if (weight == 0.0)
539541
continue;
540542
linear_term_.AddMatVec(weight, extractor.Sigma_inv_M_[g], kTrans,
@@ -543,8 +545,9 @@ void OnlineIvectorEstimationStats::AccStats(
543545
quadratic_term_vec.AddVec(weight, U_g);
544546
tot_weight += weight;
545547
}
546-
if (max_count_ != 0.0) {
547-
// see comments in header RE max_count for explanation.
548+
if (max_count_ > 0.0) {
549+
// see comments in header RE max_count for explanation. It relates to
550+
// prior scaling when the count exceeds max_count_
548551
double old_num_frames = num_frames_,
549552
new_num_frames = num_frames_ + tot_weight;
550553
double old_prior_scale = std::max(old_num_frames, max_count_) / max_count_,

0 commit comments

Comments
 (0)