@@ -22,8 +22,10 @@ if [ "$1" == "--per-utt" ]; then
2222fi
2323
2424if [ $# != 2 ]; then
25- echo " Usage: split_data.sh <data-dir> <num-to-split>"
25+ echo " Usage: split_data.sh [--per-utt] <data-dir> <num-to-split>"
2626 echo " This script will not split the data-dir if it detects that the output is newer than the input."
27+ echo " By default it splits per speaker (so each speaker is in only one split dir),"
28+ echo " but with the --per-utt option it will ignore the speaker information while splitting."
2729 exit 1
2830fi
2931
@@ -45,13 +47,11 @@ nu=`cat $data/utt2spk | wc -l`
4547nf=` cat $data /feats.scp 2> /dev/null | wc -l`
4648nt=` cat $data /text 2> /dev/null | wc -l` # take it as zero if no such file
4749if [ -f $data /feats.scp ] && [ $nu -ne $nf ]; then
48- echo " ** split_data.sh: warning, #lines is (utt2spk,feats.scp) is ($nu ,$nf ); this script "
49- echo " ** may produce incorrectly split data."
50+ echo " ** split_data.sh: warning, #lines is (utt2spk,feats.scp) is ($nu ,$nf ); you can "
5051 echo " ** use utils/fix_data_dir.sh $data to fix this."
5152fi
5253if [ -f $data /text ] && [ $nu -ne $nt ]; then
53- echo " ** split_data.sh: warning, #lines is (utt2spk,text) is ($nu ,$nt ); this script "
54- echo " ** may produce incorrectly split data."
54+ echo " ** split_data.sh: warning, #lines is (utt2spk,text) is ($nu ,$nt ); you can "
5555 echo " ** use utils/fix_data_dir.sh to fix this."
5656fi
5757
7474
7575for n in ` seq $numsplit ` ; do
7676 mkdir -p $data /split$numsplit /$n
77- feats=" $feats $data /split$numsplit /$n /feats.scp"
78- vads=" $vads $data /split$numsplit /$n /vad.scp"
79- texts=" $texts $data /split$numsplit /$n /text"
8077 utt2spks=" $utt2spks $data /split$numsplit /$n /utt2spk"
81- utt2langs=" $utt2langs $data /split$numsplit /$n /utt2lang"
8278done
8379
8480if $split_per_spk ; then
8783 utt2spk_opt=
8884fi
8985
90- utils/split_scp.pl $utt2spk_opt $data /utt2spk $utt2spks || exit 1
86+ # If lockfile is not installed, just don't lock it. It's not a big deal.
87+ which lockfile >& /dev/null && lockfile -l 60 $data /.split_lock
9188
92- [ -f $data /feats.scp ] && utils/split_scp.pl $utt2spk_opt $data /feats.scp $feats
89+ utils/split_scp.pl $utt2spk_opt $data /utt2spk $utt2spks || exit 1
9390
94- [ -f $data /text ] && utils/split_scp.pl $utt2spk_opt $data /text $texts
91+ for n in ` seq $numsplit ` ; do
92+ dsn=$data /split$numsplit /$n
93+ utils/utt2spk_to_spk2utt.pl $dsn /utt2spk > $dsn /spk2utt || exit 1;
94+ done
9595
96- [ -f $data /vad.scp ] && utils/split_scp.pl $utt2spk_opt $data /vad.scp $vads
96+ maybe_wav_scp=
97+ if [ ! -f $data /segments ]; then
98+ maybe_wav_scp=wav.scp # If there is no segments file, then wav file is
99+ # indexed per utt.
100+ fi
97101
98- [ -f $data /utt2lang ] && utils/split_scp.pl $utt2spk_opt $data /utt2lang $utt2langs
102+ # split some things that are indexed by utterance.
103+ for f in feats.scp text vad.scp utt2lang $maybe_wav_scp ; do
104+ if [ -f $data /$f ]; then
105+ utils/filter_scps.pl JOB=1:$numsplit \
106+ $data /split$numsplit /JOB/utt2spk $data /$f $data /split$numsplit /JOB/$f || exit 1;
107+ fi
108+ done
99109
100- # If lockfile is not installed, just don't lock it. It's not a big deal.
101- which lockfile >& /dev/null && lockfile -l 60 $data /.split_lock
110+ # split some things that are indexed by speaker
111+ for f in spk2gender spk2warp cmvn.scp; do
112+ if [ -f $data /$f ]; then
113+ utils/filter_scps.pl JOB=1:$numsplit \
114+ $data /split$numsplit /JOB/spk2utt $data /$f $data /split$numsplit /JOB/$f || exit 1;
115+ fi
116+ done
102117
103118for n in ` seq $numsplit ` ; do
104119 dsn=$data /split$numsplit /$n
105- utils/utt2spk_to_spk2utt.pl $dsn /utt2spk > $dsn /spk2utt || exit 1;
106- for f in spk2gender spk2warp cmvn.scp; do
107- [ -f $data /$f ] && \
108- utils/filter_scp.pl $dsn /spk2utt $data /$f > $dsn /$f
109- done
110120 if [ -f $data /segments ]; then
111121 utils/filter_scp.pl $dsn /utt2spk $data /segments > $dsn /segments
112- awk ' {print $2;}' $dsn /segments | sort| uniq > $data /tmp.reco # recording-ids.
113- [ -f $data /reco2file_and_channel ] &&
114- utils/filter_scp.pl $data /tmp.reco $data /reco2file_and_channel > $dsn /reco2file_and_channel
115- [ -f $data /wav.scp ] && utils/filter_scp.pl $data /tmp.reco $data /wav.scp > $dsn /wav.scp
122+ awk ' {print $2;}' $dsn /segments | sort | uniq > $data /tmp.reco # recording-ids.
123+ if [ -f $data /reco2file_and_channel ]; then
124+ utils/filter_scp.pl $data /tmp.reco $data /reco2file_and_channel > $dsn /reco2file_and_channel
125+ fi
126+ if [ -f $data /wav.scp ]; then
127+ utils/filter_scp.pl $data /tmp.reco $data /wav.scp > $dsn /wav.scp
128+ fi
116129 rm $data /tmp.reco
117- else # else wav indexed by utterance -> filter on this.
118- [ -f $data /wav.scp ] &&
119- utils/filter_scp.pl $dsn /utt2spk $data /wav.scp > $dsn /wav.scp
120- fi
130+ fi # else it would have been handled above, see maybe_wav.
121131done
122132
123133rm -f $data /.split_lock
0 commit comments