[浏览需要 0 积分] 发布于 2023-05-07 13:07:12
2019年Magicspeech 比赛语音识别第一名方案—Kaldi 算法

基础方案特点：
1、标准 kaldi GMM-HMM DNN-HMM LFMMI 训练模式
2、使用 DNN-HMM(CE)对齐方案
reference1：https://www.bilibili.com/read/cv5090561/
reference2：https://github.com/pigzach/MagicSpeechASR
#!/bin/bash
stage=0
train_nj=64
dict=conf/lexicon.txt

set -e
[ -f ./cmd.sh ] && . ./cmd.sh
[ -f ./path.sh ] && . ./path.sh
. utils/parse_options.sh
if [ $# -ne 1 ];then
  echo "$0 <data-base>"
  exit 1;
fi

corps=$1

echo "[ Stage 0] run.sh: Clone Text-norm ..."
if [ $stage -le 0 ];then
  git clone https://github.com/speech-io/chinese_text_normalization.git
fi

echo "[ Stage 1 ] run.sh: Prepare lexicon ..."
if [ $stage -le 1 ];then
  local/prepare_dict.sh $dict data/local/dict
  # Phone Sets, questions, L compilation
  utils/prepare_lang.sh --position-dependent-phones true data/local/dict \
    "<UNK>" data/local/lang data/lang || exit 1;
fi

echo "[ Stage 2 ] run.sh: Prepare data ..."
if [ $stage -le 2 ];then
  local/make_scp.sh $corps data/tmp || exit 1;
  local/prepare_data.sh data/tmp/train data/local/dict data/local/train data/train || exit 1;
  local/prepare_data.sh data/tmp/dev data/local/dict data/local/dev data/dev || exit 1;
  local/prepare_data.sh data/tmp/test data/local/dict data/local/test data/test || exit 1;
  rm -rf data/tmp
fi

# nj for dev and test
dev_nj=$(wc -l data/dev/spk2utt | awk '{print $1}' || exit 1;)
test_nj=$(wc -l data/test/spk2utt | awk '{print $1}' || exit 1;)
# Now make MFCC features.
echo "[ Stage 3 ] run.sh: Make mfcc feature ..."
if [ $stage -le 3 ]; then
  # mfccdir should be some place with a largish disk where you
  # want to store MFCC features.
  for x in train dev; do
    steps/make_mfcc_pitch.sh --pitch-config conf/pitch.conf \
      --cmd "$train_cmd" --nj $train_nj \
      data/$x exp/make_mfcc/$x mfcc || exit 1;
    steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x mfcc || exit 1;
    utils/fix_data_dir.sh data/$x || exit 1;
  done
fi

echo "[ Stage 4 ] run.sh: Subset training data dir ..."
if [ $stage -le 4 ]; then
  # subset the training data for fast startup
  for x in 50 100; do
    utils/subset_data_dir.sh data/train ${x}000 data/train_${x}k
  done
fi

# mono
echo "[ Stage 5 ] run.sh: Monophone training ..."
if [ $stage -le 5 ]; then
  # training
  steps/train_mono.sh --cmd "$train_cmd" --nj $train_nj \
    data/train_50k data/lang exp/mono || exit 1;

  # alignment
  steps/align_si.sh --cmd "$train_cmd" --nj $train_nj \
    data/train_100k data/lang exp/mono exp/mono_ali || exit 1;
fi

# tri1
echo "[ Stage 6 ] run.sh: Triphone training ..."
if [ $stage -le 6 ]; then
  # training
  steps/train_deltas.sh --cmd "$train_cmd" \
   4000 32000 data/train_100k data/lang exp/mono_ali exp/tri1 || exit 1;

  # alignment
  steps/align_si.sh --cmd "$train_cmd" --nj $train_nj \
    data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
fi

# tri2
echo "[ Stage 7 ] run.sh: Triphone training (more data) ..."
if [ $stage -le 7 ]; then
  # training
  steps/train_deltas.sh --cmd "$train_cmd" \
   7000 56000 data/train data/lang exp/tri1_ali exp/tri2 || exit 1;

  # alignment
  steps/align_si.sh --cmd "$train_cmd" --nj $train_nj \
    data/train data/lang exp/tri2 exp/tri2_ali || exit 1;
fi

# tri3
echo "[ Stage 8 ] run.sh: Triphone lda+mllt training ..."
if [ $stage -le 8 ]; then
  # training [LDA+MLLT]
  steps/train_lda_mllt.sh --cmd "$train_cmd" \
   10000 80000 data/train data/lang exp/tri2_ali exp/tri3 || exit 1;

  # alignment
  steps/align_si.sh --cmd "$train_cmd" --nj $train_nj \
    data/train data/lang exp/tri3 exp/tri3_ali || exit 1;
fi


# tri4
echo "[ Stage 9 ] run.sh: Triphone sat+fmllr training ..."
if [ $stage -le 9 ];then
  steps/train_sat.sh  --cmd "$train_cmd" \
                      11500 120000 data/train data/lang exp/tri3_ali exp/tri4
  steps/align_fmllr.sh --cmd "$train_cmd" --nj $train_nj \
                       data/train data/lang exp/tri4 exp/tri4_ali
fi

# tri5
echo "[ Stage 10 ] run.sh: Triphone sat+fmllr training(more gaussian and leaves) ..."
if [ $stage -le 10 ];then
  steps/train_sat.sh  --cmd "$train_cmd" \
                      12000 200000 data/train data/lang exp/tri4_ali exp/tri5
  steps/align_fmllr.sh --cmd "$train_cmd" --nj $train_nj \
                       data/train data/lang exp/tri5 exp/tri5_ali
fi

echo "[ Stage 11 ] run.sh: Adjust lexicon probabilities ..."
if [ $stage -le 11 ]; then
  # Now we compute the pronunciation and silence probabilities from training data,
  # and re-create the lang directory.
  steps/get_prons.sh --cmd "$train_cmd" data/train data/lang exp/tri5
  utils/dict_dir_add_pronprobs.sh --max-normalize true \
                                  data/local/dict exp/tri5/pron_counts_nowb.txt exp/tri5/sil_counts_nowb.txt \
                                  exp/tri5/pron_bigram_counts_nowb.txt data/local/dict_sp

  utils/prepare_lang.sh --position-dependent-phones true data/local/dict_sp "<UNK>" data/local/lang_sp data/lang_sp
fi

echo "[ Stage 12 ] run.sh: Train language model and test our GMM model on dev set ..."
if [ $stage -le 12 ];then
  local/train_lms.sh \
      data/local/dict/lexicon.txt \
      data/train/text \
      data/local/lm || exit 1;
  # G compilation, check LG composition
  utils/format_lm.sh data/lang_sp data/local/lm/4gram-mincount/lm_unpruned.gz \
    data/local/dict_sp/lexicon.txt data/lang_test_sp || exit 1;
  utils/mkgraph.sh data/lang_test_sp exp/tri5 exp/tri5/graph_sp
  steps/decode_fmllr.sh --nj $dev_nj --cmd "$decode_cmd" \
                        exp/tri5/graph_sp data/dev exp/tri5/decode_dev_sp
fi

echo "[ Stage 13 ] run.sh: Blance the data of training set and dev set, augment data by perturb speed and volume ..."
if [ $stage -le 13 ];then
  local/train_data_copyx3.sh data/train data/train_copyx3
  utils/data/perturb_data_dir_speed_3way.sh --always-include-prefix true data/train data/train_speed_hires
  utils/data/perturb_data_dir_volume.sh data/train_speed_hires
  utils/combine_data.sh data/train_aug data/train_copyx3 data/train_speed_hires

  utils/data/perturb_data_dir_speed_3way.sh --always-include-prefix true data/dev data/dev_aug
  utils/data/perturb_data_dir_volume.sh data/dev_aug
fi

echo "[ Stage 14 ] run.sh: Extract 40dims mfcc + 3dim pithc on the augment data ..."
if [ $stage -le 14 ];then
  for x in train_aug dev_aug test; do
    steps/make_mfcc_pitch.sh --mfcc-config conf/mfcc_hires.conf --pitch-config conf/pitch.conf \
      --cmd "$train_cmd" --nj $train_nj \
      data/$x exp/make_mfcc_hires/$x mfcc_hires || exit 1;
    steps/compute_cmvn_stats.sh data/$x exp/make_mfcc_hires/$x mfcc_hires || exit 1;
    utils/fix_data_dir.sh data/$x || exit 1;
  done

  utils/combine_data.sh data/train_dev_aug data/train_aug data/dev_aug
  # create MFCC data dir without pitch to extract iVector
  utils/data/limit_feature_dim.sh 0:39 data/train_dev_aug data/train_dev_aug_nopitch || exit 1;
  steps/compute_cmvn_stats.sh data/train_dev_aug_nopitch exp/make_mfcc_hires/train_dev_aug_nopitch mfcc_hires || exit 1;

  utils/data/limit_feature_dim.sh 0:39 data/test data/test_nopitch || exit 1;
  steps/compute_cmvn_stats.sh data/test_nopitch exp/make_mfcc_hires/test_nopitch mfcc_hires || exit 1;

  cp -r data/train_dev_aug data/train_dev_aug_16dims
  rm -rf data/train_dev_aug_16dims/{feats.scp,cmvn.scp}
  steps/make_mfcc_pitch.sh --pitch-config conf/pitch.conf \
      --cmd "$train_cmd" --nj $train_nj \
      data/train_dev_aug_16dims exp/make_mfcc_hires/train_dev_aug_16dims mfcc_hires || exit 1;
  steps/compute_cmvn_stats.sh data/train_dev_aug_16dims exp/make_mfcc_hires/train_dev_aug_16dims mfcc_hires || exit 1;
  utils/fix_data_dir.sh data/train_dev_aug_16dims || exit 1;
fi

echo "[ Stage 15 ] run.sh: Train ivector extractor and extract ivector of the training set, dev set and test set ..."
if [ $stage -le 15 ];then
  echo "$0: computing a subset of data to train the diagonal UBM."
  # We'll use about a quarter of the data.
  mkdir -p exp/ivector/diag_ubm
  temp_data_root=exp/ivector/diag_ubm

  echo "$0: computing a PCA transform from the hires data."
  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
      --splice-opts "--left-context=3 --right-context=3" \
      --max-utts 10000 --subsample 2 \
      data/train_dev_aug_nopitch exp/ivector/pca_transform

  echo "$0: training the diagonal UBM."
  # Use 512 Gaussians in the UBM.
  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj $train_nj \
    --num-frames 700000 \
    --num-threads 32 \
    data/train_dev_aug_nopitch 512 \
    exp/ivector/pca_transform exp/ivector/diag_ubm

  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
  # 100.
  echo "$0: training the iVector extractor"
  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj $train_nj \
     data/train_dev_aug_nopitch exp/ivector/diag_ubm \
     exp/ivector/extractor || exit 1;

  # We extract iVectors on the speed-perturbed training data after combining
  # short segments, which will be what we train the system on.  With
  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
  # each of these pairs as one speaker; this gives more diversity in iVectors..
  # Note that these are extracted 'online'.

  # note, we don't encode the 'max2' in the name of the ivectordir even though
  # that's the data we extract the ivectors from, as it's still going to be
  # valid for the non-'max2' data, the utterance list is the same.

  # having a larger number of speakers is helpful for generalization, and to
  # handle per-utterance decoding well (iVector starts at zero).
  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
    data/train_dev_aug_nopitch data/train_dev_aug_nopitch_max2

  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $train_nj \
    data/train_dev_aug_nopitch_max2 \
    exp/ivector/extractor exp/ivector/ivector_train_dev_aug

  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $test_nj \
      data/test_nopitch exp/ivector/extractor \
      exp/ivector/ivector_test

fi

echo "[ Stage 16 ] run.sh: Train nnet3 CE model on the train+dev augment data set, for better alignment  ..."
if [ $stage -le 16 ];then
  steps/align_fmllr.sh --cmd "$train_cmd" --nj $train_nj \
                       data/train_dev_aug_16dims data/lang_sp exp/tri5 exp/tri5_ali_train_dev_aug
  echo "$0: creating neural net configs";

  train_data_dir=data/train_dev_aug
  train_ivector_dir=exp/ivector/ivector_train_dev_aug
  ali_dir=exp/tri5_ali_train_dev_aug
  dir=exp/nnet3/tdnn

  train_nnet=true
  if $train_nnet;then
    num_targets=$(tree-info $ali_dir/tree |grep num-pdfs|awk '{print $2}')
    opts="l2-regularize=0.002"
    output_opts="l2-regularize=0.0005 bottleneck-dim=256"

    mkdir -p $dir/configs
    cat <<EOF > $dir/configs/network.xconfig
  input dim=100 name=ivector
  input dim=43 name=input

  # please note that it is important to have input layer with the name=input
  # as the layer immediately preceding the fixed-affine-layer to enable
  # the use of short notation for the descriptor
  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat

  # the first splicing is moved before the lda layer, so no splicing here
  relu-batchnorm-dropout-layer name=tdnn1 dim=1024 $opts
  relu-batchnorm-dropout-layer name=tdnn2 dim=1024 input=Append(-1,0,2) $opts
  relu-batchnorm-dropout-layer name=tdnn3 dim=1024 input=Append(-3,0,3) $opts
  relu-batchnorm-dropout-layer name=tdnn4 dim=1024 input=Append(-7,0,2) $opts
  relu-batchnorm-dropout-layer name=tdnn5 dim=1024 input=Append(-3,0,3) $opts
  relu-batchnorm-dropout-layer name=tdnn6 dim=1024 $opts
  output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5 $output_opts

EOF
    steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
    steps/nnet3/train_dnn.py --stage -10 \
      --cmd="$cuda_cmd" \
      --feat.online-ivector-dir $train_ivector_dir \
      --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
      --trainer.num-epochs 4 \
      --trainer.optimization.num-jobs-initial 2 \
      --trainer.optimization.num-jobs-final 12 \
      --trainer.optimization.initial-effective-lrate 0.0015 \
      --trainer.optimization.final-effective-lrate 0.00015 \
      --cleanup.remove-egs true \
      --use-gpu true \
      --cleanup.preserve-model-interval 100 \
      --feat-dir=$train_data_dir \
      --ali-dir $ali_dir \
      --lang data/lang_sp \
      --dir=$dir  || exit 1;
  fi

  steps/nnet3/align_lats.sh --cmd "$train_cmd" --nj $train_nj --generate-ali-from-lats true \
    --online-ivector-dir $train_ivector_dir $train_data_dir data/lang_sp $dir ${dir}_ali

fi

echo "[ Stage 17 ] run.sh: Train nnet3 chain model on the train+dev augment data set ..."
if [ $stage -le 17 ];then
  # Create a version of the lang/ directory that has one state per phone in the
  # topo file. [note, it really has two states.. the first one is only repeated
  # once, the second one has zero or more repeats.]
  train_data_dir=data/train_dev_aug
  train_ivector_dir=exp/ivector/ivector_train_dev_aug

  lang=exp/chain/lang
  treedir=exp/chain/tree
  graphdir=exp/chain/graph
  latdir=exp/nnet3/tdnn_ali
  dir=exp/chain/cnn-tdnnf

  train_stage=-10
  get_egs_stage=-10
  xent_regularize=0.1

  mkdir -p exp/chain
  cp -r data/lang_sp $lang
  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
  # Use our special topology... note that later on may have to tune this
  # topology.
  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo

  # Build a tree using our new topology. This is the critically different
  # step compared with other recipes.
  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
      --context-opts "--context-width=2 --central-position=1" \
      --cmd "$train_cmd" 7000 $train_data_dir $lang $latdir $treedir

  echo "$0: creating neural net configs using the xconfig parser";

  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)

  cnn_opts="l2-regularize=0.01"
  ivector_affine_opts="l2-regularize=0.01"
  tdnnf_first_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.0"
  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
  prefinal_opts="l2-regularize=0.01"
  output_opts="l2-regularize=0.002"

  mkdir -p $dir/configs
  cat <<EOF > $dir/configs/network.xconfig
  input dim=100 name=ivector
  input dim=43 name=input

  # MFCC to filterbank
  dim-range-component name=mfcc input=input dim=40 dim-offset=0
  dim-range-component name=pitch input=input dim=3 dim-offset=40
  # this takes the MFCCs and generates filterbank coefficients.  The MFCCs
  # are more compressible so we prefer to dump the MFCCs to disk rather
  # than filterbanks.
  idct-layer name=idct input=mfcc dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat
  linear-component name=ivector-linear l2-regularize=0.01 dim=197 input=ReplaceIndex(ivector, t, 0)
  batchnorm-component name=ivector-batchnorm target-rms=0.025
  batchnorm-component name=idct-batchnorm input=idct
  batchnorm-component name=pitch-batchnorm input=pitch
  spec-augment-layer name=idct-spec-augment input=idct-batchnorm freq-max-proportion=0.5 time-zeroed-proportion=0.2 time-mask-max-frames=20

  combine-feature-maps-layer name=combine_inputs input=Append(idct-spec-augment, ivector-batchnorm, pitch-batchnorm) num-filters1=1 num-filters2=5 height=40
  conv-relu-batchnorm-layer name=cnn1 $cnn_opts height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 height-offsets=-1,0,1 num-filters-out=64
  conv-relu-batchnorm-layer name=cnn2 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64
  conv-relu-batchnorm-layer name=cnn3 $cnn_opts height-in=40 height-out=20 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128
  conv-relu-batchnorm-layer name=cnn4 $cnn_opts height-in=20 height-out=20 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128
  conv-relu-batchnorm-layer name=cnn5 $cnn_opts height-in=20 height-out=10 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=256
  conv-relu-batchnorm-layer name=cnn6 $cnn_opts height-in=10 height-out=10  time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=256
  # the first TDNN-F layer has no bypass (since dims don't match), and a larger bottleneck so the
  # information bottleneck doesn't become a problem.  (we use time-stride=0 so no splicing, to
  # limit the num-parameters).
  tdnnf-layer name=tdnnf7 $tdnnf_first_opts dim=1536 bottleneck-dim=256 time-stride=0
  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0
  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
  tdnnf-layer name=tdnnf16 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
  tdnnf-layer name=tdnnf17 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
  tdnnf-layer name=tdnnf18 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
  tdnnf-layer name=tdnnf19 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
  tdnnf-layer name=tdnnf20 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
  tdnnf-layer name=tdnnf21 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
  tdnnf-layer name=tdnnf22 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
  tdnnf-layer name=tdnnf23 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
  linear-component name=prefinal-l dim=256 $linear_opts
  ## adding the layers for chain branch
  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=256 big-dim=1536
  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
  # adding the layers for xent branch
  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=256 big-dim=1536
  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
EOF
  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/

  steps/nnet3/chain/train.py --stage $train_stage \
    --cmd "$train_cmd" \
    --feat.online-ivector-dir $train_ivector_dir \
    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
    --chain.xent-regularize $xent_regularize \
    --chain.leaky-hmm-coefficient 0.1 \
    --chain.l2-regularize 0.0 \
    --chain.apply-deriv-weights false \
    --chain.lm-opts="--num-extra-lm-states=2000" \
    --trainer.dropout-schedule "0,0@0.20,0.3@0.50,0" \
    --trainer.add-option="--optimization.memory-compression-level=2" \
    --egs.stage $get_egs_stage \
    --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
    --egs.chunk-width "150,110,90,30" \
    --trainer.num-chunk-per-minibatch 64 \
    --trainer.frames-per-iter 1500000 \
    --trainer.num-epochs 6 \
    --trainer.optimization.num-jobs-initial 2 \
    --trainer.optimization.num-jobs-final 8 \
    --trainer.optimization.initial-effective-lrate 0.00015 \
    --trainer.optimization.final-effective-lrate 0.000015 \
    --trainer.max-param-change 2.0 \
    --cleanup.remove-egs false \
    --feat-dir $train_data_dir \
    --tree-dir $treedir \
    --lat-dir $latdir \
    --dir $dir  || exit 1;

  mkdir -p data/local/lm_train_dev
  cat data/train/text <(cat data/dev/text | grep Android) >  data/local/lm_train_dev/text
  local/train_lms.sh \
      data/local/dict_sp/lexicon.txt \
      data/local/lm_train_dev/text \
      data/local/lm_train_dev || exit 1;

  utils/format_lm.sh $lang data/local/lm_train_dev/4gram-mincount/lm_unpruned.gz \
    data/local/dict_sp/lexicon.txt exp/chain/lang_test || exit 1;
  utils/mkgraph.sh --self-loop-scale 1.0 exp/chain/lang_test $treedir $graphdir
  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
    --nj $test_nj --cmd "$decode_cmd" \
    --online-ivector-dir exp/ivector/ivector_test \
    $graphdir data/test \
    $dir/decode_test
fi

echo "[ Stage 18 ] run.sh: Fine tuning the chain model with dev_aug data ..."
dir=exp/chain/cnn-tdnnf-finetune
if [ $stage -le 18 ];then
  mkdir -p $dir
  $train_cmd $dir/log/generate_input_model.log \
    nnet3-am-copy --raw=true $src_dir/final.mdl $dir/input.raw
    
  utils/data/limit_feature_dim.sh 0:39 data/dev_aug data/dev_aug_nopitch || exit 1;
  steps/compute_cmvn_stats.sh data/dev_aug_nopitch exp/make_mfcc_hires/dev_aug_nopitch mfcc_hires || exit 1;
  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $train_nj \
        data/dev_aug_nopitch exp/ivector/extractor \
        exp/ivector/ivector_dev_aug

  # Create a version of the lang/ directory that has one state per phone in the
  # topo file. [note, it really has two states.. the first one is only repeated
  # once, the second one has zero or more repeats.]
  
  train_data_dir=data/dev_aug
  train_ivector_dir=exp/ivector/ivector_dev_aug

  lang=exp/chain/lang
  treedir=exp/chain/tree
  latdir=exp/nnet3/tdnn_ali_dev_aug
  
  if ! [ -d $latdir ];then
    steps/nnet3/align_lats.sh --cmd "$train_cmd" --nj $train_nj --generate-ali-from-lats true \
      --online-ivector-dir $train_ivector_dir $train_data_dir data/lang_sp exp/nnet3/tdnn $latdir
  fi

  train_stage=-10
  get_egs_stage=-10
  xent_regularize=0.1

  steps/nnet3/chain/train.py --stage $train_stage \
    --cmd "$train_cmd" \
    --feat.online-ivector-dir $train_ivector_dir \
    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
    --chain.xent-regularize $xent_regularize \
    --chain.leaky-hmm-coefficient 0.1 \
    --chain.l2-regularize 0.0 \
    --chain.apply-deriv-weights false \
    --chain.lm-opts="--num-extra-lm-states=2000" \
    --trainer.input-model $dir/input.raw \
    --trainer.dropout-schedule "0,0@0.20,0.3@0.50,0" \
    --trainer.add-option="--optimization.memory-compression-level=2" \
    --egs.stage $get_egs_stage \
    --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
    --egs.chunk-width "150,110,90,30" \
    --trainer.num-chunk-per-minibatch 64 \
    --trainer.frames-per-iter 1500000 \
    --trainer.num-epochs 1 \
    --trainer.optimization.num-jobs-initial 8 \
    --trainer.optimization.num-jobs-final 8 \
    --trainer.optimization.initial-effective-lrate 0.000015 \
    --trainer.optimization.final-effective-lrate 0.000010 \
    --trainer.max-param-change 2.0 \
    --cleanup.remove-egs false \
    --feat-dir $train_data_dir \
    --tree-dir $treedir \
    --lat-dir $latdir \
    --dir $dir  || exit 1;

  graphdir=exp/chain/graph
  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
    --nj $test_nj --cmd "$decode_cmd" \
    --online-ivector-dir exp/ivector/ivector_test \
    $graphdir data/test \
    $dir/decode_test
fi

echo "[ Stage 19 ] run.sh: Train forward rnnlm model with pretrain Tencent word embedding and rescoring the fine-tune chain model decode result ..."
if [ $stage -le 19 ];then
  tencent_embedding=data/wordembedding/Tencent_AILab_ChineseEmbedding.txt
  if ! [ -e $tencent_embedding ];then
    echo "Can not find $tencent_embedding, please download it from: https://ai.tencent.com/ailab/nlp/data/Tencent_AILab_ChineseEmbedding.tar.gz"
    echo "tar -zxvf Tencent_AILab_ChineseEmbedding.tar.gz and copy the Tencent_AILab_ChineseEmbedding.txt to data/wordembedding"
    echo ", after that, re-run this scripts as: $0 --stage 21."
  fi
  local/train_rnnlm_with_pretrain_embedding.sh 
  rnnlm/lmrescore_pruned.sh \
    --cmd "$decode_cmd" \
    --weight 0.45 --max-ngram-order 4 \
    exp/chain/lang_test exp/rnnlm_lstm_tdnn_pretrain \
    data/test exp/chain/cnn-tdnnf-finetune/decode_test \
    exp/rnnlm_lstm_tdnn_pretrain/decode_test

fi

echo "[ Stage 20 ] run.sh: Train backward rnnlm model with pretrain Tencent word embedding and rescoring the forward rnnlm decode result ..."
if [ $stage -le 20 ];then
  local/train_rnnlm_with_pretrain_embedding_back.sh
  rnnlm/lmrescore_back.sh \
    --cmd "$decode_cmd" \
    --weight 0.3 --max-ngram-order 4 \
    exp/chain/lang_test exp/rnnlm_lstm_tdnn_pretrain_back \
    data/test exp/rnnlm_lstm_tdnn_pretrain/decode_test \
    exp/rnnlm_lstm_tdnn_pretrain_back/decode_test
fi

if [ $stage -le 23 ];then
  local/combine_lattice_and_generate_submit_limit_depth.sh --cmd "$decode_cmd" \
    --wip 1.0 --ac-scale 11 --lattice-depth 150 \
    --scoring-opts "--word-ins-penalty 1.0 --min-lmwt 10 --max-lmwt 12  --decode-mbr true" \
    data/test exp/chain/lang_test exp/rnnlm_lstm_tdnn_pretrain_back/decode_test exp/final-submit
fi

echo "Finish."
exit 0;
开源分享 #语音识别 #kaldi #chain
浏览 (1327)