From 106891f443e7e60cca23e2dac7e8e5a208a6bafd Mon Sep 17 00:00:00 2001 From: TianYuan Date: Sat, 21 Aug 2021 08:35:25 +0000 Subject: [PATCH 1/2] add sclite test for text_frontend --- examples/text_frontend/run.sh | 31 +++++++++++++++++++++---- examples/text_frontend/test_g2p.py | 4 ++-- examples/text_frontend/test_textnorm.py | 4 ++-- 3 files changed, 31 insertions(+), 8 deletions(-) diff --git a/examples/text_frontend/run.sh b/examples/text_frontend/run.sh index 65ae8eb..b84325c 100755 --- a/examples/text_frontend/run.sh +++ b/examples/text_frontend/run.sh @@ -1,14 +1,37 @@ #!/bin/bash +USE_SCLITE=true +if [ "$USE_SCLITE" = true ];then + if [ ! -d "./SCTK" ];then + echo "Clone SCTK ..." + git clone https://github.com/usnistgov/SCTK + echo "Clone SCTK done!" + fi + + if [ ! -d "./SCTK/bin" ];then + echo "Start make SCTK ..." + pushd SCTK && make config && make all && make check && make install && make doc && popd + echo "SCTK make done!" + fi +fi # test g2p -echo "Start get g2p test data." +echo "Start get g2p test data ..." python3 get_g2p_data.py --root-dir=~/datasets/BZNSYP --output-dir=data/g2p -echo "Start test g2p." +echo "Start test g2p ..." python3 test_g2p.py --input-dir=data/g2p --output-dir=exp/g2p # test text normalization -echo "Start get text normalization test data." +echo "Start get text normalization test data ..." python3 get_textnorm_data.py --test-file=data/textnorm_test_cases.txt --output-dir=data/textnorm -echo "Start test text normalization." +echo "Start test text normalization ..." python3 test_textnorm.py --input-dir=data/textnorm --output-dir=exp/textnorm +# whether use sclite to get more detail information of WER +if [ "$USE_SCLITE" = true ];then + echo "Start sclite g2p ..." + ./SCTK/bin/sclite -i wsj -r ./exp/g2p/text.ref.clean -h ./exp/g2p/text.g2p -e utf-8 -o all + echo + + echo "Start sclite textnorm ..." + ./SCTK/bin/sclite -i wsj -r ./exp/textnorm/text.ref.clean -h ./exp/textnorm/text.tn -e utf-8 -o all +fi \ No newline at end of file diff --git a/examples/text_frontend/test_g2p.py b/examples/text_frontend/test_g2p.py index ba456e9..45d6c44 100644 --- a/examples/text_frontend/test_g2p.py +++ b/examples/text_frontend/test_g2p.py @@ -47,8 +47,8 @@ def get_avg_wer(raw_dict, ref_dict, frontend, output_dir): gt_phones = [phn for phn in gt_phones if phn not in SILENCE_TOKENS] gt_phones = " ".join(gt_phones) g2p_phones = " ".join(g2p_phones) - wf_ref.write(utt_id + " " + gt_phones + "\n") - wf_g2p.write(utt_id + " " + g2p_phones + "\n") + wf_ref.write(gt_phones + "(baker_" + utt_id + ")" + "\n") + wf_g2p.write(g2p_phones + "(baker_" + utt_id + ")" + "\n") edit_distance, ref_len = word_errors(gt_phones, g2p_phones) edit_distances.append(edit_distance) ref_lens.append(ref_len) diff --git a/examples/text_frontend/test_textnorm.py b/examples/text_frontend/test_textnorm.py index 1732c47..0de3c5c 100644 --- a/examples/text_frontend/test_textnorm.py +++ b/examples/text_frontend/test_textnorm.py @@ -43,8 +43,8 @@ def get_avg_cer(raw_dict, ref_dict, text_normalizer, output_dir): gt_text = del_en_add_space(gt_text) textnorm_text = del_en_add_space(textnorm_text) - wf_ref.write(text_id + " " + gt_text + "\n") - wf_tn.write(text_id + " " + textnorm_text + "\n") + wf_ref.write(gt_text + "(" + text_id + ")" + "\n") + wf_tn.write(textnorm_text + "(" + text_id + ")" + "\n") edit_distance, ref_len = char_errors(gt_text, textnorm_text) edit_distances.append(edit_distance) ref_lens.append(ref_len) From 36604b4e413d9dfa734f2fda9d563dcd3f144a3f Mon Sep 17 00:00:00 2001 From: TianYuan Date: Mon, 23 Aug 2021 03:27:54 +0000 Subject: [PATCH 2/2] add make_sclite.sh --- examples/text_frontend/README.md | 4 ++++ examples/text_frontend/make_sclite.sh | 13 +++++++++++++ examples/text_frontend/run.sh | 18 +++--------------- 3 files changed, 20 insertions(+), 15 deletions(-) create mode 100755 examples/text_frontend/make_sclite.sh diff --git a/examples/text_frontend/README.md b/examples/text_frontend/README.md index d41e428..168df46 100644 --- a/examples/text_frontend/README.md +++ b/examples/text_frontend/README.md @@ -11,6 +11,10 @@ For text normalization, the test data is `data/textnorm_test_cases.txt`, we use We use `CER` as evaluation criterion. ## Start +If you want to use sclite to get more detail information of WER, you should run the command below to make sclite first. +```bash +./make_sclite.sh +``` Run the command below to get the results of test. ```bash ./run.sh diff --git a/examples/text_frontend/make_sclite.sh b/examples/text_frontend/make_sclite.sh new file mode 100755 index 0000000..db8c921 --- /dev/null +++ b/examples/text_frontend/make_sclite.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +if [ ! -d "./SCTK" ];then + echo "Clone SCTK ..." + git clone https://github.com/usnistgov/SCTK + echo "Clone SCTK done!" +fi + +if [ ! -d "./SCTK/bin" ];then + echo "Start make SCTK ..." + pushd SCTK && make config && make all && make check && make install && make doc && popd + echo "SCTK make done!" +fi diff --git a/examples/text_frontend/run.sh b/examples/text_frontend/run.sh index b84325c..9882b05 100755 --- a/examples/text_frontend/run.sh +++ b/examples/text_frontend/run.sh @@ -1,19 +1,7 @@ #!/bin/bash + USE_SCLITE=true -if [ "$USE_SCLITE" = true ];then - if [ ! -d "./SCTK" ];then - echo "Clone SCTK ..." - git clone https://github.com/usnistgov/SCTK - echo "Clone SCTK done!" - fi - - if [ ! -d "./SCTK/bin" ];then - echo "Start make SCTK ..." - pushd SCTK && make config && make all && make check && make install && make doc && popd - echo "SCTK make done!" - fi -fi # test g2p echo "Start get g2p test data ..." python3 get_g2p_data.py --root-dir=~/datasets/BZNSYP --output-dir=data/g2p @@ -29,9 +17,9 @@ python3 test_textnorm.py --input-dir=data/textnorm --output-dir=exp/textnorm # whether use sclite to get more detail information of WER if [ "$USE_SCLITE" = true ];then echo "Start sclite g2p ..." - ./SCTK/bin/sclite -i wsj -r ./exp/g2p/text.ref.clean -h ./exp/g2p/text.g2p -e utf-8 -o all + ./SCTK/bin/sclite -i wsj -r ./exp/g2p/text.ref.clean trn -h ./exp/g2p/text.g2p trn -e utf-8 -o all echo echo "Start sclite textnorm ..." - ./SCTK/bin/sclite -i wsj -r ./exp/textnorm/text.ref.clean -h ./exp/textnorm/text.tn -e utf-8 -o all + ./SCTK/bin/sclite -i wsj -r ./exp/textnorm/text.ref.clean trn -h ./exp/textnorm/text.tn trn -e utf-8 -o all fi \ No newline at end of file