From 106891f443e7e60cca23e2dac7e8e5a208a6bafd Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Sat, 21 Aug 2021 08:35:25 +0000
Subject: [PATCH 1/2] add sclite test for text_frontend

---
 examples/text_frontend/run.sh           | 31 +++++++++++++++++++++----
 examples/text_frontend/test_g2p.py      |  4 ++--
 examples/text_frontend/test_textnorm.py |  4 ++--
 3 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/examples/text_frontend/run.sh b/examples/text_frontend/run.sh
index 65ae8eb..b84325c 100755
--- a/examples/text_frontend/run.sh
+++ b/examples/text_frontend/run.sh
@@ -1,14 +1,37 @@
 #!/bin/bash
+USE_SCLITE=true
 
+if [ "$USE_SCLITE" = true ];then
+    if [ ! -d "./SCTK" ];then
+        echo "Clone SCTK ..."
+        git clone https://github.com/usnistgov/SCTK
+        echo "Clone SCTK done!"
+    fi
+
+    if [ ! -d "./SCTK/bin" ];then
+        echo "Start make SCTK ..."
+        pushd SCTK && make config && make all && make check && make install && make doc && popd
+        echo "SCTK make done!"
+    fi
+fi
 # test g2p
-echo "Start get g2p test data."
+echo "Start get g2p test data ..."
 python3 get_g2p_data.py --root-dir=~/datasets/BZNSYP --output-dir=data/g2p
-echo "Start test g2p."
+echo "Start test g2p ..."
 python3 test_g2p.py --input-dir=data/g2p --output-dir=exp/g2p
 
 # test text normalization
-echo "Start get text normalization test data."
+echo "Start get text normalization test data ..."
 python3 get_textnorm_data.py --test-file=data/textnorm_test_cases.txt --output-dir=data/textnorm
-echo "Start test text normalization."
+echo "Start test text normalization ..."
 python3 test_textnorm.py --input-dir=data/textnorm --output-dir=exp/textnorm
 
+# whether use sclite to get more detail information of WER
+if [ "$USE_SCLITE" = true ];then
+    echo "Start sclite g2p ..."
+    ./SCTK/bin/sclite -i wsj -r ./exp/g2p/text.ref.clean -h ./exp/g2p/text.g2p -e utf-8 -o all
+    echo
+
+    echo "Start sclite textnorm ..."
+    ./SCTK/bin/sclite -i wsj -r ./exp/textnorm/text.ref.clean -h ./exp/textnorm/text.tn -e utf-8 -o all
+fi
\ No newline at end of file
diff --git a/examples/text_frontend/test_g2p.py b/examples/text_frontend/test_g2p.py
index ba456e9..45d6c44 100644
--- a/examples/text_frontend/test_g2p.py
+++ b/examples/text_frontend/test_g2p.py
@@ -47,8 +47,8 @@ def get_avg_wer(raw_dict, ref_dict, frontend, output_dir):
         gt_phones = [phn for phn in gt_phones if phn not in SILENCE_TOKENS]
         gt_phones = " ".join(gt_phones)
         g2p_phones = " ".join(g2p_phones)
-        wf_ref.write(utt_id + " " + gt_phones + "\n")
-        wf_g2p.write(utt_id + " " + g2p_phones + "\n")
+        wf_ref.write(gt_phones + "(baker_" + utt_id + ")" + "\n")
+        wf_g2p.write(g2p_phones + "(baker_" + utt_id + ")" + "\n")
         edit_distance, ref_len = word_errors(gt_phones, g2p_phones)
         edit_distances.append(edit_distance)
         ref_lens.append(ref_len)
diff --git a/examples/text_frontend/test_textnorm.py b/examples/text_frontend/test_textnorm.py
index 1732c47..0de3c5c 100644
--- a/examples/text_frontend/test_textnorm.py
+++ b/examples/text_frontend/test_textnorm.py
@@ -43,8 +43,8 @@ def get_avg_cer(raw_dict, ref_dict, text_normalizer, output_dir):
 
         gt_text = del_en_add_space(gt_text)
         textnorm_text = del_en_add_space(textnorm_text)
-        wf_ref.write(text_id + " " + gt_text + "\n")
-        wf_tn.write(text_id + " " + textnorm_text + "\n")
+        wf_ref.write(gt_text + "(" + text_id + ")" + "\n")
+        wf_tn.write(textnorm_text + "(" + text_id + ")" + "\n")
         edit_distance, ref_len = char_errors(gt_text, textnorm_text)
         edit_distances.append(edit_distance)
         ref_lens.append(ref_len)

From 36604b4e413d9dfa734f2fda9d563dcd3f144a3f Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Mon, 23 Aug 2021 03:27:54 +0000
Subject: [PATCH 2/2] add make_sclite.sh

---
 examples/text_frontend/README.md      |  4 ++++
 examples/text_frontend/make_sclite.sh | 13 +++++++++++++
 examples/text_frontend/run.sh         | 18 +++---------------
 3 files changed, 20 insertions(+), 15 deletions(-)
 create mode 100755 examples/text_frontend/make_sclite.sh

diff --git a/examples/text_frontend/README.md b/examples/text_frontend/README.md
index d41e428..168df46 100644
--- a/examples/text_frontend/README.md
+++ b/examples/text_frontend/README.md
@@ -11,6 +11,10 @@ For text normalization, the test data is  `data/textnorm_test_cases.txt`, we use
 
 We use `CER` as evaluation criterion.
 ## Start
+If you want to use sclite to get more detail information of WER, you should run the command below to make sclite first.
+```bash
+./make_sclite.sh
+```
 Run the command below to get the results of test.
 ```bash
 ./run.sh
diff --git a/examples/text_frontend/make_sclite.sh b/examples/text_frontend/make_sclite.sh
new file mode 100755
index 0000000..db8c921
--- /dev/null
+++ b/examples/text_frontend/make_sclite.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+if [ ! -d "./SCTK" ];then
+    echo "Clone SCTK ..."
+    git clone https://github.com/usnistgov/SCTK
+    echo "Clone SCTK done!"
+fi
+
+if [ ! -d "./SCTK/bin" ];then
+    echo "Start make SCTK ..."
+    pushd SCTK && make config && make all && make check && make install && make doc && popd
+    echo "SCTK make done!"
+fi
diff --git a/examples/text_frontend/run.sh b/examples/text_frontend/run.sh
index b84325c..9882b05 100755
--- a/examples/text_frontend/run.sh
+++ b/examples/text_frontend/run.sh
@@ -1,19 +1,7 @@
 #!/bin/bash
+
 USE_SCLITE=true
 
-if [ "$USE_SCLITE" = true ];then
-    if [ ! -d "./SCTK" ];then
-        echo "Clone SCTK ..."
-        git clone https://github.com/usnistgov/SCTK
-        echo "Clone SCTK done!"
-    fi
-
-    if [ ! -d "./SCTK/bin" ];then
-        echo "Start make SCTK ..."
-        pushd SCTK && make config && make all && make check && make install && make doc && popd
-        echo "SCTK make done!"
-    fi
-fi
 # test g2p
 echo "Start get g2p test data ..."
 python3 get_g2p_data.py --root-dir=~/datasets/BZNSYP --output-dir=data/g2p
@@ -29,9 +17,9 @@ python3 test_textnorm.py --input-dir=data/textnorm --output-dir=exp/textnorm
 # whether use sclite to get more detail information of WER
 if [ "$USE_SCLITE" = true ];then
     echo "Start sclite g2p ..."
-    ./SCTK/bin/sclite -i wsj -r ./exp/g2p/text.ref.clean -h ./exp/g2p/text.g2p -e utf-8 -o all
+    ./SCTK/bin/sclite -i wsj -r ./exp/g2p/text.ref.clean trn -h ./exp/g2p/text.g2p trn -e utf-8 -o all
     echo
 
     echo "Start sclite textnorm ..."
-    ./SCTK/bin/sclite -i wsj -r ./exp/textnorm/text.ref.clean -h ./exp/textnorm/text.tn -e utf-8 -o all
+    ./SCTK/bin/sclite -i wsj -r ./exp/textnorm/text.ref.clean trn -h ./exp/textnorm/text.tn trn -e utf-8 -o all
 fi
\ No newline at end of file