kaldi-asr · saikiranvalluri · Oct 30, 2018 · Dec 8, 2018 · Jan 17, 2019 · Feb 19, 2019
diff --git a/egs/fisher_callhome_spanish/s5/RESULTS b/egs/fisher_callhome_spanish/s5/RESULTS
diff --git a/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh b/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh
@@ -27,6 +27,7 @@ nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
 affix=1g   #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
 common_egs_dir=
 reporting_email=
+gigaword_workdir=
 
 # LSTM/chain options
 train_stage=-10
@@ -254,11 +255,6 @@ if [ $stage -le 21 ]; then
 
 fi
 
-rnnlmdir=exp/rnnlm_lstm_tdnn_1b
-if [ $stage -le 22 ]; then
-  local/rnnlm/train_rnnlm.sh --dir $rnnlmdir || exit 1;
-fi
-
 if [ $stage -le 23 ]; then
   frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
   rm $dir/.error 2>/dev/null || true
@@ -277,8 +273,11 @@ if [ $stage -le 23 ]; then
           --online-ivector-dir exp/nnet3/ivectors_${data}_hires \
           $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data} || exit 1;
       done
-      bash local/rnnlm/lmrescore_nbest.sh 1.0 data/lang_test $rnnlmdir data/${data}_hires/ \
-	      ${dir}/decode_${lmtype}_${data} $dir/decode_rnnLM_${lmtype}_${data} || exit 1;
+      if [ $gigaword_workdir ]; then
+	lmtype=fsp_train
+        bash rnnlm/lmrescore_nbest.sh 1.0 data/lang_test $gigaword_workdir/rnnlm data/${data}_hires/ \
+              ${dir}/decode_${lmtype}_${data} $dir/decode_gigaword_RNNLM_${lmtype}_${data} || exit 1;
+      fi
     ) || touch $dir/.error &
   done
   wait

diff --git a/egs/fisher_callhome_spanish/s5/local/clean_abbrevs_text.py b/egs/fisher_callhome_spanish/s5/local/clean_abbrevs_text.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+#    2018  Saikiran Valluri, GoVivace inc.,
+
+import os, sys
+import re
+import codecs
+
+if len(sys.argv) < 3:
+  print("Usage : python clean_abbrevs_text.py <Input text> <output text>")
+  print("        Processes the text before text normalisation to convert uppercase words as space separated letters")
+  sys.exit()
+
+inputfile=codecs.open(sys.argv[1], encoding='utf-8')
+outputfile=codecs.open(sys.argv[2], encoding='utf-8', mode='w')
+
+for line in inputfile:
+  words = line.split()
+  textout = ""
+  wordcnt = 0
+  for word in words:
+    if re.match(r"\b([A-ZÂÁÀÄÊÉÈËÏÍÎÖÓÔÖÚÙÛÑÇ])+[']?s?\b", word):
+      if wordcnt > 0:
+        word = re.sub('\'?s', 's', word)
+        textout = textout + " ".join(word) + " "
+      else:
+        textout = textout + word + " "
+    else:
+      textout = textout + word + " "
+      if word.isalpha(): wordcnt = wordcnt + 1
+  outputfile.write(textout.strip()+ '\n')
+
+inputfile.close()
+outputfile.close() 
diff --git a/egs/fisher_callhome_spanish/s5/local/clean_txt_dir.sh b/egs/fisher_callhome_spanish/s5/local/clean_txt_dir.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+# Script to clean up gigaword LM text
+# Removes punctuations, does case normalization
+
+stage=0
+nj=500
+
+. ./path.sh
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+set -euo pipefail
+
+if [ $# -ne 2 ]; then
+    echo "Usage: $0 <textdir> <outdir>"
+    exit 1;
+fi
+
+if [ ! -s `which normalizer_main` ] ; then
+  echo "Sparrowhawk normalizer was not found installed !"
+  echo "Go to $KALDI_ROOT/tools and execute install_sparrowhawk.sh and try again!"
+  exit 1
+fi
+
+txtdir=$1
+textdir=$(realpath $txtdir)
+outdir=$(realpath $2)
+
+workdir=$outdir/tmp
+if [ $stage -le 0 ]; then
+  rm -rf $outdir
+  mkdir -p $workdir
+  mkdir -p $textdir/splits
+  mkdir -p $outdir/data
+  split -l 1000000 $textdir/in.txt $textdir/splits/out
+  numsplits=0
+  for x in $textdir/splits/*; do
+    numsplits=$((numsplits+1))
+    ln -s $x $outdir/data/$numsplits
+  done
+  echo $numsplits
+  cp $SPARROWHAWK_ROOT/documentation/grammars/sentence_boundary_exceptions.txt .
+  $train_cmd --max_jobs_run 100 JOB=1:$numsplits $outdir/sparrowhawk/log/JOB.log \
+    local/run_norm.sh \
+    sparrowhawk_configuration.ascii_proto \
+    $SPARROWHAWK_ROOT/language-resources/esp/sparrowhawk/ \
+    $outdir/data \
+    JOB \
+    $outdir/sparrowhawk/
+  cat $outdir/sparrowhawk/*.txt | sed "/^$/d"  > $outdir/text_normalized
+
+  # check if numbers are there in normalized output
+  awk '{for(i=1;i<=NF;i++) {if (!seen[$i]) {print $i; seen[$i]=1} }}' \
+    $outdir/text_normalized > $outdir/unique_words
+  grep "[0-9]" $outdir/unique_words | sort -u >  $outdir/numbers
+fi
diff --git a/egs/fisher_callhome_spanish/s5/local/ctm.sh b/egs/fisher_callhome_spanish/s5/local/ctm.sh
@@ -19,9 +19,9 @@ fi
 steps/get_ctm.sh $data_dir $lang_dir $decode_dir
 
 # Make sure that channel markers match
-#perl -i -pe "s:\s.*_fsp-([AB]): \1:g" data/dev/stm
-#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} perl -i -pe 's:fsp\s1\s:fsp A :g' {}
-#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} perl -i -pe 's:fsp\s2\s:fsp B :g' {}
+#sed -i "s:\s.*_fsp-([AB]): \1:g" data/dev/stm
+#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} sed -i -r 's:fsp\s1\s:fsp A :g' {}
+#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} sed -i -r 's:fsp\s2\s:fsp B :g' {}
 
 # Get the environment variables
 . /export/babel/data/software/env.sh

diff --git a/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_all_gigaword.sh b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_all_gigaword.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+set -e
+
+# Path to Gigaword corpus with all data files decompressed.
+export GIGAWORDDIR=$1
+# The directory to write output to
+export OUTPUTDIR=$2
+# The number of jobs to run at once
+export NUMJOBS=$3
+
+echo "Flattening Gigaword with ${NUMJOBS} processes..."
+mkdir -p $OUTPUTDIR
+find ${GIGAWORDDIR}/data/*/* -type f -print -exec local/flatten_gigaword/run_flat.sh {} ${OUTPUTDIR} \;
+echo "Combining the flattened files into one..."
+cat ${OUTPUTDIR}/*.flat > ${OUTPUTDIR}/flattened_gigaword.txt
diff --git a/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_one_gigaword.py b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_one_gigaword.py
@@ -0,0 +1,61 @@
+# -*- coding: utf-8 -*-
+
+import logging
+import os
+import re
+import spacy
+import gzip
+
+from argparse import ArgumentParser
+from bs4 import BeautifulSoup
+
+en_nlp = spacy.load("es")
+
+
+def flatten_one_gigaword_file(file_path):
+    f = gzip.open(file_path)
+    html = f.read()
+    # Parse the text with BeautifulSoup
+    soup = BeautifulSoup(html, "html.parser")
+
+    # Iterate over all <p> items and get the text for each.
+    all_paragraphs = []
+    for paragraph in soup("p"):
+        # Turn inter-paragraph newlines into spaces
+        paragraph = paragraph.get_text()
+        paragraph = re.sub(r"\n+", "\n", paragraph)
+        paragraph = paragraph.replace("\n", " ")
+        # Tokenize the paragraph into words
+        tokens = en_nlp.tokenizer(paragraph)
+        words = [str(token) for token in tokens if not
+                 str(token).isspace()]
+        if len(words) < 3:
+            continue
+        all_paragraphs.append(words)
+    # Return a list of strings, where each string is a
+    # space-tokenized paragraph.
+    return [" ".join(paragraph) for paragraph in all_paragraphs]
+
+
+if __name__ == "__main__":
+    log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    logging.basicConfig(level=logging.INFO, format=log_fmt)
+    logger = logging.getLogger(__name__)
+
+    parser = ArgumentParser(description=("Flatten a gigaword data file for "
+                                         "use in language modeling."))
+    parser.add_argument("--gigaword-path", required=True,
+                        metavar="<gigaword_path>", type=str,
+                        help=("Path to Gigaword directory, with "
+                              "all .gz files unzipped."))
+    parser.add_argument("--output-dir", required=True, metavar="<output_dir>",
+                        type=str, help=("Directory to write final flattened "
+                                        "Gigaword file."))
+
+    A = parser.parse_args()
+    all_paragraphs = flatten_one_gigaword_file(A.gigaword_path)
+    output_path = os.path.join(A.output_dir,
+                               os.path.basename(A.gigaword_path) + ".flat")
+    with open(output_path, "w") as output_file:
+        for paragraph in all_paragraphs:
+            output_file.write("{}\n".format(paragraph))
diff --git a/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/run_flat.sh b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/run_flat.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+set -e
+
+. ./path_venv.sh
+
+# Path to Gigaword corpus with all data files decompressed.
+GIGAWORDPATH=$1
+# The directory to write output to
+OUTPUTDIR=$2
+file=$(basename ${GIGAWORDPATH})
+if [ ! -e ${OUTPUTDIR}/${file}.flat ]; then
+    echo "flattening to ${OUTPUTDIR}/${file}.flat"
+    python local/flatten_gigaword/flatten_one_gigaword.py --gigaword-path ${GIGAWORDPATH} --output-dir ${OUTPUTDIR}
+else
+    echo "skipping ${file}.flat"
+fi
+
diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh b/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh
@@ -133,6 +133,7 @@ if [ $stage -le 2 ]; then
   sed 's:</b::g' | \
   sed 's:<foreign langengullís>::g' | \
   sed 's:foreign>::g' | \
+  sed 's:\[noise\]:[noise] :g' | \
   sed 's:>::g' | \
   #How do you handle numbers?
   grep -v '()' | \

diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh
@@ -105,8 +105,9 @@ if [ $stage -le 4 ]; then
   cp "$tmpdir/lexicon.1" "$tmpdir/lexicon.2"
 
   # Add prons for laughter, noise, oov
-  w=$(grep -v sil $dir/silence_phones.txt | tr '\n' '|')
-  perl -i -ne "print unless /\[(${w%?})\]/"  $tmpdir/lexicon.2
+  for w in `grep -v sil $dir/silence_phones.txt`; do
+    sed -i "/\[$w\]/d" $tmpdir/lexicon.2
+  done
 
   for w in `grep -v sil $dir/silence_phones.txt`; do
     echo "[$w] $w"

diff --git a/egs/fisher_callhome_spanish/s5/local/get_data_weights.pl b/egs/fisher_callhome_spanish/s5/local/get_data_weights.pl
@@ -0,0 +1,39 @@
+#!/usr/bin/env perl
+
+# Nagendra Kumar Goel
+
+# This takes two arguments:
+# 1) Pocolm training output folder
+# 2) rnnlm weights file name (for output)
+
+use POSIX;
+use List::Util qw[min max];
+
+if (@ARGV != 2) {
+  die "Usage: get_data_weights.pl <pocolm-folder> <output-file>\n";
+}
+
+$pdir = shift @ARGV;
+$out = shift @ARGV;
+
+open(P, "<$pdir/metaparameters") || die "Could not open $pdir/metaparameters";
+open(N, "<$pdir/names") || die "Could not open $pdir/names"  ;
+open(O, ">$out")  || die "Could not open $out for writing" ;
+
+my %scores = ();
+
+while(<N>) {
+    @n = split(/\s/,$_);
+    $name = $n[1];
+    $w = <P>;
+    @w = split(/\s/,$w);
+    $weight = $w[1];
+    $scores{$name} = $weight;
+}
+
+$min = min(values %scores);
+
+for(keys %scores) {
+    $weightout = POSIX::ceil($scores{$_} / $min);
+    print O "$_\t1\t$weightout\n";
+}
diff --git a/egs/fisher_callhome_spanish/s5/local/get_rnnlm_wordlist.py b/egs/fisher_callhome_spanish/s5/local/get_rnnlm_wordlist.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+#    2018  Saikiran Valluri, GoVivace inc.
+
+import os, sys
+
+if len(sys.argv) < 5:
+    print( "Usage: python get_rnnlm_wordlist.py <ASR lexicon words> <POCOLM wordslist> <RNNLM wordslist output> <OOV wordlist>")
+    sys.exit()
+
+lexicon_words = open(sys.argv[1], 'r', encoding="utf-8")
+pocolm_words = open(sys.argv[2], 'r', encoding="utf-8")
+rnnlm_wordsout = open(sys.argv[3], 'w', encoding="utf-8")
+oov_wordlist = open(sys.argv[4], 'w', encoding="utf-8")
+
+line_count=0
+lexicon=[]
+
+for line in lexicon_words:
+    lexicon.append(line.split()[0])
+    rnnlm_wordsout.write(line.split()[0] + " " + str(line_count)+'\n')
+    line_count = line_count + 1
+
+for line in pocolm_words:
+    if not line.split()[0] in lexicon:
+        oov_wordlist.write(line.split()[0]+'\n')
+        rnnlm_wordsout.write(line.split()[0] + " " + str(line_count)+'\n')
+        line_count = line_count + 1
+
+lexicon_words.close()
+pocolm_words.close()
+rnnlm_wordsout.close()
+oov_wordlist.close()