#!/bin/bash -e

####################################################################################### 
#              REVERB  CHALLENGE -  automatic speech recognition                      # 
#                                                                                     # 
# scripts and tools written by:                                                       # 
# - Volker Leutnant,                                                                  # 
# - Marc Puels,                                                                       # 
# - Reinhold Haeb-Umbach                                                              # 
#                                                                                     # 
# Department of Communications Engineering, University of Paderborn, Germany          # 
#                                                                                     # 
# support: reverb-asr@lab.ntt.co.jp                                                   #
#######################################################################################

. printlib

print_header "$0"

print_subsec "Configuration"

# Configure paths by editing LOCAL_CONFIG, then run it
if [ -e LOCAL_CONFIG ] ; then
    . LOCAL_CONFIG
else
    print_msg 'Copy LOCAL_CONFIG.template to LOCAL_CONFIG and adapt paths as needed.'
    exit 1
fi

function clean_up
{
    print_subsec "Removing old files."

    pushd . > /dev/null
    cd $WSJLIB/dicts

    rm -f cmu.dictionary
    rm -f specialDictionary.txt
    rm -f special.dict
    rm -f mono.dict
    rm -f sortedcmu.dict
    rm -f makedict.ded
    rm -f si_tr_mono.dict
    rm -f 5cnvpmono.dict

    popd
    rm -f $WSJLIB/wlists/5cnvp.wlist
    rm -f $WSJLIB/mlists/mono0.list
    rm -f $WSJLIB/wlabs/sent.led
    rm -f $WSJLIB/wlists/si_tr.wlist

}

mkdir -p $WSJLIB/mlists
mkdir -p $WSJLIB/mlabs
mkdir -p $WSJLIB/wlists
mkdir -p $WSJLIB/nets
mkdir -p $WSJLIB/dicts

# Remove all files that are generated by this script.
clean_up

print_subsec "Prepare Monophone Dictionary"

# copy the mlf of the training data; created by wsjcam0_pepare_transcriptions
# to be changed later on!
cp $WSJLIB/wlabs/si_tr.mlf $WSJLIB/wlabs/si_tr_words.mlf.tmp


pushd . > /dev/null
# change directory
cd $WSJLIB/dicts

# Get word list from language model --> this is the same as the one
# contained in the original WSJ database!

# find the wlist5c.nvp file 
# its in the etc directory; for the DVD version, there's just one
# for the CD version, there is one per CD, however, same content;
#
# use command substitution to ensure just one file to be used;
# i.e., if multiple return values, just the first is used
wlistFile=($(find $WSJCAM0 -type f -name wlist5c.nvp))
mapsym -e  ${wlistFile} \
  | grep -v '#' \
  | sort \
  > $WSJLIB/wlists/5cnvp.wlist

expandedBeepDict=$WSJLIB/dicts/beep.dict
# convert tab to spaces
expand $BEEP_DICT > ${expandedBeepDict}

# Produce a sorted version of the beep dict with fixes to remove errors and
# multiple pronounciation notation.
print_msg "MAPSYM and sorting!"
mapsym $expandedBeepDict > sorted.dict.tmp
grep -v '#' sorted.dict.tmp \
  | sed 's/([0-9])//g;s/(20//g;s/E21/EH2/g' \
  | sort -t " " -k 1,1 \
  | uniq \
  > sorted.dict
# remove tmp
rm -f sorted.dict.tmp


print_msg "Retrieving words with missing pronuncitiation in BEEP" \
    " dictionary using the 5cnvp word list!"
join $WSJLIB/wlists/5cnvp.wlist sorted.dict -v 1 \
  > missingWords_wlist.txt

# Create file sent.led to put sentence limiters to transcriptions.
echo 'IS !SENT_START !SENT_END' > $WSJLIB/wlabs/sent.led
# Add sent start and end.
HLEd \
    -A -D -T 1 \
    -l '*' \
    -i $WSJLIB/wlabs/si_tr_words.mlf \
    $WSJLIB/wlabs/sent.led \
    $WSJLIB/wlabs/si_tr_words.mlf.tmp
# remove tmp

print_subsec 'Generate si training monophone dictionary'

# Extract a word list from the mlf.
grep -v '^#' $WSJLIB/wlabs/si_tr_words.mlf \
  | grep -v '\.lab\"$' \
  | grep -v '^\.$' \
  | sort -u  \
  > $WSJLIB/wlists/si_tr.wlist

print_msg "Retrieving words with missing pronuncitiation in BEEP" \
    " dictionary using the word list of the training data!"
join $WSJLIB/wlists/si_tr.wlist sorted.dict -v 1 \
  > missingWords_tr.txt

# merge the missingWords in one file and sort unique
cat missingWords_tr.txt >  missingWords.txt.tmp
cat missingWords_wlist.txt >>  missingWords.txt.tmp
sort -u missingWords.txt.tmp > missingWords.txt 
rm -f missingWords.txt.tmp
 
print_msg "Retrieving missing words' pronunciation from the CMU" \
    "dictionary! Note: pronunciation is AE not BE!"
mapsym ${CMU_DICT} \
  > cmu.tmp

grep -v '#' cmu.tmp \
  | sed 's/([0-9])//g;s/(20//g;s/E21/EH2/g' \
  | sort -t " " -k 1,1 \
  | uniq \
  > cmu.dictionary
# remove tmp
rm -f cmu.tmp

# join the dictionaries if necessary
join missingWords.txt cmu.dictionary \
  > specialDictionary.txt
join missingWords.txt cmu.dictionary -v 1 \
  > stillMissingWords.txt

# the still missing words should be !SENT_END and !SENT_START
cat << "EOF" >> specialDictionary.txt
!SENT_END sil
!SENT_START sil
EOF

grep -v '#' specialDictionary.txt \
  | sed 's/([0-9])//g;s/(20//g;s/E21/EH2/g' \
  | sort -t " " -k 1,1 \
  | uniq \
  > special.dict

# Make all phones lower case and remove multiple sils and sps.
cat << "EOF" > makedict.ded
LP
RS cmu
AS sp sil
MP sil sil sp
MP sp sp sp
MP sp sp sil
MP sil sil sil
EOF

print_msg "HDMAN - merging dicts!"
HDMan \
    -A -D -T 1 \
    -m \
    -g makedict.ded \
    -n ${WSJLIB}/mlists/mono1.list \
    mono.dict \
    sorted.dict \
    special.dict

# Produce monophones list without silence models (needed elsewhere).
egrep -v 'sil|sp' ${WSJLIB}/mlists/mono1.list \
  | sort \
  > ${WSJLIB}/mlists/mono0.list


# Get pronounciations for words in si training set
HDMan \
    -A -D -T 1 \
    -m \
    -w $WSJLIB/wlists/si_tr.wlist \
    -l si_tr_mono.dict.log \
    si_tr_mono.dict \
    mono.dict

print_subsec 'Generate 5cnvp monophone dictionary'

HDMan \
    -A -D -T 1 \
    -m \
    -w $WSJLIB/wlists/5cnvp.wlist \
    -l 5cnvpmono.dict.log \
    5cnvpmono.dict \
    mono.dict

popd > /dev/null
