#!/bin/bash -e

####################################################################################### 
#              REVERB  CHALLENGE -  automatic speech recognition                      # 
#                                                                                     # 
# scripts and tools written by:                                                       # 
# - Volker Leutnant,                                                                  # 
# - Marc Puels,                                                                       # 
# - Reinhold Haeb-Umbach                                                              # 
#                                                                                     # 
# Department of Communications Engineering, University of Paderborn, Germany          # 
#                                                                                     # 
# support: reverb-asr@lab.ntt.co.jp                                                   #
#######################################################################################

. printlib


# check for existing file and exit if file is missing; 
# preventing screen from being flooded by error messages
function stop_on_missing_file
{
  if [ ! -f $1 ]; then
    print_msg "Cannot find $1! Aborting"
    exit 1
  fi
}

print_header "$0"

print_subsec 'Configuration'
#Configure paths by editing LOCAL_CONFIG, then run it
if [ -e LOCAL_CONFIG ] ; then
    . LOCAL_CONFIG
else
    print_msg 'Copy LOCAL_CONFIG.template to LOCAL_CONFIG and adapt paths as needed.'
    exit 1
fi

# tmpDir=`mktemp -d --suffix .$(basename $0) --tmpdir=${WORKPATH}`
# pushd . > /dev/null
# cd $tmpDir


# there is one reference MLF in the WSJLIB folder; created by mcwsjav_prepare_transcriptions
refMLF=${WSJLIB}/wlabs/mcwsjav/MC_WSJ_AV.mlf

# specify the training condition
trainCondition="multi_cond"
# trainCondition="clean"

# Parameters for recognition
beamPruningThreshold=250.0 
insertionPenalty=0.0
languageModelWeight=14.0
# number of Gaussian will differ between choosen training mode; init here, set in if-clause below
numberOfGaussians=
wordEndPruningThreshold=200.0

# parameters for adaptation
NUMBEROFREGRESSIONCLASSES="256"
OCCUPANCY="10000"

# change the results folder
if [ "${trainCondition}" = "multi_cond" ]; then
  RESULTSDIR_CMLLR=${MCWSJAVRESULTS_MC}_cmllr
  RESULTSDIR=${MCWSJAVRESULTS_MC}
  numberOfGaussians=12
elif [ "${trainCondition}" = "clean" ]; then
  RESULTSDIR_CMLLR=${MCWSJAVRESULTS}_cmllr
  RESULTSDIR=${MCWSJAVRESULTS}
  numberOfGaussians=10
else
  echo "Unknown training condition: '$trainCondition'; choose either 'clean' or 'multi_cond'"
fi

# some backup time stamp
backupTimeStamp=`date +%y.%m.%d_%M.%s`
# check for existing results directory
if [[ -d ${RESULTSDIR_CMLLR} ]]; then
   print_msg "The results directory\n" \
	     "${RESULTSDIR_CMLLR}\n" \
	     "already exists!\n " \
	     "If you decide to proceed, its content will possibly be overwritten!"
   read -p "Continue? (Y/N)? " REPLY
   if [[ "$REPLY" = "Y" ]] || [[ "$REPLY" == "y" ]]; then
     print_msg "Continuing!"
     print_msg "Placing a backup at: ${RESULTSDIR_CMLLR}_${backupTimeStamp} !"
     cp -r ${RESULTSDIR_CMLLR} ${RESULTSDIR_CMLLR}_${backupTimeStamp}
   else
     print_msg "Exiting!"
     exit 1
   fi
else
  mkdir -p ${RESULTSDIR_CMLLR}
fi

# check for existing results directory
if [[ -d ${RESULTSDIR} ]]; then
   print_msg "The results directory\n" \
	     "${RESULTSDIR}\n" \
	     "already exists!" \
# 	     "If you decide to proceed, its content will possibly be overwritten!"
#    read -p "Continue? (Y/N)? " REPLY
#    if [[ "$REPLY" = "Y" ]] || [[ "$REPLY" == "y" ]]; then
#      print_msg "Continuing!"
#      print_msg "Placing a backup at: ${RESULTSDIR}_${backupTimeStamp} !"
#      cp -r ${RESULTSDIR} ${RESULTSDIR}_${backupTimeStamp}
#    else
#      print_msg "Exiting!"
#      exit 1
#    fi
else
  mkdir -p ${RESULTSDIR}
fi

# the language model file
LM=${WSJLIB}/nets/5cnvp.net
# the monophone dictionary
VOCAB=${WSJLIB}/dicts/5cnvpmono.dict
# the HMM related files
if [ "${trainCondition}" = "multi_cond" ]; then
  HMMBASE=$REVERBWSJHMMS/W_reverb
  HMMDIR=hmm${numberOfGaussians}4
  # the master macro file
  MMF=${HMMBASE}/${HMMDIR}/MMF
  # the stats file
  STATS=${HMMBASE}/hmm${numberOfGaussians}4/stats
  # the word internal triphone list after decision tree based clustering
  HMMLIST=${HMMBASE}/winttree.list
else
  HMMBASE=$WSJCAM0HMMS/W2
  HMMDIR=hmm${numberOfGaussians}4
  # the master macro file
  MMF=${HMMBASE}/${HMMDIR}/MMF
  # the stats file
  STATS=${HMMBASE}/${HMMDIR}/stats
  # the word internal triphone list after decision tree based clustering
  HMMLIST=${HMMBASE}/winttree.list
fi


# List of recognition tasks
tasks="near far"
# List of test sets
testSets=$TASKSET #"dt et"

for task in $tasks; 
do
  for testSet in $testSets; 
  do

    print_msg "Performing speaker independent CMLLR adaptation for task <$task> and test set <$testSet>!"

    # the task name
    taskName=RealData_${testSet}_for_1ch_${task}_room1_A
    # specify file list to be used - use only the 1ch task; 
    featureFilesList=${WSJLIB}/flists/mcwsjav/${taskName}.lst
    # specify directory to store the transforms
    ADAPTATIONDIR=${HMMBASE}/${HMMDIR}/xforms_MCWSJAV/xforms_${taskName}
    # create it
    rm -rf $ADAPTATIONDIR

    # create subdirectory for xforms 
    mkdir -p $ADAPTATIONDIR
    # and subdirectory for regression class trees
    mkdir -p $ADAPTATIONDIR/classes

# the maximum number of Gaussian mixtures per state is given by the silence model
# where the number of Gaussians is twice the number of Gaussians in all the other models
cat << EOF > $ADAPTATIONDIR/classes/global
~b "global"
<MMFIDMASK> *
<PARAMETERS> MIXBASE
<NUMCLASSES> 1
<CLASS> 1 {*.state[2-4].mix[1-$((2*numberOfGaussians))]}
EOF

cat << EOF > $ADAPTATIONDIR/regtree.hed
RN "models"
LS "${STATS}"
RC ${NUMBEROFREGRESSIONCLASSES} "rtree"
EOF

cat <<EOF > $ADAPTATIONDIR/config.hled
DL 1
EOF

cat << EOF > $ADAPTATIONDIR/config.hvite
HADAPT:TRACE = 61
HMODEL:TRACE = 512
EOF

cat << EOF > $ADAPTATIONDIR/config.cmllr_global
HADAPT:TRANSKIND         = CMLLR
HADAPT:USEBIAS           = TRUE
HADAPT:BASECLASS         = global
HADAPT:ADAPTKIND         = BASE
HADAPT:KEEPXFORMDISTINCT = TRUE
HADAPT:TRACE = 61
HMODEL:TRACE = 512
EOF

cat << EOF > $ADAPTATIONDIR/config.cmllr_rc
HADAPT:TRANSKIND         = CMLLR
HADAPT:USEBIAS           = TRUE
HADAPT:REGTREE           = rtree.tree
HADAPT:ADAPTKIND         = TREE
HADAPT:KEEPXFORMDISTINCT = TRUE
HADAPT:SPLITTHRESH       = ${OCCUPANCY}

HADAPT:TRACE = 61
HMODEL:TRACE = 512
EOF

    # assign the MLF+LOG to keep the recognizer output
    resMLF=${RESULTSDIR}/$(basename $featureFilesList | sed -e 's/lst$/mlf/g')
    resLog=${RESULTSDIR}/$(basename $featureFilesList | sed -e 's/lst$/log/g')

    # assign the MLF+LOG to keep the recognizer output when CMLLR has been used
    resMLFCMLLR=${RESULTSDIR_CMLLR}/$(basename $featureFilesList | sed -e 's/lst$/mlf/g')
    resLogCMLLR=${RESULTSDIR_CMLLR}/$(basename $featureFilesList | sed -e 's/lst$/log/g')
    
    # assign the MLF+LOG to keep the "reference" for the adaptation
    refMLFforAdaptation=${RESULTSDIR_CMLLR}/$(basename $featureFilesList | sed -e 's/lst$/aligned.mlf/g')


    print_subsub "Using feature file list:\n ${featureFilesList}"

     # check if the "reference" MLF already exists!
    if [ ! -f ${refMLFforAdaptation} ]; then
	# check if the results MLF has already been created by a call to mcwsjav_recognize[_multicond]
	if [ -f ${resMLF} ]; then
	  # try to infer the level of the current file;
	  aux=`grep [a-z]*-[a-z*]+ ${resMLF} | wc -l`
	  # if more than, e.g., 10 (to be absolutely sure) of those lines are available, the MLF contains a triphone transcription, too
	  if [ $aux -gt 10 ]; then
	     # the MLF contains triphones (level 2 MLF)
	     # proceed by copying the MLF to the one to be used for adaptation	    
	     print_msg "Reference MLF to be used for CMLLR adaptation already exists!"
	     cp $resMLF ${refMLFforAdaptation}
	  else
	    # perform a forced alignement with the results MLF to get triphone transcription
	    print_msg "Performing aligment to create reference MLF to be used for CMLLR adaptation!"
	    parallelHTK $NBPROC HVite \
	      -C ${WSJLIB}/configs/config.align \
	      -A -D -T 1 \
	      -t 250.0 150.0 2000.0 \
	      -I ${resMLF} \
	      -i ${refMLFforAdaptation} \
	      -S ${featureFilesList} \
	      -H ${MMF} \
	      -m \
	      -o SWT \
	      -a \
	      -X rec \
	      ${VOCAB} \
	      ${HMMLIST}
	  fi
	else
	  print_msg "Performing recognition to create reference MLF to be used for CMLLR adaptation!"

	  # call HVite with the appropriate parameters to generate word- and sub-word-level alignment
	  parallelHTK $NBPROC HVite \
	      -A -D -T 1 \
	      -C ${WSJLIB}/configs/config.align \
	      -t ${beamPruningThreshold} \
	      -v ${wordEndPruningThreshold} \
	      -s ${languageModelWeight} \
	      -p ${insertionPenalty} \
	      -i ${resMLF} \
	      -w ${LM} \
	      -m \
	      -S ${featureFilesList} \
	      -H ${MMF} \
	      ${VOCAB} \
	      ${HMMLIST}

	  # evaluate the MLF output by the recognizer by comparing it against the reference MLF
	  # since the reference is level 1 mlf and the recognized part is level 2 mlf, we need to
	  # delete sub-words (level 1) from generated label file in $resMLF.
	  HLEd -i $resMLF.lvl2.mlf -X rec $ADAPTATIONDIR/config.hled $resMLF

	  HResults \
	      -A -D -T 1 \
	      -h \
	      -n \
	      -k *_T%%* \
	      -z \!NULL \
	      -e \!NULL \!SENT_START \
	      -e \!NULL \!SENT_END \
	      -I ${refMLF} \
	      ${VOCAB} \
	      ${resMLF}.lvl2.mlf \
	      > ${resLog}

	  # check
	  stop_on_missing_file ${resLog}
	  # remove the level-2 mlf
	  rm -f $resMLF.lvl2.mlf 

	  # the resMLF will be used as the true MLF to perform adaptation
	  cp $resMLF ${refMLFforAdaptation}
	fi
    else
	print_msg "Reference MLF to be used for CMLLR adaptation already exists, proceeding!"
    fi

    # double check
    stop_on_missing_file ${refMLFforAdaptation}

    # Create regression class tree
    HHEd -A -D -T 1 \
	-H $MMF \
	-M $ADAPTATIONDIR/classes \
	$ADAPTATIONDIR/regtree.hed \
	${HMMLIST}

    # HERest cannot be "distributed" for adaptation
    # (cannot use parallelHTK)
    # estimate base transform
    HERest \
	-T 1 \
	-A \
	-D \
	-I $refMLFforAdaptation \
	-H $MMF \
	-u a \
	-J $ADAPTATIONDIR/classes \
	-K $ADAPTATIONDIR cmllr1 \
	-S $featureFilesList \
	-C $WSJLIB/configs/config.herest \
	-C $ADAPTATIONDIR/config.cmllr_global \
	-X rec \
	$HMMLIST

    # check for created xform
    stop_on_missing_file $ADAPTATIONDIR/mfc.cmllr1

    # estimate the regression class specific transform based on the base transform; 
    HERest \
	-T 1 \
	-A \
	-D \
	-a \
	-I $refMLFforAdaptation \
	-H $MMF \
	-u a \
	-J $ADAPTATIONDIR cmllr1 \
	-J $ADAPTATIONDIR/classes \
	-K $ADAPTATIONDIR cmllr2 \
	-S $featureFilesList \
	-C $WSJLIB/configs/config.herest \
	-C $ADAPTATIONDIR/config.cmllr_rc \
	-X rec \
	$HMMLIST

    # check for created xform
    stop_on_missing_file $ADAPTATIONDIR/mfc.cmllr2

    # Recognize test data with xform applied to MMF
    parallelHTK $NBPROC HVite \
	-A -D -T 1 \
	-C ${WSJLIB}/configs/config.align \
	-C $ADAPTATIONDIR/config.hvite \
	-l '*' \
	-t ${beamPruningThreshold} \
	-v ${wordEndPruningThreshold} \
	-s ${languageModelWeight} \
	-p ${insertionPenalty} \
	-i ${resMLFCMLLR} \
	-k \
	-w ${LM} \
	-J $ADAPTATIONDIR cmllr2 \
	-J $ADAPTATIONDIR/classes \
	-H ${MMF} \
	-S ${featureFilesList} \
	${VOCAB} \
	${HMMLIST}

    # Compute results using the adapted model
    HResults \
	-A -D -T 1 \
	-h \
	-n \
	-k *_T%%* \
	-z \!NULL \
	-e \!NULL \!SENT_START \
	-e \!NULL \!SENT_END \
	-I ${refMLF} \
	${VOCAB} \
	${resMLFCMLLR} \
	> ${resLogCMLLR}
    
    stop_on_missing_file ${resLogCMLLR}
  done
done

# popd > /dev/null
# rm -rf ${tmpDir}
