#!/usr/bin/perl
# dot2mlf by V.Valtchev, Jun 1993
# modified from dot2lsn by Doug Paul, May 1993
# ammended by Will Bradley CUED August 2001
# adapted to suit MCWSJAV/WSJCAM0 needs by Marc Puels UPB 2013

$trnndx = '';
$dotdir = '.';         #changed from '' to '.' WJB
$putsil = 0;
$corpus = 'wsjcam0';
$argdotfile = '';
$putesc = 0;
$dflg=0;	# deletion test mode flag

%processed_sentids = ();

if ($#ARGV == -1) {
  &print_help;
  exit(1);
}
&process_opt;
print "#!MLF!#\n";
&process_files;
exit(0);

sub print_help
{
  print "dot2mlf usage: dot2mlf <options> index_file\n";
  print "Options\n";
  print "-d dir     directory to find dot files                    current\n";
  print "-e         print \\ before punctuation chars              disabled\n";
  print "-s         generate silence models                        disabled\n";
  print "-c corpus  Either 'mcwsjav' or 'wsjcam0'. Corpora\n";
  print "           have differing utterance id formats.           wsjcam0\n";
  print "-f dotfile additional dot file\n";
}

sub getcm
{
  if ($_[0] > $#ARGV){
    &perr("Missing argument");
  }
  $ARGV[$_[0]];
}

sub process_opt
{
  $i = 0;
  while ($ARGV[$i] =~ /^-/) {
    $flag = $ARGV[$i];
    ifcase: {
      ($flag =~ /^-d/) && do { $dotdir = &getcm(++$i); last ifcase; };
      ($flag =~ /^-e/) && do { $putesc = 1; last ifcase; };
      ($flag =~ /^-s/) && do { $putsil = 1; last ifcase; };
      ($flag =~ /^-c/) && do { $corpus = &getcm(++$i); last ifcase; };
      ($flag =~ /^-f/) && do { $argdotfile = &getcm(++$i); last ifcase; };
#      { print "unknown switch $flag\n"; exit(1) };
    }
    &getcm(++$i);
  }

  if(! $corpus =~ /^wsjcam0$|^mcwsjav$/)
  {
      &perr("Unknown corpus $corpus.\n");
  }

  $trnndx = &getcm($i);
  if ($i != $#ARGV){
    print "Extra command arguments left unprocessed\n";
  }
  @ARGV=();
}

sub process_files
{
  if (!open(INFILE, "$trnndx"))
    { &perr("Cannot open file $trnndx"); }
  while(<INFILE>){
    if(/^;/) { next; }


    # get sentence id
    if ($corpus eq "wsjcam0")
    {
      # (/.+\/(\w{8})\.wvs\s+$/) && do { $sentid = $1; };     #WJB
        (/.+\/(\w{8})\....\s+$/) && do { $sentid = $1; };
        (/.+\/(\w{8})\s+$/)      && do { $sentid = $1; };
      # (/.+\/(\w{8})\.dot\s+$/) && do { $sentid = $1; };      #WJB
    } 
    elsif ($corpus eq "mcwsjav")
    {
        (/^.+_([a-zA-Z0-9]+)\....\s*$/) && do { $sentid = $1; };
    }
    else 
    {
        &perr("process_files: Unknown corpus: $corpus\n");
    }

    if (! $sentid) {                                       #WJB
	die "Invalid index file";                          #WJB
    }

    if ($processed_sentids{$sentid} eq "t") { next; }

    $sentid =~ /((...)...)../;
    $spkses = $1;
    $spkrid = $2;

    @dotfiles = ( "$dotdir" . '/' . "$spkrid" . '/' . "$spkses" . "00.dot",
                  "$dotdir" . '/' . "$spkses" . "00.dot",
                  $argdotfile );

    $dotfile = "";
    foreach (@dotfiles) {
        if(open(DOTFILE, $_)) {
            $dotfile = $_;
            last;
        }
    }

    if ($dotfile eq "") {
        $dotfiles_str = "";
        foreach (@dotfiles) {
            $dotfiles_str = $dotfiles_str . "\n" . $_;
        }
        &perr("Failed to open any of these dot files:\n$dotfiles_str\n");
    }

    $found = 0;
    dot: while(<DOTFILE>){
      $re_uid_match = "";
      if ($corpus eq "wsjcam0") {
        /^.+\((\w{8})\)\s+$/;
        $re_uid_match = $1;
      } elsif ($corpus eq "mcwsjav") {
        /\((\w+)\)$/;
        $re_uid_match = $1;
      }

      if ($re_uid_match eq $sentid){
        print "\"*/$sentid.lab\"\n";
        if ($putsil){
          print "sil\n";     # print begin of sentence silence
        }
        &process_line;
        while (/\s(\S+)\s/g){
          print "$1\n";
        }
        if ($putsil){
          print "sil\n";     # print end of sentence silence
        }
        print ".\n";
        $found = 1;
        $processed_sentids{$sentid} = "t";
        last dot;
      }
    }
    if (!$found)
      { &perr("Could not find sentence $sentid in dotfile $dotfile"); }
    close(DOTFILE);
  }
}

sub process_line		# process 1 line: dot -> lsn
{
  tr/a-z/A-Z/;		# xyz -> XYZ
  s/\n//;			# rm CR
  s/\s/  /g;		# double internal SP: removes need to loop
			# where successive replacements are needed
  s/^/ /;		# add init SP
  s/$/ /;		# add final SP

  # Many of the following steps assume that all real punctuation marks are
  # escaped by \, as per the spec.  (The \ is removed in the final step.)

  # false starts and incompletely spoken fragments
## correct fragments
#   s/\((\S+)\)-/$1/g;				# correct x(yz)- --> xyz
#   s/-\((\S+)\)/$1/g;				# correct -(xy)z --> xyz
#   s/(\S+)-/$1/g;				# correct xyz- --> xyz
#   s/-(\S+)/$1/g;				# correct -xyz --> xyz
## remove fragments
  s/\s\S+-\s/ /g;				# rm xyz- and x(yz)-
  s/\s-\S+\s/ /g;				# rm -xyz and -(xy)z

  # preserve some frequent non-speech events
  # s/\s\[LOUD_BREATH\]\s/ sil /g;
  # s/\s\[LIP_SMACK\]\s/ lip_smack /g;
  # s/\s\[TONGUE_CLICK\]\s/ tongue_click /g;

  # remove all other nse's
  s/\s\[\S+\]\s/ /g;			# rm [xyz]

  # mark unnatural pauses
  s/\s\.\s/ /g;			# rm . . . # ammended WJB 29/08/2001

#moved to this point by WJB from after waveform statement
  if ($dflg) {				# deletion test mode
    while(/\s(<\S+>)\s/g)		# find and print only <xyz>
    {  $wd=$1;
       /(\(\w{8}\))\s*$/;
       $id=$1;
       print "deleted $id: $wd\n";
    }
  } else { s/\s<(\S+)>\s/ $1 /g; }	# <xyz> -> xyz (default)

  # mispronounciations
  s/\s\*(\S+)\*\s/ $1 /g;		# *xyz* -> xyz

  # handle lengthening
  s/(\w):(\w)/$1$2/g;			# x:y -> xy
  s/\s:(\w)/ $1/g;			# :xyz -> xyz
  s/(\w):\s/$1 /g;			# xyz: -> xyz

  # emphatic stress
  s/\s!(\w)/ $1/g;			# !xyz -> xyz
  s/(\w)!(\w)/$1$2/g;                   # x!yz -> xyz occurs in WSJ1 WJB

  # waveform truncation
  s/\s~\s/ /g;				# ~ -> ' '

  # remove sentence id
  if($corpus eq "wsjcam0") {
      s/\s\(\w{8}\)\s/ /;
  } elsif ($corpus eq "mcwsjav") {
      s/\s\(\w{7,9}\)\s$/ /;
  }

  if (!$putesc){
    s/\\(\W)/$1/g;                      # rm \ before punct
  }
}

sub perr
{
    print STDERR "dot2mlf: $_[0]\n";    #WJB
  exit(1);
}
