#!/usr/bin/perl
# mapsym by V.Valtchev, Jun 1993
# this program converts words starting with meta-characters to the HTK compliant !<word>
# format. The list of words used in the s/ statements was collected by running "findesc"
# on the wfl-64 word list
#
# additional mapped words added for wsj1 training - pcw 2/10/93
# additional command lin eoptions - vv1 22/10/93
# additional mapped words for WSJ0 training - WJB 24/08/2001
# spaces removed from ends of output words -WJB
# additional mapped words for BEEP training - VL 01/04/2013


$file = '';
$semicom = 0;
$maphyph = 0;
$addsent = 0;
$keepcom = 1;

if ($#ARGV == -1) {
  &print_help;
  exit(1);
}
&process_opt;
&process_file;
exit(0);

sub print_help
{
  print "mapsym usage: mapsym <options> file\n";
  print "Options\n";
  print "-e   add sentence start/end disabled\n";
  print "-h   map - into _           disabled\n";
  print "-s   treat ; as comment     disabled\n";
  print "-w   wipe comments          disabled\n";
}

sub getcm
{
  if ($_[0] > $#ARGV){
    &perr("Missing argument");
  }
  $ARGV[$_[0]];
}

sub process_opt
{
  $i = 0;
  while ($ARGV[$i] =~ /^-/) {
    $flag = $ARGV[$i];
    ifcase: {
      ($flag =~ /^-e/) && do { $addsent = 1; last ifcase; };
      ($flag =~ /^-s/) && do { $semicom = 1; last ifcase; };
      ($flag =~ /^-h/) && do { $maphyph = 1; last ifcase; };
      ($flag =~ /^-w/) && do { $keepcom = 0; last ifcase; };
      { print "unknown switch $flag\n"; exit(1) };
    }
    &getcm(++$i);
  }
  $file = &getcm($i);
  if ($i != $#ARGV){
    print "Extra command arguments left unprocessed\n";
  }
  @ARGV=();
}

sub process_file
{
  if (!open(FILE, "$file"))
    { &perr("Cannot open file $file"); }
  while(<FILE>){
    if (/^#/) {
      if ($keepcom) { print $_; }
      next;
    }
    if ($semicom && /^;/) {
      if ($keepcom) { print $_; }
      next;
    }

    s/\n//g;  #remove CR
    s/$/ /;   #insert extra SP

    s/<UNK>/!UNKNOWN/g;
    s/<s>/!SENT_START/g;
    s/<\/s>/!SENT_END/g;
    s/,COMMA/!COMMA/g;
    s/\.PERIOD/!PERIOD/g;
    s/\"DOUBLE-QUOTE/!DOUBLE_QUOTE/g;
    s/\-HYPHEN/!HYPHEN/g;
    s/\.POINT/!POINT/g;
    s/\%PERCENT/!PERCENT/g;
    s/&AMPERSAND/!AMPERSAND/g;

    s/\-\-DASH/!DASH/g;
    s/:COLON/!COLON/g;
    s/\)RIGHT-PAREN/!RIGHT_PAREN/g;
    s/\(LEFT-PAREN/!LEFT_PAREN/g;
    s/;SEMI-COLON/!SEMI_COLON/g;
    s/\?QUESTION-MARK/!QUESTION_MARK/g;
    s/\'SINGLE-QUOTE/!SINGLE_QUOTE/g;
    s/\.\.\.ELLIPSIS/!ELLIPSIS/g;
    s/\/SLASH/!SLASH/g;
    s/\}RIGHT-BRACE/!RIGHT_BRACE/g;
    s/\{LEFT-BRACE/!LEFT_BRACE/g;
    s/!EXCLAMATION-POINT/!EXCLAMATION_POINT/g;
    s/\'N\s/!N /g;
    s/\'EM\s/!EM /g;
    s/\+PLUS/!PLUS/g;
    s/\=EQUALS/!EQUALS/g;
    s/\'CAUSE\s/!CAUSE /g;
    s/#SHARP-SIGN/!SHARP_SIGN/g;
    s/\'TIL\s/!TIL /g;
    s/\(PARENTHESES\s/!PARENTHESES /g;
    s/\)UN-PARENTHESES\s/!UN_PARENTHESES /g;
    s/\)PAREN\s/!PAREN /g;
    s/\"END-OF-QUOTE\s/!END_OF_QUOTE /g;
    s/\)END-THE-PAREN\s/!END_THE_PAREN /g;
    s/\)CLOSE-PAREN\s/!CLOSE_PAREN /g;
    s/\(PARENTHETICALLY\s/!PARENTHETICALLY /g;
    s/\{BRACE\s/!BRACE /g;
    s/\}CLOSE-BRACE\s/!CLOSE_BRACE /g;
    s/\"IN-QUOTES\s/!IN_QUOTES /g;
    s/\(BEGIN-PARENS\s/!BEGIN_PARENS /g;
    s/\)END-PARENS\s/!END_PARENS /g;
    s/\(PAREN\s/!PAREN /g;
    s/\"CLOSE-QUOTE\s/!CLOSE_QUOTE /g;
    s/\"END-QUOTE\s/!END_QUOTE /g;
    s/\"QUOTE\s/!QUOTE /g;
    s/\"UNQUOTE\s/!UNQUOTE /g;

    # BEEP
    s/&EM/!EM/g;
    s/&FLU/!FLU/g;
    s/&NEATH/!NEATH/g;
    s/&SBLOOD/!SBLOOD/g;
    s/&SDEATH/!SDEATH/g;
    s/&TIS/!TIS/g;
    s/&TWAS/!TWAS/g;
    s/&TWEEN/!TWEEN/g;
    s/&TWERE/!TWERE/g;
    s/&TWIXT/!TWIXT/g;
    s/\'AVE/!AVE/g;
    s/\'CAUSE/!CAUSE/g;
    s/\'COS/!COS/g;
    s/\'EM/!EM/g;
    s/\'EMPTY/!EMPTY/g;
    s/\'N/!N/g;
    s/\'NEATH/!NEATH/g;
    s/\'SHUN/!SHUN/g;
    s/\'SINGLE-QUOTE/!SINGLE_QUOTE/g;
    s/\'TIL/!TIL/g;
    s/\'TILDA/!TILDA/g;
    s/\'TWAS/!TWAS/g;
    s/\'TWEEN/!TWEEN/g;
    s/\'TWEEN-DECKS/!TWEEN-DECK/g;
    s/\'TWERE/!TWERE/g;
    s/\'TWILL/!TWILL/g;
    s/\'TWIXT/!TWIXT/g;
    s/\'TWOULD/!TWOULD/g;
    s/\'UN/!UN/g;
    s/\(LEFT-PAREN/!LEFT-PAREN/g;
    s/\-SHIRE/!SHIRE/g;
    s/<PAUSE>/!PAUSE /g;
    s/<SIL>/!SIL /g;
    s/\\\'//g;
    s/\\!//g;
    s/\\\^//g;
    s/\\\`//g;

#further conversions for CMUdict WJB 25/09/2001
    s/\"!EXCLAMATION_POINT\s/!EXCLAMATION_POINT /g;
    s/\'COURSE\s/!COURSE /g;
    s/\'END-INNER-QUOTE\s/!END_INNER_QUOTE /g;
    s/\'END-QUOTE\s/!END_QUOTE /g;
    s/\'INNER-QUOTE\s/!INNER_QUOTE /g;
    s/ \'M\s/!M /g;
    s/\'QUOTE\s/!QUOTE /g;
    s/^\'S\s/!S /g;
    s/\'TIS\s/!TIS /g;
    s/\'TWAS\s/!TWAS /g;
    s/\(IN-PARENTHESES\s/!IN_PARENTHESES /g;
    s/\(OPEN-PARENTHESES\s/!OPEN_PARENTHESES /g;
    s/\(PARENS\s/!PARENS /g;
    s/\)CLOSE-PARENTHESES\s/!CLOSE_PARENTHESES /g;
    s/\)END-PAREN\s/!END_PAREN /g;
    s/\)END-PARENTHESES\s/!END_PARENTHESES /g;
    s/\)PARENS\s/!PARENS /g;
    s/\-DASH/!DASH/g;
    s/\.FULL-STOP\s/!FULL-STOP /g;


#further conversions WSJ1 WJB
    s/\)END-OF-PAREN\s/!END_OF_PAREN /g;
    s/\.DECIMAL\s/!DECIMAL /g;
    s/\.DOT\s/!DOT /g;
    s/\(IN-PARENTHESIS\s/!IN_PARENTHESES /g;
    s/\(BRACE\s/!BRACE /g;
    s/\)CLOSE-BRACE\s/!CLOSE_BRACE /g;
    s/\)CLOSE_PAREN\s/!CLOSE_PAREN /g;


#spelling corrections in si_tr_s wsj0 training transcriptions WJB 23/08/2001
    s/EXISITING/EXISTING/g;
    s/FORCAST/FORECAST/g;
    s/GILBRALTAR/GIBRALTAR/g;
    s/GOVERMENT\'S/GOVERNMENT\'S/g;
    s/OPPORTUNITES/OPPORTUNITIES/g;
    s/THRESHHOLD/THRESHOLD/g;
    s/N\.A\.S\.A\./NASA/g;
    s/\.PERI/!PERIOD/g;
#corrections to WSJ1
    s/\.PERCENT\s/!PERCENT /g;
    s/\CORP;\s/CORP. /g;
    s/NI:NETY\s/NINETY /g;              #because of n:i:nety in dot file
    s/REPUBLI-\s/REPUBLICANS /g;        #bit of a bodge this
    s/CORP.`S\s/CORP.'S /g;
    s/\*KINSLEY\*'S\s/KINSLEY'S /g;
#correction to WSJCAM0
    s/OFFICALS\s/OFFICIALS /g;
    s/OFFICAL\s/OFFICIAL /g;


    if ($maphyph) {
      s/(\S+)\-(\S+)/$1_$2/g;
    }
    s/\s$/\n/;  #remove final SP
    print $_;
  }
  if ($addsent) { print "!SENT_START\n"; print "!SENT_END\n" }
  close(FILE);
}

sub perr
{
  print STDERR "mapsym: $_[0]\n";
  exit(1);
}
