#!/usr/bin/perl -sw
#
# Copyright CRIM 2014 - Gilles Boulianne
#
# Reads a standard lattice file format .slf
# and writes a transducer file (AT&T text format).
# The lattice file can have its labels on nodes or arcs.
# The transducer file will have numbered symbols.
#
# Note 1: epsilon or null symbol must have number 0 in symbol file.
#
# Note 2: the FSM library minimizes costs, and lattice files contain
#         log-probabilities to be maximized, so by default
#         $costScale is -1.
#

# Usage: slf2fsm.pl symbol_file [silence_symbol] < slf_file > fsm_file
#
# 
if ($ARGV[0]) {
    open(SYMBOLES,$ARGV[0]);
} else {
    print STDERR 
	"Usage: symbol_file [silence_symbol] < slf_file > fsm_file\n";
    exit 1;
}

$startSymbol = "!SENT_START";
$finalSymbol = "!SENT_END";
print STDERR "slf2fsm: Will mark $finalSymbol nodes as final\n";
print STDERR "slf2fsm: Will ignore arcs from $finalSymbol to $startSymbol\n";

$maxnum = 0;
while (<SYMBOLES>) {
   chomp; 
   ($symb,$num) = split(' ');
   $numsymb{$symb} = $num;
   if ($num > $maxnum) {
	$maxnum = $num;
	$maxsymb = $symb;
   }
}

if ($ARGV[1]) {
    $ADDSILENCE = 1;
    $silsymb = $ARGV[1];
    $nsilsymb = $numsymb{$silsymb};
    if (!defined($nsilsymb)) {
	print STDERR "slf2fsm: Silence symbol $silsymb not in symbol file\n";
	exit 1;
    }
} else {
    $ADDSILENCE = 0;
}

$ifinalSymbol = $numsymb{$finalSymbol};
if (!defined($ifinalSymbol)) {
    print STDERR "slf2fsm: Symbol $finalSymbol not defined in symbol file\n";
    exit 1;
}

# Costs will be $cost * $costScale + $costOffset
$costOffset = 0.000;
$costScale = -1.000;

print STDERR "slf2fsm: Multiplying all costs by $costScale\n";
print STDERR "slf2fsm: Augmenting  all costs by $costOffset\n";
print STDERR "slf2fsm: Unknown symbols will be mapped to $maxsymb\n";
print STDERR "slf2fsm: Adding loops with symbol $silsymb\n" if ($ADDSILENCE);

$lmscale = 1;
$wdpenalty = 0;
$acoustics = 0;

$narcs =  $nnodes = 0;
$maxnodes = $maxarcs = 0;

open(STDIN,"sed 's/=/ /g' |");
while (<STDIN>) {
    undef $cost;
    chop; 
    ($f0,$f1,$f2,$f3,$f4,$f5,$f6,$f7,$f8,$f9) = split;
    if ($f0 eq "lmscale") {
	$lmscale = $f1;
	if ($f2 eq "wdpenalty") {
	    $wdpenalty = $f3;
	}
	next;
    }
    if ($f0 eq "I") {
	# a node: initialize arc count
	$out[$f1]  = $inp[$f1] = 0;
	# labels on node, no times
	if (defined($f2) && $f2 eq "W") {
	    $symb[$f1] = $f3;
	    $isFinalNode{$f1} = 1 if ($f3 eq $finalSymbol);
	    $isStartNode{$f1} = 1 if ($f3 eq $startSymbol);
	}
	# labels on nodes, with times
	if (defined($f4) && $f4 eq "W") {
	    $symb[$f1] = $f5;
	    $isFinalNode{$f1} = 1 if ($f5 eq $finalSymbol);
	    $isStartNode{$f1} = 1 if ($f5 eq $startSymbol);
	}
	if ($f1 > $maxnodes) {
	    $maxnodes = $f1;
	}
	$nnodes++;
	next;
    }
    if ($f0 eq "J") {
	# this is an arc
	$src = $f3;
	$dst = $f5;
	next if ($isFinalNode{$src} && $isStartNode{$dst});
	if ( $f6 eq "W") {
	    # case 1: labels on arcs
	    $cost = $f9;
	    $symbole = $f7;
	} else {
	    # case 2: labels on node, with acoustic scores
	    $symbole = $symb[$dst];
	    $l = 0;
	    if ($f6 eq "a") {
		$acoustics = $f7;
		if ($f8 eq "l") {
		    $l = $f9;
		}
	    }
	    # with LM scores only
	    if ($f6 eq "l") {
		$acoustics = 0;
		$l = $f7;
	    }
	    $cost = $l;
	}
	if (!defined($numsymb{$symbole})) {
	    print STDERR "slf2fsm: Unknown $symbole mapped to $maxsymb\n";
	    $ns = $maxnum;
	} else {
	    $ns = $numsymb{$symbole};
	}
	$arcSrc[$narcs] = $src;
	$arcPrint[$narcs] = 
	    sprintf("%d %d %d %5.2f\n", 
		    $src, $dst, $ns,$cost*$costScale+$costOffset);
	
	# count output arcs
	$out[$f3]++;
	$inp[$f5]++;
	# count total arcs
	if ($f1 > $maxarcs) {
	    $maxarcs = $f1;
	}
	$narcs++;
	next;
    }
}

# identify initial and final nodes, print final
for ($i=0; $i <= $maxnodes; $i++) {
    $addsil = 1;
    next if (!defined($out[$i]));
    if ($out[$i]==0) {
	$isFinalNode{$i} = 1;
	$addsil = 0;
    }
    next if (!defined($inp[$i]));
    if ($inp[$i]==0) {
	print STDERR "slf2fsm: Node $i is initial (word $symb[$i])\n";
	$noeudInitial = $i;
	$addsil = 0;
    }
    if ($ADDSILENCE && $addsil) {
	# not an initial or final node
	$arcSrc[$narcs] = $i;
	# add a silence loop with no cost
	$arcPrint[$narcs] = sprintf("%d %d %s %5.2f\n",
				    $i,$i,$nsilsymb,0);
	$narcs++;
    }	
}

print STDERR "slf2fsm: $nnodes nodes / $narcs arcs\n";

# identify initial arc
for ($i=0; $i <= $maxarcs; $i++) {
    if ($arcSrc[$i] == $noeudInitial) {
        $arcInitial = $i;
	last;
    }
}

# print arcs, beginning with the one out of initial node
#printf "%d %d %d %5.2f\n",
#    0, $noeudInitial+1, $numsymb{$symb[$noeudInitial]}, 0;
print $arcPrint[$arcInitial];
for ($i=0; $i < $narcs; $i++) {
    print $arcPrint[$i] unless ($i == $arcInitial);
}

# print final node(s)
foreach $node (keys %isFinalNode) {
    print STDERR "slf2fsm: Node $node is final (word $symb[$node])\n";
    print "$node 0\n";
}

