# italian.pl
# Same as hepburn8.pl, but for italian files
# (Modified a bit to handle more complex changes involving backreferences)
$number_of_trials = 1;

$input_file = "Italian-ToConvert.txt";
open (INFILE, $input_file) or die "Warning! Can't open input file: $!\n";
$check_file = "Italian-Converted.txt";
open (CHECKFILE, $check_file) or die "Warning! Can't open check file: $!\n";

$rules_file = "ItalianRules.txt";
open (RULESFILE, $rules_file) or die "Warning! Can't open rule file: $!\n";

open (LOGFILE, ">italian3.log") or die "Warning! Can't create log file: $!\n";

# Read in the file and store each line in the rules array of arrays
while ($line = <RULESFILE>) {
    chomp($line);
    unless ($line =~ /^\s*$/) { # skip lines that are just whitespace    
	($orthographic, $phonemic, @rest) = split("\t", $line);
	# Now, place this pair onto the end of the @rules array
	push(@rules, [ $orthographic, $phonemic ]);
    }
}
$number_of_rules = $#rules;

# Now read in the input forms, and store them so we can learn from them
while ($line = <INFILE>) {
    chomp($line);      
    push (@inputs, $line);    
    
    $check_line = <CHECKFILE>;    
    chomp($check_line);        
    push (@answers, $check_line);        
}

for ($t = 1; $t <= $number_of_trials; $t++) {    
    # For each trial, we start at the start state and try solving it again
    # Rather than manipulating the rules array directly, we'll just keep track of
    #   the currently hypothesized order, using an "indexing array"
    # We start by assuming that we'll use the rules in the given order
    #   (that is, the initial ordering is 1, 2, 3, 4, ...)
    for (my $i = 0; $i <= $number_of_rules; $i++) {
	$ordering[$i] = $i;           
    }
    $known_orderings = undef; 
    $explored_orderings = undef;        

    # Start by checking the accuracy of the grammar using the initial ordering:
    $current_number_correct = check_accuracy(@ordering);
    
    $iterations = 0;
    while ($current_number_correct != ($#inputs + 1)) {
	$iterations++;
	
	# We try to improve performance by swapping rules that might feed or bleed each other
	# There are various ways to explore the space of possible orderings, 
	#   but one way is to start at the end of the grammar and try to "promote" rules
	#   by moving them up in the grammar
	$swap1 = -1;	
	$swap2 = -1;		
	FINDSWAPPAIR:
	for (my $r1 = $#rules; $r1 >= 1; $r1--) {
	     for (my $r2 = $r1 - 1; $r2 >= 0; $r2--) {
		 # Check if we already know that r2 crucially precedes r1
		 next if ($known_orderings =~ /$ordering[$r2]<$ordering[$r1]/ or
		    	  $explored_orderings =~ /$ordering[$r2]<$ordering[$r1]/ or
		 	  $explored_orderings =~ /$ordering[$r1]<$ordering[$r2]/
		    );		 
		 # Now check if moving r1 before r2 could possibly change things
		 # There are three possibilities:
		 #   (1) r1 and r2 have overlapping structural descriptions (bleeding/counterbleeding)
		 #   (2) struc desc of r1 overlaps with output of r2 (currently feeding, try counterfeeding)
		 #   (3) struc desc of r2 overlaps with output of r1 (currently counterfeeding, try feeding)
		 # The struc descs are:  $rules[$ordering[$r1]][0]  and $rules[$ordering[$r2]][0] 
		 # "Overlapping" means that at least one of their "terms" could match the same segment
		 # (a "term" in this simplified conception is an expression that refers to a single segment)
		 if (overlap($rules[$ordering[$r1]][0], $rules[$ordering[$r2]][0])) {
		     print "Rules $ordering[$r1] and $ordering[$r2] have overlapping struc descs (potential for bleeding/counterbleeding)\n";
		     print "\t\t[$rules[$ordering[$r1]][0]] and [$rules[$ordering[$r2]][0]]\n";		     		     
		     ($swap1, $swap2) = ($r1, $r2);
		 } elsif (overlap($rules[$ordering[$r2]][0], $rules[$ordering[$r1]][1])) {
		     print "Rule $ordering[$r2] operates on the output of $ordering[$r1] (currently counterfeeding; trying feeding)\n";
		     print "\t\t[$rules[$ordering[$r2]][0]] and [$rules[$ordering[$r1]][1]]\n";		     		     
		     ($swap1, $swap2) = ($r1, $r2);		     
		 } elsif (overlap($rules[$ordering[$r1]][0], $rules[$ordering[$r2]][1])) {
		     print "Rule $ordering[$r1] operates on the output of $ordering[$r2] (currently feeding; trying counterfeeding)\n";
		     print "\t\t[$rules[$ordering[$r1]][0]] and [$rules[$ordering[$r2]][1]]\n";		     		     
		     ($swap1, $swap2) = ($r1, $r2);		     
		 } 
		 if ($swap1 >= 0) {
		     last FINDSWAPPAIR;		     
		 }
	     }
	 }
	 # If we got through the FINDSWAPPAIR block and didn't find any eligible swaps, then we're stuck
	 if ($swap1 == -1) {
	     print "****It appears that there is no ordering of these rules that will correctly derive the data****\n";
	     check_accuracy(@ordering);	     
	     exit;
	 } else { # Try this swap
	     print "\tAttempting to move rule $ordering[$swap1] before $ordering[$swap2]\n";	     	
	     @proposed_ordering = (@ordering[0..$swap2-1],$ordering[$swap1],@ordering[$swap2..$swap1-1],@ordering[$swap1+1..$number_of_rules]);	     
	     print "Proposed ordering: @proposed_ordering\n";	     
	     
	     $new_number_correct = check_accuracy(@proposed_ordering);	     
	     if ($new_number_correct > $current_number_correct) {
		 # Aha, this helped!  A new crucial ordering
		 # Save the new order
		 @ordering = @proposed_ordering;
		 $current_number_correct = $new_number_correct;			 
		 $known_orderings .= "$ordering[$swap1]<$ordering[$swap2] ";		 
		 $explored_orderings = undef;		 		 
		 print "\tNew order is better: rule $proposed_ordering[$swap1] must come before rule $proposed_ordering[$swap2]\n";		 
		 print "\t\t$rules[$ordering[$swap1]][0] -> $rules[$ordering[$swap1]][1]   PRECEDES   ";		 
		 print "$rules[$ordering[$swap2]][0] -> $rules[$ordering[$swap2]][1]\n";
		 print "\tKnown orderings: $known_orderings\n";	
		 print "\tCurrent order: @ordering\n";		 		 
	     } elsif ($new_number_correct < $current_number_correct) {
		 # A detrimental move; make sure we don't do it again (at least until something else has changed)
		 $explored_orderings .= "$ordering[$swap2]<$ordering[$swap1] ";
		 print "\tNew order is worse: rule $ordering[$swap2] probably comes before rule $ordering[$swap1]\n";		 		 		 
	     }
	     # It's also possible that the move made no difference, and we shouldn't try it again.
	     #  However, I think it may also be possible that something which seems not to make a difference at
	     #  the moment might turn out to be crucial; So, we should keep track of the irrelevant_orderings at the moment,
	     #  but keep in mind that if we change any other ordering, then we might as well re-check orderings
	     #  that were previously irrelevant (by forgetting that they were irrelevant)
	     else {
		 print "\tNew order makes no difference; leaving things as they were\n\n";		 		 
		 $explored_orderings .= "$ordering[$swap2]<$ordering[$swap1] ";
	     }	     
	 }	 
    }    
    $total_iterations += $iterations;        
    print "Trial $t took $iterations iterations\n";    
    printf LOGFILE "Trial $t took $iterations iterations\n";
}

# Now that we're done, the average iterations is the total over the number of trials
$average_iterations = $total_iterations / $number_of_trials;
printf "\nAfter $number_of_trials trials, the average solution time is %.2f iterations\n", $average_iterations;


sub check_accuracy {   
    my @check_ordering = @_;        
    my $correct = 0;        
    for ($i = 0; $i <= $#inputs; $i++) {
	# We'll start with the current input, and transform it
	$output = $inputs[$i];        
	for ($r = 0; $r <= $number_of_rules; $r++) {
	    # Nab the backreferences
	    if ($output =~ /$rules[$check_ordering[$r]][0]/) {
		@backrefs = ($output =~ /$rules[$check_ordering[$r]][0]/);
	    }    
	    # Now do the replacement, putting in the "dummy" backref $1, etc.
	    $output =~ s/$rules[$check_ordering[$r]][0]/$rules[$check_ordering[$r]][1]/g;
	    while ($output =~ /\$(\d+)/) {
		$replacement = $backrefs['$1'];	    	    
		$output =~ s/\$$1/$replacement/;	    
	    }
	}
	# Now check answer against the "real" answer in the checkfile
	if ($output eq $answers[$i]) {
	    $correct++;		
	} else {
	    print "\tIncorrect on form $i  ($inputs[$i]: deriving [$output] instead of [$answers[$i]])\n";	        
	}
    }
    return $correct;
}

sub overlap {
    my $string1 = @_[0];    
    my $string2 = @_[1];
    
    if (length($string1) > length($string2) ) {
	return ($string1 =~ /\Q$string2\E/);		
    } else {
        return ($string2 =~ /\Q$string1\E/);        
    }
}

