SFST szintaxis vizsgálat

A Programozás Wiki wikiből
################ dc_readme of dc_check sfst syntax check suite #############
#
#  Syntax check suite for sfst fst files
#  Usage: 
#    1. copy all dc_* files into your working directory 
#    2. sh dc_do_check.sh your_fst_file.fst  - or
#       sh dc_checkall.sh  - to check all *.fst files
#
# Sample program dc_x.fst compiles usig fst-compiler: 
#---------------------------------------------------
#fst-compiler-utf8 dc_x.fst dc_x.a
#x.fst: 2
#  symbols1.fst: 35
#x.fst: 69
#x.fst:69: warning: assignment of empty transducer to: $rp$
xx.fst: 89
#  ninfl1.fst: 77
#x.fst: 92
#x.fst:92: warning: assignment of empty transducer to: $ma$
#x.fst: 96
#reading words from noun-reg1.lex...finished
#x.fst: 99
#x.fst:99: warning: assignment of empty transducer to: $phon1$
#x.fst: 100
#
#  Fst-compiler reports empty transducers, but gives no hint whatsoever
#  to the exact location of the problem(s)
#  Even if there are no empty transducers, the fst-compiler result can be 
#  missing or completely wrong due to definition and 
#  setting errors, program inconsistencies, etc...
#
#  Syntax checker dc_check helps to locate the problem(s).
#
# Expected result:
# -----------------
# sh dc_do_check.sh dc_x.fst
# Warning: transducer rp3 redefined near line 30
# Warning: unused transducers: $j2$ $rp5$ $rp8$ $ine1$ 
# Warning: near to line 31 variable #vowel name starting with # and not ending with #: #vowel
# Warning: near to line 39 variable $rp5 name starting with $ and not ending with $: $rp5
# Warning: Too many definitions for oeoez1 element:o <> defs:o a cas:2 bas:2 
# Warning: Too many definitions for J element:j <> defs:<> j <> cas:3 bas:2 
# Warning: Variable OOEE  missing from alphabet
# Warning: Variables plvp oeoez1 oooe plv oeoez2 AAA oeoe AA plv3 oeoez3 uue VV OOEE plv1 EOA plv2 oeoez plvs OEOE1 J AE1 UUE OEOE UUE1 JP AE  missing from alphabet
#
#
#  Checks done are:
#  1. Does setting set value for all ambigous  alphabet chars?
#  2. Is setting only for by alphabet allowed chars?
#  3. Warn, if a variable is doubly defined in alphabet with the same value.
#     This is very likely a typing error
#  4. Warn, if setting count >= alphabet values
#     This would cause some not shown word forms
#  5. Are all inflection variables used in alphabet?  
#  6. Do all names starting with $ or # also end with $ or #? (warning)
#  7. Are all defined transducers used somehow? 
#     If not, this is very likely an error in the logic
#  8. Check for redefined transducers, probaly wrong naming
#  9. Check for undefined but used transducers
#
#
#  If *.a files are used instead of sources,
#  you must put together a source file in order to make the syntax check.
#  #include files are allowed and handled properly by dc_do_check.sh
#  test suite handles properly besides include also comments and 
#  continuation lines
#
#  dc_checkall.sh checks all fst files in a directory
#   using dc_do_check.sh
#
#  Requires a working perl interpreter on the system
#  
#  Files:
#  dc_checkvar1.pl, dc_contin.pl dc_osszefuz.pl 
#  dc_do_check.sh dc_checkall.sh dc_x.fst dc_symbols1.fst dc_ninfl1.fst 
#  dc_readme dc_sav.sh
#
#  Simplifications:
#  dc_check modifies internally in agreement variables name the '=' to '_' for
#  better handling ($=abc$ -> $_abc$) during the test, and prints
#  the modified name.
#  for simplicity, dc_check does not fulfil in included files 
#  embedded include commands and reports found #include statements 
#  as erroneous #..# variables.
#
#  dc_check was tested with:
#     trmorph, SMOR, XMOR, Omorfi for sfst (Finnish sfst) and Humorph.
#  dc_check is a great help for almost all of the above; the only exception
#   is SMOR with lots of unusual variable names 
#   ($..$-s with embedded $-s, <..> with embedded $ and the like), where
#   dc_check reports numerous unjustified variable name inconsistencies.
#        
#  If there are questions/problems/requests, please report
#  to: 
#
############### dc_sav.sh ##################
#
tar cvf dc_check.tar dc_*.sh dc_*.pl dc_x.fst dc_symbols1.fst dc_ninfl1.fst  dc_readme dc_sav.sh
rm -rf dc_check.tar.gz
gzip dc_check.tar
cp dc_check.tar.gz ../../sich
#################### dc_checkall.sh ###############xxx
#
# check all *.fst files in a directory
# using dc_do_check.sh
#
for f in *.fst
do
  echo "-------processing $f"
  sh dc_do_check.sh $f
done
#################### dc_do_check.sh  ###############
#
# check suite for sfst fst files
# check a file using dc_osszefuz.pl dc_contin.pl and dc_checkvar1.pl
#
perl dc_osszefuz.pl <$1 >$1.check1
perl dc_contin.pl <$1.check1 >$1.check2
perl dc_checkvar1.pl <$1.check2 
rm -rf $1.check1  $1.check2

use strict;
#################### dc_checkvar1.pl ###############xxx
#
# check for undefined transducers
# check for unused and redefined transducers
# check for variables starting with $ or # also end with $ or #
# check if <> variables of an alphabet are set up properly
# 
my (@arr, $as, $folytatas, %barr, @b1arr, %carr, $was_alphabet);
# pattern for transducer and #..# variables name
my $pat1 = "A-Za-z0-9_\$\#\/+&\(\)\=";

my @input = <STDIN>;
#
#  check for undefined transducers
#
my ($linenr,  @def_arr, $sda, $out, $out1, $debug);
$debug = 0;
foreach(@input){
   ++$linenr;
   chomp;
  my ($l0, $l1,@list, @list0, @list1);
  my $current = $_;
  my @arr = split(/=/, $current);
  my $as = @arr;
  if($arr[0] =~  /\$([$pat1\-]*)/){
     @list = ($arr[0] =~ /\$([$pat1\-]*)/g);
     foreach(@list){
        # print "element:$_ last:$last\n";
         my $ix = is_in($_, '$');
         if($ix != -1){
             my $seg = substr($_,0,$ix);
             #print "elem:'$_' seg:'$seg'\n";
             push(@list0, $seg);
         }
     }
  }   
  if(defined($arr[1]) and $arr[1] =~  /\$([$pat1\-]*)/){
     @list = ($arr[1] =~ /\$([$pat1\-]*)/g);
     foreach(@list){
        # print "element:$_ last:$last\n";
         my $ix = is_in($_, '$');
         if($ix != -1){
             my $seg = substr($_,0,$ix);
             #print "2elem:'$_' seg:'$seg'\n";
             push(@list1, $seg);
         }
     }
  }  
  $l0 = @list0;
  $out = '';
  $out1 = '';
  $sda = @def_arr;
  if($debug>2){
    my $sds = @def_arr;
    if($sds){
     print "line:$current sda:@def_arr\n";
    }
  }
  if($sda){
   for(my $i = 0; $i < $sda; $i++){
     my $elem = $def_arr[$i];
     $out1 .= '_'.$elem;
   }
   $out1 .= '_';
  }
 if($l1){
   for(my $i = 0; $i < $l1; $i++){
     my $elem = $list1[$i];
     $out .= '_'.$elem;
     if($out1 !~ /$elem/ and $as > 1 ){
           print "Warning: using undefined transducer $elem near line $linenr\n"; 
     }
   }
   $out .= '_';
  }
  #print "l0:$l0 as:$as $current\n";
 if($l0 and $as < 2){
   for(my $i = 0; $i < $l0; $i++){
     my $elem = $list0[$i];
     $out .= '_'.$elem;
     if($out1 !~ /$elem/ ){
           print "Warning: using undefined transducer $elem near line $linenr\n"; 
     }
   }
   $out .= '_';
  }
  # fill defined transducer list
 if($l0 > 1 or  $l1> 0){
  if($debug>3){print "line:$linenr cur:$current as:$as l1:$l0 l1:$l1 darr:@def_arr out:$out\n";}
 }
  if($as > 1 and $l0 ){
    for(my $i = 0; $i < $l0; $i++){
      my $elem = $list0[$i];
      my $elem1 = '_'.$elem.'_';
      if($out !~ /$elem1/ ){
         my $found;
         foreach(@def_arr){
           if($_ eq $elem){ $found = 1; last; }
         }
         if(!$found) {
           push(@def_arr,$elem);
           if($debug>3){print "after add $elem in def_arr: @def_arr\n";}
           #foreach(@def_arr){
           #   print "after add in def_arr: $_\n";
           #}
         } #found 
       } # ($out !~ /$elem1/)
     } # for $i 
   } #$as > 1 and $l0
 } # foreach input   
#
#  check for unused and redefined transducers
#
my ($linenr,  @def_arr, $sda, $out,  $debug);
$debug = 0;
foreach(@input){
   ++$linenr;
   chomp;
  my ($l0, $l1,@list, @list0, @list1);
  my $current = $_;
  my @arr = split(/=/, $current);
  my $as = @arr;
  if($arr[0] =~  /\$([$pat1\-]*)/){
     @list = ($arr[0] =~ /\$([$pat1\-]*)/g);
     foreach(@list){
         my $ix = is_in($_, '$');
         if($ix != -1){
             my $seg = substr($_,0,$ix);
             #print "2elem:'$_' seg:'$seg'\n";
             push(@list0, $seg);
         }
     }
  }   
  if(defined($arr[1]) and $arr[1] =~  /\$([$pat1\-]*)/){
     @list = ($arr[1] =~ /\$([$pat1\-]*)/g);
     foreach(@list){
        # print "element:$_ last:$last\n";
         my $ix = is_in($_, '$');
         if($ix != -1){
             my $seg = substr($_,0,$ix);
             #print "2elem:'$_' seg:'$seg'\n";
             push(@list1, $seg);
         }
     }
  }  
  $l0 = @list0;
  $l1 = @list1; 
  $out = '';
  if($l1){
   for(my $i = 0; $i < $l1; $i++){
     my $elem = $list1[$i];
     $out .= '_'.$elem;
   }
   $out .= '_';
  }
  # fill defined transducer list
 if($l0 > 1 or  $l1> 0){
  if($debug>3){print "line:$linenr cur:$current as:$as l1:$l0 l1:$l1 darr:@def_arr out:$out\n";}
 }
  if($as > 1 and $l0 ){
    for(my $i = 0; $i < $l0; $i++){
      my $elem = $list0[$i];
      my $elem1 = '_'.$elem.'_';
      if($out !~ /$elem1/ ){
         my $found;
         foreach(@def_arr){
           if($_ eq $elem){ $found = 1; last; }
         }
         if(!$found) {
           push(@def_arr,$elem);
           if($debug>3){print "after add $elem in def_arr: @def_arr\n";}
           #foreach(@def_arr){
           #   print "after add in def_arr: $_\n";
           #}
         } #found 
         else{
           print "Warning: transducer $elem redefined near line $linenr\n"; 
         } # else
      } # ($out !~ /$elem1/)
     } # for $i 
   } #$as > 1 and $l0
  if($as > 1 and $l0 and $l1){
  # empty def_arr transducer list
    my (@seg_arr);
    for(my $i = 0; $i < $l1; $i++){
      my $elem = $list1[$i];
      if($debug>3){print "search for $elem in @def_arr list1:@list1 l0:$l0 l1:$l1 \n";}
      foreach(@def_arr){
          if($_ eq $elem){ 
             push(@seg_arr, $elem);
             if($debug>3){print "added $elem to @seg_arr\n";}
          } 
       }
     } # $i
     if($debug>2){print "remove @seg_arr from @def_arr new def-arr:@def_arr\n";}
     my $adefarr = remove_element_from_array(\@def_arr, \@seg_arr);
     @def_arr = @$adefarr;
   } # $l0 and $l1 
   elsif($as <=1 and $l0){
    my (@seg_arr);
    for(my $i = 0; $i < $l0; $i++){
      my $elem = @list0[$i];
         my $found = -1;
         foreach(@def_arr){
           if($_ eq $elem){ 
              if($debug>3){print "2. remove from def_arr $elem\n";}
             push(@seg_arr, $elem);
             $found = $i;
             last;
           }
         }
         if($found != -1) {
           my $adefarr = remove_element_from_array(\@def_arr, \@seg_arr);
          @def_arr = @$adefarr;
          if($debug>2){print "2.new def-arr:@def_arr\n";}
         }
      } # $i
    } # $l0 only
   my $dnr = @def_arr;
   if($dnr){
   # print "line $linenr def_arr:@def_arr\n";
   }
} # foreach input   
#
# print result for unused transducers
#  
$out = '';
my ($cnt, $var);
$var = "transducer";
foreach(@def_arr){
   $out .= '$'.$_.'$ ';
   if(++$cnt > 1){$var = "transducers";}
}
if($out ne ''){
   print "Warning: unused $var: $out\n";
}
#
# utility for unused transducers
# remove elements of arref, that are contained in sref
#
sub remove_element_from_array($$){
    my ($arref, $sref) = @_;
    my @def_arr = @$arref;
    my @seg_arr = @$sref;
    my @seg1_arr;
    my $sa = @seg_arr;
    my ($found, $debug);
    if($debug){print "sa:$sa da:@def_arr\n";}
    if($sa) {
       foreach(@def_arr){
             my $elem = $_;
             $found = 0;
             foreach(@seg_arr){
              if ($_ eq $elem){
                 $found =1;
              }
           }
           if(!$found){
              push(@seg1_arr, $elem);
           }
         }
         @def_arr = @seg1_arr;
         @seg1_arr = ();
     }
     return \@def_arr;
}
    
#
#  check for variables starting with $ or # also end with $ or #
#
my $linenr = 0;
foreach(@input){
   ++$linenr;
   chomp;
  my $current = $_;
  if($current =~  /#([$pat1\-]*)/){
     my @list = ($current =~ /#([$pat1\-]*)/g);
     foreach(@list){
          if(is_in($_, '#') == -1){
          print "Warning: near to line $linenr variable \#$_ name starting with \# and not ending with \#: \#$_\n";
         }
     }
  }
  if($current =~  /\$([$pat1\-]*)/){
     my @list = ($current =~ /\$([$pat1\-]*)/g);
     foreach(@list){
        # print "element:$_ last:$last\n";
         if(is_in($_, '$') == -1){
          print "Warning: near to line $linenr variable \$$_ name starting with \$ and not ending with \$: \$$_\n";
         }
     }
  }   
}
#
# is element in a string- because of $ must be this complicated
#
sub is_in($$){
   my ($elem, $char)=@_;
   my @arr = split(//,$elem);
   my $ix;
   foreach(@arr){
     if($_ eq $char){ return $ix;}
     ++$ix;
   }
   return -1;
} 
#
#  replace all #name# variables with the fullest version
#    result in hash %repl
#
my (%repl);
foreach(@input){
   chomp;
   if($_ =~ /#(.*)#\s*=\s*(.*)\s*$/){
        my $seg2 = $2;
        my $seg1 = $1;
        my ($name, $elements);
        while (($name, $elements) = each(%repl))
        {
          my $seg3 = '#'.$name.'#';
          if($seg2 =~ /$seg3/){
             $seg2 =~ s/$seg3/$elements/;
           }
        }
        #print "writing $seg1 = $seg2\n";
        $repl{$seg1} = $seg2;
   }
}
#
#  collect all inflection variables in array @darr
#
my (@darr);
foreach(@input){
   chomp;
   if($_ =~ /{?<(.*)>}?:{(.*)}\s*$/){
     #print "1:$1 2:$2 l:$_\n";
     my $seg = $2;
     if($2 =~ /<(.*)>/){
        #print "in 2 $2  seg:$seg\n";
        $seg =~ s/>[a-záéóúűőüöí]*</ /g;
        $seg =~ s/>[a-záéóúűőüöí]*\s*$/ /g;
        #print "in 2 $1  seg:$seg\n";
        $seg =~ s/</ /g;
        #print "in 2 $1  seg:$seg l:$_\n";
        my @arr = split(/\s+/, $seg);
        foreach(@arr){
           my $seg1 = $_;
           my $found = 0;
          foreach(@darr){
            if($_ eq $seg1){$found = 1; last;}
          }
          if(!$found and $_ ne ''){
              #print "write in darr:$_\n";
              push(@darr, $_); 
          }
        } #@arr
     } # if $2
   } # if $_
}
#
# check if <> variables of an alphabet are set up properly
# %barr: in alphabet defined vars
# %carr: transducers
# @b1arr:#..# vars
#
foreach(@input){
   chomp;
   if($_ =~ /ALPHABET\s*=/){
     if($was_alphabet){
       check_previous_alphabet();
       %barr = ();
       %carr = ();
       @b1arr = ();
     }
     $was_alphabet = 1;
     #print "1. $_\n";
     @arr = split(/\s+/, $_);
     $as = @arr;
     for(my $i = 0; $i < $as; $i++){
        #print "2. $arr[$i]\n";
        if($arr[$i] =~ /<(.*)>:(.*)/){
           #print "3. 1:$1 2:$2\n";
           my $mas;
           if(substr($2,0,1) eq '[' and substr($2, length($2)-1,1) eq ']'){
              my $seg = substr($2,1);
              $seg = substr($seg,0,length($seg)-1);
              #print "seg:'$seg'\n";
              my @marr = split(//, $seg);
              my $ms = @marr;
              for(my $i = 0; $i < $ms; $i++){
                if($marr[$i] eq '<' or $marr[$i] eq '#'){
                   my $end = '>';
                   if($marr[$i] eq '#') {$end = '#';}
                   $mas .= $marr[$i];
                   $i++;
                   while($marr[$i] ne $end and $i < $ms){
                     $mas .= $marr[$i++]
                   }
                   if($marr[$i] eq $end){$mas .= "$end ";}
                   else {$mas .= ' ';}
                } else{
                   $mas .= "$marr[$i] ";
                }
              }
              $mas = substr($mas, 0, length($mas)-1);
           } else {$mas =  $2;}
           if(!defined($barr{$1})){
                #print "writing into barr 1:$1 2:$2\n";
                $barr{$1} = $mas;
           }
           else{
                check_for_double_element($barr{$1}, $2, $1);
                 $barr{$1} .= ' '.$mas;
           }
          # print "2:'$2' mas:'$mas'\n";
        } elsif ($arr[$i] =~ /#(.*)#/){
            my $seg1 = $1;
            my $found;
            foreach(@b1arr){
               if($_ eq $seg1){$found = 1; last;}
            }
            if(!$found){
               push(@b1arr, $seg1);
              # print "added to b1arr:$seg1\n";
            }
        }     
     }
   } # ALPHABET
   else{
    if($_ =~ /<([^ ]*)>\s*<=>\s*([^ ]*)/){
           #print "4. match 1:$1 2:$2 l:$_\n";
           if(!defined($carr{$1})){
                $carr{$1} = $2;
           }
           else{
                $carr{$1} .= ' '.$2;
           }
     }
   }
}

sub check_previous_alphabet(){
my ($name, $elements, $wassign, $size, $debug);
$debug = 0;
if($debug){
  $size = keys(%barr);
  #print "bsize:$size\n";
  if($size){
  print "-----------------------barr\n";
  while (($name, $elements) = each(%barr))
  {
	print "$name has $elements\n";
  }
  }
  $size = keys(%carr);
  if($size){
    print "-----------------------carr\n";
    while (($name, $elements) = each(%carr))
    {
	  print "$name has $elements\n";
    }
    print "-----------------------\n";
   }# size
} #debug
 while (($name, $elements) = each(%barr))
 {
	#print "$name has $elements\n";
    my (@ba, @ca, $bas, $cas);
    @ba = split(/\s+/, $elements);
    $bas = @ba;
    $cas = 0;
    if(defined($carr{$name})){
      @ca = split(/\s+/, $carr{$name});
      $cas = @ca;
    } else{
      if($bas > 1){
        print "Definition missing for $name, element: $elements bas:$bas\n";
        $wassign = 1;
      }
    }
    if(!$wassign){
      if($cas > $bas-1){
         print "Warning: Too many definitions for $name element:$elements defs:$carr{$name} cas:$cas bas:$bas \n";
      }elsif($cas < $bas -1){
         print "Too few definitions for $name element:$elements defs:$carr{$name} cas:$cas bas:$bas \n";
      }
      if($cas){
         my $found = 0;
         foreach(@ca){
           if($bas){
              my $seg = $_;
              foreach(@ba){
               if($_ eq $seg){$found = 1; last;}
               #else {print "elem:$_ seg:$seg\n";}
              }
              if(!$found){
                print "Wrong definition for $name not found:$seg \n";
              }
           } #$bas
         } #@ca
      } #cas
      if($bas > 1){
         if(!$cas){ print "Undefined $name element:$elements \n";}
      }
    } # wassign
    $wassign = 0;
 }
 check_if_alphabet_uses_all_inflect_variables();
}
#
# check if alphabet uses all inflect variables
# %repl contains fullest variable names
# @darr contains all inflection variables
# %barr contains all alphabet <...> variables
# @b1arr contains all alphabet #...# variables
# %carr contains all variable settings
# all saved names without <> or ##
#
sub check_if_alphabet_uses_all_inflect_variables(){
   my ($debug, $name, $elements);
   if($debug > 1){while (($name, $elements) = each(%barr))
    {
   	  print "barr: $name has $elements\n";
    }
    foreach(@darr){
      print "darr:$_\n";
    }
   }

   my @missing;
   foreach(@darr){
     my $seg1 = $_;
     my $found;
     # search in %barr first
    my ($name, $elements);
    while (($name, $elements) = each(%barr))
    {
	  if($debug){print "search for $seg1 in barr, $name has $elements\n";}
      if($name eq $seg1){
           $found = 1;
           if($debug){print "found $seg1\n";}
           #last;
      }
    }
    if($debug){if(!$found){ print "not found $seg1 yet\n";}}
    # search in @b1arr-%repl second
    if(!$found){
      foreach(@b1arr){  
        if(defined($repl{$_})){
         my $seg2 = $repl{$_};
         if($seg2 =~ $seg1){
            if($debug){print "found $seg1 in b1arr/tpl $seg2\n";}
            $found = 1; 
            last;
         }
        }
      }
    } 
    if(!$found){ 
       my $found2;
       foreach(@missing){
        if($_ eq $seg1) {$found2 = 1; last;}
       }
       if(!$found2){push(@missing, $seg1);}
    }
   }
    my ($out, $count, $var);
    $var = "Variable";
    foreach(@missing){
      $out .= $_.' ';
      if(++$count > 1){$var = "Variables";}
    }
    if(length($out)){ print "Warning: $var $out missing from alphabet\n";}
    else {print "All variables found in alphabet\n";}
}
#
# Utility for variable check
#
sub check_for_double_element($$$){
  my ($arr, $elem, $name) = @_;
  my @sa = split(/\s+/, $arr);
  foreach(@sa){
     if($_ eq $elem){
         print "Element $elem twice in $name\n";
     }
  }
}
use strict;
#################### dc_contin.pl ###############xxx
#
# put together lines separated by \ (continuation lines)
#
my (@output, $ol, $contin);
while(<STDIN>){
    chomp;
    #print "cont:$contin line:$_\n";
    if($_ =~ /(.*)\\\s*$/){
       #print "itt, $_\n";
       if($contin){
          $ol .= ' '.$1;
       }else{
          $ol = $1;
       }
       $contin = 1;
    } else{
      if($contin){
          $ol .= ' '.$_;
          $contin = 0;
          push(@output, $ol);
          $ol = '';
      } else{
          push(@output, $_);
      }
    }                
}
foreach(@output){
   # replace in agreement variables $=abc$ with $_abc$
   $_ =~ s/\$=([^ ]*)\$/\$_$1\$/g;
   $_ =~ s/\#=([^ ]*)\#/\#_$1\#/g;
   print "$_\n";
}
use strict;
#################### dc_osszefuz.pl ###############xxx
#
# exclude comments
# include includes
#
my (@output, @arr);
while(<STDIN>){
   chomp;
   if($_ =~ /^#include\s+"(.*)"/){
      open FILE, "$1" or die $1;
      my @lines = <FILE>;
      close FILE;
      foreach(@lines){
       chomp;
       @arr = split(/%/, $_);
       if(length($arr[0])){
        push(@output, $arr[0]);
       }
      } 
   } else{
      @arr = split(/%/, $_);
      if(length($arr[0])){
        push(@output, $arr[0]);
      }
   }
}
foreach(@output){
   print "$_\n";
}

Lásd még[szerkesztés]