SFST szintaxis vizsgálat
Innen: Programozás Wiki
Ugrás a navigációhozUgrás a kereséshez################ dc_readme of dc_check sfst syntax check suite #############
#
# Syntax check suite for sfst fst files
# Usage:
# 1. copy all dc_* files into your working directory
# 2. sh dc_do_check.sh your_fst_file.fst - or
# sh dc_checkall.sh - to check all *.fst files
#
# Sample program dc_x.fst compiles usig fst-compiler:
#---------------------------------------------------
#fst-compiler-utf8 dc_x.fst dc_x.a
#x.fst: 2
# symbols1.fst: 35
#x.fst: 69
#x.fst:69: warning: assignment of empty transducer to: $rp$
xx.fst: 89
# ninfl1.fst: 77
#x.fst: 92
#x.fst:92: warning: assignment of empty transducer to: $ma$
#x.fst: 96
#reading words from noun-reg1.lex...finished
#x.fst: 99
#x.fst:99: warning: assignment of empty transducer to: $phon1$
#x.fst: 100
#
# Fst-compiler reports empty transducers, but gives no hint whatsoever
# to the exact location of the problem(s)
# Even if there are no empty transducers, the fst-compiler result can be
# missing or completely wrong due to definition and
# setting errors, program inconsistencies, etc...
#
# Syntax checker dc_check helps to locate the problem(s).
#
# Expected result:
# -----------------
# sh dc_do_check.sh dc_x.fst
# Warning: transducer rp3 redefined near line 30
# Warning: unused transducers: $j2$ $rp5$ $rp8$ $ine1$
# Warning: near to line 31 variable #vowel name starting with # and not ending with #: #vowel
# Warning: near to line 39 variable $rp5 name starting with $ and not ending with $: $rp5
# Warning: Too many definitions for oeoez1 element:o <> defs:o a cas:2 bas:2
# Warning: Too many definitions for J element:j <> defs:<> j <> cas:3 bas:2
# Warning: Variable OOEE missing from alphabet
# Warning: Variables plvp oeoez1 oooe plv oeoez2 AAA oeoe AA plv3 oeoez3 uue VV OOEE plv1 EOA plv2 oeoez plvs OEOE1 J AE1 UUE OEOE UUE1 JP AE missing from alphabet
#
#
# Checks done are:
# 1. Does setting set value for all ambigous alphabet chars?
# 2. Is setting only for by alphabet allowed chars?
# 3. Warn, if a variable is doubly defined in alphabet with the same value.
# This is very likely a typing error
# 4. Warn, if setting count >= alphabet values
# This would cause some not shown word forms
# 5. Are all inflection variables used in alphabet?
# 6. Do all names starting with $ or # also end with $ or #? (warning)
# 7. Are all defined transducers used somehow?
# If not, this is very likely an error in the logic
# 8. Check for redefined transducers, probaly wrong naming
# 9. Check for undefined but used transducers
#
#
# If *.a files are used instead of sources,
# you must put together a source file in order to make the syntax check.
# #include files are allowed and handled properly by dc_do_check.sh
# test suite handles properly besides include also comments and
# continuation lines
#
# dc_checkall.sh checks all fst files in a directory
# using dc_do_check.sh
#
# Requires a working perl interpreter on the system
#
# Files:
# dc_checkvar1.pl, dc_contin.pl dc_osszefuz.pl
# dc_do_check.sh dc_checkall.sh dc_x.fst dc_symbols1.fst dc_ninfl1.fst
# dc_readme dc_sav.sh
#
# Simplifications:
# dc_check modifies internally in agreement variables name the '=' to '_' for
# better handling ($=abc$ -> $_abc$) during the test, and prints
# the modified name.
# for simplicity, dc_check does not fulfil in included files
# embedded include commands and reports found #include statements
# as erroneous #..# variables.
#
# dc_check was tested with:
# trmorph, SMOR, XMOR, Omorfi for sfst (Finnish sfst) and Humorph.
# dc_check is a great help for almost all of the above; the only exception
# is SMOR with lots of unusual variable names
# ($..$-s with embedded $-s, <..> with embedded $ and the like), where
# dc_check reports numerous unjustified variable name inconsistencies.
#
# If there are questions/problems/requests, please report
# to:
#
############### dc_sav.sh ##################
#
tar cvf dc_check.tar dc_*.sh dc_*.pl dc_x.fst dc_symbols1.fst dc_ninfl1.fst dc_readme dc_sav.sh
rm -rf dc_check.tar.gz
gzip dc_check.tar
cp dc_check.tar.gz ../../sich
#################### dc_checkall.sh ###############xxx
#
# check all *.fst files in a directory
# using dc_do_check.sh
#
for f in *.fst
do
echo "-------processing $f"
sh dc_do_check.sh $f
done
#################### dc_do_check.sh ###############
#
# check suite for sfst fst files
# check a file using dc_osszefuz.pl dc_contin.pl and dc_checkvar1.pl
#
perl dc_osszefuz.pl <$1 >$1.check1
perl dc_contin.pl <$1.check1 >$1.check2
perl dc_checkvar1.pl <$1.check2
rm -rf $1.check1 $1.check2
use strict;
#################### dc_checkvar1.pl ###############xxx
#
# check for undefined transducers
# check for unused and redefined transducers
# check for variables starting with $ or # also end with $ or #
# check if <> variables of an alphabet are set up properly
#
my (@arr, $as, $folytatas, %barr, @b1arr, %carr, $was_alphabet);
# pattern for transducer and #..# variables name
my $pat1 = "A-Za-z0-9_\$\#\/+&\(\)\=";
my @input = <STDIN>;
#
# check for undefined transducers
#
my ($linenr, @def_arr, $sda, $out, $out1, $debug);
$debug = 0;
foreach(@input){
++$linenr;
chomp;
my ($l0, $l1,@list, @list0, @list1);
my $current = $_;
my @arr = split(/=/, $current);
my $as = @arr;
if($arr[0] =~ /\$([$pat1\-]*)/){
@list = ($arr[0] =~ /\$([$pat1\-]*)/g);
foreach(@list){
# print "element:$_ last:$last\n";
my $ix = is_in($_, '$');
if($ix != -1){
my $seg = substr($_,0,$ix);
#print "elem:'$_' seg:'$seg'\n";
push(@list0, $seg);
}
}
}
if(defined($arr[1]) and $arr[1] =~ /\$([$pat1\-]*)/){
@list = ($arr[1] =~ /\$([$pat1\-]*)/g);
foreach(@list){
# print "element:$_ last:$last\n";
my $ix = is_in($_, '$');
if($ix != -1){
my $seg = substr($_,0,$ix);
#print "2elem:'$_' seg:'$seg'\n";
push(@list1, $seg);
}
}
}
$l0 = @list0;
$out = '';
$out1 = '';
$sda = @def_arr;
if($debug>2){
my $sds = @def_arr;
if($sds){
print "line:$current sda:@def_arr\n";
}
}
if($sda){
for(my $i = 0; $i < $sda; $i++){
my $elem = $def_arr[$i];
$out1 .= '_'.$elem;
}
$out1 .= '_';
}
if($l1){
for(my $i = 0; $i < $l1; $i++){
my $elem = $list1[$i];
$out .= '_'.$elem;
if($out1 !~ /$elem/ and $as > 1 ){
print "Warning: using undefined transducer $elem near line $linenr\n";
}
}
$out .= '_';
}
#print "l0:$l0 as:$as $current\n";
if($l0 and $as < 2){
for(my $i = 0; $i < $l0; $i++){
my $elem = $list0[$i];
$out .= '_'.$elem;
if($out1 !~ /$elem/ ){
print "Warning: using undefined transducer $elem near line $linenr\n";
}
}
$out .= '_';
}
# fill defined transducer list
if($l0 > 1 or $l1> 0){
if($debug>3){print "line:$linenr cur:$current as:$as l1:$l0 l1:$l1 darr:@def_arr out:$out\n";}
}
if($as > 1 and $l0 ){
for(my $i = 0; $i < $l0; $i++){
my $elem = $list0[$i];
my $elem1 = '_'.$elem.'_';
if($out !~ /$elem1/ ){
my $found;
foreach(@def_arr){
if($_ eq $elem){ $found = 1; last; }
}
if(!$found) {
push(@def_arr,$elem);
if($debug>3){print "after add $elem in def_arr: @def_arr\n";}
#foreach(@def_arr){
# print "after add in def_arr: $_\n";
#}
} #found
} # ($out !~ /$elem1/)
} # for $i
} #$as > 1 and $l0
} # foreach input
#
# check for unused and redefined transducers
#
my ($linenr, @def_arr, $sda, $out, $debug);
$debug = 0;
foreach(@input){
++$linenr;
chomp;
my ($l0, $l1,@list, @list0, @list1);
my $current = $_;
my @arr = split(/=/, $current);
my $as = @arr;
if($arr[0] =~ /\$([$pat1\-]*)/){
@list = ($arr[0] =~ /\$([$pat1\-]*)/g);
foreach(@list){
my $ix = is_in($_, '$');
if($ix != -1){
my $seg = substr($_,0,$ix);
#print "2elem:'$_' seg:'$seg'\n";
push(@list0, $seg);
}
}
}
if(defined($arr[1]) and $arr[1] =~ /\$([$pat1\-]*)/){
@list = ($arr[1] =~ /\$([$pat1\-]*)/g);
foreach(@list){
# print "element:$_ last:$last\n";
my $ix = is_in($_, '$');
if($ix != -1){
my $seg = substr($_,0,$ix);
#print "2elem:'$_' seg:'$seg'\n";
push(@list1, $seg);
}
}
}
$l0 = @list0;
$l1 = @list1;
$out = '';
if($l1){
for(my $i = 0; $i < $l1; $i++){
my $elem = $list1[$i];
$out .= '_'.$elem;
}
$out .= '_';
}
# fill defined transducer list
if($l0 > 1 or $l1> 0){
if($debug>3){print "line:$linenr cur:$current as:$as l1:$l0 l1:$l1 darr:@def_arr out:$out\n";}
}
if($as > 1 and $l0 ){
for(my $i = 0; $i < $l0; $i++){
my $elem = $list0[$i];
my $elem1 = '_'.$elem.'_';
if($out !~ /$elem1/ ){
my $found;
foreach(@def_arr){
if($_ eq $elem){ $found = 1; last; }
}
if(!$found) {
push(@def_arr,$elem);
if($debug>3){print "after add $elem in def_arr: @def_arr\n";}
#foreach(@def_arr){
# print "after add in def_arr: $_\n";
#}
} #found
else{
print "Warning: transducer $elem redefined near line $linenr\n";
} # else
} # ($out !~ /$elem1/)
} # for $i
} #$as > 1 and $l0
if($as > 1 and $l0 and $l1){
# empty def_arr transducer list
my (@seg_arr);
for(my $i = 0; $i < $l1; $i++){
my $elem = $list1[$i];
if($debug>3){print "search for $elem in @def_arr list1:@list1 l0:$l0 l1:$l1 \n";}
foreach(@def_arr){
if($_ eq $elem){
push(@seg_arr, $elem);
if($debug>3){print "added $elem to @seg_arr\n";}
}
}
} # $i
if($debug>2){print "remove @seg_arr from @def_arr new def-arr:@def_arr\n";}
my $adefarr = remove_element_from_array(\@def_arr, \@seg_arr);
@def_arr = @$adefarr;
} # $l0 and $l1
elsif($as <=1 and $l0){
my (@seg_arr);
for(my $i = 0; $i < $l0; $i++){
my $elem = @list0[$i];
my $found = -1;
foreach(@def_arr){
if($_ eq $elem){
if($debug>3){print "2. remove from def_arr $elem\n";}
push(@seg_arr, $elem);
$found = $i;
last;
}
}
if($found != -1) {
my $adefarr = remove_element_from_array(\@def_arr, \@seg_arr);
@def_arr = @$adefarr;
if($debug>2){print "2.new def-arr:@def_arr\n";}
}
} # $i
} # $l0 only
my $dnr = @def_arr;
if($dnr){
# print "line $linenr def_arr:@def_arr\n";
}
} # foreach input
#
# print result for unused transducers
#
$out = '';
my ($cnt, $var);
$var = "transducer";
foreach(@def_arr){
$out .= '$'.$_.'$ ';
if(++$cnt > 1){$var = "transducers";}
}
if($out ne ''){
print "Warning: unused $var: $out\n";
}
#
# utility for unused transducers
# remove elements of arref, that are contained in sref
#
sub remove_element_from_array($$){
my ($arref, $sref) = @_;
my @def_arr = @$arref;
my @seg_arr = @$sref;
my @seg1_arr;
my $sa = @seg_arr;
my ($found, $debug);
if($debug){print "sa:$sa da:@def_arr\n";}
if($sa) {
foreach(@def_arr){
my $elem = $_;
$found = 0;
foreach(@seg_arr){
if ($_ eq $elem){
$found =1;
}
}
if(!$found){
push(@seg1_arr, $elem);
}
}
@def_arr = @seg1_arr;
@seg1_arr = ();
}
return \@def_arr;
}
#
# check for variables starting with $ or # also end with $ or #
#
my $linenr = 0;
foreach(@input){
++$linenr;
chomp;
my $current = $_;
if($current =~ /#([$pat1\-]*)/){
my @list = ($current =~ /#([$pat1\-]*)/g);
foreach(@list){
if(is_in($_, '#') == -1){
print "Warning: near to line $linenr variable \#$_ name starting with \# and not ending with \#: \#$_\n";
}
}
}
if($current =~ /\$([$pat1\-]*)/){
my @list = ($current =~ /\$([$pat1\-]*)/g);
foreach(@list){
# print "element:$_ last:$last\n";
if(is_in($_, '$') == -1){
print "Warning: near to line $linenr variable \$$_ name starting with \$ and not ending with \$: \$$_\n";
}
}
}
}
#
# is element in a string- because of $ must be this complicated
#
sub is_in($$){
my ($elem, $char)=@_;
my @arr = split(//,$elem);
my $ix;
foreach(@arr){
if($_ eq $char){ return $ix;}
++$ix;
}
return -1;
}
#
# replace all #name# variables with the fullest version
# result in hash %repl
#
my (%repl);
foreach(@input){
chomp;
if($_ =~ /#(.*)#\s*=\s*(.*)\s*$/){
my $seg2 = $2;
my $seg1 = $1;
my ($name, $elements);
while (($name, $elements) = each(%repl))
{
my $seg3 = '#'.$name.'#';
if($seg2 =~ /$seg3/){
$seg2 =~ s/$seg3/$elements/;
}
}
#print "writing $seg1 = $seg2\n";
$repl{$seg1} = $seg2;
}
}
#
# collect all inflection variables in array @darr
#
my (@darr);
foreach(@input){
chomp;
if($_ =~ /{?<(.*)>}?:{(.*)}\s*$/){
#print "1:$1 2:$2 l:$_\n";
my $seg = $2;
if($2 =~ /<(.*)>/){
#print "in 2 $2 seg:$seg\n";
$seg =~ s/>[a-záéóúűőüöí]*</ /g;
$seg =~ s/>[a-záéóúűőüöí]*\s*$/ /g;
#print "in 2 $1 seg:$seg\n";
$seg =~ s/</ /g;
#print "in 2 $1 seg:$seg l:$_\n";
my @arr = split(/\s+/, $seg);
foreach(@arr){
my $seg1 = $_;
my $found = 0;
foreach(@darr){
if($_ eq $seg1){$found = 1; last;}
}
if(!$found and $_ ne ''){
#print "write in darr:$_\n";
push(@darr, $_);
}
} #@arr
} # if $2
} # if $_
}
#
# check if <> variables of an alphabet are set up properly
# %barr: in alphabet defined vars
# %carr: transducers
# @b1arr:#..# vars
#
foreach(@input){
chomp;
if($_ =~ /ALPHABET\s*=/){
if($was_alphabet){
check_previous_alphabet();
%barr = ();
%carr = ();
@b1arr = ();
}
$was_alphabet = 1;
#print "1. $_\n";
@arr = split(/\s+/, $_);
$as = @arr;
for(my $i = 0; $i < $as; $i++){
#print "2. $arr[$i]\n";
if($arr[$i] =~ /<(.*)>:(.*)/){
#print "3. 1:$1 2:$2\n";
my $mas;
if(substr($2,0,1) eq '[' and substr($2, length($2)-1,1) eq ']'){
my $seg = substr($2,1);
$seg = substr($seg,0,length($seg)-1);
#print "seg:'$seg'\n";
my @marr = split(//, $seg);
my $ms = @marr;
for(my $i = 0; $i < $ms; $i++){
if($marr[$i] eq '<' or $marr[$i] eq '#'){
my $end = '>';
if($marr[$i] eq '#') {$end = '#';}
$mas .= $marr[$i];
$i++;
while($marr[$i] ne $end and $i < $ms){
$mas .= $marr[$i++]
}
if($marr[$i] eq $end){$mas .= "$end ";}
else {$mas .= ' ';}
} else{
$mas .= "$marr[$i] ";
}
}
$mas = substr($mas, 0, length($mas)-1);
} else {$mas = $2;}
if(!defined($barr{$1})){
#print "writing into barr 1:$1 2:$2\n";
$barr{$1} = $mas;
}
else{
check_for_double_element($barr{$1}, $2, $1);
$barr{$1} .= ' '.$mas;
}
# print "2:'$2' mas:'$mas'\n";
} elsif ($arr[$i] =~ /#(.*)#/){
my $seg1 = $1;
my $found;
foreach(@b1arr){
if($_ eq $seg1){$found = 1; last;}
}
if(!$found){
push(@b1arr, $seg1);
# print "added to b1arr:$seg1\n";
}
}
}
} # ALPHABET
else{
if($_ =~ /<([^ ]*)>\s*<=>\s*([^ ]*)/){
#print "4. match 1:$1 2:$2 l:$_\n";
if(!defined($carr{$1})){
$carr{$1} = $2;
}
else{
$carr{$1} .= ' '.$2;
}
}
}
}
sub check_previous_alphabet(){
my ($name, $elements, $wassign, $size, $debug);
$debug = 0;
if($debug){
$size = keys(%barr);
#print "bsize:$size\n";
if($size){
print "-----------------------barr\n";
while (($name, $elements) = each(%barr))
{
print "$name has $elements\n";
}
}
$size = keys(%carr);
if($size){
print "-----------------------carr\n";
while (($name, $elements) = each(%carr))
{
print "$name has $elements\n";
}
print "-----------------------\n";
}# size
} #debug
while (($name, $elements) = each(%barr))
{
#print "$name has $elements\n";
my (@ba, @ca, $bas, $cas);
@ba = split(/\s+/, $elements);
$bas = @ba;
$cas = 0;
if(defined($carr{$name})){
@ca = split(/\s+/, $carr{$name});
$cas = @ca;
} else{
if($bas > 1){
print "Definition missing for $name, element: $elements bas:$bas\n";
$wassign = 1;
}
}
if(!$wassign){
if($cas > $bas-1){
print "Warning: Too many definitions for $name element:$elements defs:$carr{$name} cas:$cas bas:$bas \n";
}elsif($cas < $bas -1){
print "Too few definitions for $name element:$elements defs:$carr{$name} cas:$cas bas:$bas \n";
}
if($cas){
my $found = 0;
foreach(@ca){
if($bas){
my $seg = $_;
foreach(@ba){
if($_ eq $seg){$found = 1; last;}
#else {print "elem:$_ seg:$seg\n";}
}
if(!$found){
print "Wrong definition for $name not found:$seg \n";
}
} #$bas
} #@ca
} #cas
if($bas > 1){
if(!$cas){ print "Undefined $name element:$elements \n";}
}
} # wassign
$wassign = 0;
}
check_if_alphabet_uses_all_inflect_variables();
}
#
# check if alphabet uses all inflect variables
# %repl contains fullest variable names
# @darr contains all inflection variables
# %barr contains all alphabet <...> variables
# @b1arr contains all alphabet #...# variables
# %carr contains all variable settings
# all saved names without <> or ##
#
sub check_if_alphabet_uses_all_inflect_variables(){
my ($debug, $name, $elements);
if($debug > 1){while (($name, $elements) = each(%barr))
{
print "barr: $name has $elements\n";
}
foreach(@darr){
print "darr:$_\n";
}
}
my @missing;
foreach(@darr){
my $seg1 = $_;
my $found;
# search in %barr first
my ($name, $elements);
while (($name, $elements) = each(%barr))
{
if($debug){print "search for $seg1 in barr, $name has $elements\n";}
if($name eq $seg1){
$found = 1;
if($debug){print "found $seg1\n";}
#last;
}
}
if($debug){if(!$found){ print "not found $seg1 yet\n";}}
# search in @b1arr-%repl second
if(!$found){
foreach(@b1arr){
if(defined($repl{$_})){
my $seg2 = $repl{$_};
if($seg2 =~ $seg1){
if($debug){print "found $seg1 in b1arr/tpl $seg2\n";}
$found = 1;
last;
}
}
}
}
if(!$found){
my $found2;
foreach(@missing){
if($_ eq $seg1) {$found2 = 1; last;}
}
if(!$found2){push(@missing, $seg1);}
}
}
my ($out, $count, $var);
$var = "Variable";
foreach(@missing){
$out .= $_.' ';
if(++$count > 1){$var = "Variables";}
}
if(length($out)){ print "Warning: $var $out missing from alphabet\n";}
else {print "All variables found in alphabet\n";}
}
#
# Utility for variable check
#
sub check_for_double_element($$$){
my ($arr, $elem, $name) = @_;
my @sa = split(/\s+/, $arr);
foreach(@sa){
if($_ eq $elem){
print "Element $elem twice in $name\n";
}
}
}
use strict;
#################### dc_contin.pl ###############xxx
#
# put together lines separated by \ (continuation lines)
#
my (@output, $ol, $contin);
while(<STDIN>){
chomp;
#print "cont:$contin line:$_\n";
if($_ =~ /(.*)\\\s*$/){
#print "itt, $_\n";
if($contin){
$ol .= ' '.$1;
}else{
$ol = $1;
}
$contin = 1;
} else{
if($contin){
$ol .= ' '.$_;
$contin = 0;
push(@output, $ol);
$ol = '';
} else{
push(@output, $_);
}
}
}
foreach(@output){
# replace in agreement variables $=abc$ with $_abc$
$_ =~ s/\$=([^ ]*)\$/\$_$1\$/g;
$_ =~ s/\#=([^ ]*)\#/\#_$1\#/g;
print "$_\n";
}
use strict;
#################### dc_osszefuz.pl ###############xxx
#
# exclude comments
# include includes
#
my (@output, @arr);
while(<STDIN>){
chomp;
if($_ =~ /^#include\s+"(.*)"/){
open FILE, "$1" or die $1;
my @lines = <FILE>;
close FILE;
foreach(@lines){
chomp;
@arr = split(/%/, $_);
if(length($arr[0])){
push(@output, $arr[0]);
}
}
} else{
@arr = split(/%/, $_);
if(length($arr[0])){
push(@output, $arr[0]);
}
}
}
foreach(@output){
print "$_\n";
}