| 1 | #!/usr/bin/perl -w |
|---|
| 2 | # A script to extract most parser-related information from STD.pm, |
|---|
| 3 | # in a simple "functions" form (runable in perl5/ruby/etc), |
|---|
| 4 | # to aid the development of new parsers in p6 bootstrap efforts, |
|---|
| 5 | # by making the information more accessible. |
|---|
| 6 | # |
|---|
| 7 | # Eg, |
|---|
| 8 | # class Term does PrecOp[|%term] {} |
|---|
| 9 | # becomes something like |
|---|
| 10 | # class_does_PrecOp('term'); |
|---|
| 11 | # |
|---|
| 12 | # Why functions and not yaml? I've found it easier to read and use. |
|---|
| 13 | # |
|---|
| 14 | # What is the difference vis cheat and metholate? |
|---|
| 15 | # Their focus is on code transformation, with a single target. |
|---|
| 16 | # Transforming STD.pm into p6 a pugs-level implementation can run directly. |
|---|
| 17 | # This script is more literary decomposition. Providing human readable and |
|---|
| 18 | # metaprogramming bits for the "hand" creation of a bootstrap p6 parser in |
|---|
| 19 | # non-p6 programming languages. Only really has to succeed once of course. |
|---|
| 20 | # cheat and metholate would clearly be sufficient if we had an activily |
|---|
| 21 | # developed pugs-level p6 implementation, or the hope of having one soon. |
|---|
| 22 | # That doesn't appear to be the case, and it's not clear 'runs on pugs' is |
|---|
| 23 | # sufficient? |
|---|
| 24 | # |
|---|
| 25 | # Development |
|---|
| 26 | # Set of constructs recognized (almost?) complete. |
|---|
| 27 | # Rest sufficiently unique to likely to remain residue. |
|---|
| 28 | # Todo |
|---|
| 29 | # inherit proto args |
|---|
| 30 | # condition arguments |
|---|
| 31 | # code and pattern element extraction is still quite buggy. |
|---|
| 32 | # check output is syntactically correct (p5,rb,py?). |
|---|
| 33 | # inquire questions |
|---|
| 34 | # |
|---|
| 35 | # Questions |
|---|
| 36 | # |
|---|
| 37 | # |
|---|
| 38 | # Notes |
|---|
| 39 | # Q: why isn't |
|---|
| 40 | # proto rule statement_control (:$endsym is context = / \s <nofat> /) { } |
|---|
| 41 | # instead \s+ ? |
|---|
| 42 | # A: because statement control rules will eat \s+ anyway via sigspace, |
|---|
| 43 | # and because this ought to have been a lookahead anyway. |
|---|
| 44 | |
|---|
| 45 | use warnings; |
|---|
| 46 | use strict; |
|---|
| 47 | use utf8; |
|---|
| 48 | use Regexp::Common; |
|---|
| 49 | |
|---|
| 50 | binmode(STDOUT, ":utf8"); |
|---|
| 51 | binmode(STDERR, ":utf8"); |
|---|
| 52 | sub print_usage_and_exit { |
|---|
| 53 | print STDERR "Usage: $0 REPORT-DIRECTORY [INPUT-FILE]\n"; |
|---|
| 54 | exit(2); |
|---|
| 55 | } |
|---|
| 56 | print_usage_and_exit if not @ARGV or not -d $ARGV[0]; |
|---|
| 57 | my $dir = $ARGV[0]; $dir =~ s/\/$//; |
|---|
| 58 | my $std_file = $ARGV[1] || "./STD.pm"; |
|---|
| 59 | open(F,"<$std_file") or die $!; binmode(F,":utf8"); |
|---|
| 60 | my $std = join("",<F>); close(F); |
|---|
| 61 | local $_ = $std; |
|---|
| 62 | |
|---|
| 63 | sub spew { |
|---|
| 64 | my($fn,$s)=@_; |
|---|
| 65 | open(F,">$dir/$fn") or die; |
|---|
| 66 | binmode(F, ":utf8"); |
|---|
| 67 | print F $s; |
|---|
| 68 | close(F); |
|---|
| 69 | } |
|---|
| 70 | |
|---|
| 71 | sub remove_clutter { |
|---|
| 72 | s/^ *\#.*\n//mg; |
|---|
| 73 | s/ \# .*//mg; |
|---|
| 74 | s/(?<=\s)\#\'//g; |
|---|
| 75 | #s/^=begin\s+(.*?\S)\s*\n(?:.|\n)*?^=end\s+\1\s*\n//mg; |
|---|
| 76 | # no, eg, =begin perlhints <-> matched by =end perlhints . |
|---|
| 77 | s/^=begin\s+(\S+).*\n(?:.|\n)*?^=end\s+\1.*\n//mg; |
|---|
| 78 | |
|---|
| 79 | # grammar Perl:ver<6.0.0.alpha>:auth<http://perl.org>; |
|---|
| 80 | s/^grammar.*//m; |
|---|
| 81 | # BEGIN { say "compiling STD" } |
|---|
| 82 | s/^BEGIN { say.*//m; |
|---|
| 83 | |
|---|
| 84 | # say "Starting..."; to end-of-file |
|---|
| 85 | s/^say .*//ms; |
|---|
| 86 | |
|---|
| 87 | while(s/\n *\n/\n/g){} |
|---|
| 88 | s/^\n+//; |
|---|
| 89 | } |
|---|
| 90 | sub quote { |
|---|
| 91 | my($s)=@_; |
|---|
| 92 | $s =~ s/\\/\\\\/g; |
|---|
| 93 | $s =~ s/\'/\\\'/g; |
|---|
| 94 | $s; |
|---|
| 95 | } |
|---|
| 96 | sub replace_sym { |
|---|
| 97 | my($sym,$s)=@_; |
|---|
| 98 | $sym = quote($sym); |
|---|
| 99 | $s =~ s/<sym>/'$sym'/g; # these '' are for the rx literal. |
|---|
| 100 | $s; |
|---|
| 101 | } |
|---|
| 102 | sub strip_oneliners { # bad idea - ws is significant. |
|---|
| 103 | my($s)=@_; |
|---|
| 104 | return $s if $s =~ /\n/; |
|---|
| 105 | $s =~ s/^\s+//; |
|---|
| 106 | $s =~ s/\s+$//; |
|---|
| 107 | return $s; |
|---|
| 108 | } |
|---|
| 109 | sub remove_solo_whatever { |
|---|
| 110 | my($s)=@_; |
|---|
| 111 | my $tmp = $s; $tmp =~ s/{\*\}//; $s = $tmp if $tmp !~ /{\*\}/; |
|---|
| 112 | return $s; |
|---|
| 113 | } |
|---|
| 114 | sub extract { |
|---|
| 115 | local $_ = $_; |
|---|
| 116 | my $braces = "{([^\n]*?|.*?^)}"; #/sm |
|---|
| 117 | my $out = ""; |
|---|
| 118 | my($data,$rest,$both) = ("","",""); |
|---|
| 119 | my @data_list; |
|---|
| 120 | my @patterns; |
|---|
| 121 | |
|---|
| 122 | # Resist bitrot |
|---|
| 123 | my %extractors=map{($_,1)} qw( |
|---|
| 124 | cls1 cls2 const methodsub |
|---|
| 125 | proto_empty proto_endsym_nofat proto_endsym proto_defeq |
|---|
| 126 | stmt1 stmt2 stmt_mod stmt_prefix |
|---|
| 127 | category |
|---|
| 128 | rul_sym rul_plain |
|---|
| 129 | ); |
|---|
| 130 | my %used = map{($_,undef)} keys(%extractors); |
|---|
| 131 | |
|---|
| 132 | my $do_extraction = sub { |
|---|
| 133 | while (1) { |
|---|
| 134 | if(undef) {} |
|---|
| 135 | # constant |
|---|
| 136 | elsif(/\A(constant \%(\w+) += {\s*(.*?\S) *}; *\n)/) { |
|---|
| 137 | my($all,$name,$prec)=($1,$2,$3); $used{const}=1; |
|---|
| 138 | $prec =~ s/ +/ /g; |
|---|
| 139 | $prec =~ s/:(prec|assoc)<(.*?)>,?/$2/g; |
|---|
| 140 | $out .= "constant_prec('$name','$prec');\n"; |
|---|
| 141 | $_ = substr($_,length($all)); |
|---|
| 142 | } |
|---|
| 143 | # category |
|---|
| 144 | elsif(/\A(token category:(\w+) +{( <sym> )} *\n)/) { |
|---|
| 145 | push(@patterns,$3); $used{category}=1; |
|---|
| 146 | $out .= "token_category('$2');\n"; |
|---|
| 147 | $_ = substr($_,length($1)); |
|---|
| 148 | } |
|---|
| 149 | # proto |
|---|
| 150 | elsif(/\A(proto token (\w+) { } *\n)/) { $used{proto_empty}=1; |
|---|
| 151 | $out .= "proto_token_simple('$2');\n"; |
|---|
| 152 | $_ = substr($_,length($1)); |
|---|
| 153 | } |
|---|
| 154 | elsif(/\A(proto (token|rule) +(\w+) \(:\$endsym is context = '(nofat)'\) { } *\n)/) { |
|---|
| 155 | push(@patterns,$4); $used{proto_endsym_nofat}=1; |
|---|
| 156 | $out .= "proto_endsym_nofat_angle_$2('$3');\n"; |
|---|
| 157 | $_ = substr($_,length($1)); |
|---|
| 158 | } |
|---|
| 159 | elsif(/\A(proto token (\w+) is defequiv\(%(\w+)\) { } *\n)/) { |
|---|
| 160 | $out .= "proto_token_defequiv('$2','$3');\n"; $used{proto_defeq}=1; |
|---|
| 161 | $_ = substr($_,length($1)); |
|---|
| 162 | } |
|---|
| 163 | elsif(/\A(proto (token|rule) +(\w+) \(:\$endsym is context = '(\w+)'\) { } *\n)/) { |
|---|
| 164 | push(@patterns,$4); $used{proto_endsym}=1; |
|---|
| 165 | $out .= "proto_endsym_other_${2}('$3','$4');\n"; |
|---|
| 166 | $_ = substr($_,length($1)); |
|---|
| 167 | } |
|---|
| 168 | # class |
|---|
| 169 | elsif(/\A(class (\w+) +does PrecOp\[\|%(.*?)\] *{} *\n)/) { |
|---|
| 170 | my($all,$name,$prec)=($1,$2,$3); $used{cls1}=1; |
|---|
| 171 | die "assert" if lc($name) ne $prec; |
|---|
| 172 | $out .= "class_does_PrecOp('$prec');\n"; |
|---|
| 173 | $_ = substr($_,length($all)); |
|---|
| 174 | } |
|---|
| 175 | elsif(/\A(class (\w+) +does QLang +{(.*?)^} *\n)/sm) { |
|---|
| 176 | my($all,$name,$block)=($1,$2,$3); $used{cls2}=1; |
|---|
| 177 | $block = quote($block); |
|---|
| 178 | $out .= "class_does_QLang('$name','$block');\n"; |
|---|
| 179 | $_ = substr($_,length($all)); |
|---|
| 180 | } |
|---|
| 181 | # method and sub |
|---|
| 182 | elsif(/\A((multi )?(method|sub) +(\w+) *\(([^\)]*?)\)[ \n]*{(.*?)^} *\n)/sm) { |
|---|
| 183 | my($all,$multi,$methsub,$name,$args,$block)=($1,($2||""),$3,$4,$5,$6); $used{methodsub}=1; |
|---|
| 184 | $multi = 'multi_' if $multi; |
|---|
| 185 | $args = quote($args); |
|---|
| 186 | $block = quote($block); |
|---|
| 187 | $out .= "grammar_$multi$methsub('$name','$args','$block');\n"; |
|---|
| 188 | $_ = substr($_,length($all)); |
|---|
| 189 | } |
|---|
| 190 | # statement |
|---|
| 191 | elsif(/\A(rule statement_control:(\w+) +{(<sym> <block> {\*} )}.*\n)/) { |
|---|
| 192 | push(@patterns,$3); $used{stmt1}=1; |
|---|
| 193 | $out .= "rule_statement_control_block('$2');\n"; |
|---|
| 194 | $_ = substr($_,length($1)); |
|---|
| 195 | } |
|---|
| 196 | elsif(/\A(rule statement_control:(\w+) +{\\ *\n(.*?\n)} *\n)/s) { |
|---|
| 197 | my($all,$name,$pat)=($1,$2,$3); $used{stmt2}=1; |
|---|
| 198 | push(@patterns,$pat); |
|---|
| 199 | $pat = quote(replace_sym($name,$pat)); |
|---|
| 200 | $out .= "rule_statement_control('$name','$pat');\n"; |
|---|
| 201 | $_ = substr($_,length($all)); |
|---|
| 202 | } |
|---|
| 203 | elsif(/\A(rule statement_mod_(cond|loop):(\w+) +{(<sym> <modifier_expr> {\*} )}.*\n)/) { |
|---|
| 204 | push(@patterns,$4); $used{stmt_mod}=1; |
|---|
| 205 | $out .= "rule_statement_mod_$2('$3');\n"; |
|---|
| 206 | $_ = substr($_,length($1)); |
|---|
| 207 | } |
|---|
| 208 | elsif(/\A(token statement_prefix:(\w+) +{( <sym> <statement> {\*} )}.*\n)/) { |
|---|
| 209 | push(@patterns,$3); $used{stmt_prefix}=1; |
|---|
| 210 | $out .= "token_statement_prefix('$2');\n"; |
|---|
| 211 | $_ = substr($_,length($1)); |
|---|
| 212 | } |
|---|
| 213 | # rul foo:sym |
|---|
| 214 | elsif( |
|---|
| 215 | /\A((token|rule)[ ] |
|---|
| 216 | ([a-z_]+): |
|---|
| 217 | (?:sym.[ ]* (\S+? (?:[ ]\S+)?) [ ]*[»>}\]\"\'] |
|---|
| 218 | |(\w+)) |
|---|
| 219 | (?:[ ]*\((?:[ ]*-->[ ]*(\w+)[ ]*)?\))? |
|---|
| 220 | [ \n]* |
|---|
| 221 | $braces |
|---|
| 222 | (?:[ ]*\#=[ ]([^\n]+))? |
|---|
| 223 | [ ]*\n |
|---|
| 224 | (?=\w|\z)) |
|---|
| 225 | /smx) |
|---|
| 226 | { |
|---|
| 227 | my($all,$tr,$cat,$sym,$sym2,$type,$pat,$callit)=($1,$2,$3,$4,$5,($6||""),$7,($8||"")); $used{rul_sym}=1; |
|---|
| 228 | push(@patterns,$pat); |
|---|
| 229 | $sym = $sym2 if not defined($sym); |
|---|
| 230 | $pat = remove_solo_whatever($pat); |
|---|
| 231 | $pat = '' if $pat =~ /^\s*<sym>\s*$/; |
|---|
| 232 | $pat = '' if $pat =~ /^\s*\'\Q$sym\E\'\s*$/; |
|---|
| 233 | $callit = '' if $callit eq $sym; |
|---|
| 234 | $pat = quote(replace_sym($sym,$pat)); |
|---|
| 235 | $type = lc $type; |
|---|
| 236 | $out .= "${tr}_op('$cat','$sym','$type','$pat','$callit');\n"; |
|---|
| 237 | $_ = substr($_,length($all)); |
|---|
| 238 | } |
|---|
| 239 | # rul foo |
|---|
| 240 | elsif(/\A((token|rule|regex) (\w+)(?: *\(([^\)]*)\))? *$braces *\n)/sm) { |
|---|
| 241 | my($all,$tr,$name,$args,$pat)=($1,$2,$3,$4,$5); $used{rul_plain}=1; |
|---|
| 242 | push(@patterns,$pat); |
|---|
| 243 | $pat = remove_solo_whatever($pat); |
|---|
| 244 | $pat = quote(replace_sym($name,$pat)); |
|---|
| 245 | if(defined $args) { |
|---|
| 246 | $out .= "${tr}_pattern_with_args('$name','$args','$pat');\n"; |
|---|
| 247 | }else{ |
|---|
| 248 | $out .= "${tr}_pattern('$name','$pat');\n"; |
|---|
| 249 | } |
|---|
| 250 | $_ = substr($_,length($all)); |
|---|
| 251 | } |
|---|
| 252 | # interior lines |
|---|
| 253 | elsif(/\A(.*\n)/) { |
|---|
| 254 | $rest .= $1; |
|---|
| 255 | $both .= $1; |
|---|
| 256 | $_ = substr($_,length($1)); |
|---|
| 257 | } |
|---|
| 258 | # done |
|---|
| 259 | elsif(/\A\z/) { |
|---|
| 260 | last; |
|---|
| 261 | } |
|---|
| 262 | else { |
|---|
| 263 | die "ERROR: match failed. File doesnt end in newline?\n$_"; |
|---|
| 264 | } |
|---|
| 265 | if($out =~ /^(?!\A)\w/m) { |
|---|
| 266 | die "BUG: a regex overshot, matching:\n$out"; |
|---|
| 267 | } |
|---|
| 268 | $data .= $out; |
|---|
| 269 | $both .= $out; |
|---|
| 270 | push(@data_list,$out); |
|---|
| 271 | $out = ""; |
|---|
| 272 | } |
|---|
| 273 | }; |
|---|
| 274 | |
|---|
| 275 | my $wrestle_with_Regex = sub { |
|---|
| 276 | my($body)=@_; |
|---|
| 277 | $body =~ s/^ //mg; |
|---|
| 278 | s/^(token )(ws|stdstopper) /${1}regex__$2 /mg; |
|---|
| 279 | $body; |
|---|
| 280 | }; |
|---|
| 281 | s/^grammar Regex is Perl { *\n(.*?)^}/$wrestle_with_Regex->($1)/mse |
|---|
| 282 | or die "Couldn't find Regex to wrestle with it."; |
|---|
| 283 | |
|---|
| 284 | $do_extraction->(); |
|---|
| 285 | |
|---|
| 286 | # Resist bitrot |
|---|
| 287 | my $problem = ""; |
|---|
| 288 | for my $k (keys(%used)) { |
|---|
| 289 | $problem .= "An undeclared extractor $k was reported used. Bug.\n" |
|---|
| 290 | if(!$extractors{$k}); |
|---|
| 291 | $problem .= "Extractor $k was not used. The STD.pm format must have changed.\n" |
|---|
| 292 | if(!$used{$k}); |
|---|
| 293 | } |
|---|
| 294 | print STDERR $problem; |
|---|
| 295 | #die "Aborting.\n" if $problem; |
|---|
| 296 | |
|---|
| 297 | spew("items-all.pm",$both); |
|---|
| 298 | spew("items-data.pm",$data); |
|---|
| 299 | spew("items-nondata.pm",$rest); |
|---|
| 300 | |
|---|
| 301 | my @sorted = sort { |
|---|
| 302 | my $atop = $a=~/^(.+)/;my $btop = $b=~/^(.*)/; $a cmp $b } @data_list; |
|---|
| 303 | spew("items-data-sorted.pm",join("",@sorted)); |
|---|
| 304 | |
|---|
| 305 | open(F,"|grep '^\\w'|sort>$dir/items-summary") or die; |
|---|
| 306 | binmode(F, ":utf8"); |
|---|
| 307 | print F $both; |
|---|
| 308 | close(F); |
|---|
| 309 | |
|---|
| 310 | pattern_analysis(\@patterns); |
|---|
| 311 | } |
|---|
| 312 | sub pattern_analysis { |
|---|
| 313 | my($patterns)=@_; |
|---|
| 314 | my $pats = join("", map{/\n\z/?$_:"$_\n"} @$patterns); |
|---|
| 315 | local $_ = $pats; |
|---|
| 316 | s/ +/ /g; |
|---|
| 317 | s/\n /\n/g; |
|---|
| 318 | my @code; |
|---|
| 319 | my $re = $RE{balanced}{-parens=>'{}()<>[]'}; |
|---|
| 320 | while(s/(?:(?<=\s)|(?<=^))(\{(?!(?:\.\.\.|\*)\})$re})/{...}/ms) { |
|---|
| 321 | my $code = $1; |
|---|
| 322 | $code =~ s/\n/ /g; |
|---|
| 323 | push(@code,$code."\n"); |
|---|
| 324 | } |
|---|
| 325 | while(/^\s*(:my.*)/mgc) { |
|---|
| 326 | push(@code,$1."\n"); |
|---|
| 327 | } |
|---|
| 328 | |
|---|
| 329 | my $preserve_spaces = sub {my($s)=@_;$s=~s/ /\014/g;$s}; |
|---|
| 330 | s/(:my.+)/$preserve_spaces->($1)/eg; |
|---|
| 331 | s/((?<!\')#.+)/$preserve_spaces->($1)/eg; |
|---|
| 332 | |
|---|
| 333 | $re = $RE{delimited}{-delim=>'"'}; |
|---|
| 334 | #s/(?<=\s)($re)/$preserve_spaces->($1)/eg; |
|---|
| 335 | $re = $RE{delimited}{-delim=>"'"}; |
|---|
| 336 | #s/(?<=\s)($re)/$preserve_spaces->($1)/eg; |
|---|
| 337 | $re = $RE{delimited}{-delim=>'/'}; |
|---|
| 338 | #s/(?<=\s)($re)/$preserve_spaces->($1)/eg; |
|---|
| 339 | $re = $RE{balanced}{-parens=>'<>'}; |
|---|
| 340 | s/(?<=\s)($re)/$preserve_spaces->($1)/eg; |
|---|
| 341 | |
|---|
| 342 | s/ /\n/g; |
|---|
| 343 | s/\014/ /g; |
|---|
| 344 | s/\n+/\n/g; s/\A\n//; |
|---|
| 345 | my $elements = $_; |
|---|
| 346 | |
|---|
| 347 | spew("pattern-dump",$pats); |
|---|
| 348 | spew("pattern-elements",$elements); |
|---|
| 349 | do { |
|---|
| 350 | my $elems = $elements; |
|---|
| 351 | local $_ = $elems; |
|---|
| 352 | s/^#= .+/#= xxx/mg; |
|---|
| 353 | $elems = $_; |
|---|
| 354 | my @e = map{"$_\n"} split(/\n/,$elems); |
|---|
| 355 | my %u = map{($_,1)} @e; |
|---|
| 356 | my $uniq = join("",sort keys %u); |
|---|
| 357 | spew("pattern-elements-unique",$uniq); |
|---|
| 358 | }; |
|---|
| 359 | spew("pattern-code",join("",@code)); |
|---|
| 360 | } |
|---|
| 361 | |
|---|
| 362 | spew("file_0.pm",$_); |
|---|
| 363 | remove_clutter; |
|---|
| 364 | spew("file_1.pm",$_); |
|---|
| 365 | system("cd $dir; diff -u file_0.pm file_1.pm > file_1.diff"); |
|---|
| 366 | extract; |
|---|
| 367 | |
|---|
| 368 | #; Local Variables: |
|---|
| 369 | #; perl-indent-level: 2 |
|---|
| 370 | #; End: |
|---|
| 371 | #; vim: shiftwidth=2: |
|---|