root/misc/STD/STD_extract

Revision 20129, 11.2 kB (checked in by putter, 10 months ago)

misc/winter_jig/STD/STD_extract: Sync'ed with current STD.pm. Added more bitrot detection to make it easier next time.

  • Property svn:mime-type set to text/plain; charset=UTF-8
  • Property svn:eol-style set to native
  • Property svn:executable set to *
Line 
1#!/usr/bin/perl -w
2# A script to extract most parser-related information from STD.pm,
3# in a simple "functions" form (runable in perl5/ruby/etc),
4# to aid the development of new parsers in p6 bootstrap efforts,
5# by making the information more accessible.
6#
7# Eg,
8#   class Term            does PrecOp[|%term]                   {}
9# becomes something like
10#   class_does_PrecOp('term');
11#
12# Why functions and not yaml?  I've found it easier to read and use.
13#
14# What is the difference vis cheat and metholate?
15#  Their focus is on code transformation, with a single target.
16#  Transforming STD.pm into p6 a pugs-level implementation can run directly.
17#  This script is more literary decomposition.  Providing human readable and
18#  metaprogramming bits for the "hand" creation of a bootstrap p6 parser in
19#  non-p6 programming languages.  Only really has to succeed once of course.
20#  cheat and metholate would clearly be sufficient if we had an activily
21#  developed pugs-level p6 implementation, or the hope of having one soon.
22#  That doesn't appear to be the case, and it's not clear 'runs on pugs' is
23#  sufficient?
24#
25# Development
26#  Set of constructs recognized (almost?) complete.
27#    Rest sufficiently unique to likely to remain residue.
28# Todo
29#  inherit proto args
30#  condition arguments
31#   code and pattern element extraction is still quite buggy.
32#  check output is syntactically correct (p5,rb,py?).
33#  inquire questions
34
35# Questions
36#
37#
38# Notes
39#  Q: why isn't
40#    proto rule statement_control (:$endsym is context = / \s <nofat> /) { }
41#   instead \s+ ?
42#   A: because statement control rules will eat \s+ anyway via sigspace,
43#    and because this ought to have been a lookahead anyway.
44
45use warnings;
46use strict;
47use utf8;
48use Regexp::Common;
49
50binmode(STDOUT, ":utf8");
51binmode(STDERR, ":utf8");
52sub print_usage_and_exit {
53  print STDERR "Usage: $0 REPORT-DIRECTORY [INPUT-FILE]\n";
54  exit(2);
55}
56print_usage_and_exit if not @ARGV or not -d $ARGV[0];
57my $dir = $ARGV[0]; $dir =~ s/\/$//;
58my $std_file = $ARGV[1] || "./STD.pm";
59open(F,"<$std_file") or die $!; binmode(F,":utf8");
60my $std = join("",<F>); close(F);
61local $_ = $std;
62
63sub spew {
64  my($fn,$s)=@_;
65  open(F,">$dir/$fn") or die;
66  binmode(F, ":utf8");
67  print F $s;
68  close(F);
69}
70
71sub remove_clutter {
72  s/^ *\#.*\n//mg;
73  s/ \# .*//mg;
74  s/(?<=\s)\#\'//g;
75  #s/^=begin\s+(.*?\S)\s*\n(?:.|\n)*?^=end\s+\1\s*\n//mg;
76  # no, eg, =begin perlhints <-> matched by =end perlhints .
77  s/^=begin\s+(\S+).*\n(?:.|\n)*?^=end\s+\1.*\n//mg;
78 
79  # grammar Perl:ver<6.0.0.alpha>:auth<http://perl.org>;
80  s/^grammar.*//m;
81  # BEGIN { say "compiling STD" }
82  s/^BEGIN { say.*//m;
83 
84  # say "Starting...";   to end-of-file
85  s/^say .*//ms;
86 
87  while(s/\n *\n/\n/g){}
88  s/^\n+//;
89}
90sub quote {
91  my($s)=@_;
92  $s =~ s/\\/\\\\/g;
93  $s =~ s/\'/\\\'/g;
94  $s;
95}
96sub replace_sym {
97  my($sym,$s)=@_;
98  $sym = quote($sym);
99  $s =~ s/<sym>/'$sym'/g; # these '' are for the rx literal.
100  $s;
101}
102sub strip_oneliners { # bad idea - ws is significant.
103  my($s)=@_;
104  return $s if $s =~ /\n/;
105  $s =~ s/^\s+//;
106  $s =~ s/\s+$//;
107  return $s;
108}
109sub remove_solo_whatever {
110  my($s)=@_;
111  my $tmp = $s; $tmp =~ s/{\*\}//; $s = $tmp if $tmp !~ /{\*\}/;
112  return $s;
113}
114sub extract {
115  local $_ = $_;
116  my $braces = "{([^\n]*?|.*?^)}"; #/sm
117  my $out = "";
118  my($data,$rest,$both) = ("","","");
119  my @data_list;
120  my @patterns;
121
122  # Resist bitrot
123  my %extractors=map{($_,1)} qw(
124    cls1 cls2 const methodsub
125    proto_empty proto_endsym_nofat proto_endsym proto_defeq
126    stmt1 stmt2 stmt_mod stmt_prefix
127    category
128    rul_sym rul_plain
129  );
130  my %used = map{($_,undef)} keys(%extractors);
131
132  my $do_extraction = sub {
133    while (1) {
134      if(undef) {}
135      # constant
136      elsif(/\A(constant \%(\w+) += {\s*(.*?\S) *}; *\n)/) {
137        my($all,$name,$prec)=($1,$2,$3);  $used{const}=1;
138        $prec =~ s/ +/ /g;
139        $prec =~ s/:(prec|assoc)<(.*?)>,?/$2/g;
140        $out .= "constant_prec('$name','$prec');\n";
141        $_ = substr($_,length($all));
142      }
143      # category
144      elsif(/\A(token category:(\w+) +{( <sym> )} *\n)/) {
145        push(@patterns,$3);  $used{category}=1;
146        $out .= "token_category('$2');\n";
147        $_ = substr($_,length($1));
148      }
149      # proto
150      elsif(/\A(proto token (\w+) { } *\n)/) {  $used{proto_empty}=1;
151        $out .= "proto_token_simple('$2');\n";
152        $_ = substr($_,length($1));
153      }
154      elsif(/\A(proto (token|rule) +(\w+) \(:\$endsym is context = '(nofat)'\) { } *\n)/) {
155        push(@patterns,$4);  $used{proto_endsym_nofat}=1;
156        $out .= "proto_endsym_nofat_angle_$2('$3');\n";
157        $_ = substr($_,length($1));
158      }
159      elsif(/\A(proto token (\w+) is defequiv\(%(\w+)\) { } *\n)/) {
160        $out .= "proto_token_defequiv('$2','$3');\n";  $used{proto_defeq}=1;
161        $_ = substr($_,length($1));
162      }
163      elsif(/\A(proto (token|rule) +(\w+) \(:\$endsym is context = '(\w+)'\) { } *\n)/) {
164        push(@patterns,$4);  $used{proto_endsym}=1;
165        $out .= "proto_endsym_other_${2}('$3','$4');\n";
166        $_ = substr($_,length($1));
167      }
168      # class
169      elsif(/\A(class (\w+) +does PrecOp\[\|%(.*?)\] *{} *\n)/) {
170        my($all,$name,$prec)=($1,$2,$3);  $used{cls1}=1;
171        die "assert" if lc($name) ne $prec;
172        $out .= "class_does_PrecOp('$prec');\n";
173        $_ = substr($_,length($all));
174      }
175      elsif(/\A(class (\w+) +does QLang +{(.*?)^} *\n)/sm) {
176        my($all,$name,$block)=($1,$2,$3);  $used{cls2}=1;
177        $block = quote($block);
178        $out .= "class_does_QLang('$name','$block');\n";
179        $_ = substr($_,length($all));
180      }
181      # method and sub
182      elsif(/\A((multi )?(method|sub) +(\w+) *\(([^\)]*?)\)[ \n]*{(.*?)^} *\n)/sm) {
183        my($all,$multi,$methsub,$name,$args,$block)=($1,($2||""),$3,$4,$5,$6);  $used{methodsub}=1;
184        $multi = 'multi_' if $multi;
185        $args = quote($args);
186        $block = quote($block);
187        $out .= "grammar_$multi$methsub('$name','$args','$block');\n";
188        $_ = substr($_,length($all));
189      }
190      # statement
191      elsif(/\A(rule statement_control:(\w+) +{(<sym> <block> {\*} )}.*\n)/) {
192        push(@patterns,$3);  $used{stmt1}=1;
193        $out .= "rule_statement_control_block('$2');\n";
194        $_ = substr($_,length($1));
195      }
196      elsif(/\A(rule statement_control:(\w+) +{\\ *\n(.*?\n)} *\n)/s) {
197        my($all,$name,$pat)=($1,$2,$3);  $used{stmt2}=1;
198        push(@patterns,$pat);
199        $pat = quote(replace_sym($name,$pat));
200        $out .= "rule_statement_control('$name','$pat');\n";
201        $_ = substr($_,length($all));
202      }
203      elsif(/\A(rule statement_mod_(cond|loop):(\w+) +{(<sym> <modifier_expr> {\*} )}.*\n)/) {
204        push(@patterns,$4);  $used{stmt_mod}=1;
205        $out .= "rule_statement_mod_$2('$3');\n";
206        $_ = substr($_,length($1));
207      }
208      elsif(/\A(token statement_prefix:(\w+) +{( <sym> <statement> {\*} )}.*\n)/) {
209        push(@patterns,$3);  $used{stmt_prefix}=1;
210        $out .= "token_statement_prefix('$2');\n";
211        $_ = substr($_,length($1));
212      }
213      # rul foo:sym
214      elsif(
215        /\A((token|rule)[ ]
216            ([a-z_]+):
217            (?:sym.[ ]* (\S+? (?:[ ]\S+)?) [ ]*[»>}\]\"\']
218             |(\w+))
219            (?:[ ]*\((?:[ ]*-->[ ]*(\w+)[ ]*)?\))?
220            [ \n]*
221            $braces
222            (?:[ ]*\#=[ ]([^\n]+))?
223            [ ]*\n
224            (?=\w|\z))
225            /smx)
226    {
227        my($all,$tr,$cat,$sym,$sym2,$type,$pat,$callit)=($1,$2,$3,$4,$5,($6||""),$7,($8||""));  $used{rul_sym}=1;
228        push(@patterns,$pat);
229        $sym = $sym2 if not defined($sym);
230        $pat = remove_solo_whatever($pat);
231        $pat = '' if $pat =~ /^\s*<sym>\s*$/;
232        $pat = '' if $pat =~ /^\s*\'\Q$sym\E\'\s*$/;
233        $callit = '' if $callit eq $sym;
234        $pat = quote(replace_sym($sym,$pat));
235        $type = lc $type;
236        $out .= "${tr}_op('$cat','$sym','$type','$pat','$callit');\n";
237        $_ = substr($_,length($all));
238      }
239      # rul foo
240      elsif(/\A((token|rule|regex) (\w+)(?: *\(([^\)]*)\))? *$braces *\n)/sm) {
241        my($all,$tr,$name,$args,$pat)=($1,$2,$3,$4,$5);  $used{rul_plain}=1;
242        push(@patterns,$pat);
243        $pat = remove_solo_whatever($pat);
244        $pat = quote(replace_sym($name,$pat));
245        if(defined $args) {
246          $out .= "${tr}_pattern_with_args('$name','$args','$pat');\n";
247        }else{
248          $out .= "${tr}_pattern('$name','$pat');\n";
249        }
250        $_ = substr($_,length($all));
251      }
252      # interior lines
253      elsif(/\A(.*\n)/) {
254        $rest .= $1;
255        $both .= $1;
256        $_ = substr($_,length($1));
257      }
258      # done
259      elsif(/\A\z/) {
260        last;
261      }
262      else {
263        die "ERROR: match failed.  File doesnt end in newline?\n$_";
264      }
265      if($out =~ /^(?!\A)\w/m) {
266        die "BUG: a regex overshot, matching:\n$out";
267      }
268      $data .= $out;
269      $both .= $out;
270      push(@data_list,$out);
271      $out = "";
272    }
273  };
274
275  my $wrestle_with_Regex = sub {
276    my($body)=@_;
277    $body =~ s/^    //mg;
278    s/^(token )(ws|stdstopper) /${1}regex__$2 /mg;
279    $body;
280  };
281  s/^grammar Regex is Perl { *\n(.*?)^}/$wrestle_with_Regex->($1)/mse
282      or die "Couldn't find Regex to wrestle with it.";
283
284  $do_extraction->();
285
286  # Resist bitrot
287  my $problem = "";
288  for my $k (keys(%used)) {
289    $problem .= "An undeclared extractor $k was reported used.  Bug.\n"
290        if(!$extractors{$k});
291    $problem .= "Extractor $k was not used.  The STD.pm format must have changed.\n"
292        if(!$used{$k});
293  }
294  print STDERR $problem;
295  #die "Aborting.\n" if $problem;
296
297  spew("items-all.pm",$both);
298  spew("items-data.pm",$data);
299  spew("items-nondata.pm",$rest);
300
301  my @sorted = sort {
302    my $atop = $a=~/^(.+)/;my $btop = $b=~/^(.*)/; $a cmp $b } @data_list;
303  spew("items-data-sorted.pm",join("",@sorted));
304
305  open(F,"|grep '^\\w'|sort>$dir/items-summary") or die;
306  binmode(F, ":utf8");
307  print F $both;
308  close(F);
309
310  pattern_analysis(\@patterns);
311}
312sub pattern_analysis {
313  my($patterns)=@_;
314  my $pats = join("", map{/\n\z/?$_:"$_\n"} @$patterns);
315  local $_ = $pats;
316  s/ +/ /g;
317  s/\n /\n/g;
318  my @code;
319  my $re = $RE{balanced}{-parens=>'{}()<>[]'};
320  while(s/(?:(?<=\s)|(?<=^))(\{(?!(?:\.\.\.|\*)\})$re})/{...}/ms) {
321    my $code = $1;
322    $code =~ s/\n/  /g;
323    push(@code,$code."\n");
324  }
325  while(/^\s*(:my.*)/mgc) {
326    push(@code,$1."\n");
327  }
328
329  my $preserve_spaces = sub {my($s)=@_;$s=~s/ /\014/g;$s};
330  s/(:my.+)/$preserve_spaces->($1)/eg;
331  s/((?<!\')#.+)/$preserve_spaces->($1)/eg;
332
333  $re = $RE{delimited}{-delim=>'"'};
334  #s/(?<=\s)($re)/$preserve_spaces->($1)/eg;
335  $re = $RE{delimited}{-delim=>"'"};
336  #s/(?<=\s)($re)/$preserve_spaces->($1)/eg;
337  $re = $RE{delimited}{-delim=>'/'};
338  #s/(?<=\s)($re)/$preserve_spaces->($1)/eg;
339  $re = $RE{balanced}{-parens=>'<>'};
340  s/(?<=\s)($re)/$preserve_spaces->($1)/eg;
341
342  s/ /\n/g;
343  s/\014/ /g;
344  s/\n+/\n/g; s/\A\n//;
345  my $elements = $_;
346
347  spew("pattern-dump",$pats);
348  spew("pattern-elements",$elements);
349  do {
350    my $elems = $elements;
351    local $_ = $elems;
352    s/^#= .+/#= xxx/mg;
353    $elems = $_;
354    my @e = map{"$_\n"} split(/\n/,$elems);
355    my %u = map{($_,1)} @e;
356    my $uniq = join("",sort keys %u);
357    spew("pattern-elements-unique",$uniq);
358  };
359  spew("pattern-code",join("",@code));
360}
361
362spew("file_0.pm",$_);
363remove_clutter;
364spew("file_1.pm",$_);
365system("cd $dir; diff -u file_0.pm file_1.pm > file_1.diff");
366extract;
367
368#; Local Variables:
369#; perl-indent-level: 2
370#; End:
371#; vim: shiftwidth=2:
Note: See TracBrowser for help on using the browser.