Changeset 3029 for src/pge

Show
Ignore:
Timestamp:
05/12/05 04:01:26 (4 years ago)
Author:
autrijus
svk:copy_cache_prev:
4536
Message:

* sync up with PGE for character classes.

Location:
src/pge
Files:
4 modified

Legend:

Unmodified
Added
Removed
  • src/pge/PGE.pir

    r3027 r3029  
    1818.sub "__onload" @LOAD 
    1919    .local pmc load 
     20    load_bytecode "Data/Escape.imc" 
    2021    load = find_global "PGE::TokenHash", "__onload" 
    2122    load() 
  • src/pge/PGE/Exp.pir

    r3027 r3029  
    1515    PGE::Dot       - match any character 
    1616    PGE::CCShortcut - character class shortcuts (\d, \D, \w, etc.) 
     17    PGE::CharClass - character classes (<[abcde]>, <-[abcde]>) 
    1718    PGE::WS        - <?ws> rule 
    1819    PGE::Anchor    - matching of ^, ^^, $, $$, \b, \B anchors 
     
    4546    $P0 = subclass expclass, "PGE::Exp::Dot" 
    4647    $P0 = subclass expclass, "PGE::Exp::CCShortcut" 
     48    $P0 = subclass expclass, "PGE::Exp::CharClass" 
    4749    $P0 = subclass expclass, "PGE::Exp::WS" 
    4850    $P0 = subclass expclass, "PGE::Exp::Anchor" 
     
    640642    emit(code, "  %s_f:", label) 
    641643    emit(code, "    goto fail") 
     644.end 
     645 
     646.namespace [ "PGE::Exp::CharClass" ] 
     647 
     648# Note:  The implementation interface for CharClass may change 
     649# in the near future, so don't rely on this too heavily just yet. 
     650# (pmichaud, 2005-05-11) 
     651 
     652.sub gen method 
     653    .param pmc code 
     654    .param string label 
     655    .param string next 
     656    .local string token 
     657    .local int min, max, isgreedy, iscut 
     658    .local pmc emit 
     659    .local string charclass, charmatch 
     660    (min, max, isgreedy, iscut) = self."_getattributes"() 
     661    emit = find_global "PGE::Exp", "emit" 
     662    $P0 = find_global "Data::Escape", "String" 
     663    charclass = self["charclass"] 
     664    charclass = $P0(charclass, '"') 
     665    charmatch = self["charmatch"] 
     666    emit(code, "\n  %s:", label) 
     667    emit(code, "    rep = 0") 
     668    unless isgreedy goto lazy 
     669    emit(code, "  %s_1:", label) 
     670    emit(code, "    if pos >= lastpos goto %s_2", label) 
     671    emit(code, "    if rep >= %d goto %s_2", max, label) 
     672    emit(code, "    $S0 = substr target, pos, 1") 
     673    emit(code, "    $I0 = index \"%s\", $S0", charclass) 
     674    emit(code, "    %s $I0 == -1 goto %s_2", charmatch, label) 
     675    emit(code, "    inc pos") 
     676    emit(code, "    inc rep") 
     677    emit(code, "    goto %s_1", label) 
     678    emit(code, "  %s_2:", label) 
     679    emit(code, "    if rep < %d goto fail", min) 
     680    unless iscut goto greedy_1 
     681    emit(code, "    goto %s", next) 
     682    .return () 
     683  greedy_1: 
     684    emit(code, "    if rep == %d goto %s", min, next) 
     685    self.emitsub(code, next, "pos", "rep") 
     686    emit(code, "    dec pos") 
     687    emit(code, "    dec rep") 
     688    emit(code, "    goto %s_2", label) 
     689    .return () 
     690  lazy: 
     691    emit(code, "  %s_0:", label) 
     692    emit(code, "    if rep < %d goto %s_1", min, label) 
     693    unless iscut goto lazy_1 
     694    emit(code, "    goto %s", next) 
     695    goto lazy_2 
     696  lazy_1: 
     697    emit(code, "    if rep >= %d goto %s", max, next) 
     698    emit(code, "    if pos > lastpos goto fail") 
     699    self.emitsub(code, next, "pos", "rep") 
     700  lazy_2: 
     701    emit(code, "  %s_1:", label) 
     702    emit(code, "    $S0 = substr target, pos, 1") 
     703    emit(code, "    $I0 = index \"%s\", pos") 
     704    emit(code, "    %s $I0 == -1 goto fail", charmatch) 
     705    emit(code, "    inc rep") 
     706    emit(code, "    inc pos") 
     707    emit(code, "    goto %s_0", label)  
    642708.end 
    643709 
  • src/pge/PGE/P6Rule.pir

    r3027 r3029  
    6767    p6meta['$8'] = $P0 
    6868    p6meta['$9'] = $P0 
    69     $P0 = find_global "PGE::P6Rule", "p6rule_parse_subrule"      # XXX: TODO 
     69    $P0 = find_global "PGE::P6Rule", "p6rule_parse_subrule"  
    7070    p6meta['<'] = $P0 
    7171    p6meta['>'] = u 
    72     $P0 = find_global "PGE::P6Rule", "p6rule_parse_charclass" 
     72    $P0 = find_global "PGE::P6Rule", "p6rule_parse_ccshortcut" 
    7373    p6meta['\d'] = $P0 
    7474    p6meta['\D'] = $P0 
     
    7979    p6meta['\n'] = $P0 
    8080    p6meta['\N'] = $P0 
     81    $P0 = find_global "PGE::P6Rule", "p6rule_parse_charclass" 
     82    p6meta['<['] = $P0 
     83    p6meta['<-['] = $P0 
     84    p6meta['<+['] = $P0 
    8185.end 
    8286 
     
    438442 
    439443 
    440 =item C<p6rule_parse_charclass(STR pattern, PMC lex)> 
     444=item C<p6rule_parse_ccshortcut(STR pattern, PMC lex)> 
    441445 
    442446Parses a character class of some sort, including the \n, \N, \s, \S, 
     
    445449=cut 
    446450 
    447 .sub p6rule_parse_charclass 
     451.sub p6rule_parse_ccshortcut 
    448452    .param string pattern 
    449453    .param pmc lex 
     
    455459    $I0 = length token 
    456460    p6rule_parse_skip(pattern, lex, $I0) 
     461    .return (exp) 
     462.end 
     463 
     464=item C<p6rule_parse_charclass(STR pattern, PMC lex, STR token)> 
     465 
     466Parse a character class in a rule expression. 
     467 
     468Note: The interface for PGE::Exp::CharClass may change in the 
     469near future, so don't rely on this code too strongly just yet. 
     470(pmichaud, 2005-05-11) 
     471 
     472=cut 
     473 
     474.sub p6rule_parse_charclass 
     475    .param string pattern 
     476    .param pmc lex 
     477    .param string token 
     478    .local int pos, plen 
     479    .local string charclass 
     480    .local int range 
     481    .local pmc exp 
     482    pos = lex["pos"] 
     483    plen = lex["plen"] 
     484    $I0 = length token 
     485    pos += $I0 
     486    charclass = '' 
     487    range = 0 
     488  scan: 
     489    if pos >= plen goto no_close_err 
     490    $S0 = substr pattern, pos, 1 
     491    if $S0 == ']' goto end_class 
     492    if $S0 == '-' goto unescaped_hyphen 
     493    if $S0 == '.' goto start_range 
     494    unless $S0 == '\\' goto add_char 
     495  backslash: 
     496    inc pos 
     497    $S0 = substr pattern, pos, 1 
     498    $I0 = index "nrtfae0", $S0 
     499    if $I0 == -1 goto add_char 
     500    $S0 = substr "\n\r\t\f\a\e\0", $I0, 1 
     501  add_char: 
     502    inc pos 
     503    if range goto add_range 
     504    concat charclass, $S0 
     505    goto scan 
     506  add_range: 
     507    range = 0 
     508    $I2 = ord charclass, -1 
     509    $I0 = ord $S0 
     510  add_range_1: 
     511    inc $I2 
     512    if $I2 > $I0 goto scan 
     513    $S1 = chr $I2 
     514    concat charclass, $S1 
     515    goto add_range_1 
     516  start_range: 
     517    if range goto add_range 
     518    $S1 = substr pattern, pos, 2 
     519    unless $S1 == ".." goto add_char 
     520    pos += 2 
     521    range = 1 
     522    goto scan 
     523  end_class: 
     524    $S0 = substr pattern, pos, 2 
     525    unless $S0 == "]>" goto unescaped_bracket 
     526    pos += 2 
     527    lex["pos"] = pos 
     528    p6rule_parse_skip(pattern, lex, 0) 
     529    $P0 = find_global "PGE::Exp", "new" 
     530    exp = $P0("PGE::Exp::CharClass") 
     531    exp["charclass"] = charclass 
     532    $S0 = substr token, 1, 1 
     533    if $S0 == "-" goto charclass_negate 
     534    exp["charmatch"] = "if" 
     535    goto end 
     536  charclass_negate: 
     537    exp["charmatch"] = "unless" 
     538    goto end 
     539  unescaped_hyphen: 
     540    p6rule_parse_error(pattern, lex, "Unescaped '-' in charclass (use '..' or '\\-')") 
     541    goto end 
     542  no_close_err: 
     543    p6rule_parse_error(pattern, lex, "No closing ']>' for character class") 
     544    goto end 
     545  unescaped_bracket: 
     546    p6rule_parse_error(pattern, lex, "Unescaped ']' in character class") 
     547    goto end 
     548  end: 
    457549    .return (exp) 
    458550.end 
     
    543635  quant:                                           # qexp is the atom to quant 
    544636  quant_quest: 
    545     if c != '?' goto quant_plus 
     637    if c != "?" goto quant_plus 
    546638    pos = "p6rule_parse_skip"(pattern, lex, 1) 
    547639    qexp["min"] = 0 
    548640    goto quant_greedy 
    549641  quant_plus: 
    550     if c != '+' goto quant_star 
     642    if c != "+" goto quant_star 
    551643    pos = "p6rule_parse_skip"(pattern, lex, 1) 
    552644    qexp["max"] = PGE_INF 
     
    554646    goto quant_greedy 
    555647  quant_star: 
    556     if c != '*' goto quant_greedy 
     648    if c != "*" goto quant_greedy 
    557649    pos = "p6rule_parse_skip"(pattern, lex, 1) 
    558650    c = substr pattern, pos, 1 
  • src/pge/README

    r2879 r3029  
    4343the load_bytecode operation, as in 
    4444 
    45     load_bytecode "runtime/parrot/library/PGE.pbc"           
     45    load_bytecode "PGE.pbc"           
    4646 
    4747This imports the C<PGE::p6rule> subroutine, which can be used to 
     
    9494 
    9595PGE doesn't (yet) properly handle nested repetitions of zero-length  
    96 patterns in groups -- that's coming next. 
     96patterns in groups -- that's coming soon. 
    9797 
    9898This is just the first-cut framework for building the  
    9999remainder of the engine, so many items (lookaround,  
    100 conjunctions, closures, character classes, and hypotheticals) 
     100conjunctions, closures, and hypotheticals) 
    101101just aren't implemented yet.  They're on their way! 
    102102