| 1 | use v6-alpha; |
|---|
| 2 | |
|---|
| 3 | my %words; |
|---|
| 4 | |
|---|
| 5 | sub load_db returns Void { |
|---|
| 6 | return() unless "words.db.pl" ~~ :e; |
|---|
| 7 | my $db = open("words.db.pl") orelse die "Cannot open the words.db.pl file: $!"; |
|---|
| 8 | for (=$db) -> $_line { |
|---|
| 9 | my $line = $_line; |
|---|
| 10 | my ($key, $value) = split("\t", $line); |
|---|
| 11 | %words{"$key"} = $value; |
|---|
| 12 | } |
|---|
| 13 | $db.close(); |
|---|
| 14 | } |
|---|
| 15 | |
|---|
| 16 | sub save_db returns Void { |
|---|
| 17 | my $db = open("words.db.pl", :w) orelse die "Cannot open the words.db.pl file: $!"; |
|---|
| 18 | for (%words.kv) -> $key, $value { |
|---|
| 19 | $db.say($key ~ "\t" ~ $value); |
|---|
| 20 | } |
|---|
| 21 | $db.close(); |
|---|
| 22 | } |
|---|
| 23 | |
|---|
| 24 | sub parse_file (Str $file) returns Hash { |
|---|
| 25 | my %words_in_file; |
|---|
| 26 | my $fh = open("$file") orelse die "Cannot open the '$file' file: $!"; |
|---|
| 27 | for (=$fh) -> $_line { |
|---|
| 28 | my $line = $_line; |
|---|
| 29 | while ($line ~~ s:perl5/(\w+)[ \t\n\r]//) { |
|---|
| 30 | %words_in_file{lc($0)}++; |
|---|
| 31 | } |
|---|
| 32 | } |
|---|
| 33 | $fh.close; |
|---|
| 34 | return %words_in_file; |
|---|
| 35 | } |
|---|
| 36 | |
|---|
| 37 | sub add_words (Str $category, %words_in_file) returns Void { |
|---|
| 38 | for (%words_in_file.kv) -> $key, $value { |
|---|
| 39 | %words{"$category-$key"} += $value; |
|---|
| 40 | } |
|---|
| 41 | } |
|---|
| 42 | |
|---|
| 43 | sub classify (%words_in_file) returns Void { |
|---|
| 44 | |
|---|
| 45 | my %count; |
|---|
| 46 | my $total = 0; |
|---|
| 47 | |
|---|
| 48 | for (%words.kv) -> $key, $value { |
|---|
| 49 | $key ~~ rx:perl5/^(.+)-(.+)$/; |
|---|
| 50 | %count{$0} += $value; |
|---|
| 51 | $total += $value; |
|---|
| 52 | } |
|---|
| 53 | |
|---|
| 54 | my %score; |
|---|
| 55 | for (%words_in_file.keys) -> $word { |
|---|
| 56 | for (%count.kv) -> $category, $count { |
|---|
| 57 | if (defined(%words{"$category-$word"})) { |
|---|
| 58 | %score{$category} += log(%words{"$category-$word"} / $count); |
|---|
| 59 | } |
|---|
| 60 | else { |
|---|
| 61 | %score{$category} += log(0.01 / $count); |
|---|
| 62 | } |
|---|
| 63 | } |
|---|
| 64 | } |
|---|
| 65 | |
|---|
| 66 | for (%count.kv) -> $category, $count { |
|---|
| 67 | %score{$category} += log($count / $total) |
|---|
| 68 | } |
|---|
| 69 | |
|---|
| 70 | # do this weird sort block because: |
|---|
| 71 | # %score{$^a} <=> %score{$^b} |
|---|
| 72 | # does not currently work |
|---|
| 73 | for (%count.keys.sort:{ %score{$^a} == %score{$^b} ?? 0 !! %score{$^a} > %score{$^b} ?? -1 !! 1 }) -> $category { |
|---|
| 74 | say("$category %score{$category}"); |
|---|
| 75 | } |
|---|
| 76 | } |
|---|
| 77 | |
|---|
| 78 | load_db(); |
|---|
| 79 | |
|---|
| 80 | if (@*ARGS[0] eq 'add' && +@*ARGS == 3) { |
|---|
| 81 | add_words(@*ARGS[1], parse_file(@*ARGS[2])); |
|---|
| 82 | } |
|---|
| 83 | elsif (@*ARGS[0] eq 'classify' && +@*ARGS == 2) { |
|---|
| 84 | classify(parse_file(@*ARGS[1])); |
|---|
| 85 | } |
|---|
| 86 | else { |
|---|
| 87 | say("USAGE: |
|---|
| 88 | add <category> <file> |
|---|
| 89 | classify <file>"); |
|---|
| 90 | } |
|---|
| 91 | |
|---|
| 92 | save_db(); |
|---|