[2d232ed] | 1 | use warnings; |
---|
| 2 | use strict; |
---|
| 3 | |
---|
| 4 | package BarnOwl::Parse; |
---|
| 5 | |
---|
| 6 | use base qw(Exporter); |
---|
[30c5aab] | 7 | our @EXPORT_OK = qw(tokenize tokenize_with_point); |
---|
[2d232ed] | 8 | |
---|
| 9 | # TODO: have the main function return whether or not it was a valid parse, with |
---|
| 10 | # possible error messages or something. (Still give a parse of some sort on |
---|
| 11 | # invalid parses, just let us know it's invalid if we care.) This is to |
---|
| 12 | # implement command-line-ish things in Perl. |
---|
| 13 | |
---|
| 14 | =for doc |
---|
| 15 | |
---|
| 16 | Ideally, this should use the same codepath we use to /actually/ |
---|
| 17 | tokenize commands, but for now, make sure this is kept in sync with |
---|
| 18 | owl_parseline in util.c |
---|
| 19 | |
---|
| 20 | Unlike owl_parseline, we always return a result, even in the presence |
---|
| 21 | of parse errors, since we may be called on incomplete command-lines. |
---|
| 22 | |
---|
| 23 | The owl_parseline rules are: |
---|
| 24 | |
---|
| 25 | * Tokenize on ' ' and '\t' |
---|
| 26 | * ' and " are quote characters |
---|
| 27 | * \ has no effect |
---|
| 28 | |
---|
| 29 | =cut |
---|
| 30 | |
---|
| 31 | my $boring = qr{[^'" \t]}; |
---|
| 32 | my $quote = qr{['"]}; |
---|
| 33 | my $space = qr{[ \t]}; |
---|
| 34 | |
---|
[30c5aab] | 35 | sub tokenize_with_point { |
---|
[2d232ed] | 36 | my $line = shift; |
---|
| 37 | my $point = shift; |
---|
| 38 | |
---|
| 39 | my @words = (); |
---|
| 40 | my $cword = 0; |
---|
| 41 | my $cword_start; |
---|
| 42 | my $cword_end; |
---|
| 43 | my $word_point; |
---|
| 44 | |
---|
| 45 | my $word = ''; |
---|
| 46 | my $wstart = 0; |
---|
| 47 | my $skipped = 0; |
---|
| 48 | my $have_word = 0; |
---|
| 49 | |
---|
| 50 | pos($line) = 0; |
---|
| 51 | while(pos($line) < length($line)) { |
---|
| 52 | if($line =~ m{\G ($boring+) }gcx) { |
---|
| 53 | $word .= $1; |
---|
| 54 | $have_word = 1; |
---|
| 55 | } elsif ($line =~ m{\G ($quote)}gcx) { |
---|
| 56 | my $chr = $1; |
---|
| 57 | $skipped++ if pos($line) > $point; |
---|
| 58 | if($line =~ m{\G ([^$chr]*) $chr}gcx) { |
---|
| 59 | $word .= $1; |
---|
| 60 | $skipped++ if pos($line) > $point; |
---|
| 61 | } else { |
---|
| 62 | $word .= substr($line, pos($line)); |
---|
| 63 | pos($line) = length($line); |
---|
| 64 | } |
---|
| 65 | $have_word = 1; |
---|
| 66 | } |
---|
| 67 | |
---|
| 68 | if ($line =~ m{\G ($space+|$)}gcx) { |
---|
| 69 | my $wend = pos($line) - length($1); |
---|
| 70 | if ($have_word) { |
---|
| 71 | push @words, $word; |
---|
| 72 | $cword++ unless $wend >= $point; |
---|
| 73 | if(($wend >= $point) && !defined($word_point)) { |
---|
| 74 | $word_point = length($word) - ($wend - $point) + $skipped; |
---|
| 75 | $cword_start = $wstart; |
---|
| 76 | $cword_end = $wend; |
---|
| 77 | } |
---|
| 78 | } |
---|
| 79 | # Always reset, so we get $wstart right |
---|
| 80 | $word = ''; |
---|
| 81 | $wstart = pos($line); |
---|
| 82 | $skipped = 0; |
---|
| 83 | $have_word = 0; |
---|
| 84 | } |
---|
| 85 | } |
---|
| 86 | |
---|
| 87 | if(length($word)) { die("Internal error, leftover=$word"); } |
---|
| 88 | |
---|
| 89 | unless(defined($word_point)) { |
---|
| 90 | $word_point = 0; |
---|
| 91 | $cword_start = $cword_end = $point; |
---|
| 92 | } |
---|
| 93 | |
---|
| 94 | return (\@words, $cword, $word_point, $cword_start, $cword_end); |
---|
| 95 | } |
---|
[30c5aab] | 96 | |
---|
| 97 | sub tokenize { |
---|
| 98 | my $line = shift; |
---|
| 99 | |
---|
| 100 | my ($words, $word, $word_point, |
---|
| 101 | $word_start, $word_end) = tokenize_with_point($line, 0); |
---|
| 102 | return $words; |
---|
| 103 | } |
---|