1 | use warnings; |
---|
2 | use strict; |
---|
3 | |
---|
4 | package BarnOwl::Parse; |
---|
5 | |
---|
6 | use base qw(Exporter); |
---|
7 | our @EXPORT_OK = qw(tokenize tokenize_with_point); |
---|
8 | |
---|
9 | # TODO: have the main function return whether or not it was a valid parse, with |
---|
10 | # possible error messages or something. (Still give a parse of some sort on |
---|
11 | # invalid parses, just let us know it's invalid if we care.) This is to |
---|
12 | # implement command-line-ish things in Perl. |
---|
13 | |
---|
14 | =for doc |
---|
15 | |
---|
16 | Ideally, this should use the same codepath we use to /actually/ |
---|
17 | tokenize commands, but for now, make sure this is kept in sync with |
---|
18 | owl_parseline in util.c |
---|
19 | |
---|
20 | Unlike owl_parseline, we always return a result, even in the presence |
---|
21 | of parse errors, since we may be called on incomplete command-lines. |
---|
22 | |
---|
23 | The owl_parseline rules are: |
---|
24 | |
---|
25 | * Tokenize on ' ' and '\t' |
---|
26 | * ' and " are quote characters |
---|
27 | * \ has no effect |
---|
28 | |
---|
29 | =cut |
---|
30 | |
---|
31 | my $boring = qr{[^'" \t]}; |
---|
32 | my $quote = qr{['"]}; |
---|
33 | my $space = qr{[ \t]}; |
---|
34 | |
---|
35 | sub tokenize_with_point { |
---|
36 | my $line = shift; |
---|
37 | my $point = shift; |
---|
38 | |
---|
39 | my @words = (); |
---|
40 | my $cword = 0; |
---|
41 | my $cword_start; |
---|
42 | my $cword_end; |
---|
43 | my $word_point; |
---|
44 | |
---|
45 | my $word = ''; |
---|
46 | my $wstart = 0; |
---|
47 | my $skipped = 0; |
---|
48 | my $have_word = 0; |
---|
49 | |
---|
50 | pos($line) = 0; |
---|
51 | while(pos($line) < length($line)) { |
---|
52 | if($line =~ m{\G ($boring+) }gcx) { |
---|
53 | $word .= $1; |
---|
54 | $have_word = 1; |
---|
55 | } elsif ($line =~ m{\G ($quote)}gcx) { |
---|
56 | my $chr = $1; |
---|
57 | $skipped++ if pos($line) > $point; |
---|
58 | if($line =~ m{\G ([^$chr]*) $chr}gcx) { |
---|
59 | $word .= $1; |
---|
60 | $skipped++ if pos($line) > $point; |
---|
61 | } else { |
---|
62 | $word .= substr($line, pos($line)); |
---|
63 | pos($line) = length($line); |
---|
64 | } |
---|
65 | $have_word = 1; |
---|
66 | } |
---|
67 | |
---|
68 | if ($line =~ m{\G ($space+|$)}gcx) { |
---|
69 | my $wend = pos($line) - length($1); |
---|
70 | if ($have_word) { |
---|
71 | push @words, $word; |
---|
72 | $cword++ unless $wend >= $point; |
---|
73 | if(($wend >= $point) && !defined($word_point)) { |
---|
74 | $word_point = length($word) - ($wend - $point) + $skipped; |
---|
75 | $cword_start = $wstart; |
---|
76 | $cword_end = $wend; |
---|
77 | } |
---|
78 | } |
---|
79 | # Always reset, so we get $wstart right |
---|
80 | $word = ''; |
---|
81 | $wstart = pos($line); |
---|
82 | $skipped = 0; |
---|
83 | $have_word = 0; |
---|
84 | } |
---|
85 | } |
---|
86 | |
---|
87 | if(length($word)) { die("Internal error, leftover=$word"); } |
---|
88 | |
---|
89 | unless(defined($word_point)) { |
---|
90 | $word_point = 0; |
---|
91 | $cword_start = $cword_end = $point; |
---|
92 | } |
---|
93 | |
---|
94 | return (\@words, $cword, $word_point, $cword_start, $cword_end); |
---|
95 | } |
---|
96 | |
---|
97 | sub tokenize { |
---|
98 | my $line = shift; |
---|
99 | |
---|
100 | my ($words, $word, $word_point, |
---|
101 | $word_start, $word_end) = tokenize_with_point($line, 0); |
---|
102 | return $words; |
---|
103 | } |
---|