source: perl/lib/BarnOwl/Parse.pm @ 8d16e58

release-1.10release-1.6release-1.7release-1.8release-1.9
Last change on this file since 8d16e58 was 30c5aab, checked in by David Benjamin <davidben@mit.edu>, 15 years ago
Add function for point-less parses Often, you don't care very much about the point when you do the parse. We rename existing tokenize to tokenize_with_point and add tokenize that does point-less parses. Signed-off-by: David Benjamin <davidben@mit.edu>
  • Property mode set to 100644
File size: 2.7 KB
RevLine 
[2d232ed]1use warnings;
2use strict;
3
4package BarnOwl::Parse;
5
6use base qw(Exporter);
[30c5aab]7our @EXPORT_OK = qw(tokenize tokenize_with_point);
[2d232ed]8
9# TODO: have the main function return whether or not it was a valid parse, with
10# possible error messages or something.  (Still give a parse of some sort on
11# invalid parses, just let us know it's invalid if we care.) This is to
12# implement command-line-ish things in Perl.
13
14=for doc
15
16Ideally, this should use the same codepath we use to /actually/
17tokenize commands, but for now, make sure this is kept in sync with
18owl_parseline in util.c
19
20Unlike owl_parseline, we always return a result, even in the presence
21of parse errors, since we may be called on incomplete command-lines.
22
23The owl_parseline rules are:
24
25* Tokenize on ' ' and '\t'
26* ' and " are quote characters
27* \ has no effect
28
29=cut
30
31my $boring = qr{[^'" \t]};
32my $quote  = qr{['"]};
33my $space  = qr{[ \t]};
34
[30c5aab]35sub tokenize_with_point {
[2d232ed]36    my $line = shift;
37    my $point = shift;
38
39    my @words = ();
40    my $cword = 0;
41    my $cword_start;
42    my $cword_end;
43    my $word_point;
44
45    my $word = '';
46    my $wstart = 0;
47    my $skipped = 0;
48    my $have_word = 0;
49
50    pos($line) = 0;
51    while(pos($line) < length($line)) {
52        if($line =~ m{\G ($boring+) }gcx) {
53            $word .= $1;
54            $have_word = 1;
55        } elsif ($line =~ m{\G ($quote)}gcx) {
56            my $chr = $1;
57            $skipped++ if pos($line) > $point;
58            if($line =~ m{\G ([^$chr]*) $chr}gcx) {
59                $word .= $1;
60                $skipped++ if pos($line) > $point;
61            } else {
62                $word .= substr($line, pos($line));
63                pos($line) = length($line);
64            }
65            $have_word = 1;
66        }
67
68        if ($line =~ m{\G ($space+|$)}gcx) {
69            my $wend = pos($line) - length($1);
70            if ($have_word) {
71                push @words, $word;
72                $cword++ unless $wend >= $point;
73                if(($wend >= $point) && !defined($word_point)) {
74                    $word_point = length($word) - ($wend - $point) + $skipped;
75                    $cword_start = $wstart;
76                    $cword_end   = $wend;
77                }
78            }
79            # Always reset, so we get $wstart right
80            $word = '';
81            $wstart = pos($line);
82            $skipped = 0;
83            $have_word = 0;
84        }
85    }
86
87    if(length($word)) { die("Internal error, leftover=$word"); }
88
89    unless(defined($word_point)) {
90        $word_point = 0;
91        $cword_start = $cword_end = $point;
92    }
93
94    return (\@words, $cword, $word_point, $cword_start, $cword_end);
95}
[30c5aab]96
97sub tokenize {
98    my $line = shift;
99
100    my ($words, $word, $word_point,
101        $word_start, $word_end) = tokenize_with_point($line, 0);
102    return $words;
103}
Note: See TracBrowser for help on using the repository browser.