# Lex.pm: Class Used to Generate Lexical Analyzers package RDA::Object::Lex; # $Id: Lex.pm,v 1.10 2015/05/13 14:53:52 RDA Exp $ # ARCS: $Header: /home/cvs/cvs/RDA_8/src/scripting/lib/RDA/Object/Lex.pm,v 1.10 2015/05/13 14:53:52 RDA Exp $ # # Change History # 20150513 MSC Improve the documentation. =head1 NAME RDA::Object::Lex - Class Used to Generate Lexical Analyzers =head1 SYNOPSIS use RDA::Object::Lex; @def = (qw(ADDOP [-+] LEFTP [\(] RIGHTP [\)] INTEGER [1-9][0-9]* NEWLINE \n ), qw(STRING), [qw(" (?:[^"]+|"")* ")], qw(ERROR .*), sub { die qq{Cannot analyze: "$_[1]"}}, ); $lex = RDA::Object::Lex->new('DEMO'); $lex->set_trace(63); $lex->init(@def)->from(\*DATA); print "Tokenization of DATA:\n"; TOKEN:for (;;) { $tok = $lex->get_next; last TOKEN if $lex->eoi; print "Line $.\t"; print "Type: ".$tok->get_oid."\t"; print "Content:->".$tok->get_text."<-\n"; } __END__ 1+2-5 "a multiline string with an embedded "" in it" an invalid string with a "" in it" =head1 DESCRIPTION The class C creates lexical analyzers. It is a subclass of L. A lexical analyzer is defined by list of tokens passed as arguments to the C or C method. Tokens are instances of the L class. The definition of a token usually comprises two arguments: a symbolic name (like C), followed by a regular expression. When an anonymous subroutine is given as third argument, it is called when the token is recognized. Its arguments are the C instance and the string recognized by the regular expression. The return value is used as the new string contents of the token object. The order in which the lexical analyzer examines the regular expressions is determined by the order in which these expressions are passed as arguments to the C or C method. The token returned by the lexical analyzer corresponds to the first regular expression which matches (this strategy is different from that used by Lex, which returns the longest match possible out of all that can be recognized). The lexical analyzer can recognize tokens which span multiple records. When the definition of the token comprises more than one regular expression (placed within a reference to an anonymous array), the analyzer reads as many records as required to recognize the token (see the documentation for the L class). When the start pattern is found, the analyzer looks for the end, and if necessary, reads more records. No backtracking is done in case of failure. The analyzer can be used to analyze an isolated character string or a stream of data coming from a file handle. At the end of the input data, the analyzer returns a C instance named C (End Of Input). =head2 Start Conditions You can associate start conditions with the token-recognition rules that comprise your lexical analyzer. When start conditions are used, the rule that succeeds is no longer necessarily the first rule that matches. A token symbol may be preceded by a start condition specifier for the associated recognition rule. For example: qw(C1:TERMINAL_1 REGEXP), sub { # associated action }, qw(TERMINAL_2 REGEXP), sub { # associated action }, Symbol C will be recognized only if start condition C is active. Start conditions are activated and deactivated using respectively the C and C methods. C resets the analysis automaton. Start conditions can be combined using AND/OR operators as follows: C1:SYMBOL condition C1 C1:C2:SYMBOL condition C1 AND condition C2 C1,C2:SYMBOL condition C1 OR condition C2 There are two types of start conditions: I and I, which are declared by class methods C and C respectively. With an inclusive start condition, all rules are active regardless of whether or not they are qualified with the start condition. With an exclusive start condition, only the rules qualified with the start condition are active; all other rules are deactivated. For example, use RDA::Object::Lex; @def = ( 'EXPECT', 'expect-floats', sub { $lex->start('expect'); return $_[1]; }, 'expect:FLOAT', '\d+\.\d+', sub { print "found a float: $_[1]\n"; return $_[1]; }, 'expect:NEWLINE', '\n', sub { $lex->end('expect') ; return $_[1]; }, 'NEWLINE2', '\n', 'INT', '\d+', sub { print "found an integer: $_[1] \n"; return $_[1]; }, 'DOT', '\.', sub { print "found a period\n"; return $_[1] }, ); RDA::Object::Lex->exclusive('expect'); $lex = RDA::Object::Lex->new('DEMO',@def); The special start condition C is always verified. =head1 METHODS The following methods are available: =cut use strict; BEGIN { use Exporter; use RDA::Text qw(debug get_string); use RDA::Driver::Template; use RDA::Object; use RDA::Object::Token; } use integer; # Define the global public variables use vars qw($STRINGS $VERSION @DUMP @ISA %SDCL); $VERSION = sprintf('%d.%02d', q$Revision: 1.10 $ =~ /(\d+)\.(\d+)/); @DUMP = ( str => { map {$_ => 1} qw(_cod _c_f _c_s _smc) }, ); @ISA = qw(RDA::Object Exporter); %SDCL = ( dep => [qw(RDA::Object::Token)], flg => 1, met => { 'end' => {ret => 0}, 'eoi' => {ret => 0}, 'exclusive' => {ret => 0}, 'flush' => {ret => 0}, 'from' => {ret => 0}, 'get_buffer' => {ret => 0}, 'get_length' => {ret => 0}, 'get_line' => {ret => 0}, 'get_next' => {ret => 0}, 'get_offset' => {ret => 0}, 'get_pos' => {ret => 0}, 'get_skip' => {ret => 0}, 'get_state' => {ret => 0}, 'get_text' => {ret => 0}, 'get_token' => {ret => 0}, 'hold' => {ret => 0}, 'inclusive' => {ret => 0}, 'init' => {ret => 0, evl => 'C'}, 'is_debug' => {ret => 0}, 'is_hold' => {ret => 0}, 'is_string' => {ret => 0}, 'is_trace' => {ret => 0}, 'read_line' => {ret => 0}, 'restart' => {ret => 0}, 'set_eoi' => {ret => 0}, 'set_pos' => {ret => 0}, 'set_line' => {ret => 0}, 'set_state' => {ret => 0}, 'set_text' => {ret => 0}, 'set_token' => {ret => 0}, 'set_trace' => {ret => 0}, 'skip' => {ret => 0}, 'start' => {ret => 0}, }, new => 1, trc => 'LEX', ); # Define the global private constants my $TOKEN_CLASS = 'RDA::Object::Token'; # Root class my $DEFAULT_TOKEN = $TOKEN_CLASS->new('DEFAULT', q{.*}); # Default token my $DEFAULT_SUB = sub { $_[0]->gen_lex; # lexical analyzer generation return &{$_[0]->{'_sub'}}; # lexical analyzer execution }; # Define the global private variables # Report the package version sub Version { return $VERSION; } # Create a prototype my $dummy = q{}; my $lexer = bless { ## no critic (Interpolation) _all => [], # Token list _buf => \$dummy, # String to tokenize _cod => [], # Current lexical analyzer _c_f => [], # Cached subroutine definition _c_s => [], # Cached subroutine definition _def => {}, # Token definition hash _eoi => 1, # End of input indicator _exc => {}, # Exclusive conditions _hld => 0, # Save or not what is consumed _hnd => [], # Lexical analyzer closure environnement _ifh => \*STDIN, # Default input file handle _lgt => \$dummy, # Length of the current record _lin => \$dummy, # Number of the current record _inc => {}, # Inclusive conditions _nxt => $DEFAULT_TOKEN, _off => \$dummy, # Offset from the beginning _pos => \$dummy, # Position in the current record _sav => q{}, # Saved string _skp => q{[ \t]+}, # Pattern to skip _sta => {INITIAL => \$dummy }, _str => 0, # True when you must analyze a string _sub => $DEFAULT_SUB, _s_f => sub {}, # Cache for the stream lexer _s_s => sub {}, # Cache for the string lexer }, __PACKAGE__; # Define the template # %%...%% are processed by the RDA::Driver::Template class # In %%...%%, $slf is the RDA::Driver::Template instance # Regular expressions must be delimited by // or m!! $lexer->set_template(RDA::Driver::Template->new( ## no critic (Interpolation,Long,Newline) LEX_WITH_SKIP => q@ if ($LEX_POS < $LEX_LGT and $LEX_BUF =~ /\G(?:%%&SKIP%%)/cg) { $lgt = pos($LEX_BUF) - $LEX_POS; $LEX_OFF += $lgt; $LEX_POS += $lgt; %%&IS_HOLD ? #LEX_HOLD_SKIP : ''%% } @, LEX_WITH_SKIP_LAST_READ => q@ if ($LEX_BUF =~ /\G(?:%%&SKIP%%)/cg) { # skip this pattern $lgt = pos($LEX_BUF) - $LEX_POS; $LEX_OFF += $lgt; $LEX_POS += $lgt; %%&IS_HOLD ? #LEX_HOLD_SKIP : ''%% } else { last READ; } @, LEX_HOLD_SKIP => q@$slf->save_text($1);@, LEX_HEADER_STRING => q@ { pos($LEX_BUF) = $LEX_POS; my $lgt = 0; # %%&SKIP ne '' ? #LEX_WITH_SKIP : ''%% if ($LEX_POS == $LEX_LGT) { $slf->set_eoi'(1); return $LEX_TOK = $RDA::Object::Token::EOI; } $LEX_TOK = undef; my $str = ''; # CASE:{ @, LEX_HEADER_STREAM => q@ { pos($LEX_BUF) = $LEX_POS; my $lgt = 0; my $LEX_IFH = $$LEX_FHR; # %%&SKIP ne '' ? #LEX_WITH_SKIP : ''%% if ($LEX_POS == $LEX_LGT) { return $LEX_TOK = $RDA::Object::Token::EOI if $slf->eoi; READ: { do { unless (defined($LEX_BUF = <$LEX_IFH>)) { $slf->set_eoi(1); return $LEX_TOK = $RDA::Object::Token::EOI; } pos($LEX_BUF) = $LEX_POS = 0; $LEX_LGT = length($LEX_BUF); $LEX_REC++; %%&SKIP ne '' ? #LEX_WITH_SKIP_LAST_READ : ''%% } while ($LEX_POS == $LEX_LGT); }# READ } my $str = ''; $LEX_TOK = undef; # CASE: { @, LEX_HOLD_TOKEN => q@$slf->save_text($str);@, LEX_FOOTER => q@ } #CASE %%&IS_HOLD ? #LEX_HOLD_TOKEN : ''%% $slf->set_token($LEX_TOK); $LEX_TOK; } @, )); =head2 S<$h-Enew($oid,@definition)> The object constructor. This method takes the lexical analyzer name and its definition as arguments. The definition a list of C instances, or a list of triplets permitting their creation. The triplets consist of: the symbolic name of the token, the regular expression necessary for its recognition, and possibly an anonymous subroutine that is called when the token is recognized. =cut sub new { my ($ctx, $oid, @arg) = @_; my ($slf, @tok); # Validate the lexical analyzer identifier die get_string('BAD_OID') unless $oid && $oid =~ m/^([A-Za-z]\w{0,31})$/; # Create the lexical analyzer object $slf = $ctx->clone; $slf->{'lvl'} = 0; $slf->{'oid'} = uc($oid); return @arg ? $slf->init(@arg) : $slf; } # Return a copy of the lexical analyzer sub clone { my ($src) = @_; my ($cls); return bless {%{$src}}, $cls if ($cls = ref($src)); # Clone the instance return bless {%{$src->get_prototype}}, $src; # Clone the prototype } =head2 S<$h-Eeoi> This method indicates whether there is no more data to analyze. =cut sub eoi { return shift->{'_eoi'}; } =head2 S<$h-Eflush> When saving of the consumed strings is activated, this method returns and clears the buffer containing the character strings recognized up to now. This is only useful when C has been called to activate saving of consumed strings. =cut sub flush { my ($slf) = @_; my ($txt); $txt = $slf->{'_sav'}; $slf->{'_sav'} = q{}; return $txt; } =head2 S<$h-Eget_buffer> This method returns the contents of the internal buffer of the lexical analyzer. =cut sub get_buffer { return ${shift->{'_buf'}}; } =head2 S<$h-Eget_code> This method returns the code of the current lexical analyzer =cut sub get_code { return join(q{}, @{shift->{'_cod'}}); } =head2 S<$h-Eget_length> This method returns the length of the current record. =cut sub get_length { return ${shift->{'_lgt'}}; } =head2 S<$h-Eget_line> This method returns the line number of the current record. It always returns 1 when a character string is being analyzed. The C method increments the line number. =cut sub get_line { return ${shift->{'_lin'}}; } =head2 S<$h-Eget_offset> This method returns the number of characters already consumed since the beginning of the analyzed data stream. =cut sub get_offset { return ${shift->{'_off'}}; } =head2 S<$h-Eget_pos> This methods returns the end position of the last token found in the current record. =cut sub get_pos { return ${shift->{'_pos'}}; } =head2 S<$h-Eget_skip> This method returns the current skip pattern. =cut sub get_skip { return shift->{'_skp'}; } =head2 S<$h-Eget_sub> This method returns the anonymous subroutine that performs the lexical analysis. Example: my $tok; my $sub = $lexer->get_sub; while (($tok = &{$sub}()) ne $RDA::Object::Token::EOI) { print $tok->get_oid."\t".$tok->get_text."\n"; } # or my $tok; local *tokenizer = $lexer->get_sub; while (($tok = tokenizer()) ne $RDA::Object::Token::EOI) { print $tok->get_oid."\t".$tok->get_text."\n"; } =cut sub get_sub { my ($slf) = @_; return $slf->{'_sub'} if ref($slf->{'_sub'}) eq 'CODE'; return $slf->gen_lex; } =head2 S<$h-Eget_token> This method returns the a reference of the object corresponding to the last recognized token. =cut sub get_token { return shift->{'_nxt'}; } =head2 S<$h-Ehold> This method activates or deactivates saving of the consumed strings. The return value is the current setting (TRUE or FALSE). You can obtain the contents of the buffer using the C method, which also empties the buffer. =cut sub hold { my ($slf) = @_; if (ref($slf)) { $slf->del_code; } else { $slf = $slf->get_prototype; } return $slf->{'_hld'} = not $slf->{'_hld'}; } =head2 S<$h-Eis_debug> This method indicates whether debug messages should be printed on the standard output. =cut sub is_debug { return shift->{'lvl'} & 16; ## no critic (Bit) } =head2 S<$h-Eis_hold> This method indicates whether consumed data is saved. =cut sub is_hold { return shift->{'_hld'}; } =head2 S<$h-Eis_string> This method indicates whether a character string is analyzed. =cut sub is_string { return shift->{'_str'}; } =head2 S<$h-Eis_trace> This method indicates whether the analysis should be traced. =cut sub is_trace { return shift->{'lvl'} & 32; ## no critic (Bit) } =head2 S<$h-Eread_line> Reads data from the input specified by the C method. Returns the result of the reading. Example: use RDA::Object::Lex; $lex = RDA::Object::Lex->new('DEMO'); print $lex->read_line # read and print one line while !$lex->eoi; =cut sub read_line { my ($slf) = @_; my ($ifh, $rec); $ifh = $slf->{'_ifh'}; if (defined($rec = <$ifh>)) { ${$slf->{'_lin'}}++; } else { $slf->{'_eoi'} = 1; } return $rec; } =head2 S<$h-Ereset> This method clears the internal buffer of the lexical analyzer and erases all tokens already recognized. =cut sub reset ## no critic (Builtin) { my ($slf) = @_; ${$slf->{'_lin'}} = 0; ${$slf->{'_lgt'}} = 0; ${$slf->{'_off'}} = 0; ${$slf->{'_pos'}} = 0; ${$slf->{'_buf'}} = q{}; $slf->{'_sav'} = q{}; $slf->{'_eoi'} = 0; $slf->set_state('INITIAL'); if ($slf->{'_nxt'}) { $slf->{'_nxt'}->set_text; $slf->{'_nxt'} = 0; } return $slf; } =head2 S<$h-Eset_text($text)> This method adds the specified text to the saved data. =cut sub save_text { my ($slf, $txt) = @_; return $slf->{'_sav'} .= $txt; } =head2 S<$h-Eset_eoi($flag)> This method sets the end-of-input indicator. =cut sub set_eoi { my ($slf, $flg) = @_; return $slf->{'_eoi'} = $flg; } =head2 S<$h-Eset_line($num)> This method sets the value of the line number. =cut sub set_line { my ($slf, $num) = @_; return ${$slf->{'_lin'}} = $num; } =head2 S<$h-Eset_pos($pos)> This methods specifies the position of the beginning of the next token to be recognized in the current line. =cut sub set_pos { my ($slf, $pos) = @_; return ${$slf->{'_pos'}} = $pos; } =head2 S<$h-Eset_trace([$level])> This method specifies the trace level and returns the previous trace level. It propagates the trace level to the definition tokens. =over 11 =item S< B > When true, signs the parts =item S< B > When true, traces the parts =item S< B > When true, traces the parts evaluation =item S< B > When true, traces the context definitions =item S< B > When true, traces the data analysis =item S< B > When true, traces the lexical analyzer operations =back =cut sub set_trace { my ($slf, $lvl) = @_; my ($old); $old = $slf->{'lvl'}; if (defined($lvl)) { $lvl = 0 if $lvl < 0; $slf->{'lvl'} = $lvl; foreach my $tok (@{$slf->{'_all'}}) { $tok->set_level($lvl); } } return $old; } =head2 S<$h-Eset_token($token)> This method sets the specified token as the current token. It is useful for re-qualifying a token inside the anonymous subroutine associated with the original token. =cut sub set_token { my ($slf, $tok) = @_; return $slf->{'_nxt'} = $tok; } =head1 LEXICAL ANALYZER DEFINITION METHODS =head2 S<$h-Einit(@definition)> This method defines the lexical analyzer. =cut sub init { my ($slf, @arg) = @_; $slf->{'_def'} = {}; $slf->{'_all'} = [map {$slf->{'_def'}->{$_->get_oid} = $_->set_lexer($slf)} $TOKEN_CLASS->factory(@arg)]; return $slf->reset; } =head2 S<$h-Eskip([$pattern])> This methods returns or sets the regular expression for consuming inter-token strings. Changing the skip pattern causes the recompilation of the lexical analyzer. For example, RDA::Object::Lex->skip('\s*#(?s:.*)|\s+'); @tok = RDA::Object::Lex->new('DEMO',INTEGER => '\d+')->analyze(\*DATA); print "@tok\n"; # print INTEGER 1 INTEGER 2 INTEGER 3 INTEGER 4 EOI __END__ 1 # first string to skip 2 3# second string to skip 4 =cut sub skip { my ($slf, $pat) = @_; if (ref($slf)) { # Instance method if (defined($pat) && $pat ne $slf->{'_skp'}) { debug(qq{LEX> Skip value: "$pat"}) if $slf->is_debug; $slf->{'_skp'} = $pat; $slf->del_code; } } else { $slf = $slf->get_prototype; if (defined($pat) && $pat ne $slf->{'_skp'}) { debug(qq{LEX> Prototype skip value: "$pat"}) if $slf->is_debug; $slf->{'_skp'} = $pat; } } return $slf->{'_skp'}; } # Delete the code already generated sub del_code { my ($slf) = @_; @{$slf->{'_c_f'}} = (); @{$slf->{'_c_s'}} = (); @{$slf->{'_cod'}} = (); $slf->{'_sub'} = $DEFAULT_SUB; return $slf; } # sub gen_code { my ($slf) = @_; my ($tpl); debug(q{LEX> gen_code()}) if $slf->is_debug; $tpl = $slf->get_template; # Generate the header part debug(q{LEX> - header}) if $slf->is_debug; $tpl->set_context($slf->{'lvl'}, IS_HOLD => $slf->is_hold, IS_TRACE => $slf->is_trace, SKIP => $slf->get_skip, ); $slf->{'_cod'}->[0] = $tpl->eval_part($slf->{'_str'} ? 'LEX_HEADER_STRING' : 'LEX_HEADER_STREAM'); # Generate the body part debug(q{LEX> - body}) if $slf->is_debug; $slf->{'_cod'}->[1] = join(q{}, map {$_->gen_code} @{$slf->{'_all'}}); # Generate the footer part debug(q{LEX> - footer}) if $slf->is_debug; $slf->{'_cod'}->[2] = $tpl->eval_part('LEX_FOOTER'); return $slf; } # Generate a condition sub gen_condition { my ($slf, $spc) = @_; my ($cnd, @cnd, @itm, %exc, %inc); return q{} if $spc =~ /^ALL:/; # special condition %exc = %{$slf->exclusive}; %inc = %{$slf->inclusive}; return q{} unless $spc || keys(%exc); if ($spc =~ /^(.+):/g) # For example, A:B:C: or A,C: { foreach my $grp (split(/:/, $1)) { foreach my $itm (@itm = split(/,/, $grp)) { die get_string('NO_COND', $itm) unless $itm eq 'INITIAL' || exists($exc{$itm}) || exists($inc{$itm}); delete($exc{$itm}); delete($inc{$itm}); } push(@cnd, q{(}.join(q{ or }, map {"\$$_"} @itm).q{)}); } $cnd = (@cnd == 1) ? $cnd[0] : q{(}.join(q{ and }, @cnd).q{)}; } $cnd = defined($cnd) ? $cnd = q{not (}.join(q{ or }, @cnd) . qq{) and $cnd} : $cnd = q{not (}.join(q{ or }, @cnd) . q{)} if (@cnd = map {"\$$_"} keys(%exc)); debug(qq{LEX> gen_condition(): $spc -> $cnd}) if $slf->is_debug; return defined($cnd) ? qq{$cnd and } : q{}; } # Generate the lexical analyzer sub gen_lex { my ($slf) = @_; my ($cod, $mac, %state); my ($LEX_BUF, $LEX_FHR, $LEX_LGT, $LEX_OFF, $LEX_POS, $LEX_REC, $LEX_TOK); $slf->gen_code unless @{$slf->{'_cod'}}; debug(q{LEX> Lexical analyzer generation...}) if $slf->is_debug; # Set the lexical analyzer context $LEX_FHR = \$slf->{'_ifh'}; $LEX_BUF = q{}; # Buffer to analyze $LEX_LGT = 0; # Buffer length $LEX_OFF = 0; # Offset from the beginning $LEX_POS = 0; # Current position in buffer $LEX_REC = 0; # Current record number $LEX_TOK = q{}; # Token instance $slf->set_handles(\$LEX_BUF, \$LEX_LGT, \$LEX_REC, \$LEX_POS, \$LEX_OFF, \%state); # Validate the lexical analyzer code $mac = $slf->gen_state_machine; $cod = $slf->get_code; eval qq!$mac; \$slf->{'_sub'} = sub $cod!; ## no critic (Eval) if ($@ or $slf->is_debug) { my $lin = 0; $mac =~ s/^/sprintf("%3d ", ++$lin)/meg; $cod =~ s/^/sprintf("%3d ", ++$lin)/meg; debug($mac, $cod); debug(q{LEX> }, $@); die "\n" if $@; } return $slf->{'_sub'}; } # sub get_definition { my ($slf, $nam) = @_; return $slf->{'_def'}->{$nam}; } # Get environnement of the lexer closure sub get_handles { my ($slf) = @_; return [$slf->{'_buf'}, $slf->{'_lgt'}, $slf->{'_lin'}, $slf->{'_pos'}, $slf->{'_off'}, $slf->{'_sta'}, ]; } # sub get_prototype { return $lexer || {}; } # sub get_template { return shift->{'_tpl'}; } # Set environnement of the lexer closure sub set_handles { my ($slf, @arg) = @_; return ($slf->{'_buf'}, $slf->{'_lgt'}, $slf->{'_lin'}, $slf->{'_pos'}, $slf->{'_off'}, $slf->{'_sta'}, ) = @arg; } # sub set_template { my ($slf, $val) = @_; return $slf->{'_tpl'} = $val; } # sub _save_handles { my ($slf) = @_; return $slf->{'_hnd'} = $slf->get_handles; } # sub _switch_handles { my ($slf) = @_; my ($bkp); $bkp = $slf->get_handles; $slf->set_handles(@{$slf->{'_hnd'}}); return $slf->{'_hnd'} = $bkp; } =head1 ANALYSIS METHODS =head2 S<$h-Eanalyze($source)> This method analyzes the specified data and returns a list of pairs consisting of a token name followed by recognized text. The data can be a character string or a reference to a file handle. Examples: @tok = RDA::Object::Lex->new('DEMO', qw(PLUS [+] NUMBER \d+))->analyze("3+3+3"); @tok = RDA::Object::Lex->new('DEMO', qw(PLUS [+] NUMBER \d+))->analyze(\*STREAM); =cut sub analyze { my ($slf, @src) = @_; my ($nxt, $tok, @tok); die get_string('NO_DATA') unless defined($src[0]); $slf->from(@src); $nxt = $slf->{'_sub'}; for (;;) ## no critic(Loop) { $tok = &{$nxt}($slf); push(@tok, $tok->get_oid, $tok->get_text); return @tok if $slf->{'_eoi'}; } } =head2 S<$h-Eevery($sub)> This method avoids having to write a reading loop in order to analyze a stream of data. The argument is an anonymous subroutine executed after the recognition of each token. For example, to analyze the string C<1+2>, you can write: use RDA::Object::Lex; $lex = RDA::Object::Lex->new('DEMO',qw(ADDOP [-+] INTEGER \d+ )); $lex->from("1+2"); $lex->every (sub { print $_[0]->get_oid."\t"; print $_[0]->get_text."\n"; }); The first argument of the anonymous subroutine is the L instance recognized. =cut sub every { my ($slf, $act) = @_; my ($ref, $tok); die get_string('NO_CODE') unless ($ref = ref($act)) && $ref eq 'CODE'; for (;;) ## no critic (Loop) { $tok = &{$slf->{'_sub'}}($slf); return if $slf->{'_eoi'}; &{$act}($tok); } } =head2 S<$h-Efrom([$source...])> This method specifies the source of the data to be analyzed. The argument of this method can be a string (or list of strings) or a reference to a file handle. It returns a reference to the calling lexical analyzer. By default, it reads data from the standard input. Examples: $ifh = IO::File->new; $ifh->open('from($ifh); $lex->from(\*DATA); $lex->from('the data to be analyzed'); =head2 S<$h-Efrom> This method returns the file handle when defined or an undefined value when the input is a string. =cut sub from { my ($slf, @src) = @_; my ($ref, $src); return $slf->{'_ifh'} ? $slf : undef unless @src; $ref = ref($src = $src[0]); return ($ref eq 'RDA::Object::Buffer') ? _from_ifh($slf, $src->get_handle) : ($ref eq 'IO::File') ? _from_ifh($slf, $src) : _is_file($src) ? _from_ifh($slf, $src) : _from_string($slf, join(q{ }, @src)); } sub _is_file { my ($src) = @_; my ($num); eval {$num = fileno($src)}; return defined($num); } sub _from_ifh { my ($slf, $ifh) = @_; debug(q{LEX> From stream}) if $slf->is_debug; $slf->{'_ifh'} = $ifh; if (@{$slf->{'_c_f'}}) { debug(q{LEX> Code already exists}) if $slf->is_debug; if ($slf->{'_str'}) # when STREAM is not the current context { $slf->{'_cod'} = [@{$slf->{'_c_f'}}]; $slf->{'_sub'} = $slf->{'_s_f'}; $slf->_switch_handles; $slf->{'_str'} = 0; } } else { debug(q{LEX> STREAM code generation}) if $slf->is_debug; $slf->{'_str'} = 0; $slf->gen_code; $slf->_save_handles; $slf->gen_lex; $slf->{'_c_f'} = [@{$slf->{'_cod'}}]; $slf->{'_s_f'} = $slf->{'_sub'}; } return $slf->reset; } sub _from_string { my ($slf, $buf) = @_; debug(q{LEX> From string}) if $slf->is_debug; if (@{$slf->{'_c_s'}}) { debug(q{LEX> Code already exists}) if $slf->is_debug; unless ($slf->{'_str'}) { $slf->{'_cod'} = [@{$slf->{'_c_s'}}]; $slf->{'_sub'} = $slf->{'_s_s'}; $slf->_switch_handles; $slf->{'_str'} = 1; } } else { debug(q{LEX> STRING code generation}) if $slf->is_debug; $slf->{'_str'} = 1; $slf->gen_code; $slf->_save_handles; $slf->gen_lex; $slf->{'_c_s'} = [@{$slf->{'_cod'}}]; $slf->{'_s_s'} = $slf->{'_sub'}; } ${$slf->{'_buf'}} = $buf; ${$slf->{'_lgt'}} = length($buf); return $slf->reset; } =head2 S<$h-Eget_next> This method searches for the next token and return its reference. Its returns the C instance at the end of the data. For example, $lex = RDA::Object::Lex->new('DEMO',@token); print $lex->get_next->get_oid; # print the token type print $lex->get_text; # print the token content =cut sub get_next { my ($slf) = @_; my ($tok); $tok = eval {&{$slf->{'_sub'}}($slf)}; die $@ if $@; return $tok; } =head2 S<$h-Eparse([$source...])> This method analyzes the data in one call. =cut sub parse { my ($slf, @src) = @_; my ($nxt); die get_string('NO_DATA') unless defined($src[0]); $slf->from(@src); $nxt = $slf->{'_sub'}; &{$nxt}($slf) until $slf->{'_eoi'}; return $slf; } =head1 STATE MACHINE METHODS =head2 S<$h-Eend($name)> This method deactivates the specified condition. =cut sub end { my ($slf, $nam) = @_; return ${$slf->{'_sta'}->{$nam}} = 0; } =head2 S<$h-Eexclusive($name...)> This method declares that the specified conditions are I. =cut sub exclusive { my ($slf, @arg) = @_; $slf = $slf->get_prototype unless ref($slf); return $slf->{'_exc'} = {map {$_ => 1} @arg} if @arg; return $slf->{'_exc'}; } # sub gen_state_machine { my ($slf) = @_; my ($cod); ## no critic (Interpolation) $cod = 'my $INITIAL = 1;'."\n" .q!$state{'INITIAL'} = \\$INITIAL;!."\n"; foreach my $nam (keys(%{$slf->exclusive}), keys(%{$slf->inclusive})) { $cod .= q!my $!.$nam.q! = 0; ! .q!$state{'! .$nam .q!'} = \\$!.$nam.q!;!."\n"; } return $slf->set_state_machine($cod); } =head2 S<$h-Eget_state($name)> This method returns the state of the specified condition. =cut sub get_state { my ($slf, $nam) = @_; return ${$slf->{'_sta'}->{$nam}}; } =head2 S<$h-Eget_state_machine> This method returns the state machine code. =cut sub get_state_machine { return shift->{'_smc'}; } =head2 S<$h-Einclusive($name...)> This method declares that the specifies conditions are I. =cut sub inclusive { my ($slf, @arg) = @_; $slf = $slf->get_prototype unless ref($slf); return $slf->{'_inc'} = {map {$_ => 1} @arg} if @arg; return $slf->{'_inc'}; } =head2 S<$h-Erestart> This method re-initializes the analysis automaton. The sole active condition becomes the condition C. =cut sub restart { my ($slf) = @_; my ($sta); $sta = $slf->{'_sta'}; foreach my $nam (keys(%{$sta})) { ${$sta->{$nam}} = 0; } return ${$sta->{'INITIAL'}} = 1; } =head2 S<$h-Eset_state($name,$value)> This method assigns a new state to the specified condition. =cut sub set_state { my ($slf, $nam, $val) = @_; return ${$slf->{'_sta'}->{$nam}} = $val; } =head2 S<$h-Eset_state_machine($code)> This method specifies the state machine code. =cut sub set_state_machine { my ($slf, $cod) = @_; return $slf->{'_smc'} = $cod; } =head2 S<$h-Estart($name)> This method activates the specified condition. =cut sub start { my ($slf, $nam) = @_; if ($nam eq 'INITIAL') { $slf->restart; } else { $slf->restart if exists($slf->{'_exc'}->{$nam}); ${$slf->{'_sta'}->{$nam}} = 1; } return; } 1; __END__ =head1 SEE ALSO L, L, L, L, L, L, L, L, L =head1 COPYRIGHT NOTICE Copyright (c) 1996, 2016, Oracle and/or its affiliates. All rights reserved. =head1 TRADEMARK NOTICE Oracle and Java are registered trademarks of Oracle and/or its affiliates. Other names may be trademarks of their respective owners. =cut