# Token.pm: Class Used to Manage Lexical Units package RDA::Object::Token; # $Id: Token.pm,v 1.6 2015/04/29 13:55:40 RDA Exp $ # ARCS: $Header: /home/cvs/cvs/RDA_8/src/scripting/lib/RDA/Object/Token.pm,v 1.6 2015/04/29 13:55:40 RDA Exp $ # # Change History # 20150211 MSC Fix symbols. =head1 NAME RDA::Object::Token - Class Used to Manage Lexical Units =head1 SYNOPSIS use RDA::Object::Lex; @tok = qw(ADDOP [-+] INTEGER [1-9]\d*); $lex = RDA::Object::Lex->new(@tok)->from(\*DATA); $ADDOP = $lex->get_definition('ADDOP'); $INTEGER = $lex->get_definition('INTEGER'); $txt = $INTEGER->get_next; print "$txt\n" if $INTEGER->status; $txt = $ADDOP->get_next; print "$txt\n" if $ADDOP->status; print "$txt\n" if $INTEGER->isnext(\$txt); __END__ 1+2 =head1 DESCRIPTION The C package defines the lexical units used by lexical analyzers. The C and C methods of the L package indirectly creates a C instance for each recognized lexical unit. It is a sub class of L. =head1 SUBCLASSES Subclasses of the C class are defined. They allow to recognize specific structures. Following subclasses are available: =over 2 =item * C : for defining 'ordinary' lexical units; =item * C : for defining lexical units that may require reading additional data; =item * C : for action-related lexical units; =item * C : for recognizing, for example, strings within double quotation marks. =back =head1 ERROR HANDLING To handle the cases of nonrecognition of lexical units, you can define a special C object at the end of the list of tokens that defines the lexical analyzer. When the search for this token succeeds, it is then possible to call an error handling routine. =head1 METHODS The following methods are available: =cut use strict; BEGIN { use Exporter; use RDA::Text qw(get_string); use RDA::Object; use RDA::Token::Action; use RDA::Token::Complex; use RDA::Token::Quoted; use RDA::Token::Simple; } # Define the global public variables use vars qw($AUTOLOAD $EOI $STRINGS $VERSION @DUMP @ISA %SDCL); $EOI = __PACKAGE__->new('EOI'); $VERSION = sprintf('%d.%02d', q$Revision: 1.6 $ =~ /(\d+)\.(\d+)/); @DUMP = ( hsh => {'RDA::Object::Token' => 1, 'RDA::Token::Action' => 1, 'RDA::Token::Complex' => 1, 'RDA::Token::Quoted' => 1, 'RDA::Token::Simple' => 1, }, ); @ISA = qw(RDA::Object Exporter); %SDCL = ( dep => [qw(RDA::Token::Action RDA::Token::Complex RDA::Token::Quoted RDA::Token::Simple)], flg => 1, inc => [qw(RDA::Object)], met => { 'get_attr' => {ret => 0}, 'get_condition' => {ret => 0}, 'get_expression' => {ret => 0}, 'get_next' => {ret => 0}, 'get_re' => {ret => 0}, 'get_status' => {ret => 0}, 'get_text' => {ret => 0}, 'set_attr' => {ret => 0}, 'set_expression' => {ret => 0}, 'set_status' => {ret => 0}, 'set_text' => {ret => 0}, }, new => 1, ); # Define the global private constants # Define the global private variables # Report the package version sub Version { return $VERSION; } =head2 Snew($name,$pattern,$sub)> The object constructor. This method takes a symbolic name, a regular expression, and an anonymous subroutine as arguments. The regular expression is either a simple regular expression, or a reference to an array containing from one to three regular expressions. In the latter case, the lexical unit can span several lines. For example, it can be a character string delimited by quotation marks, comments in a C program, and so on. The regular expressions are used to recognize: =over 3 =item 1. The beginning of the lexical unit, =item 2. The body of the lexical unit; when this second expression is missing, it uses C<(?:.*?)>, =item 3. The end of the lexical unit; when this last expression is missing, the first one is used. The end of the lexical unit cannot span several lines. =back For example, qw(STRING), [qw(" (?:[^"\\\\]+|\\\\(?:.|\n))* ")], These regular expressions can recognize multi-line strings delimited by quotation marks, where the backslash is used to quote the quotation marks appearing within the string. Note the quadrupling of the backslashes. Here is a variation of the previous example which uses the C option to include the newline in the characters recognized by "C<.>": qw(STRING), [qw(" (?s:[^"\\\\]+|\\\\.)* ")], The anonymous subroutine is called when the lexical unit is recognized by the lexical analyzer. This subroutine takes two arguments: a reference to the token object, and the string recognized by the regular expression. The scalar returned by the anonymous subroutine defines the character string memorized in the token object. C is represented by a blessed hash reference. The following special keys are used: =over 12 =item S< B<'lvl' > > Trace level =item S< B<'oid' > > Symbolic name =item S< B<'_act'> > Associated sub =item S< B<'_cnd'> > Associated conditions =item S< B<'_det'> > Token decoration =item S< B<'_exp'> > For an action token =item S< B<'_lex'> > Lexical analyzer instance =item S< B<'_pat'> > Regular expression (can be an array reference) =item S< B<'_sta'> > Object status =item S< B<'_tpl'> > Associated template =item S< B<'_txt'> > Recognized text =back Internal keys are prefixed by an underscore. =cut sub new { my ($cls, @arg) = @_; my ($cnd, $oid); ($cnd, $oid) = _parse_name($arg[0]); return bless { lvl => 0, # Trace level oid => $oid, # Symbolic name _act => $arg[2], # Associated sub _cnd => $cnd, # Associated conditions _det => {}, # Token decoration _exp => $arg[4], # For an action token _lex => $arg[3], # Lexical analyzer instance _pat => $arg[1], # Regular expression (can be an array reference) _sta => 0, # Object status _tpl => {}, # Associated template _txt => q{}, # Recognized text }, ref($cls) || $cls; } sub _parse_name { my ($nam) = @_; return ($1, $2) if $nam =~ /^(.+:)(.+)/; # Ex. A:B:C:SYMBOL, A,C:SYMBOL return (q{}, $nam); } # sub AUTOLOAD ## no critic (Autoload) { my ($slf, $val) = @_; my ($nam); return unless ref($slf); $nam = $AUTOLOAD; $nam =~ s/.*://; return defined($val) ? $slf->{'_det'}->{$nam} = $val : $slf->{'_det'}->{$nam}; } # Destroy the object sub DESTROY { } =head2 RDA::Object::Token->factory($definition...) This method creates a list of token objects from a list of token specifications. The list can include objects of the C class or of a class derived from it. =cut sub factory { my ($slf, @arg) = @_; my ($cls, $pat, $nam, $sub, @tok); die get_string('NO_DEFINITION') unless @arg; while (@arg) { if (ref($nam = shift(@arg)) && $nam->isa(__PACKAGE__)) { # Add the token to the list push(@tok, $nam); } else { # Parse the specification and add it to the list $cls = (ref($pat = shift(@arg)) eq 'ARRAY') ? 'RDA::Token::Complex' : 'RDA::Token::Simple'; $sub = (@_ && ref($arg[0]) eq 'CODE') ? shift(@arg) : undef; push(@tok, $cls->new($nam, $pat, $sub)); } } push(@tok, 'RDA::Token::Simple'->new('ALL:DEFAULT', q{.*})); return @tok; } =head2 S<$h-Eget_action> This method returns the anonymous subroutine defined within the object. =cut sub get_action { return shift->{'_act'}; } =head2 S<$h-Eget_attr($name)> This method returns the value of the specified attribute. =cut sub get_attr { my ($slf, $nam) = @_; return $slf->{'_det'}->{$nam}; } =head2 S<$h-Eget_condition> This method returns the associated condition. =cut sub get_condition { return shift->{'_cnd'}; } =head2 S<$h-Eget_expression> This method returns the associated expression. =cut sub get_expression { return shift->{'_exp'}; } =head2 S<$h-Eget_next> This method activates searching for the lexical unit defined by the regular expression contained in the object. When this lexical unit is recognized on the character stream to analyze, it returns the string found and sets the status of the object to true. =cut sub get_next { my ($slf) = @_; my ($lex, $nxt, $txt); $lex = $slf->{'_lex'}; $nxt = $lex->get_token; if ($nxt == $EOI) { $slf->{'_sta'} = ($slf == $EOI) ? 1 : 0; return; } $lex->get_next unless $nxt; if ($slf == $lex->get_token) { $lex->set_token(0); # now no pending token $txt = $slf->{'_txt'}; $slf->{'_txt'} = q{}; $slf->{'_sta'} = 1; return $txt; } $slf->{'_sta'} = 0; return; } =head2 S<$h-Eget_re> This method returns the regular expression associated to the object. =cut sub get_re { return shift->{'_pat'}; } =head2 S<$h-Eget_status(status)> The method indicates whether the last search of the lexical unit succeeded or failed. =cut sub get_status { return shift->{'_sta'}; } =head2 S<$h-Eget_text> This method returns the character string that was recognized using this object. =cut sub get_text { return shift->{'_txt'}; } =head2 S<$h-Eset_attr($nam,$val)> This method sets an attribute value. =cut sub set_attr { my ($slf, $nam, $val) = @_; return $slf->{'_det'}->{$nam} = $val; } =head2 S<$h-Eset_expression> This method sets an associate expression. =cut sub set_expression { my ($slf, $val) = @_; return $slf->{'_exp'} = $val; } =head2 S<$h-Eset_status($val)> This method overrides the existing value and sets it to the specified value. =cut sub set_status { my ($slf, $val) = @_; return $slf->{'_sta'} = $val } =head2 S<$h-Eset_text($string)> This method associates the specified character string to the lexical unit. =cut sub set_text { my ($slf, $val) = @_; return $slf->{'_txt'} = $val; } # --- Internal routines ------------------------------------------------------- # Return the associated lexical analyzer sub get_lexer { return shift->{'_lex'}; } # Return a reference to the template object sub get_template { return shift->{'_tpl'}; } # Define the associated lexical analyzer sub set_lexer { my ($slf, $lex) = @_; $slf->{'_lex'} = $lex; $slf->{'lvl'} = $lex->set_trace; return $slf; } # Associate a template to the lexical unit sub set_template { my ($slf, $val) = @_; $slf->{'_tpl'} = $val; return $slf; } 1; __END__ =head1 SEE ALSO L, L, L, L, L, L, L, L, L =head1 COPYRIGHT NOTICE Copyright (c) 1996, 2016, Oracle and/or its affiliates. All rights reserved. =head1 TRADEMARK NOTICE Oracle and Java are registered trademarks of Oracle and/or its affiliates. Other names may be trademarks of their respective owners. =cut