# Lex.pm: Class Used to Generate Lexical Analyzers

package RDA::Object::Lex;

# $Id: Lex.pm,v 1.10 2015/05/13 14:53:52 RDA Exp $
# ARCS: $Header: /home/cvs/cvs/RDA_8/src/scripting/lib/RDA/Object/Lex.pm,v 1.10 2015/05/13 14:53:52 RDA Exp $
#
# Change History
# 20150513  MSC  Improve the documentation.

=head1 NAME

RDA::Object::Lex - Class Used to Generate Lexical Analyzers

=head1 SYNOPSIS

 use RDA::Object::Lex;

 @def = (qw(ADDOP    [-+]
            LEFTP    [\(]
            RIGHTP   [\)]
            INTEGER  [1-9][0-9]*
            NEWLINE  \n
           ),
         qw(STRING), [qw(" (?:[^"]+|"")* ")],
         qw(ERROR    .*), sub { die qq{Cannot analyze: "$_[1]"}},
        );

 $lex = RDA::Object::Lex->new('DEMO');
 $lex->set_trace(63);
 $lex->init(@def)->from(\*DATA);

 print "Tokenization of DATA:\n";
 TOKEN:for (;;)
 { $tok = $lex->get_next;
   last TOKEN if $lex->eoi;
   print "Line $.\t";
   print "Type: ".$tok->get_oid."\t";
   print "Content:->".$tok->get_text."<-\n";
 }

 __END__
 1+2-5
        "a multiline
        string with an embedded "" in it"
        an invalid string with a "" in it"

=head1 DESCRIPTION

The class C<RDA::Object::Lex> creates lexical analyzers. It is a subclass of
L<RDA::Object|RDA::Object>.

A lexical analyzer is defined by list of tokens passed as arguments to the
C<new> or C<init> method. Tokens are instances of the
L<RDA::Object::Token|RDA::Object::Token> class. The definition of a token
usually comprises two arguments: a symbolic name (like C<INTEGER>), followed by
a regular expression. When an anonymous subroutine is given as third argument,
it is called when the token is recognized. Its arguments are the
C<RDA::Object::Token> instance and the string recognized by the regular
expression. The return value is used as the new string contents of the token
object.

The order in which the lexical analyzer examines the regular
expressions is determined by the order in which these expressions are
passed as arguments to the C<new> or C<init> method. The token returned by the
lexical analyzer corresponds to the first regular expression which matches
(this strategy is different from that used by Lex, which returns the longest
match possible out of all that can be recognized).

The lexical analyzer can recognize tokens which span multiple records. When the
definition of the token comprises more than one regular expression (placed
within a reference to an anonymous array), the analyzer reads as many records
as required to recognize the token (see the documentation for the
L<RDA::Object::Token|RDA::Object::Token> class). When the start pattern is
found, the analyzer looks for the end, and if necessary, reads more records. No
backtracking is done in case of failure.

The analyzer can be used to analyze an isolated character string or a stream of
data coming from a file handle. At the end of the input data, the analyzer
returns a C<RDA::Object::Token> instance named C<EOI> (End Of Input).

=head2 Start Conditions

You can associate start conditions with the token-recognition rules that
comprise your lexical analyzer. When start conditions are used, the rule that
succeeds is no longer necessarily the first rule that matches.

A token symbol may be preceded by a start condition specifier for the
associated recognition rule. For example:

 qw(C1:TERMINAL_1  REGEXP), sub { # associated action },
 qw(TERMINAL_2     REGEXP), sub { # associated action },

Symbol C<TERMINAL_1> will be recognized only if start condition C<C1>
is active. Start conditions are activated and deactivated using respectively
the C<start(CONDITION_NAME)> and C<end(CONDITION_NAME)> methods.

C<start('INITIAL')> resets the analysis automaton.

Start conditions can be combined using AND/OR operators as follows:

 C1:SYMBOL      condition C1

 C1:C2:SYMBOL   condition C1 AND condition C2

 C1,C2:SYMBOL   condition C1 OR  condition C2

There are two types of start conditions: I<inclusive> and I<exclusive>,
which are declared by class methods C<inclusive()> and C<exclusive()>
respectively.  With an inclusive start condition, all rules are active
regardless of whether or not they are qualified with the start condition.
With an exclusive start condition, only the rules qualified with
the start condition are active; all other rules are deactivated.

For example,

 use RDA::Object::Lex;

 @def = (
   'EXPECT', 'expect-floats', sub { $lex->start('expect');
                                    return $_[1];
                                  },
   'expect:FLOAT', '\d+\.\d+', sub { print "found a float: $_[1]\n";
                                     return $_[1];
                                   },
   'expect:NEWLINE', '\n', sub { $lex->end('expect') ;
                                 return $_[1];
                               },
   'NEWLINE2', '\n',
   'INT', '\d+', sub { print "found an integer: $_[1] \n";
                       return $_[1];
                     },
   'DOT', '\.', sub { print "found a period\n";
                      return $_[1]
                    },
  );

 RDA::Object::Lex->exclusive('expect');
 $lex = RDA::Object::Lex->new('DEMO',@def);

The special start condition C<ALL> is always verified.

=head1 METHODS

The following methods are available:

=cut

use strict;

BEGIN
{ use Exporter;
  use RDA::Text qw(debug get_string);
  use RDA::Driver::Template;
  use RDA::Object;
  use RDA::Object::Token;
}

use integer;

# Define the global public variables
use vars qw($STRINGS $VERSION @DUMP @ISA %SDCL);
$VERSION = sprintf('%d.%02d', q$Revision: 1.10 $ =~ /(\d+)\.(\d+)/);
@DUMP    = (
  str => { map {$_ => 1} qw(_cod _c_f _c_s _smc) },
  );
@ISA     = qw(RDA::Object Exporter);
%SDCL    = (
  dep => [qw(RDA::Object::Token)],
  flg => 1,
  met => {
    'end'        => {ret => 0},
    'eoi'        => {ret => 0},
    'exclusive'  => {ret => 0},
    'flush'      => {ret => 0},
    'from'       => {ret => 0},
    'get_buffer' => {ret => 0},
    'get_length' => {ret => 0},
    'get_line'   => {ret => 0},
    'get_next'   => {ret => 0},
    'get_offset' => {ret => 0},
    'get_pos'    => {ret => 0},
    'get_skip'   => {ret => 0},
    'get_state'  => {ret => 0},
    'get_text'   => {ret => 0},
    'get_token'  => {ret => 0},
    'hold'       => {ret => 0},
    'inclusive'  => {ret => 0},
    'init'       => {ret => 0, evl => 'C'},
    'is_debug'   => {ret => 0},
    'is_hold'    => {ret => 0},
    'is_string'  => {ret => 0},
    'is_trace'   => {ret => 0},
    'read_line'  => {ret => 0},
    'restart'    => {ret => 0},
    'set_eoi'    => {ret => 0},
    'set_pos'    => {ret => 0},
    'set_line'   => {ret => 0},
    'set_state'  => {ret => 0},
    'set_text'   => {ret => 0},
    'set_token'  => {ret => 0},
    'set_trace'  => {ret => 0},
    'skip'       => {ret => 0},
    'start'      => {ret => 0},
    },
  new => 1,
  trc => 'LEX',
  );

# Define the global private constants
my $TOKEN_CLASS = 'RDA::Object::Token'; # Root class

my $DEFAULT_TOKEN  = $TOKEN_CLASS->new('DEFAULT', q{.*}); # Default token
my $DEFAULT_SUB = sub { $_[0]->gen_lex;            # lexical analyzer generation
                        return &{$_[0]->{'_sub'}}; # lexical analyzer execution
                      };

# Define the global private variables

# Report the package version
sub Version
{ return $VERSION;
}

# Create a prototype
my $dummy = q{};
my $lexer = bless {
  ## no critic (Interpolation)
  _all => [],        # Token list
  _buf => \$dummy,   # String to tokenize
  _cod => [],        # Current lexical analyzer
  _c_f => [],        # Cached subroutine definition
  _c_s => [],        # Cached subroutine definition
  _def => {},        # Token definition hash
  _eoi => 1,         # End of input indicator
  _exc => {},        # Exclusive conditions
  _hld => 0,         # Save or not what is consumed
  _hnd => [],        # Lexical analyzer closure environnement
  _ifh => \*STDIN,   # Default input file handle
  _lgt => \$dummy,   # Length of the current record
  _lin => \$dummy,   # Number of the current record
  _inc => {},        # Inclusive conditions
  _nxt => $DEFAULT_TOKEN,
  _off => \$dummy,   # Offset from the beginning
  _pos => \$dummy,   # Position in the current record
  _sav => q{},       # Saved string
  _skp => q{[ \t]+}, # Pattern to skip
  _sta => {INITIAL => \$dummy },
  _str => 0,         # True when you must analyze a string
  _sub => $DEFAULT_SUB,
  _s_f => sub {},    # Cache for the stream lexer
  _s_s => sub {},    # Cache for the string lexer
  }, __PACKAGE__;

# Define the template
#   %%...%% are processed by the RDA::Driver::Template class
#   In %%...%%, $slf is the RDA::Driver::Template instance
#   Regular expressions must be delimited by // or m!!

$lexer->set_template(RDA::Driver::Template->new(
  ## no critic (Interpolation,Long,Newline)
  LEX_WITH_SKIP => q@
  if ($LEX_POS < $LEX_LGT and $LEX_BUF =~ /\G(?:%%&SKIP%%)/cg)
  { $lgt = pos($LEX_BUF) - $LEX_POS;
    $LEX_OFF += $lgt;
    $LEX_POS += $lgt;
    %%&IS_HOLD ? #LEX_HOLD_SKIP : ''%%
  }
@,
  LEX_WITH_SKIP_LAST_READ => q@
  if ($LEX_BUF =~ /\G(?:%%&SKIP%%)/cg)
  { # skip this pattern
    $lgt = pos($LEX_BUF) - $LEX_POS;
    $LEX_OFF += $lgt;
    $LEX_POS += $lgt;
    %%&IS_HOLD ? #LEX_HOLD_SKIP : ''%%
  }
  else
  { last READ;
  }
@,
  LEX_HOLD_SKIP =>  q@$slf->save_text($1);@,
  LEX_HEADER_STRING => q@
  { pos($LEX_BUF) = $LEX_POS;
    my $lgt = 0;
    #
    %%&SKIP ne '' ? #LEX_WITH_SKIP : ''%%
    if ($LEX_POS == $LEX_LGT)
    { $slf->set_eoi'(1);
      return $LEX_TOK = $RDA::Object::Token::EOI;
    }
    $LEX_TOK = undef;
    my $str = '';
    #
  CASE:{
@,
  LEX_HEADER_STREAM => q@
  { pos($LEX_BUF) = $LEX_POS;
    my $lgt = 0;
    my $LEX_IFH = $$LEX_FHR;
    #
    %%&SKIP ne '' ? #LEX_WITH_SKIP : ''%%
    if ($LEX_POS == $LEX_LGT)
    { return $LEX_TOK = $RDA::Object::Token::EOI if $slf->eoi;
      READ: {
        do
        { unless (defined($LEX_BUF = <$LEX_IFH>))
          { $slf->set_eoi(1);
            return $LEX_TOK = $RDA::Object::Token::EOI;
          }
          pos($LEX_BUF) = $LEX_POS = 0;
          $LEX_LGT = length($LEX_BUF);
          $LEX_REC++;
          %%&SKIP ne '' ? #LEX_WITH_SKIP_LAST_READ : ''%%
        } while ($LEX_POS == $LEX_LGT);
      }# READ
    }
    my $str = '';
    $LEX_TOK = undef;
    #
    CASE: {
@,
  LEX_HOLD_TOKEN => q@$slf->save_text($str);@,
  LEX_FOOTER => q@
  } #CASE
  %%&IS_HOLD ? #LEX_HOLD_TOKEN : ''%%
  $slf->set_token($LEX_TOK);
  $LEX_TOK;
}
@,
  ));

=head2 S<$h-E<gt>new($oid,@definition)>

The object constructor. This method takes the lexical analyzer name and its
definition as arguments. The definition a list of C<RDA::Object::Token>
instances, or a list of triplets permitting their creation. The triplets
consist of: the symbolic name of the token, the regular expression necessary
for its recognition, and possibly an anonymous subroutine that is called when
the token is recognized.

=cut

sub new
{ my ($ctx, $oid, @arg) = @_;
  my ($slf, @tok);

  # Validate the lexical analyzer identifier
  die get_string('BAD_OID') unless $oid && $oid =~ m/^([A-Za-z]\w{0,31})$/;

  # Create the lexical analyzer object
  $slf = $ctx->clone;
  $slf->{'lvl'} = 0;
  $slf->{'oid'} = uc($oid);
  return @arg ? $slf->init(@arg) : $slf;
}

# Return a copy of the lexical analyzer
sub clone
{ my ($src) = @_;
  my ($cls);

  return bless {%{$src}}, $cls if ($cls = ref($src)); # Clone the instance
  return bless {%{$src->get_prototype}}, $src;        # Clone the prototype
}

=head2 S<$h-E<gt>eoi>

This method indicates whether there is no more data to analyze.

=cut

sub eoi
{ return shift->{'_eoi'};
}

=head2 S<$h-E<gt>flush>

When saving of the consumed strings is activated, this method returns and
clears the buffer containing the character strings recognized up to now. This
is only useful when C<hold> has been called to activate saving of consumed
strings.

=cut

sub flush
{ my ($slf) = @_;
  my ($txt);

  $txt = $slf->{'_sav'};
  $slf->{'_sav'} = q{};
  return $txt;
}

=head2 S<$h-E<gt>get_buffer>

This method returns the contents of the internal buffer of the lexical
analyzer.

=cut

sub get_buffer
{ return ${shift->{'_buf'}};
}

=head2 S<$h-E<gt>get_code>

This method returns the code of the current lexical analyzer

=cut

sub get_code
{ return join(q{}, @{shift->{'_cod'}});
}

=head2 S<$h-E<gt>get_length>

This method returns the length of the current record.

=cut

sub get_length
{ return ${shift->{'_lgt'}};
}

=head2 S<$h-E<gt>get_line>

This method returns the line number of the current record. It always returns 1
when a character string is being analyzed. The C<read_line()> method increments
the line number.

=cut

sub get_line
{ return ${shift->{'_lin'}};
}

=head2 S<$h-E<gt>get_offset>

This method returns the number of characters already consumed since the
beginning of the analyzed data stream.

=cut

sub get_offset
{ return ${shift->{'_off'}};
}

=head2 S<$h-E<gt>get_pos>

This methods returns the end position of the last token found in the current
record.

=cut

sub get_pos
{ return ${shift->{'_pos'}};
}

=head2 S<$h-E<gt>get_skip>

This method returns the current skip pattern.

=cut

sub get_skip
{ return shift->{'_skp'};
}

=head2 S<$h-E<gt>get_sub>

This method returns the anonymous subroutine that performs the lexical
analysis.

Example:

 my $tok;
 my $sub = $lexer->get_sub;
 while (($tok = &{$sub}()) ne $RDA::Object::Token::EOI)
 { print $tok->get_oid."\t".$tok->get_text."\n";
 }

 # or

 my $tok;
 local *tokenizer = $lexer->get_sub;
 while (($tok = tokenizer()) ne $RDA::Object::Token::EOI)
 { print $tok->get_oid."\t".$tok->get_text."\n";
 }

=cut

sub get_sub
{ my ($slf) = @_;

  return $slf->{'_sub'} if ref($slf->{'_sub'}) eq 'CODE';
  return $slf->gen_lex;
}

=head2 S<$h-E<gt>get_token>

This method returns the a reference of the object corresponding to the last
recognized token.

=cut

sub get_token
{ return shift->{'_nxt'};
}

=head2 S<$h-E<gt>hold>

This method activates or deactivates saving of the consumed strings. The return
value is the current setting (TRUE or FALSE).

You can obtain the contents of the buffer using the C<flush> method, which also
empties the buffer.

=cut

sub hold
{ my ($slf) = @_;

  if (ref($slf))
  { $slf->del_code;
  }
  else
  { $slf = $slf->get_prototype;
  }
  return $slf->{'_hld'} = not $slf->{'_hld'};
}

=head2 S<$h-E<gt>is_debug>

This method indicates whether debug messages should be printed on the standard
output.

=cut

sub is_debug
{ return shift->{'lvl'} & 16;  ## no critic (Bit)
}

=head2 S<$h-E<gt>is_hold>

This method indicates whether consumed data is saved.

=cut

sub is_hold
{ return shift->{'_hld'};
}

=head2 S<$h-E<gt>is_string>

This method indicates whether a character string is analyzed.

=cut

sub is_string
{ return shift->{'_str'};
}

=head2 S<$h-E<gt>is_trace>

This method indicates whether the analysis should be traced.

=cut

sub is_trace
{ return shift->{'lvl'} & 32;  ## no critic (Bit)
}

=head2 S<$h-E<gt>read_line>

Reads data from the input specified by the C<from> method. Returns the
result of the reading.

Example:

 use RDA::Object::Lex;

 $lex = RDA::Object::Lex->new('DEMO');
 print $lex->read_line # read and print one line
   while !$lex->eoi;

=cut

sub read_line
{ my ($slf) = @_;
  my ($ifh, $rec);

  $ifh = $slf->{'_ifh'};
  if (defined($rec = <$ifh>))
  { ${$slf->{'_lin'}}++;
  }
  else
  { $slf->{'_eoi'} = 1;
  }
  return $rec;
}

=head2 S<$h-E<gt>reset>

This method clears the internal buffer of the lexical analyzer and erases all
tokens already recognized.

=cut

sub reset  ## no critic (Builtin)
{ my ($slf) = @_;

  ${$slf->{'_lin'}} = 0;
  ${$slf->{'_lgt'}} = 0;
  ${$slf->{'_off'}} = 0;
  ${$slf->{'_pos'}} = 0;
  ${$slf->{'_buf'}} = q{};
  $slf->{'_sav'} = q{};
  $slf->{'_eoi'} = 0;
  $slf->set_state('INITIAL');
  if ($slf->{'_nxt'})
  { $slf->{'_nxt'}->set_text;
    $slf->{'_nxt'} = 0;
  }
  return $slf;
}

=head2 S<$h-E<gt>set_text($text)>

This method adds the specified text to the saved data.

=cut

sub save_text
{ my ($slf, $txt) = @_;

  return $slf->{'_sav'} .= $txt;
}

=head2 S<$h-E<gt>set_eoi($flag)>

This method sets the end-of-input indicator.

=cut

sub set_eoi
{ my ($slf, $flg) = @_;

  return $slf->{'_eoi'} = $flg;
}

=head2 S<$h-E<gt>set_line($num)>

This method sets the value of the line number.

=cut

sub set_line
{ my ($slf, $num) = @_;

  return ${$slf->{'_lin'}} = $num;
}

=head2 S<$h-E<gt>set_pos($pos)>

This methods specifies the position of the beginning of the next token to be
recognized in the current line.

=cut

sub set_pos
{ my ($slf, $pos) = @_;

  return ${$slf->{'_pos'}} = $pos;
}

=head2 S<$h-E<gt>set_trace([$level])>

This method specifies the trace level and returns the previous trace level. It
propagates the trace level to the definition tokens.

=over 11

=item S<    B<Bit 0> > When true, signs the parts

=item S<    B<Bit 1> > When true, traces the parts

=item S<    B<Bit 2> > When true, traces the parts evaluation

=item S<    B<Bit 3> > When true, traces the context definitions

=item S<    B<Bit 4> > When true, traces the data analysis

=item S<    B<Bit 5> > When true, traces the lexical analyzer operations

=back

=cut

sub set_trace
{ my ($slf, $lvl) = @_;
  my ($old);

  $old = $slf->{'lvl'};
  if (defined($lvl))
  { $lvl = 0 if $lvl < 0;
    $slf->{'lvl'} = $lvl;
    foreach my $tok (@{$slf->{'_all'}})
    { $tok->set_level($lvl);
    }
  }
  return $old;
}

=head2 S<$h-E<gt>set_token($token)>

This method sets the specified token as the current token. It is useful for
re-qualifying a token inside the anonymous subroutine associated with the
original token.

=cut

sub set_token
{ my ($slf, $tok) = @_;

  return $slf->{'_nxt'} = $tok;
}

=head1 LEXICAL ANALYZER DEFINITION METHODS

=head2 S<$h-E<gt>init(@definition)>

This method defines the lexical analyzer.

=cut

sub init
{ my ($slf, @arg) = @_;

  $slf->{'_def'} = {};
  $slf->{'_all'} = [map {$slf->{'_def'}->{$_->get_oid} = $_->set_lexer($slf)}
                        $TOKEN_CLASS->factory(@arg)];
  return $slf->reset;
}

=head2 S<$h-E<gt>skip([$pattern])>

This methods returns or sets the regular expression for consuming inter-token
strings. Changing the skip pattern causes the recompilation of the lexical
analyzer.

For example,

 RDA::Object::Lex->skip('\s*#(?s:.*)|\s+');
 @tok = RDA::Object::Lex->new('DEMO',INTEGER => '\d+')->analyze(\*DATA);
 print "@tok\n"; # print INTEGER 1 INTEGER 2 INTEGER 3 INTEGER 4 EOI
 __END__
 1 # first string to skip
 2
 3# second string to skip
 4

=cut

sub skip
{ my ($slf, $pat) = @_;

  if (ref($slf))
  { # Instance method
    if (defined($pat) && $pat ne $slf->{'_skp'})
    { debug(qq{LEX> Skip value: "$pat"}) if $slf->is_debug;
      $slf->{'_skp'} = $pat;
      $slf->del_code;
    }
  }
  else
  { $slf = $slf->get_prototype;
    if (defined($pat) && $pat ne $slf->{'_skp'})
    { debug(qq{LEX> Prototype skip value: "$pat"}) if $slf->is_debug;
      $slf->{'_skp'} = $pat;
    }
  }
  return $slf->{'_skp'};
}

# Delete the code already generated
sub del_code
{ my ($slf) = @_;

  @{$slf->{'_c_f'}} = ();
  @{$slf->{'_c_s'}} = ();
  @{$slf->{'_cod'}} = ();
  $slf->{'_sub'} = $DEFAULT_SUB;
  return $slf;
}

#
sub gen_code
{ my ($slf) = @_;
  my ($tpl);

  debug(q{LEX> gen_code()}) if $slf->is_debug;
  $tpl = $slf->get_template;

  # Generate the header part
  debug(q{LEX> - header}) if $slf->is_debug;
  $tpl->set_context($slf->{'lvl'},
    IS_HOLD       => $slf->is_hold,
    IS_TRACE      => $slf->is_trace,
    SKIP          => $slf->get_skip,
    );
  $slf->{'_cod'}->[0] = $tpl->eval_part($slf->{'_str'}
    ? 'LEX_HEADER_STRING'
    : 'LEX_HEADER_STREAM');

  # Generate the body part
  debug(q{LEX> - body}) if $slf->is_debug;
  $slf->{'_cod'}->[1] = join(q{}, map {$_->gen_code} @{$slf->{'_all'}});

  # Generate the footer part
  debug(q{LEX> - footer}) if $slf->is_debug;
  $slf->{'_cod'}->[2] = $tpl->eval_part('LEX_FOOTER');

  return $slf;
}

# Generate a condition
sub gen_condition
{ my ($slf, $spc) = @_;
  my ($cnd, @cnd, @itm, %exc, %inc);

  return q{} if $spc =~ /^ALL:/; # special condition

  %exc = %{$slf->exclusive};
  %inc = %{$slf->inclusive};
  return q{} unless $spc || keys(%exc);

  if ($spc =~ /^(.+):/g)  # For example, A:B:C: or A,C:
  { foreach my $grp (split(/:/, $1))
    { foreach my $itm (@itm = split(/,/, $grp))
      { die get_string('NO_COND', $itm)
          unless $itm eq 'INITIAL' || exists($exc{$itm}) || exists($inc{$itm});
        delete($exc{$itm});
        delete($inc{$itm});
      }
      push(@cnd, q{(}.join(q{ or }, map {"\$$_"} @itm).q{)});
    }
    $cnd = (@cnd == 1)
      ? $cnd[0]
      : q{(}.join(q{ and }, @cnd).q{)};
  }
  $cnd = defined($cnd)
    ? $cnd = q{not (}.join(q{ or }, @cnd) . qq{) and $cnd}
    : $cnd = q{not (}.join(q{ or }, @cnd) . q{)}
    if (@cnd = map {"\$$_"} keys(%exc));
  debug(qq{LEX> gen_condition(): $spc -> $cnd}) if $slf->is_debug;
  return defined($cnd) ? qq{$cnd and } : q{};
}

# Generate the lexical analyzer
sub gen_lex
{ my ($slf) = @_;
  my ($cod, $mac, %state);
  my ($LEX_BUF, $LEX_FHR, $LEX_LGT, $LEX_OFF, $LEX_POS, $LEX_REC, $LEX_TOK);

  $slf->gen_code unless @{$slf->{'_cod'}};
  debug(q{LEX> Lexical analyzer generation...}) if $slf->is_debug;

  # Set the lexical analyzer context
  $LEX_FHR = \$slf->{'_ifh'};
  $LEX_BUF = q{}; # Buffer to analyze
  $LEX_LGT = 0;   # Buffer length
  $LEX_OFF = 0;   # Offset from the beginning
  $LEX_POS = 0;   # Current position in buffer
  $LEX_REC = 0;   # Current record number
  $LEX_TOK = q{}; # Token instance
  $slf->set_handles(\$LEX_BUF, \$LEX_LGT, \$LEX_REC, \$LEX_POS, \$LEX_OFF,
    \%state);

  # Validate the lexical analyzer code
  $mac = $slf->gen_state_machine;
  $cod = $slf->get_code;
  eval qq!$mac; \$slf->{'_sub'} = sub $cod!;  ## no critic (Eval)
  if ($@ or $slf->is_debug)
  { my $lin = 0;
    $mac =~ s/^/sprintf("%3d ", ++$lin)/meg;
    $cod =~ s/^/sprintf("%3d ", ++$lin)/meg;
    debug($mac, $cod);
    debug(q{LEX> }, $@);
    die "\n" if $@;
  }
  return $slf->{'_sub'};
}

#
sub get_definition
{ my ($slf, $nam) = @_;

  return $slf->{'_def'}->{$nam};
}

# Get environnement of the lexer closure
sub get_handles
{ my ($slf) = @_;

  return [$slf->{'_buf'},
          $slf->{'_lgt'},
          $slf->{'_lin'},
          $slf->{'_pos'},
          $slf->{'_off'},
          $slf->{'_sta'},
         ];
}

#
sub get_prototype
{ return $lexer || {};
}

#
sub get_template
{ return shift->{'_tpl'};
}

# Set environnement of the lexer closure
sub set_handles
{ my ($slf, @arg) = @_;

  return ($slf->{'_buf'},
          $slf->{'_lgt'},
          $slf->{'_lin'},
          $slf->{'_pos'},
          $slf->{'_off'},
          $slf->{'_sta'},
         ) = @arg;
}

#
sub set_template
{ my ($slf, $val) = @_;

  return $slf->{'_tpl'} = $val;
}

#
sub _save_handles
{ my ($slf) = @_;

  return $slf->{'_hnd'} = $slf->get_handles;
}

#
sub _switch_handles
{ my ($slf) = @_;
  my ($bkp);

  $bkp = $slf->get_handles;
  $slf->set_handles(@{$slf->{'_hnd'}});
  return $slf->{'_hnd'} = $bkp;
}

=head1 ANALYSIS METHODS

=head2 S<$h-E<gt>analyze($source)>

This method analyzes the specified data and returns a list of pairs consisting
of a token name followed by recognized text. The data can be a character string
or a reference to a file handle.

Examples:

 @tok = RDA::Object::Lex->new('DEMO',
          qw(PLUS [+] NUMBER \d+))->analyze("3+3+3");
 @tok = RDA::Object::Lex->new('DEMO',
          qw(PLUS [+] NUMBER \d+))->analyze(\*STREAM);

=cut

sub analyze
{ my ($slf, @src) = @_;
  my ($nxt, $tok, @tok);

  die get_string('NO_DATA') unless defined($src[0]);
  $slf->from(@src);
  $nxt = $slf->{'_sub'};
  for (;;)  ## no critic(Loop)
  { $tok = &{$nxt}($slf);
    push(@tok, $tok->get_oid, $tok->get_text);
    return @tok if $slf->{'_eoi'};
  }
}

=head2 S<$h-E<gt>every($sub)>

This method avoids having to write a reading loop in order to analyze a stream
of data. The argument is an anonymous subroutine executed after the recognition
of each token. For example, to analyze the string C<1+2>, you can write:

 use RDA::Object::Lex;

 $lex = RDA::Object::Lex->new('DEMO',qw(ADDOP [-+]
                                        INTEGER \d+
                                       ));

 $lex->from("1+2");
 $lex->every (sub {
   print $_[0]->get_oid."\t";
   print $_[0]->get_text."\n";
   });

The first argument of the anonymous subroutine is the
L<RDA::Object::Token|RDA::Object::Token> instance recognized.

=cut

sub every
{ my ($slf, $act) = @_;
  my ($ref, $tok);

  die get_string('NO_CODE') unless ($ref = ref($act)) && $ref eq 'CODE';
  for (;;)  ## no critic (Loop)
  { $tok = &{$slf->{'_sub'}}($slf);
    return if $slf->{'_eoi'};
    &{$act}($tok);
  }
}

=head2 S<$h-E<gt>from([$source...])>

This method specifies the source of the data to be analyzed. The argument of
this method can be a string (or list of strings) or a reference to a file
handle. It returns a reference to the calling lexical analyzer.

By default, it reads data from the standard input.

Examples:

 $ifh = IO::File->new;
 $ifh->open('<filename');
 $lex->from($ifh);

 $lex->from(\*DATA);

 $lex->from('the data to be analyzed');

=head2 S<$h-E<gt>from>

This method returns the file handle when defined or an undefined value when
the input is a string.

=cut

sub from
{ my ($slf, @src) = @_;
  my ($ref, $src);

  return $slf->{'_ifh'} ? $slf : undef unless @src;

  $ref = ref($src = $src[0]);
  return
    ($ref eq 'RDA::Object::Buffer') ? _from_ifh($slf, $src->get_handle) :
    ($ref eq 'IO::File')            ? _from_ifh($slf, $src) :
    _is_file($src)                  ? _from_ifh($slf, $src) :
                                      _from_string($slf, join(q{ }, @src));
}

sub _is_file
{ my ($src) = @_;
  my ($num);

  eval {$num = fileno($src)};
  return defined($num);
}

sub _from_ifh
{ my ($slf, $ifh) = @_;

  debug(q{LEX> From stream}) if $slf->is_debug;
  $slf->{'_ifh'} = $ifh;
  if (@{$slf->{'_c_f'}})
  { debug(q{LEX> Code already exists}) if $slf->is_debug;
    if ($slf->{'_str'})  # when STREAM is not the current context
    { $slf->{'_cod'} = [@{$slf->{'_c_f'}}];
      $slf->{'_sub'} = $slf->{'_s_f'};
      $slf->_switch_handles;
      $slf->{'_str'} = 0;
    }
  }
  else
  { debug(q{LEX> STREAM code generation}) if $slf->is_debug;
    $slf->{'_str'} = 0;
    $slf->gen_code;
    $slf->_save_handles;
    $slf->gen_lex;
    $slf->{'_c_f'} = [@{$slf->{'_cod'}}];
    $slf->{'_s_f'} = $slf->{'_sub'};
  }
  return $slf->reset;
}

sub _from_string
{ my ($slf, $buf) = @_;

  debug(q{LEX> From string}) if $slf->is_debug;
  if (@{$slf->{'_c_s'}})
  { debug(q{LEX> Code already exists}) if $slf->is_debug;
    unless ($slf->{'_str'})
    { $slf->{'_cod'} = [@{$slf->{'_c_s'}}];
      $slf->{'_sub'} = $slf->{'_s_s'};
      $slf->_switch_handles;
      $slf->{'_str'} = 1;
    }
  }
  else
  { debug(q{LEX> STRING code generation}) if $slf->is_debug;
    $slf->{'_str'} = 1;
    $slf->gen_code;
    $slf->_save_handles;
    $slf->gen_lex;
    $slf->{'_c_s'} = [@{$slf->{'_cod'}}];
    $slf->{'_s_s'} = $slf->{'_sub'};
  }
  ${$slf->{'_buf'}} = $buf;
  ${$slf->{'_lgt'}} = length($buf);
  return $slf->reset;
}

=head2 S<$h-E<gt>get_next>

This method searches for the next token and return its reference. Its returns
the C<Token::EOI> instance at the end of the data.

For example,

 $lex = RDA::Object::Lex->new('DEMO',@token);
 print $lex->get_next->get_oid;  # print the token type
 print $lex->get_text;           # print the token content

=cut

sub get_next
{ my ($slf) = @_;
  my ($tok);

  $tok = eval {&{$slf->{'_sub'}}($slf)};
  die $@ if $@;
  return $tok;
}

=head2 S<$h-E<gt>parse([$source...])>

This method analyzes the data in one call.

=cut

sub parse
{ my ($slf, @src) = @_;
  my ($nxt);

  die get_string('NO_DATA') unless defined($src[0]);
  $slf->from(@src);
  $nxt = $slf->{'_sub'};
  &{$nxt}($slf) until $slf->{'_eoi'};
  return $slf;
}

=head1 STATE MACHINE METHODS

=head2 S<$h-E<gt>end($name)>

This method deactivates the specified condition.

=cut

sub end
{ my ($slf, $nam) = @_;

  return ${$slf->{'_sta'}->{$nam}} = 0;
}

=head2 S<$h-E<gt>exclusive($name...)>

This method declares that the specified conditions are I<exclusive>.

=cut

sub exclusive
{ my ($slf, @arg) = @_;

  $slf = $slf->get_prototype unless ref($slf);
  return $slf->{'_exc'} = {map {$_ => 1} @arg} if @arg;
  return $slf->{'_exc'};
}

#
sub gen_state_machine
{ my ($slf) = @_;
  my ($cod);

  ## no critic (Interpolation)
  $cod = 'my $INITIAL = 1;'."\n"
    .q!$state{'INITIAL'} = \\$INITIAL;!."\n";
  foreach my $nam (keys(%{$slf->exclusive}), keys(%{$slf->inclusive}))
  { $cod .= q!my $!.$nam.q! = 0; !
      .q!$state{'! .$nam
      .q!'} = \\$!.$nam.q!;!."\n";
  }
  return $slf->set_state_machine($cod);
}

=head2 S<$h-E<gt>get_state($name)>

This method returns the state of the specified condition.

=cut

sub get_state
{ my ($slf, $nam) = @_;

  return ${$slf->{'_sta'}->{$nam}};
}

=head2 S<$h-E<gt>get_state_machine>

This method returns the state machine code.

=cut

sub get_state_machine
{ return shift->{'_smc'};
}

=head2 S<$h-E<gt>inclusive($name...)>

This method declares that the specifies conditions are I<inclusive>.

=cut

sub inclusive
{ my ($slf, @arg) = @_;

  $slf = $slf->get_prototype unless ref($slf);
  return $slf->{'_inc'} = {map {$_ => 1} @arg} if @arg;
  return $slf->{'_inc'};
}

=head2 S<$h-E<gt>restart>

This method re-initializes the analysis automaton. The sole active condition
becomes the condition C<INITIAL>.

=cut

sub restart
{ my ($slf) = @_;
  my ($sta);

  $sta = $slf->{'_sta'};
  foreach my $nam (keys(%{$sta}))
  { ${$sta->{$nam}} = 0;
  }
  return ${$sta->{'INITIAL'}} = 1;
}

=head2 S<$h-E<gt>set_state($name,$value)>

This method assigns a new state to the specified condition.

=cut

sub set_state
{ my ($slf, $nam, $val) = @_;

  return ${$slf->{'_sta'}->{$nam}} = $val;
}

=head2 S<$h-E<gt>set_state_machine($code)>

This method specifies the state machine code.

=cut

sub set_state_machine
{ my ($slf, $cod) = @_;

  return $slf->{'_smc'} = $cod;
}

=head2 S<$h-E<gt>start($name)>

This method activates the specified condition.

=cut

sub start
{ my ($slf, $nam) = @_;

  if ($nam eq 'INITIAL')
  { $slf->restart;
  }
  else
  { $slf->restart if exists($slf->{'_exc'}->{$nam});
    ${$slf->{'_sta'}->{$nam}} = 1;
  }
  return;
}

1;

__END__

=head1 SEE ALSO

L<RDA::Agent|RDA::Agent>,
L<RDA::Driver::Template|RDA::Driver::Template>,
L<RDA::Object|RDA::Object>,
L<RDA::Object::Token|RDA::Object::Token>,
L<RDA::Text|RDA::Text>,
L<RDA::Token::Action|RDA::Token::Action>,
L<RDA::Token::Complex|RDA::Token::Complex>,
L<RDA::Token::Quoted|RDA::Token::Quoted>,
L<RDA::Token::Simple|RDA::Token::Simple>

=head1 COPYRIGHT NOTICE

Copyright (c) 1996, 2016, Oracle and/or its affiliates. All rights reserved.

=head1 TRADEMARK NOTICE

Oracle and Java are registered trademarks of Oracle and/or its
affiliates. Other names may be trademarks of their respective owners.

=cut