Repository URL to install this package:
|
Version:
2023.12.1 ▾
|
/* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/
using System;
using System.IO;
using System.Collections.Generic;
using System.Text;
using Antlr4.Runtime;
using Antlr4.Runtime.Atn;
using Antlr4.Runtime.Misc;
using Antlr4.Runtime.Sharpen;
namespace Antlr4.Runtime
{
/// <summary>A lexer is recognizer that draws input symbols from a character stream.</summary>
/// <remarks>
/// A lexer is recognizer that draws input symbols from a character stream.
/// lexer grammars result in a subclass of this object. A Lexer object
/// uses simplified match() and error recovery mechanisms in the interest
/// of speed.
/// </remarks>
public abstract class Lexer : Recognizer<int, LexerATNSimulator>, ITokenSource
{
public const int DEFAULT_MODE = 0;
public const int DefaultTokenChannel = TokenConstants.DefaultChannel;
public const int Hidden = TokenConstants.HiddenChannel;
public const int MinCharValue = 0x0000;
public const int MaxCharValue = 0x10FFFF;
private ICharStream _input;
protected readonly TextWriter Output;
protected readonly TextWriter ErrorOutput;
private Tuple<ITokenSource, ICharStream> _tokenFactorySourcePair;
/// <summary>How to create token objects</summary>
private ITokenFactory _factory = CommonTokenFactory.Default;
/// <summary>The goal of all lexer rules/methods is to create a token object.</summary>
/// <remarks>
/// The goal of all lexer rules/methods is to create a token object.
/// This is an instance variable as multiple rules may collaborate to
/// create a single token. nextToken will return this object after
/// matching lexer rule(s). If you subclass to allow multiple token
/// emissions, then set this to the last token to be matched or
/// something nonnull so that the auto token emit mechanism will not
/// emit another token.
/// </remarks>
private IToken _token;
/// <summary>
/// What character index in the stream did the current token start at?
/// Needed, for example, to get the text for current token.
/// </summary>
/// <remarks>
/// What character index in the stream did the current token start at?
/// Needed, for example, to get the text for current token. Set at
/// the start of nextToken.
/// </remarks>
private int _tokenStartCharIndex = -1;
/// <summary>The line on which the first character of the token resides</summary>
private int _tokenStartLine;
/// <summary>The character position of first character within the line</summary>
private int _tokenStartColumn;
/// <summary>Once we see EOF on char stream, next token will be EOF.</summary>
/// <remarks>
/// Once we see EOF on char stream, next token will be EOF.
/// If you have DONE : EOF ; then you see DONE EOF.
/// </remarks>
private bool _hitEOF;
/// <summary>The channel number for the current token</summary>
private int _channel;
/// <summary>The token type for the current token</summary>
private int _type;
private readonly Stack<int> _modeStack = new Stack<int>();
private int _mode = Antlr4.Runtime.Lexer.DEFAULT_MODE;
/// <summary>
/// You can set the text for the current token to override what is in
/// the input char buffer.
/// </summary>
/// <remarks>
/// You can set the text for the current token to override what is in
/// the input char buffer. Use setText() or can set this instance var.
/// </remarks>
private string _text;
public Lexer(ICharStream input) : this(input, Console.Out, Console.Error) { }
public Lexer(ICharStream input, TextWriter output, TextWriter errorOutput)
{
this._input = input;
this.Output = output;
this.ErrorOutput = errorOutput;
this._tokenFactorySourcePair = Tuple.Create((ITokenSource)this, input);
}
public virtual void Reset()
{
// wack Lexer state variables
if (_input != null)
{
_input.Seek(0);
}
// rewind the input
_token = null;
_type = TokenConstants.InvalidType;
_channel = TokenConstants.DefaultChannel;
_tokenStartCharIndex = -1;
_tokenStartColumn = -1;
_tokenStartLine = -1;
_text = null;
_hitEOF = false;
_mode = Antlr4.Runtime.Lexer.DEFAULT_MODE;
_modeStack.Clear();
Interpreter.Reset();
}
/// <summary>
/// Return a token from this source; i.e., match a token on the char
/// stream.
/// </summary>
/// <remarks>
/// Return a token from this source; i.e., match a token on the char
/// stream.
/// </remarks>
public virtual IToken NextToken()
{
if (_input == null)
{
throw new InvalidOperationException("nextToken requires a non-null input stream.");
}
// Mark start location in char stream so unbuffered streams are
// guaranteed at least have text of current token
int tokenStartMarker = _input.Mark();
try
{
while (true)
{
if (_hitEOF)
{
EmitEOF();
return _token;
}
_token = null;
_channel = TokenConstants.DefaultChannel;
_tokenStartCharIndex = _input.Index;
_tokenStartColumn = Interpreter.Column;
_tokenStartLine = Interpreter.Line;
_text = null;
do
{
_type = TokenConstants.InvalidType;
// System.out.println("nextToken line "+tokenStartLine+" at "+((char)input.LA(1))+
// " in mode "+mode+
// " at index "+input.index());
int ttype;
try
{
ttype = Interpreter.Match(_input, _mode);
}
catch (LexerNoViableAltException e)
{
NotifyListeners(e);
// report error
Recover(e);
ttype = TokenTypes.Skip;
}
if (_input.LA(1) == IntStreamConstants.EOF)
{
_hitEOF = true;
}
if (_type == TokenConstants.InvalidType)
{
_type = ttype;
}
if (_type == TokenTypes.Skip)
{
goto outer_continue;
}
}
while (_type == TokenTypes.More);
if (_token == null)
{
Emit();
}
return _token;
outer_continue: ;
}
}
finally
{
// make sure we release marker after match or
// unbuffered char stream will keep buffering
_input.Release(tokenStartMarker);
}
}
/// <summary>
/// Instruct the lexer to skip creating a token for current lexer rule
/// and look for another token.
/// </summary>
/// <remarks>
/// Instruct the lexer to skip creating a token for current lexer rule
/// and look for another token. nextToken() knows to keep looking when
/// a lexer rule finishes with token set to SKIP_TOKEN. Recall that
/// if token==null at end of any token rule, it creates one for you
/// and emits it.
/// </remarks>
public virtual void Skip()
{
_type = TokenTypes.Skip;
}
public virtual void More()
{
_type = TokenTypes.More;
}
public virtual void Mode(int m)
{
_mode = m;
}
public virtual void PushMode(int m)
{
_modeStack.Push(_mode);
Mode(m);
}
public virtual int PopMode()
{
if (_modeStack.Count == 0)
{
throw new InvalidOperationException();
}
int mode = _modeStack.Pop();
Mode(mode);
return _mode;
}
public virtual ITokenFactory TokenFactory
{
get
{
return _factory;
}
set
{
ITokenFactory factory = value;
this._factory = factory;
}
}
/// <summary>Set the char stream and reset the lexer</summary>
public virtual void SetInputStream(ICharStream input)
{
this._input = null;
this._tokenFactorySourcePair = Tuple.Create((ITokenSource)this, _input);
Reset();
this._input = input;
this._tokenFactorySourcePair = Tuple.Create((ITokenSource)this, _input);
}
public virtual string SourceName
{
get
{
return _input.SourceName;
}
}
public override IIntStream InputStream
{
get
{
return _input;
}
}
ICharStream ITokenSource.InputStream
{
get
{
return _input;
}
}
/// <summary>
/// By default does not support multiple emits per nextToken invocation
/// for efficiency reasons.
/// </summary>
/// <remarks>
/// By default does not support multiple emits per nextToken invocation
/// for efficiency reasons. Subclass and override this method, nextToken,
/// and getToken (to push tokens into a list and pull from that list
/// rather than a single variable as this implementation does).
/// </remarks>
public virtual void Emit(IToken token)
{
//System.err.println("emit "+token);
this._token = token;
}
/// <summary>
/// The standard method called to automatically emit a token at the
/// outermost lexical rule.
/// </summary>
/// <remarks>
/// The standard method called to automatically emit a token at the
/// outermost lexical rule. The token object should point into the
/// char buffer start..stop. If there is a text override in 'text',
/// use that to set the token's text. Override this method to emit
/// custom Token objects or provide a new factory.
/// </remarks>
public virtual IToken Emit()
{
IToken t = _factory.Create(_tokenFactorySourcePair, _type, _text, _channel, _tokenStartCharIndex, CharIndex - 1, _tokenStartLine, _tokenStartColumn);
Emit(t);
return t;
}
public virtual IToken EmitEOF()
{
int cpos = Column;
int line = Line;
IToken eof = _factory.Create(_tokenFactorySourcePair, TokenConstants.EOF, null, TokenConstants.DefaultChannel, _input.Index, _input.Index - 1, line, cpos);
Emit(eof);
return eof;
}
public virtual int Line
{
get
{
return Interpreter.Line;
}
set
{
int line = value;
Interpreter.Line = line;
}
}
public virtual int Column
{
get
{
return Interpreter.Column;
}
set
{
int charPositionInLine = value;
Interpreter.Column = charPositionInLine;
}
}
/// <summary>What is the index of the current character of lookahead?</summary>
public virtual int CharIndex
{
get
{
return _input.Index;
}
}
public virtual int TokenStartCharIndex
{
get
{
return _tokenStartCharIndex;
}
}
public virtual int TokenStartLine
{
get
{
return _tokenStartLine;
}
}
public virtual int TokenStartColumn
{
get
{
return _tokenStartColumn;
}
}
/// <summary>
/// Return the text matched so far for the current token or any text
/// override.
/// </summary>
/// <remarks>
/// Return the text matched so far for the current token or any text
/// override.
/// </remarks>
/// <summary>
/// Set the complete text of this token; it wipes any previous changes to the
/// text.
/// </summary>
/// <remarks>
/// Set the complete text of this token; it wipes any previous changes to the
/// text.
/// </remarks>
public virtual string Text
{
get
{
if (_text != null)
{
return _text;
}
return Interpreter.GetText(_input);
}
set
{
string text = value;
this._text = text;
}
}
/// <summary>Override if emitting multiple tokens.</summary>
/// <remarks>Override if emitting multiple tokens.</remarks>
public virtual IToken Token
{
get
{
return _token;
}
set
{
IToken _token = value;
this._token = _token;
}
}
public virtual int Type
{
get
{
return _type;
}
set
{
int ttype = value;
_type = ttype;
}
}
public virtual int Channel
{
get
{
return _channel;
}
set
{
int channel = value;
_channel = channel;
}
}
public virtual Stack<int> ModeStack
{
get
{
return _modeStack;
}
}
public virtual int CurrentMode
{
get
{
return _mode;
}
set
{
int mode = value;
_mode = mode;
}
}
public virtual bool HitEOF
{
get
{
return _hitEOF;
}
set
{
bool hitEOF = value;
_hitEOF = hitEOF;
}
}
public virtual string[] ChannelNames
{
get
{
return null;
}
}
public virtual string[] ModeNames
{
get
{
return null;
}
}
/// <summary>Return a list of all Token objects in input char stream.</summary>
/// <remarks>
/// Return a list of all Token objects in input char stream.
/// Forces load of all tokens. Does not include EOF token.
/// </remarks>
public virtual IList<IToken> GetAllTokens()
{
IList<IToken> tokens = new List<IToken>();
IToken t = NextToken();
while (t.Type != TokenConstants.EOF)
{
tokens.Add(t);
t = NextToken();
}
return tokens;
}
public virtual void Recover(LexerNoViableAltException e)
{
if (_input.LA(1) != IntStreamConstants.EOF)
{
// skip a char and try again
Interpreter.Consume(_input);
}
}
public virtual void NotifyListeners(LexerNoViableAltException e)
{
string text = _input.GetText(Interval.Of(_tokenStartCharIndex, _input.Index));
string msg = "token recognition error at: '" + GetErrorDisplay(text) + "'";
IAntlrErrorListener<int> listener = ErrorListenerDispatch;
listener.SyntaxError(ErrorOutput, this, 0, _tokenStartLine, _tokenStartColumn, msg, e);
}
public virtual string GetErrorDisplay(string s)
{
StringBuilder buf = new StringBuilder();
for (var i = 0; i < s.Length; ) {
var codePoint = Char.ConvertToUtf32(s, i);
buf.Append(GetErrorDisplay(codePoint));
i += (codePoint > 0xFFFF) ? 2 : 1;
}
return buf.ToString();
}
public virtual string GetErrorDisplay(int c)
{
string s;
switch (c)
{
case TokenConstants.EOF:
{
s = "<EOF>";
break;
}
case '\n':
{
s = "\\n";
break;
}
case '\t':
{
s = "\\t";
break;
}
case '\r':
{
s = "\\r";
break;
}
default:
{
s = Char.ConvertFromUtf32(c);
break;
}
}
return s;
}
public virtual string GetCharErrorDisplay(int c)
{
string s = GetErrorDisplay(c);
return "'" + s + "'";
}
/// <summary>
/// Lexers can normally match any char in it's vocabulary after matching
/// a token, so do the easy thing and just kill a character and hope
/// it all works out.
/// </summary>
/// <remarks>
/// Lexers can normally match any char in it's vocabulary after matching
/// a token, so do the easy thing and just kill a character and hope
/// it all works out. You can instead use the rule invocation stack
/// to do sophisticated error recovery if you are in a fragment rule.
/// </remarks>
public virtual void Recover(RecognitionException re)
{
//System.out.println("consuming char "+(char)input.LA(1)+" during recovery");
//re.printStackTrace();
// TODO: Do we lose character or line position information?
_input.Consume();
}
}
}