import java.io.Reader;
import java.io.IOException;
import java.util.Stack;
import java.io.EOFException;

/**
 * This is an abstract base class for a hierarchy of table-driven lexers
 * (presumably for small alphabets like 7-bit ASCII).  This essentially
 * provides the DFA engine to interpret a lexer table; to implement a real
 * lexer, subclass this class and build the DFA table <code>_tab</code> (e.g.,
 * in the constructor of the derived class) before calling either {@link
 * #next()} or {@link #hasNext()}.  This does also provide handling of the
 * pushback mechanism.
 * <p>
 * The DFA interpreted by this engine provides four actions (specified by the
 * <code>A_*</code> constants, described in the protected interface):
 * <dl>
 * <dt><code>A_SHIFT</code></dt>
 * <dd>Shift current input onto currently accumulating token and continue
 * lexing.</dd>
 * <dt><code>A_DROP</code></dt>
 * <dd>Drop current input and continue lexing.</code>
 * <dt><code>A_RET</code></dt>
 * <dd>Return current token; shift current input onto next token.</dd>
 * <dt><code>A_DRET</code></dt>
 * <dd>Return current token; drop current input and reset accumulator token to
 * empty.</dd>
 * </dl>
 * <p>
 * Implementors must also provide the protected {@link #_getEOFIdx()} method.
 * When the input stream reaches <code>EOF</code>, the <code>next()</code>
 * method will generate a pseudo-input characher of value
 * <code>_getEOFIdx()</code> and look up the appropriate action at
 * <code>_tab[currentState][_getEOFIdx()]</code>.  The derived class can
 * determine whether a real token (e.g., of type <code>T_EOF</code>) should be
 * returned, or whether the lexer should simply return <code>null</code> when
 * it encounters <code>EOF</code>.  An attempt to read <em>beyond</em>
 * <code>EOF</code> will always generate an <code>EOFException</code>.
 * <p>
 * Finally, concrete derived classes must implement the {@link #_startState()}
 * method, which tells the engine what index to start the DFA at.
 *
 * @author Terran Lane
 * @version 1.0
 */

public abstract class AbstractTableLexer implements Lexer {

  public AbstractTableLexer(Reader r) {
    if (r==null) {
      throw new NullPointerException("Reader arg to SemiMiniLexer must be " +
				     "non-null");
    }
    _in=r;
    _pbBuf=new Stack();
    _curStr=new StringBuffer();
    _curState=_startState();
    _atEOF=false;
  }

  /**
   * Returns <code>true</code> iff there is still another token on the input
   * stream.  Note that this may require an entire token's worth of
   * characters to be scanned from the input stream and buffered.
   *
   * @return <code>true</code> if there is at least one token remaining on
   * the input stream; otherwise <code>false</code>.
   * @exception IOException If an error occurs on the underlying input stream
   * @exception IllegalCharException If a character outside the legal range
   * is encountered.
   */
  public boolean hasNext() throws IOException, IllegalCharException {
    if (_atEOF) { return false; }
    if (!_pbBuf.isEmpty()) { return true; }
    Token t=next();
    if (t==null) {
      return false;
    }
    pushBack(t);
    return true;
  }

  /**
   * Retrieve and return the next token from either the pushback buffer (if
   * it is non-empty) or the input stream (if it is not at end-of-file).  If
   * EOF has been reached, it returns <code>null</code>
   *
   * @return Most recent token, or <code>null</code> if there is no token
   * remaining on the input stream.
   * @exception IOException If a problem is encountered with the underlying
   * input stream
   * @exception EOFException If an attempt to scan an additional token is
   * made after <code>EOF</code> is encountered on the input stream.
   * @exception IllegalCharException If a character outside the legal range
   * (7-bit ASCII, for this lexer) is encountered
   */
  public Token next() throws IOException, IllegalCharException {
    if (!_pbBuf.empty()) {
      return (Token)_pbBuf.pop();
    }
    if (_atEOF) {
      throw new EOFException("Attempt to scan beyond end of stream");
    }
    Token t;
    while (true) {
      int c=_in.read();
      if (c>=_tab[_curState].length) {
	throw new IllegalCharException("Illegal Character '" +
				       (char)c + "' (" + c + ")");
      }
      if (c<0) {
	// EOF
	_atEOF=true;
	c=_getEOFIdx();
      }
      switch (_tab[_curState][c].getAct()) {
	case A_RET:
	  // return the currently accumulated token, and shift current
	  // character onto beginning of next token
	  if (_tab[_curState][c].isAToken()) {
	    t=new BaseToken(_curStr.toString(),
			    _tab[_curState][c].getTokType());
	  }
	  else {
	    t=null;
	  }
	  _curStr.delete(0,_curStr.length());
	  _curStr.append((char)c);
	  _curState=_tab[_curState][c].getNextState();
	  return t;
	case A_DROP:
	  // simply drop the current character and continue (don't return the
	  // currently accumulating token)
	  _curState=_tab[_curState][c].getNextState();
	  break;
	case A_SHIFT:
	  // shift the current character onto the buffer and continue
	  _curStr.append((char)c);
	  _curState=_tab[_curState][c].getNextState();
	  break;
	case A_DRET:
	  // drop the current character, but treat it as a token terminator;
	  // return the currently accumulating token and continue.
	  if (_tab[_curState][c].isAToken()) {
	    t=new BaseToken(_curStr.toString(),
			    _tab[_curState][c].getTokType());
	  }
	  else {
	    t=null;
	  }
	  _curStr.delete(0,_curStr.length());
	  _curState=_tab[_curState][c].getNextState();
	  return t;
	default:
	  assert true : "action=" + _tab[_curState][c].getAct() +
	    "_curState=" + _curState + ", c=" + c + "(" + (char)c + ")" +
	    ", _curStr=" + _curStr;
      }
    }
  }

  public int pushBack(Token t) {
    _pbBuf.push(t);
    return _pbBuf.size();
  }

  /* ******************** end of public interface ******************** */

  /** Lexer action: Return current token; shift current input onto next
      token. */
  protected static final int A_RET=1;
  /** Lexer action: Drop current input and continue lexing. */
  protected static final int A_DROP=2;
  /** Lexer action: Shift current input onto currently accumulating token
      and continue lexing. */
  protected static final int A_SHIFT=3;
  /** Lexer action: Return current token; drop current input and reset
      accumulator token to empty. */
  protected static final int A_DRET=4;

  /**
   * Index of the <code>EOF</code> input in the DFA table.  Must be defined
   * by any concrete derived class.
   */
  abstract protected int _getEOFIdx();

  /**
   * Index of the initial state for the DFA.  Must be defined by any concrete
   * derived class.
   */
  abstract protected int _startState();
  
  /**
   * Inner class representing a lexical action, including next state and
   * action to take (e.g., RETURN, SHIFT, or DROP).  These objects are stored
   * in the <code>_tab</code> DFA table and looked up by the lexer engine to
   * decide what to do at each step.  In addition to next state and action,
   * these objects provide an "is a token" method which is used to determine
   * whether a real token is available to be returned, or if there is no
   * token left.  This can be used when the stream reaches EOF, but when
   * there is no lexical token preceeding the EOF so that the lexer can
   * return <code>null</code>.  Alternatively, the implementing lexer can
   * choose to provide a "<code>TT_EOF</code>" token type and request an
   * explicit token return for EOF.
   */
  protected static class _LexAct {
    /**
     * Creates a new <code>_LexAct</code> instance.
     *
     * @param nState Next DFA state from this (state,input) pair
     * @param act Lexer action to take (one of the <code>A_*</code>
     * constants)
     * @param ttype Token type to assign to a token created at this state
     * (i.e., if the current (state,action) pair is <code>A_RET</code>,
     * <code>A_DRET</code>, or <code>EOF</code> is reached.
     * @param isTok Boolean indicating whether this should be considered a
     * real token and returned, or if <code>EOF</code> has been reached
     * without accumulating a real token.
     */
    public _LexAct(int nState, int act, int ttype, boolean isTok) {
      if (nState<0 || act<0) {
	throw new IllegalArgumentException("State/Action/TokType tuple for " +
					   "_LexAct object must both be " +
					   ">=0, not nState=" + nState +
					   ",act=" + act +
					   ",ttype=" + ttype);
      }
      _ns=nState;
      _a=act;
      _t=ttype;
      _isTok=isTok;
    }

    /**
     * Equivalent to <code>_LexAct(nState,act,ttype,true)</code>.  This is
     * the default state that will normally be wanted.
     */
    public _LexAct(int nState, int act, int ttype) {
      this(nState,act,ttype,true);
    }

    /** Return the next state in the DFA from this (state,action) pair */
    public int getNextState() { return _ns; }
    /** Return the lexer action to take from this (state,input) pair. */
    public int getAct() { return _a; }
    /** Return the type of the accumulated token. */
    public int getTokType() { return _t; }
    /** Whether the current buffer is an actual token, or should be discarded
	on return. */
    public boolean isAToken() { return _isTok; }

    /* ******************** end of public interface ******************** */
    private final int _ns;
    private final int _a;
    private final int _t;
    private final boolean _isTok;
  }

  /**
   * DFA table for the lexer.  This must be instantiated and filled in by any
   * concrete deriving lexical analyzer class.  This table must be of the
   * form<br>
   * <pre>
   * _tab[curState][curInput]
   * </pre><br>
   * Where each entry specifies the action/type/etc info for that
   * (state,input) pair.
   */
  protected _LexAct[][] _tab;

  /* ******************** end of protected interface ******************** */

  private final Stack _pbBuf;
  private StringBuffer _curStr;
  private int _curState;
  private final Reader _in;
  private boolean _atEOF;
}
