001/* StreamTokenizer.java -- parses streams of characters into tokens
002   Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003  Free Software Foundation
003
004This file is part of GNU Classpath.
005
006GNU Classpath is free software; you can redistribute it and/or modify
007it under the terms of the GNU General Public License as published by
008the Free Software Foundation; either version 2, or (at your option)
009any later version.
010 
011GNU Classpath is distributed in the hope that it will be useful, but
012WITHOUT ANY WARRANTY; without even the implied warranty of
013MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
014General Public License for more details.
015
016You should have received a copy of the GNU General Public License
017along with GNU Classpath; see the file COPYING.  If not, write to the
018Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
01902110-1301 USA.
020
021Linking this library statically or dynamically with other modules is
022making a combined work based on this library.  Thus, the terms and
023conditions of the GNU General Public License cover the whole
024combination.
025
026As a special exception, the copyright holders of this library give you
027permission to link this library with independent modules to produce an
028executable, regardless of the license terms of these independent
029modules, and to copy and distribute the resulting executable under
030terms of your choice, provided that you also meet, for each linked
031independent module, the terms and conditions of the license of that
032module.  An independent module is a module which is not derived from
033or based on this library.  If you modify this library, you may extend
034this exception to your version of the library, but you are not
035obligated to do so.  If you do not wish to do so, delete this
036exception statement from your version. */
037
038package java.io;
039
040import gnu.java.lang.CPStringBuilder;
041
042/**
043 * This class parses streams of characters into tokens.  There are a
044 * million-zillion flags that can be set to control the parsing, as 
045 * described under the various method headings.
046 *
047 * @author Warren Levy (warrenl@cygnus.com)
048 * @date October 25, 1998.  
049 */
050/* Written using "Java Class Libraries", 2nd edition, ISBN 0-201-31002-3
051 * "The Java Language Specification", ISBN 0-201-63451-1
052 * plus online API docs for JDK 1.2 beta from http://www.javasoft.com.
053 * Status:  Believed complete and correct.
054 */
055 
056public class StreamTokenizer
057{
058  /** A constant indicating that the end of the stream has been read. */
059  public static final int TT_EOF = -1;
060
061  /** A constant indicating that the end of the line has been read. */
062  public static final int TT_EOL = '\n';
063
064  /** A constant indicating that a number token has been read. */
065  public static final int TT_NUMBER = -2;
066
067  /** A constant indicating that a word token has been read. */
068  public static final int TT_WORD = -3;
069
070  /** A constant indicating that no tokens have been read yet. */
071  private static final int TT_NONE = -4;
072
073  /**
074   * Contains the type of the token read resulting from a call to nextToken
075   * The rules are as follows:
076   * <ul>
077   * <li>For a token consisting of a single ordinary character, this is the 
078   *     value of that character.</li>
079   * <li>For a quoted string, this is the value of the quote character</li>
080   * <li>For a word, this is TT_WORD</li>
081   * <li>For a number, this is TT_NUMBER</li>
082   * <li>For the end of the line, this is TT_EOL</li>
083   * <li>For the end of the stream, this is TT_EOF</li>
084   * </ul>
085   */
086  public int ttype = TT_NONE;
087
088  /** The String associated with word and string tokens. */
089  public String sval;
090
091  /** The numeric value associated with number tokens. */
092  public double nval;
093
094  /* Indicates whether end-of-line is recognized as a token. */
095  private boolean eolSignificant = false;
096
097  /* Indicates whether word tokens are automatically made lower case. */
098  private boolean lowerCase = false;
099
100  /* Indicates whether C++ style comments are recognized and skipped. */
101  private boolean slashSlash = false;
102
103  /* Indicates whether C style comments are recognized and skipped. */
104  private boolean slashStar = false;
105
106  /* Attribute tables of each byte from 0x00 to 0xFF. */
107  private boolean[] whitespace = new boolean[256];
108  private boolean[] alphabetic = new boolean[256];
109  private boolean[] numeric = new boolean[256];
110  private boolean[] quote = new boolean[256];
111  private boolean[] comment = new boolean[256];
112
113  /* The Reader associated with this class. */
114  private PushbackReader in;
115
116  /* Indicates if a token has been pushed back. */
117  private boolean pushedBack = false;
118
119  /* Contains the current line number of the reader. */
120  private int lineNumber = 1;
121
122  /**
123   * This method reads bytes from an <code>InputStream</code> and tokenizes
124   * them.  For details on how this method operates by default, see
125   * <code>StreamTokenizer(Reader)</code>.
126   *
127   * @param is The <code>InputStream</code> to read from
128   *
129   * @deprecated Since JDK 1.1.
130   */
131  public StreamTokenizer(InputStream is)
132  {
133    this(new InputStreamReader(is));
134  }
135
136  /**
137   * This method initializes a new <code>StreamTokenizer</code> to read 
138   * characters from a <code>Reader</code> and parse them.  The char values
139   * have their hight bits masked so that the value is treated a character
140   * in the range of 0x0000 to 0x00FF.
141   * <p>
142   * This constructor sets up the parsing table to parse the stream in the
143   * following manner:
144   * <ul>
145   * <li>The values 'A' through 'Z', 'a' through 'z' and 0xA0 through 0xFF
146   *     are initialized as alphabetic</li>
147   * <li>The values 0x00 through 0x20 are initialized as whitespace</li>
148   * <li>The values '\'' and '"' are initialized as quote characters</li>
149   * <li>'/' is a comment character</li>
150   * <li>Numbers will be parsed</li>
151   * <li>EOL is not treated as significant</li>
152   * <li>C  and C++ (//) comments are not recognized</li>
153   * </ul>
154   *
155   * @param r The <code>Reader</code> to read chars from
156   */
157  public StreamTokenizer(Reader r)
158  {
159    in = new PushbackReader(r);
160
161    whitespaceChars(0x00, 0x20);
162    wordChars('A', 'Z');
163    wordChars('a', 'z');
164    wordChars(0xA0, 0xFF);
165    commentChar('/');
166    quoteChar('\'');
167    quoteChar('"');
168    parseNumbers();
169  }
170
171  /**
172   * This method sets the comment attribute on the specified
173   * character.  Other attributes for the character are cleared.
174   *
175   * @param ch The character to set the comment attribute for, passed as an int
176   */
177  public void commentChar(int ch)
178  {
179    if (ch >= 0 && ch <= 255)
180      {
181        comment[ch] = true;
182        whitespace[ch] = false;
183        alphabetic[ch] = false;
184        numeric[ch] = false;
185        quote[ch] = false;
186      }
187  }
188
189  /**
190   * This method sets a flag that indicates whether or not the end of line
191   * sequence terminates and is a token.  The defaults to <code>false</code>
192   *
193   * @param flag <code>true</code> if EOF is significant, <code>false</code>
194   *             otherwise
195   */
196  public void eolIsSignificant(boolean flag)
197  {
198    eolSignificant = flag;
199  }
200
201  /**
202   * This method returns the current line number.  Note that if the 
203   * <code>pushBack()</code> method is called, it has no effect on the
204   * line number returned by this method.
205   *
206   * @return The current line number
207   */
208  public int lineno()
209  {
210    return lineNumber;
211  }
212
213  /**
214   * This method sets a flag that indicates whether or not alphabetic
215   * tokens that are returned should be converted to lower case.
216   * 
217   * @param flag <code>true</code> to convert to lower case,
218   *             <code>false</code> otherwise
219   */
220  public void lowerCaseMode(boolean flag)
221  {
222    lowerCase = flag;
223  }
224
225  private boolean isWhitespace(int ch)
226  {
227    return (ch >= 0 && ch <= 255 && whitespace[ch]);
228  }
229
230  private boolean isAlphabetic(int ch)
231  {
232    return ((ch > 255) || (ch >= 0 && alphabetic[ch]));
233  }
234
235  private boolean isNumeric(int ch)
236  {
237    return (ch >= 0 && ch <= 255 && numeric[ch]);
238  }
239
240  private boolean isQuote(int ch)
241  {
242    return (ch >= 0 && ch <= 255 && quote[ch]);
243  }
244
245  private boolean isComment(int ch)
246  {
247    return (ch >= 0 && ch <= 255 && comment[ch]);
248  }
249
250  /**
251   * This method reads the next token from the stream.  It sets the 
252   * <code>ttype</code> variable to the appropriate token type and 
253   * returns it.  It also can set <code>sval</code> or <code>nval</code>
254   * as described below.  The parsing strategy is as follows:
255   * <ul>
256   * <li>Skip any whitespace characters.</li>
257   * <li>If a numeric character is encountered, attempt to parse a numeric
258   * value.  Leading '-' characters indicate a numeric only if followed by
259   * another non-'-' numeric.  The value of the numeric token is terminated
260   * by either the first non-numeric encountered, or the second occurrence of
261   * '-' or '.'.  The token type returned is TT_NUMBER and <code>nval</code>
262   * is set to the value parsed.</li>
263   * <li>If an alphabetic character is parsed, all subsequent characters
264   * are read until the first non-alphabetic or non-numeric character is
265   * encountered.  The token type returned is TT_WORD and the value parsed
266   * is stored in <code>sval</code>.  If lower case mode is set, the token
267   * stored in <code>sval</code> is converted to lower case.  The end of line
268   * sequence terminates a word only if EOL signficance has been turned on.
269   * The start of a comment also terminates a word.  Any character with a 
270   * non-alphabetic and non-numeric attribute (such as white space, a quote,
271   * or a commet) are treated as non-alphabetic and terminate the word.</li>
272   * <li>If a comment character is parsed, then all remaining characters on
273   * the current line are skipped and another token is parsed.  Any EOL or
274   * EOF's encountered are not discarded, but rather terminate the comment.</li>
275   * <li>If a quote character is parsed, then all characters up to the 
276   * second occurrence of the same quote character are parsed into a
277   * <code>String</code>.  This <code>String</code> is stored as
278   * <code>sval</code>, but is not converted to lower case, even if lower case
279   * mode is enabled.  The token type returned is the value of the quote
280   * character encountered.  Any escape sequences
281   * (\b (backspace), \t (HTAB), \n (linefeed), \f (form feed), \r
282   * (carriage return), \" (double quote), \' (single quote), \\
283   * (backslash), \XXX (octal esacpe)) are converted to the appropriate
284   * char values.  Invalid esacape sequences are left in untranslated.  
285   * Unicode characters like ('\ u0000') are not recognized. </li>
286   * <li>If the C++ comment sequence "//" is encountered, and the parser
287   * is configured to handle that sequence, then the remainder of the line
288   * is skipped and another token is read exactly as if a character with
289   * the comment attribute was encountered.</li>
290   * <li>If the C comment sequence "/*" is encountered, and the parser
291   * is configured to handle that sequence, then all characters up to and
292   * including the comment terminator sequence are discarded and another
293   * token is parsed.</li>
294   * <li>If all cases above are not met, then the character is an ordinary
295   * character that is parsed as a token by itself.  The char encountered
296   * is returned as the token type.</li>
297   * </ul>
298   *
299   * @return The token type
300   * @exception IOException If an I/O error occurs
301   */
302  public int nextToken() throws IOException
303  {
304    if (pushedBack)
305      {
306        pushedBack = false;
307        if (ttype != TT_NONE)
308          return ttype;
309      }
310
311    sval = null;
312    int ch;
313
314    // Skip whitespace.  Deal with EOL along the way.
315    while (isWhitespace(ch = in.read()))
316      if (ch == '\n' || ch == '\r')
317        {
318          lineNumber++;
319
320          // Throw away \n if in combination with \r.
321          if (ch == '\r' && (ch = in.read()) != '\n')
322            {
323              if (ch != TT_EOF)
324                in.unread(ch);
325            }
326          if (eolSignificant)
327            return (ttype = TT_EOL);
328        }
329
330    if (ch == '/')
331      if ((ch = in.read()) == '/' && slashSlash)
332        {
333          while ((ch = in.read()) != '\n' && ch != '\r' && ch != TT_EOF)
334            ;
335          
336          if (ch != TT_EOF)
337            in.unread(ch);
338          return nextToken(); // Recursive, but not too deep in normal cases
339        }
340      else if (ch == '*' && slashStar) 
341        {
342          while (true)
343            {
344              ch = in.read();
345              if (ch == '*')
346                {
347                  if ((ch = in.read()) == '/')
348                    break;
349                  else if (ch != TT_EOF)
350                    in.unread(ch);
351                }
352              else if (ch == '\n' || ch == '\r')
353                {
354                  lineNumber++;
355                  if (ch == '\r' && (ch = in.read()) != '\n')
356                    {
357                      if (ch != TT_EOF)
358                        in.unread(ch);
359                    }
360                }
361              else if (ch == TT_EOF)
362                {
363                  break;
364                }
365            }
366          return nextToken(); // Recursive, but not too deep in normal cases
367        }
368      else
369        {
370          if (ch != TT_EOF)
371            in.unread(ch);
372          ch = '/';
373        }
374
375    if (ch == TT_EOF)
376      ttype = TT_EOF;
377    else if (isNumeric(ch))
378      {
379        boolean isNegative = false;
380        if (ch == '-')
381          {
382            // Read ahead to see if this is an ordinary '-' rather than numeric.
383            ch = in.read();
384            if (isNumeric(ch) && ch != '-')
385              {
386                isNegative = true;
387              }
388            else
389              {
390                if (ch != TT_EOF)
391                  in.unread(ch);
392                return (ttype = '-');
393              }
394          }
395
396        CPStringBuilder tokbuf = new CPStringBuilder();
397        tokbuf.append((char) ch);
398
399        int decCount = 0;
400        while (isNumeric(ch = in.read()) && ch != '-')
401          if (ch == '.' && decCount++ > 0)
402            break;
403          else
404            tokbuf.append((char) ch);
405
406        if (ch != TT_EOF)
407          in.unread(ch);
408        ttype = TT_NUMBER;
409        try
410          {
411            nval = Double.valueOf(tokbuf.toString()).doubleValue();
412          }
413        catch (NumberFormatException _)
414          {
415            nval = 0.0;
416          }
417        if (isNegative)
418          nval = -nval;
419      }
420    else if (isAlphabetic(ch))
421      {
422        CPStringBuilder tokbuf = new CPStringBuilder();
423        tokbuf.append((char) ch);
424        while (isAlphabetic(ch = in.read()) || isNumeric(ch))
425          tokbuf.append((char) ch);
426        if (ch != TT_EOF)
427          in.unread(ch);
428        ttype = TT_WORD;
429        sval = tokbuf.toString();
430        if (lowerCase)
431          sval = sval.toLowerCase();
432      }
433    else if (isComment(ch))
434      {
435        while ((ch = in.read()) != '\n' && ch != '\r' && ch != TT_EOF)
436          ;
437        
438        if (ch != TT_EOF)
439          in.unread(ch);
440        return nextToken();     // Recursive, but not too deep in normal cases.
441      }
442    else if (isQuote(ch))
443      {
444        ttype = ch;
445        CPStringBuilder tokbuf = new CPStringBuilder();
446        while ((ch = in.read()) != ttype && ch != '\n' && ch != '\r' &&
447               ch != TT_EOF)
448          {
449            if (ch == '\\')
450              switch (ch = in.read())
451                {
452                  case 'a':     ch = 0x7;
453                    break;
454                  case 'b':     ch = '\b';
455                    break;
456                  case 'f':     ch = 0xC;
457                    break;
458                  case 'n':     ch = '\n';
459                    break;
460                  case 'r':     ch = '\r';
461                    break;
462                  case 't':     ch = '\t';
463                    break;
464                  case 'v':     ch = 0xB;
465                    break;
466                  case '\n':    ch = '\n';
467                    break;
468                  case '\r':    ch = '\r';
469                    break;
470                  case '\"':
471                  case '\'':
472                  case '\\':
473                    break;
474                  default:
475                    int ch1, nextch;
476                    if ((nextch = ch1 = ch) >= '0' && ch <= '7')
477                      {
478                        ch -= '0';
479                        if ((nextch = in.read()) >= '0' && nextch <= '7')
480                          {
481                            ch = ch * 8 + nextch - '0';
482                            if ((nextch = in.read()) >= '0' && nextch <= '7' &&
483                                ch1 >= '0' && ch1 <= '3')
484                              {
485                                ch = ch * 8 + nextch - '0';
486                                nextch = in.read();
487                              }
488                          }
489                      }
490
491                    if (nextch != TT_EOF)
492                      in.unread(nextch);
493                }
494
495            tokbuf.append((char) ch);
496          }
497
498        // Throw away matching quote char.
499        if (ch != ttype && ch != TT_EOF)
500          in.unread(ch);
501
502        sval = tokbuf.toString();
503      }
504    else
505      {
506        ttype = ch;
507      }
508
509    return ttype;
510  }
511
512  private void resetChar(int ch)
513  {
514    whitespace[ch] = alphabetic[ch] = numeric[ch] = quote[ch] = comment[ch] =
515      false;
516  }
517
518  /**
519   * This method makes the specified character an ordinary character.  This
520   * means that none of the attributes (whitespace, alphabetic, numeric,
521   * quote, or comment) will be set on this character.  This character will
522   * parse as its own token.
523   *
524   * @param ch The character to make ordinary, passed as an int
525   */
526  public void ordinaryChar(int ch)
527  {
528    if (ch >= 0 && ch <= 255)
529      resetChar(ch);
530  }
531
532  /**
533   * This method makes all the characters in the specified range, range
534   * terminators included, ordinary.  This means the none of the attributes
535   * (whitespace, alphabetic, numeric, quote, or comment) will be set on
536   * any of the characters in the range.  This makes each character in this
537   * range parse as its own token.
538   *
539   * @param low The low end of the range of values to set the whitespace
540   * attribute for
541   * @param hi The high end of the range of values to set the whitespace
542   * attribute for
543   */
544  public void ordinaryChars(int low, int hi)
545  {
546    if (low < 0)
547      low = 0;
548    if (hi > 255)
549      hi = 255;
550    for (int i = low; i <= hi; i++)
551      resetChar(i);
552  }
553
554  /**
555   * This method sets the numeric attribute on the characters '0' - '9' and
556   * the characters '.' and '-'.
557   * When this method is used, the result of giving other attributes
558   * (whitespace, quote, or comment) to the numeric characters may
559   * vary depending on the implementation. For example, if
560   * parseNumbers() and then whitespaceChars('1', '1') are called,
561   * this implementation reads "121" as 2, while some other implementation
562   * will read it as 21.
563   */
564  public void parseNumbers()
565  {
566    for (int i = 0; i <= 9; i++)
567      numeric['0' + i] = true;
568
569    numeric['.'] = true;
570    numeric['-'] = true;
571  }
572
573  /**
574   * Puts the current token back into the StreamTokenizer so
575   * <code>nextToken</code> will return the same value on the next call.
576   * May cause the lineno method to return an incorrect value
577   * if lineno is called before the next call to nextToken.
578   */
579  public void pushBack()
580  {
581    pushedBack = true;
582  }
583
584  /**
585   * This method sets the quote attribute on the specified character.
586   * Other attributes for the character are cleared.
587   *
588   * @param ch The character to set the quote attribute for, passed as an int.
589   */
590  public void quoteChar(int ch)
591  {
592    if (ch >= 0 && ch <= 255)
593      {
594        quote[ch] = true;
595        comment[ch] = false;
596        whitespace[ch] = false;
597        alphabetic[ch] = false;
598        numeric[ch] = false;
599      }
600  }
601
602  /**
603   * This method removes all attributes (whitespace, alphabetic, numeric,
604   * quote, and comment) from all characters.  It is equivalent to calling
605   * <code>ordinaryChars(0x00, 0xFF)</code>.
606   *
607   * @see #ordinaryChars(int, int)
608   */
609  public void resetSyntax()
610  {
611    ordinaryChars(0x00, 0xFF);
612  }
613
614  /**
615   * This method sets a flag that indicates whether or not "C++" language style
616   * comments ("//" comments through EOL ) are handled by the parser.
617   * If this is <code>true</code> commented out sequences are skipped and
618   * ignored by the parser.  This defaults to <code>false</code>.
619   *
620   * @param flag <code>true</code> to recognized and handle "C++" style
621   *             comments, <code>false</code> otherwise
622   */
623  public void slashSlashComments(boolean flag)
624  {
625    slashSlash = flag;
626  }
627
628  /**
629   * This method sets a flag that indicates whether or not "C" language style
630   * comments (with nesting not allowed) are handled by the parser.
631   * If this is <code>true</code> commented out sequences are skipped and
632   * ignored by the parser.  This defaults to <code>false</code>.
633   *
634   * @param flag <code>true</code> to recognized and handle "C" style comments,
635   *             <code>false</code> otherwise
636   */
637  public void slashStarComments(boolean flag)
638  {
639    slashStar = flag;
640  }
641
642  /**
643   * This method returns the current token value as a <code>String</code> in
644   * the form "Token[x], line n", where 'n' is the current line numbers and
645   * 'x' is determined as follows.
646   * <p>
647   * <ul>
648   * <li>If no token has been read, then 'x' is "NOTHING" and 'n' is 0</li>
649   * <li>If <code>ttype</code> is TT_EOF, then 'x' is "EOF"</li>
650   * <li>If <code>ttype</code> is TT_EOL, then 'x' is "EOL"</li>
651   * <li>If <code>ttype</code> is TT_WORD, then 'x' is <code>sval</code></li>
652   * <li>If <code>ttype</code> is TT_NUMBER, then 'x' is "n=strnval" where
653   * 'strnval' is <code>String.valueOf(nval)</code>.</li>
654   * <li>If <code>ttype</code> is a quote character, then 'x' is
655   * <code>sval</code></li>
656   * <li>For all other cases, 'x' is <code>ttype</code></li>
657   * </ul>
658   */
659  public String toString()
660  {
661    String tempstr;
662    if (ttype == TT_EOF)
663      tempstr = "EOF";
664    else if (ttype == TT_EOL)
665      tempstr = "EOL";
666    else if (ttype == TT_WORD)
667      tempstr = sval;
668    else if (ttype == TT_NUMBER)
669      tempstr = "n=" + nval;
670    else if (ttype == TT_NONE)
671      tempstr = "NOTHING";
672    else // must be an ordinary char.
673      tempstr = "\'" + (char) ttype + "\'";
674
675    return "Token[" + tempstr + "], line " + lineno();
676  }
677
678  /**
679   * This method sets the whitespace attribute for all characters in the
680   * specified range, range terminators included.
681   *
682   * @param low The low end of the range of values to set the whitespace
683   * attribute for
684   * @param hi The high end of the range of values to set the whitespace
685   * attribute for
686   */
687  public void whitespaceChars(int low, int hi)
688  {
689    if (low < 0)
690      low = 0;
691    if (hi > 255)
692      hi = 255;
693    for (int i = low; i <= hi; i++)
694      {
695        resetChar(i);
696        whitespace[i] = true;
697      }
698  }
699
700  /**
701   * This method sets the alphabetic attribute for all characters in the
702   * specified range, range terminators included.
703   *
704   * @param low The low end of the range of values to set the alphabetic
705   * attribute for
706   * @param hi The high end of the range of values to set the alphabetic
707   * attribute for
708   */
709  public void wordChars(int low, int hi)
710  {
711    if (low < 0)
712      low = 0;
713    if (hi > 255)
714      hi = 255;
715    for (int i = low; i <= hi; i++)
716      alphabetic[i] = true;
717  }
718}