001/* Parser.java -- HTML parser
002   Copyright (C) 2005 Free Software Foundation, Inc.
003
004This file is part of GNU Classpath.
005
006GNU Classpath is free software; you can redistribute it and/or modify
007it under the terms of the GNU General Public License as published by
008the Free Software Foundation; either version 2, or (at your option)
009any later version.
010
011GNU Classpath is distributed in the hope that it will be useful, but
012WITHOUT ANY WARRANTY; without even the implied warranty of
013MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
014General Public License for more details.
015
016You should have received a copy of the GNU General Public License
017along with GNU Classpath; see the file COPYING.  If not, write to the
018Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
01902110-1301 USA.
020
021Linking this library statically or dynamically with other modules is
022making a combined work based on this library.  Thus, the terms and
023conditions of the GNU General Public License cover the whole
024combination.
025
026As a special exception, the copyright holders of this library give you
027permission to link this library with independent modules to produce an
028executable, regardless of the license terms of these independent
029modules, and to copy and distribute the resulting executable under
030terms of your choice, provided that you also meet, for each linked
031independent module, the terms and conditions of the license of that
032module.  An independent module is a module which is not derived from
033or based on this library.  If you modify this library, you may extend
034this exception to your version of the library, but you are not
035obligated to do so.  If you do not wish to do so, delete this
036exception statement from your version. */
037
038
039package javax.swing.text.html.parser;
040
041import java.io.IOException;
042import java.io.Reader;
043
044import javax.swing.text.ChangedCharSetException;
045import javax.swing.text.SimpleAttributeSet;
046
047/*
048 * FOR DEVELOPERS: To avoid regression, please run the package test
049 * textsuite/javax.swing.text.html.parser/AllParserTests after your
050 * modifications.
051 */
052
053/**
054 * <p>A simple error-tolerant HTML parser that uses a DTD document
055 * to access data on the possible tokens, arguments and syntax.</p>
056 * <p> The parser reads an HTML content from a Reader and calls various
057 * notifying methods (which should be overridden in a subclass)
058 * when tags or data are encountered.</p>
059 * <p>Some HTML elements need no opening or closing tags. The
060 * task of this parser is to invoke the tag handling methods also when
061 * the tags are not explicitly specified and must be supposed using
062 * information, stored in the DTD.
063 * For  example, parsing the document
064 * <p>&lt;table&gt;&lt;tr&gt;&lt;td&gt;a&lt;td&gt;b&lt;td&gt;c&lt;/tr&gt; <br>
065 * will invoke exactly the handling methods exactly in the same order
066 * (and with the same parameters) as if parsing the document: <br>
067 * <em>&lt;html&gt;&lt;head&gt;&lt;/head&gt;&lt;body&gt;&lt;table&gt;&lt;
068 * tbody&gt;</em>&lt;tr&gt;&lt;td&gt;a<em>&lt;/td&gt;</em>&lt;td&gt;b<em>
069 * &lt;/td&gt;</em>&lt;td&gt;c<em>&lt;/td&gt;&lt;/tr&gt;</em>&lt;
070 * <em>/tbody&gt;&lt;/table&gt;&lt;/body&gt;&lt;/html&gt;</em></p>
071 * (supposed tags are given in italics). The parser also supports
072 * obsolete elements of HTML syntax.<p>
073 * </p>
074 * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org)
075 */
076public class Parser 
077  implements DTDConstants
078{
079  /**
080   * The document template description that will be used to parse the documents.
081   */
082  protected DTD dtd;
083
084  /**
085   * The value of this field determines whether or not the Parser will be
086   * strict in enforcing SGML compatibility. The default value is false,
087   * stating that the parser should do everything to parse and get at least
088   * some information even from the incorrectly written HTML input.
089   */
090  protected boolean strict;
091
092  /**
093   * The package level reference to the working HTML parser in this
094   * implementation.
095   */
096  final gnu.javax.swing.text.html.parser.support.Parser gnu;
097
098  /**
099   * Creates a new parser that uses the given DTD to access data on the
100   * possible tokens, arguments and syntax. There is no single - step way
101   * to get a default DTD; you must either refer to the implementation -
102   * specific packages, write your own DTD or obtain the working instance
103   * of parser in other way, for example, by calling
104   * {@link javax.swing.text.html.HTMLEditorKit#getParser() }.
105   * @param a_dtd A DTD to use.
106   */
107  public Parser(DTD a_dtd)
108  {
109    dtd = a_dtd;
110
111    final Parser j = this;
112
113    gnu =
114      new gnu.javax.swing.text.html.parser.support.Parser(dtd)
115        {
116          protected final void handleComment(char[] comment)
117          {
118            j.handleComment(comment);
119          }
120
121          protected final void handleEOFInComment()
122          {
123            j.handleEOFInComment();
124          }
125
126          protected final void handleEmptyTag(TagElement tag)
127            throws javax.swing.text.ChangedCharSetException
128          {
129            j.handleEmptyTag(tag);
130          }
131
132          protected final void handleStartTag(TagElement tag)
133          {
134            j.handleStartTag(tag);
135          }
136
137          protected final void handleEndTag(TagElement tag)
138          {
139            j.handleEndTag(tag);
140          }
141
142          protected final void handleError(int line, String message)
143          {
144            j.handleError(line, message);
145          }
146
147          protected final void handleText(char[] text)
148          {
149            j.handleText(text);
150          }
151
152          protected final void handleTitle(char[] title)
153          {
154            j.handleTitle(title);
155          }
156
157          protected final void markFirstTime(Element element)
158          {
159            j.markFirstTime(element);
160          }
161
162          protected final void startTag(TagElement tag)
163            throws ChangedCharSetException
164          {
165            j.startTag(tag);
166          }
167
168          protected final void endTag(boolean omitted)
169          {
170            j.endTag(omitted);
171          }
172
173          protected TagElement makeTag(Element element)
174          {
175            return j.makeTag(element);
176          }
177
178          protected TagElement makeTag(Element element, boolean isSupposed)
179          {
180            return j.makeTag(element, isSupposed);
181          }
182        };
183  }
184
185  /**
186   * Parse the HTML text, calling various methods in response to the
187   * occurence of the corresponding HTML constructions.
188   * @param reader The reader to read the source HTML from.
189   * @throws IOException If the reader throws one.
190   */
191  public synchronized void parse(Reader reader)
192    throws IOException
193  {
194    gnu.parse(reader);
195  }
196
197  /**
198   * Parses DTD markup declaration. Currently returns without action.
199   * @return null.
200   * @throws java.io.IOException
201   */
202  public String parseDTDMarkup()
203    throws IOException
204  {
205    return gnu.parseDTDMarkup();
206  }
207
208  /**
209   * Parse DTD document declarations. Currently only parses the document
210   * type declaration markup.
211   * @param strBuff
212   * @return true if this is a valid DTD markup declaration.
213   * @throws IOException
214   */
215  protected boolean parseMarkupDeclarations(StringBuffer strBuff)
216    throws IOException
217  {
218    return gnu.parseMarkupDeclarations(strBuff);
219  }
220
221  /**
222   * Get the attributes of the current tag.
223   * @return The attribute set, representing the attributes of the current tag.
224   */
225  protected SimpleAttributeSet getAttributes()
226  {
227    return gnu.getAttributes();
228  }
229
230  /**
231   * Get the number of the document line being parsed.
232   * @return The current line.
233   */
234  protected int getCurrentLine()
235  {
236    return gnu.hTag.where.beginLine;
237  }
238
239  /**
240   * Get the current position in the document being parsed.
241   * @return The current position.
242   */
243  protected int getCurrentPos()
244  {
245    return gnu.hTag.where.startPosition;
246  }
247
248  /**
249   * The method is called when the HTML end (closing) tag is found or if
250   * the parser concludes that the one should be present in the
251   * current position. The method is called immediatly
252   * before calling the handleEndTag().
253   * @param omitted True if the tag is no actually present in the document,
254   * but is supposed by the parser (like &lt;/html&gt; at the end of the
255   * document).
256   */
257  protected void endTag(boolean omitted)
258  {
259    // This default implementation does nothing.
260  }
261
262  /**
263   * Invokes the error handler. The default method in this implementation
264   * finally delegates the call to handleError, also providing the number of the
265   * current line.
266   */
267  protected void error(String msg)
268  {
269    gnu.error(msg);
270  }
271
272  /**
273   * Invokes the error handler. The default method in this implementation
274   * finally delegates the call to error (msg+": '"+invalid+"'").
275   */
276  protected void error(String msg, String invalid)
277  {
278    gnu.error(msg, invalid);
279  }
280
281  /**
282   * Invokes the error handler. The default method in this implementation
283   * finally delegates the call to error (parm1+" "+ parm2+" "+ parm3).
284   */
285  protected void error(String parm1, String parm2, String parm3)
286  {
287    gnu.error(parm1, parm2, parm3);
288  }
289
290  /**
291   * Invokes the error handler. The default method in this implementation
292   * finally delegates the call to error
293   * (parm1+" "+ parm2+" "+ parm3+" "+ parm4).
294   */
295  protected void error(String parm1, String parm2, String parm3, String parm4)
296  {
297    gnu.error(parm1, parm2, parm3, parm4);
298  }
299
300  /**
301   * In this implementation, this is never called and returns without action.
302   */
303  protected void flushAttributes()
304  {
305    gnu.flushAttributes();
306  }
307
308  /**
309   * Handle HTML comment. The default method returns without action.
310   * @param comment The comment being handled
311   */
312  protected void handleComment(char[] comment)
313  {
314    // This default implementation does nothing.
315  }
316
317  /**
318   * This is additionally called in when the HTML content terminates
319   * without closing the HTML comment. This can only happen if the
320   * HTML document contains errors (for example, the closing --;gt is
321   * missing. The default method calls the error handler.
322   */
323  protected void handleEOFInComment()
324  {
325    gnu.error("Unclosed comment");
326  }
327
328  /**
329   * Handle the tag with no content, like &lt;br&gt;. The method is
330   * called for the elements that, in accordance with the current DTD,
331   * has an empty content.
332   * @param tag The tag being handled.
333   * @throws javax.swing.text.ChangedCharSetException
334   */
335  protected void handleEmptyTag(TagElement tag)
336    throws ChangedCharSetException
337  {
338    // This default implementation does nothing.
339  }
340
341  /**
342   * The method is called when the HTML closing tag ((like &lt;/table&gt;)
343   * is found or if the parser concludes that the one should be present
344   * in the current position.
345   * @param tag The tag being handled
346   */
347  protected void handleEndTag(TagElement tag)
348  {
349    // This default implementation does nothing.
350  }
351
352  /* Handle error that has occured in the given line. */
353  protected void handleError(int line, String message)
354  {
355    // This default implementation does nothing.
356  }
357
358  /**
359   * The method is called when the HTML opening tag ((like &lt;table&gt;)
360   * is found or if the parser concludes that the one should be present
361   * in the current position.
362   * @param tag The tag being handled
363   */
364  protected void handleStartTag(TagElement tag)
365  {
366    // This default implementation does nothing.
367  }
368
369  /**
370   * Handle the text section.
371   * <p> For non-preformatted section, the parser replaces
372   * \t, \r and \n by spaces and then multiple spaces
373   * by a single space. Additionaly, all whitespace around
374   * tags is discarded.
375   * </p>
376   * <p> For pre-formatted text (inside TEXAREA and PRE), the parser preserves
377   * all tabs and spaces, but removes <b>one</b>  bounding \r, \n or \r\n,
378   * if it is present. Additionally, it replaces each occurence of \r or \r\n
379   * by a single \n.</p>
380   *
381   * @param text A section text.
382   */
383  protected void handleText(char[] text)
384  {
385    // This default implementation does nothing.
386  }
387
388  /**
389   * Handle HTML &lt;title&gt; tag. This method is invoked when
390   * both title starting and closing tags are already behind.
391   * The passed argument contains the concatenation of all
392   * title text sections.
393   * @param title The title text.
394   */
395  protected void handleTitle(char[] title)
396  {
397    // This default implementation does nothing.
398  }
399
400  /**
401   * Constructs the tag from the given element. In this implementation,
402   * this is defined, but never called.
403   * @param element the base element of the tag.
404   * @return the tag
405   */
406  protected TagElement makeTag(Element element)
407  {
408    return makeTag(element, false);
409  }
410
411  /**
412   * Constructs the tag from the given element.
413   * @param element the tag base {@link javax.swing.text.html.parser.Element}
414   * @param isSupposed true if the tag is not actually present in the
415   * html input, but the parser supposes that it should to occur in
416   * the current location.
417   * @return the tag
418   */
419  protected TagElement makeTag(Element element, boolean isSupposed)
420  {
421    return new TagElement(element, isSupposed);
422  }
423
424  /**
425   * This is called when the tag, representing the given element,
426   * occurs first time in the document.
427   * @param element
428   */
429  protected void markFirstTime(Element element)
430  {
431    // This default implementation does nothing.
432  }
433
434  /**
435   * The method is called when the HTML opening tag ((like &lt;table&gt;)
436   * is found or if the parser concludes that the one should be present
437   * in the current position. The method is called immediately before
438   * calling the handleStartTag.
439   * @param tag The tag
440   */
441  protected void startTag(TagElement tag)
442    throws ChangedCharSetException
443  {
444    // This default implementation does nothing.
445  }
446}