Thread: [Htmlparser-cvs] htmlparser/src/org/htmlparser/sax Attributes.java,NONE,1.1 Feedback.java,NONE,1.1 L

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/sax
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv11717/src/org/htmlparser/sax

Added Files:
	Attributes.java Feedback.java Locator.java XMLReader.java 
	package.html 
Log Message:
Implement rudimentary sax parser.
Currently exposes DOM parser via sax project (http://sourceforge.net/projects/sax) interfaces.

--- NEW FILE: Locator.java ---
// HTMLParser Library $Name:  $ - A java-based parser for HTML
// http://sourceforge.org/projects/htmlparser
// Copyright (C) 2004 Derrick Oswald
//
// Revision Control Information
//
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/sax/Locator.java,v $
// $Author: derrickoswald $
// $Date: 2004/07/14 01:58:02 $
// $Revision: 1.1 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//

package org.htmlparser.sax;

import org.htmlparser.Parser;
import org.htmlparser.lexer.Lexer;

/**
 * Transforms character offsets into line and column in the HTML file.
 */
public class Locator
    implements
        org.xml.sax.Locator
{
    /**
     * Underlying parser object.
     */
    protected Parser mParser;

    /**
     * Creates a locator for the given parser.
     * @param parser The parser with the {@link org.htmlparser.lexer.Page Page} being accessed.
     */
    public Locator (Parser parser)
    {
        mParser = parser;
    }

    /**
     * Return the public identifier for the current document event.
     *
     * <p>The return value is the public identifier of the document
     * entity or of the external parsed entity in which the markup
     * triggering the event appears.</p>
     *
     * @return A string containing the public identifier, or
     *         null if none is available.
     * @see #getSystemId
     */
    public String getPublicId ()
    {
        return (null); // I assume this would be <title></title>
    }

    /**
     * Return the system identifier for the current document event.
     *
     * <p>The return value is the system identifier of the document
     * entity or of the external parsed entity in which the markup
     * triggering the event appears.</p>
     *
     * <p>If the system identifier is a URL, the parser must resolve it
     * fully before passing it to the application.  For example, a file
     * name must always be provided as a <em>file:...</em> URL, and other
     * kinds of relative URI are also resolved against their bases.</p>
     *
     * @return A string containing the system identifier, or null
     *         if none is available.
     * @see #getPublicId
     */
    public String getSystemId ()
    {
        return (mParser.getURL ());
    }

    /**
     * Return the line number where the current document event ends.
     * Lines are delimited by line ends, which are defined in
     * the XML specification.
     *
     * <p><strong>Warning:</strong> The return value from the method
     * is intended only as an approximation for the sake of diagnostics;
     * it is not intended to provide sufficient information
     * to edit the character content of the original XML document.
     * In some cases, these "line" numbers match what would be displayed
     * as columns, and in others they may not match the source text
     * due to internal entity expansion.  </p>
     *
     * <p>The return value is an approximation of the line number
     * in the document entity or external parsed entity where the
     * markup triggering the event appears.</p>
     *
     * <p>If possible, the SAX driver should provide the line position 
     * of the first character after the text associated with the document 
     * event.  The first line is line 1.</p>
     *
     * @return The line number, or -1 if none is available.
     * @see #getColumnNumber
     */
    public int getLineNumber ()
    {
        Lexer lexer;

        lexer = mParser.getLexer ();
        return (lexer.getPage ().row (lexer.getCursor ()));
    }

    /**
     * Return the column number where the current document event ends.
     * This is one-based number of Java <code>char</code> values since
     * the last line end.
     *
     * <p><strong>Warning:</strong> The return value from the method
     * is intended only as an approximation for the sake of diagnostics;
     * it is not intended to provide sufficient information
     * to edit the character content of the original XML document.
     * For example, when lines contain combining character sequences, wide
     * characters, surrogate pairs, or bi-directional text, the value may
     * not correspond to the column in a text editor's display. </p>
     *
     * <p>The return value is an approximation of the column number
     * in the document entity or external parsed entity where the
     * markup triggering the event appears.</p>
     *
     * <p>If possible, the SAX driver should provide the line position 
     * of the first character after the text associated with the document 
     * event.  The first column in each line is column 1.</p>
     *
     * @return The column number, or -1 if none is available.
     * @see #getLineNumber
     */
    public int getColumnNumber ()
    {
        Lexer lexer;

        lexer = mParser.getLexer ();
        return (lexer.getPage ().column (lexer.getCursor ()));
    }
}

--- NEW FILE: package.html ---
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<html>
<head>
<!--
 HTMLParser Library $Name:  $ - A java-based parser for HTML
 http://sourceforge.org/projects/htmlparser
 Copyright (C) 2004 Derrick Oswald

 Revision Control Information

 $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/sax/package.html,v $
 $Author: derrickoswald $
 $Date: 2004/07/14 01:58:02 $
 $Revision: 1.1 $

 This library is free software; you can redistribute it and/or
 modify it under the terms of the GNU Lesser General Public
 License as published by the Free Software Foundation; either
 version 2.1 of the License, or (at your option) any later version.

 This library is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 Lesser General Public License for more details.

 You should have received a copy of the GNU Lesser General Public
 License along with this library; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-->
</head>
<body bgcolor="white">
The sax package implements a SAX (Simple API for XML) parser for HTML.
It uses the SAX 2 interfaces available from the <A
href="http://sourceforge.net/projects/sax/">sax</A> project.<br>
The HTML parser sax package is currently in it's infancy and just exposes the DOM Parser
via a SAX interface. The driver name is "org.htmlparser.sax.XMLReader" and a
simplistic test program is available in the org.htmlparser.tests package as
SAXTest.java.<br>
Some major pieces are missing, like namespace support (HTML files won't
generally have much in the way of namespaces), attribute type info, resolvers
and DTD handlers, among many other things.<br>
</body>
</html>
--- NEW FILE: XMLReader.java ---
// HTMLParser Library $Name:  $ - A java-based parser for HTML
// http://sourceforge.org/projects/htmlparser
// Copyright (C) 2004 Derrick Oswald
//
// Revision Control Information
//
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/sax/XMLReader.java,v $
// $Author: derrickoswald $
// $Date: 2004/07/14 01:58:02 $
// $Revision: 1.1 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//

package org.htmlparser.sax;

import java.io.IOException;

import org.xml.sax.ContentHandler;
import org.xml.sax.DTDHandler;
import org.xml.sax.EntityResolver;
import org.xml.sax.ErrorHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXNotRecognizedException;
import org.xml.sax.SAXNotSupportedException;
import org.xml.sax.SAXParseException;
import org.xml.sax.helpers.NamespaceSupport;

import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.Remark;
import org.htmlparser.Tag;
import org.htmlparser.Text;
import org.htmlparser.util.DefaultParserFeedback;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.ParserFeedback;

/**
 * SAX parser.
 * Generates callbacks on the {@link ContentHandler} based on encountered nodes.
 * <br><em>Preliminary</em>.
 * <pre>
 * org.xml.sax.XMLReader reader = org.xml.sax.helpers.XMLReaderFactory.createXMLReader ("org.htmlparser.sax.XMLReader");
 * org.xml.sax.ContentHandler content = new MyContentHandler ();
 * reader.setContentHandler (content);
 * org.xml.sax.ErrorHandler errors = new MyErrorHandler ();
 * reader.setErrorHandler (errors);
 * reader.parse ("http://cbc.ca");
 * </pre>
 */
public class XMLReader
    implements
        org.xml.sax.XMLReader
{
    /**
     * Determines if namespace handling is on.
     * All XMLReaders are required to recognize the feature names:
     * <ul>
     * <li><code>http://xml.org/sax/features/namespaces</code> -
     *     a value of "true" indicates namespace URIs and unprefixed
     *     local names for element and attribute names will be available</li>
     * <li><code>http://xml.org/sax/features/namespace-prefixes</code> -
     *     a value of "true" indicates that XML qualified names (with
     *     prefixes) and attributes (including xmlns* attributes) will
     *     be available.
     * </ul>
     */
    protected boolean mNameSpaces; // namespaces

    /**
     * Determines if namespace prefix handling is on.
     * @see #mNameSpaces
     */
    protected boolean mNameSpacePrefixes; // namespace-prefixes

    /**
     * <em> not implemented</em>
     */
    protected EntityResolver mEntityResolver;

    /**
     * <em> not implemented</em>
     */
    protected DTDHandler mDTDHandler;

    /**
     * The content callback object.
     */
    protected ContentHandler mContentHandler;

    /**
     * The error handler object.
     */
    protected ErrorHandler mErrorHandler;

    /**
     * The underlying DOM parser.
     */
    protected Parser mParser;

    /**
     * Namspace utility object.
     */
    protected NamespaceSupport mSupport;

    /**
     * Qualified name parts.
     */
    protected String mParts[];

    /**
     * Create an SAX parser.
     */
    public XMLReader ()
    {
        mNameSpaces = true;
        mNameSpacePrefixes = false;

        mEntityResolver = null;
        mDTDHandler = null;
        mContentHandler = null;
        mErrorHandler = null;

        mSupport = new NamespaceSupport ();
        mSupport.pushContext ();
        mSupport.declarePrefix ("", "http://www.w3.org/TR/REC-html40");
        // todo:
        //    xmlns:html='http://www.w3.org/TR/REC-html40'
        // or xmlns:html='http://www.w3.org/1999/xhtml'
        mParts = new String[3];
    }

    ////////////////////////////////////////////////////////////////////
    // Configuration.
    ////////////////////////////////////////////////////////////////////

    /**
     * Look up the value of a feature flag.
     *
     * <p>The feature name is any fully-qualified URI.  It is
     * possible for an XMLReader to recognize a feature name but
     * temporarily be unable to return its value.
     * Some feature values may be available only in specific
     * contexts, such as before, during, or after a parse.
     * Also, some feature values may not be programmatically accessible.
     * (In the case of an adapter for SAX1 {@link Parser}, there is no
     * implementation-independent way to expose whether the underlying
     * parser is performing validation, expanding external entities,
     * and so forth.) </p>
     *
     * <p>All XMLReaders are required to recognize the
     * http://xml.org/sax/features/namespaces and the
     * http://xml.org/sax/features/namespace-prefixes feature names.</p>
     *
     * <p>Typical usage is something like this:</p>
     *
     * <pre>
     * XMLReader r = new MySAXDriver();
     *
     *                         // try to activate validation
     * try {
     *   r.setFeature("http://xml.org/sax/features/validation", true);
     * } catch (SAXException e) {
     *   System.err.println("Cannot activate validation."); 
     * }
     *
     *                         // register event handlers
     * r.setContentHandler(new MyContentHandler());
     * r.setErrorHandler(new MyErrorHandler());
     *
     *                         // parse the first document
     * try {
     *   r.parse("http://www.foo.com/mydoc.xml");
     * } catch (IOException e) {
     *   System.err.println("I/O exception reading XML document");
     * } catch (SAXException e) {
     *   System.err.println("XML exception reading document.");
     * }
     * </pre>
     *
     * <p>Implementors are free (and encouraged) to invent their own features,
     * using names built on their own URIs.</p>
     *
     * @param name The feature name, which is a fully-qualified URI.
     * @return The current value of the feature (true or false).
     * @exception org.xml.sax.SAXNotRecognizedException If the feature
     *            value can't be assigned or retrieved.
     * @exception org.xml.sax.SAXNotSupportedException When the
     *            XMLReader recognizes the feature name but 
     *            cannot determine its value at this time.
     * @see #setFeature
     */
    public boolean getFeature (String name)
        throws SAXNotRecognizedException, SAXNotSupportedException
    {
        boolean ret;

        if (name.equals ("http://xml.org/sax/features/namespaces"))
            ret = mNameSpaces;
        else if (name.equals ("http://xml.org/sax/features/namespace-prefixes"))
            ret = mNameSpacePrefixes;
        else
            throw new SAXNotSupportedException (name + " not yet understood");

        return (ret);
    }

    /**
     * Set the value of a feature flag.
     *
     * <p>The feature name is any fully-qualified URI.  It is
     * possible for an XMLReader to expose a feature value but
     * to be unable to change the current value.
     * Some feature values may be immutable or mutable only 
     * in specific contexts, such as before, during, or after 
     * a parse.</p>
     *
     * <p>All XMLReaders are required to support setting
     * http://xml.org/sax/features/namespaces to true and
     * http://xml.org/sax/features/namespace-prefixes to false.</p>
     *
     * @param name The feature name, which is a fully-qualified URI.
     * @param value The requested value of the feature (true or false).
     * @exception org.xml.sax.SAXNotRecognizedException If the feature
     *            value can't be assigned or retrieved.
     * @exception org.xml.sax.SAXNotSupportedException When the
     *            XMLReader recognizes the feature name but 
     *            cannot set the requested value.
     * @see #getFeature
     */
    public void setFeature (String name, boolean value)
	throws SAXNotRecognizedException, SAXNotSupportedException
    {
        if (name.equals ("http://xml.org/sax/features/namespaces"))
            mNameSpaces = value;
        else if (name.equals ("http://xml.org/sax/features/namespace-prefixes"))
            mNameSpacePrefixes = value;
        else
            throw new SAXNotSupportedException (name + " not yet understood");
    }

    /**
     * Look up the value of a property.
     *
     * <p>The property name is any fully-qualified URI.  It is
     * possible for an XMLReader to recognize a property name but
     * temporarily be unable to return its value.
     * Some property values may be available only in specific
     * contexts, such as before, during, or after a parse.</p>
     *
     * <p>XMLReaders are not required to recognize any specific
     * property names, though an initial core set is documented for
     * SAX2.</p>
     *
     * <p>Implementors are free (and encouraged) to invent their own properties,
     * using names built on their own URIs.</p>
     *
     * @param name The property name, which is a fully-qualified URI.
     * @return The current value of the property.
     * @exception org.xml.sax.SAXNotRecognizedException If the property
     *            value can't be assigned or retrieved.
     * @exception org.xml.sax.SAXNotSupportedException When the
     *            XMLReader recognizes the property name but 
     *            cannot determine its value at this time.
     * @see #setProperty
     */
    public Object getProperty (String name)
	throws SAXNotRecognizedException, SAXNotSupportedException
    {
        throw new SAXNotSupportedException (name + " not yet understood");
    }

    /**
     * Set the value of a property.
     *
     * <p>The property name is any fully-qualified URI.  It is
     * possible for an XMLReader to recognize a property name but
     * to be unable to change the current value.
     * Some property values may be immutable or mutable only 
     * in specific contexts, such as before, during, or after 
     * a parse.</p>
     *
     * <p>XMLReaders are not required to recognize setting
     * any specific property names, though a core set is defined by 
     * SAX2.</p>
     *
     * <p>This method is also the standard mechanism for setting
     * extended handlers.</p>
     *
     * @param name The property name, which is a fully-qualified URI.
     * @param value The requested value for the property.
     * @exception org.xml.sax.SAXNotRecognizedException If the property
     *            value can't be assigned or retrieved.
     * @exception org.xml.sax.SAXNotSupportedException When the
     *            XMLReader recognizes the property name but 
     *            cannot set the requested value.
     */
    public void setProperty (String name, Object value)
	throws SAXNotRecognizedException, SAXNotSupportedException
    {
        throw new SAXNotSupportedException (name + " not yet understood");
    }

    ////////////////////////////////////////////////////////////////////
    // Event handlers.
    ////////////////////////////////////////////////////////////////////

    /**
     * Allow an application to register an entity resolver.
     *
     * <p>If the application does not register an entity resolver,
     * the XMLReader will perform its own default resolution.</p>
     *
     * <p>Applications may register a new or different resolver in the
     * middle of a parse, and the SAX parser must begin using the new
     * resolver immediately.</p>
     *
     * @param resolver The entity resolver.
     * @see #getEntityResolver
     */
    public void setEntityResolver (EntityResolver resolver)
    {
        mEntityResolver = resolver;
    }

    /**
     * Return the current entity resolver.
     *
     * @return The current entity resolver, or null if none
     *         has been registered.
     * @see #setEntityResolver
     */
    public EntityResolver getEntityResolver ()
    {
        return (mEntityResolver);
    }

    /**
     * Allow an application to register a DTD event handler.
     *
     * <p>If the application does not register a DTD handler, all DTD
     * events reported by the SAX parser will be silently ignored.</p>
     *
     * <p>Applications may register a new or different handler in the
     * middle of a parse, and the SAX parser must begin using the new
     * handler immediately.</p>
     *
     * @param handler The DTD handler.
     * @see #getDTDHandler
     */
    public void setDTDHandler (DTDHandler handler)
    {
        mDTDHandler = handler;
    }

    /**
     * Return the current DTD handler.
     *
     * @return The current DTD handler, or null if none
     *         has been registered.
     * @see #setDTDHandler
     */
    public DTDHandler getDTDHandler ()
    {
        return (mDTDHandler);
    }

    /**
     * Allow an application to register a content event handler.
     *
     * <p>If the application does not register a content handler, all
     * content events reported by the SAX parser will be silently
     * ignored.</p>
     *
     * <p>Applications may register a new or different handler in the
     * middle of a parse, and the SAX parser must begin using the new
     * handler immediately.</p>
     *
     * @param handler The content handler.
     * @see #getContentHandler
     */
    public void setContentHandler (ContentHandler handler)
    {
        mContentHandler = handler;
    }

    /**
     * Return the current content handler.
     *
     * @return The current content handler, or null if none
     *         has been registered.
     * @see #setContentHandler
     */
    public ContentHandler getContentHandler ()
    {
        return (mContentHandler);
    }

    /**
     * Allow an application to register an error event handler.
     *
     * <p>If the application does not register an error handler, all
     * error events reported by the SAX parser will be silently
     * ignored; however, normal processing may not continue.  It is
     * highly recommended that all SAX applications implement an
     * error handler to avoid unexpected bugs.</p>
     *
     * <p>Applications may register a new or different handler in the
     * middle of a parse, and the SAX parser must begin using the new
     * handler immediately.</p>
     *
     * @param handler The error handler.
     * @see #getErrorHandler
     */
    public void setErrorHandler (ErrorHandler handler)
    {
        mErrorHandler = handler;
    }

    /**
     * Return the current error handler.
     *
     * @return The current error handler, or null if none
     *         has been registered.
     * @see #setErrorHandler
     */
    public ErrorHandler getErrorHandler ()
    {
        return (mErrorHandler);
    }

    ////////////////////////////////////////////////////////////////////
    // Parsing.
    ////////////////////////////////////////////////////////////////////

    /**
     * Parse an XML document.
     *
     * <p>The application can use this method to instruct the XML
     * reader to begin parsing an XML document from any valid input
     * source (a character stream, a byte stream, or a URI).</p>
     *
     * <p>Applications may not invoke this method while a parse is in
     * progress (they should create a new XMLReader instead for each
     * nested XML document).  Once a parse is complete, an
     * application may reuse the same XMLReader object, possibly with a
     * different input source.
     * Configuration of the XMLReader object (such as handler bindings and
     * values established for feature flags and properties) is unchanged
     * by completion of a parse, unless the definition of that aspect of
     * the configuration explicitly specifies other behavior.
     * (For example, feature flags or properties exposing
     * characteristics of the document being parsed.)
     * </p>
     *
     * <p>During the parse, the XMLReader will provide information
     * about the XML document through the registered event
     * handlers.</p>
     *
     * <p>This method is synchronous: it will not return until parsing
     * has ended.  If a client application wants to terminate 
     * parsing early, it should throw an exception.</p>
     *
     * @param input The input source for the top-level of the
     *        XML document.
     * @exception org.xml.sax.SAXException Any SAX exception, possibly
     *            wrapping another exception.
     * @exception java.io.IOException An IO exception from the parser,
     *            possibly from a byte stream or character stream
     *            supplied by the application.
     * @see org.xml.sax.InputSource
     * @see #parse(java.lang.String)
     * @see #setEntityResolver
     * @see #setDTDHandler
     * @see #setContentHandler
     * @see #setErrorHandler 
     */
    public void parse (InputSource input)
	throws IOException, SAXException
    {
        throw new SAXException ("parse (InputSource input) is not yet supported");
    }

    /**
     * Parse an XML document from a system identifier (URI).
     *
     * <p>This method is a shortcut for the common case of reading a
     * document from a system identifier.  It is the exact
     * equivalent of the following:</p>
     *
     * <pre>
     * parse(new InputSource(systemId));
     * </pre>
     *
     * <p>If the system identifier is a URL, it must be fully resolved
     * by the application before it is passed to the parser.</p>
     *
     * @param systemId The system identifier (URI).
     * @exception org.xml.sax.SAXException Any SAX exception, possibly
     *            wrapping another exception.
     * @exception java.io.IOException An IO exception from the parser,
     *            possibly from a byte stream or character stream
     *            supplied by the application.
     * @see #parse(org.xml.sax.InputSource)
     */
    public void parse (String systemId)
	throws IOException, SAXException
    {
        Locator locator;
        ParserFeedback feedback;

        if (null != mContentHandler)
            try
            {
                mParser = new Parser (systemId);
                locator = new Locator (mParser);
                if (null != mErrorHandler)
                    feedback = new Feedback (mErrorHandler, locator);
                else
                    feedback = new DefaultParserFeedback (DefaultParserFeedback.QUIET);
                mParser.setFeedback (feedback);

                // OK, try a simplistic parse
                mContentHandler.setDocumentLocator (locator);
                try
                {
                    mContentHandler.startDocument ();
                    for (NodeIterator iterator = mParser.elements (); iterator.hasMoreNodes (); )
                        doSAX (iterator.nextNode ());
                    mContentHandler.endDocument ();
                }
                catch (SAXException se)
                {
                    if (null != mErrorHandler)
                        mErrorHandler.fatalError (
                            new SAXParseException ("contentHandler threw me", locator, se));
                }
            }
            catch (ParserException pe)
            {
                if (null != mErrorHandler)
                    mErrorHandler.fatalError (
                        new SAXParseException (pe.getMessage (), "", systemId, 0, 0));

            }
    }

    /**
     * Process nodes recursively on the DocumentHandler.
     * Calls methods on the handler based on the type and whether it's an end tag.
     * Processes composite tags recursively.
     * Does rudimentary namespace processing according to the state of {@link #mNameSpaces}
     * and {@link #mNameSpacePrefixes}. 
     * @param node The htmlparser node to traverse.
     */
    protected void doSAX (Node node)
        throws
            ParserException,
            SAXException
    {
        Tag tag;
        Tag end;

        if (node instanceof Remark)
        {
            String text = mParser.getLexer ().getPage ().getText (node.getStartPosition (), node.getEndPosition ());
            mContentHandler.ignorableWhitespace (text.toCharArray (), 0, text.length ());
        }
        else if (node instanceof Text)
        {
            String text = mParser.getLexer ().getPage ().getText (node.getStartPosition (), node.getEndPosition ());
            mContentHandler.characters (text.toCharArray (), 0, text.length ());
        }
        else if (node instanceof Tag)
        {
            tag = (Tag)node;
            if (mNameSpaces)
                mSupport.processName (tag.getTagName (), mParts, false);
            else
            {
                mParts[0] = "";
                mParts[1] = "";
            }
            if (mNameSpacePrefixes)
                mParts[2] = tag.getTagName ();
            else if (mNameSpaces)
                mParts[2] = "";
            else
                mParts[2] = tag.getTagName ();

            mContentHandler.startElement (
                mParts[0], // uri
                mParts[1], // local
                mParts[2], // raw
                new Attributes (tag, mSupport, mParts));
            NodeList children = tag.getChildren ();
            if (null != children)
                for (int i = 0; i < children.size (); i++)
                    doSAX (children.elementAt (i));
            end = tag.getEndTag ();
            if (null != end)
            {
                if (mNameSpaces)
                    mSupport.processName (end.getTagName (), mParts, false);
                else
                {
                    mParts[0] = "";
                    mParts[1] = "";
                }
                if (mNameSpacePrefixes)
                    mParts[2] = end.getTagName ();
                else if (mNameSpaces)
                    mParts[2] = "";
                else
                    mParts[2] = end.getTagName ();
                mContentHandler.endElement (
                    mParts[0], // uri
                    mParts[1], // local
                    mParts[2]); // raw
            }
        }
    }
}

--- NEW FILE: Feedback.java ---
// HTMLParser Library $Name:  $ - A java-based parser for HTML
// http://sourceforge.org/projects/htmlparser
// Copyright (C) 2004 Derrick Oswald
//
// Revision Control Information
//
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/sax/Feedback.java,v $
// $Author: derrickoswald $
// $Date: 2004/07/14 01:58:02 $
// $Revision: 1.1 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//

package org.htmlparser.sax;

import org.xml.sax.ErrorHandler;
import org.xml.sax.Locator;
import org.xml.sax.SAXParseException;

import org.htmlparser.util.ParserException;
import org.htmlparser.util.ParserFeedback;
import org.xml.sax.SAXException;

/**
 * Mediates between the feedback mechanism of the htmlparser and an error handler.
 */
public class Feedback
    implements
        ParserFeedback
{
    /**
     * The error handler to call back on.
     */
    protected ErrorHandler mErrorHandler;

    /**
     * The locator for tag positions.
     */
    protected Locator mLocator;

    /**
     * Create a feedback/error handler mediator.
     * @param handler The callback object.
     * @param locator A locator for error locations.
     */
    public Feedback (ErrorHandler handler, Locator locator)
    {
        mErrorHandler = handler;
        mLocator = locator;
    }

    /**
     * <em>Just eats the info message.</em>
     * @param message {@inheritDoc} 
     */
    public void info (String message)
    {
        // swallow
    }

    /**
     * Calls {@link ErrorHandler#warning(SAXParseException) ErrorHandler.warning}.
     * @param message {@inheritDoc} 
     */
    public void warning (String message)
    {
        try
        {
            mErrorHandler.warning (
                new SAXParseException (message, mLocator));
        }
        catch (SAXException se)
        {
            se.printStackTrace ();
        }
    }

    /**
     * Calls {@link ErrorHandler#error(SAXParseException) ErrorHandler.error}.
     * @param message {@inheritDoc} 
     */
    public void error (String message, ParserException e)
    {
        try
        {
            mErrorHandler.error (
                new SAXParseException (message, mLocator, e));
        }
        catch (SAXException se)
        {
            se.printStackTrace ();
        }
    }
}

--- NEW FILE: Attributes.java ---
// HTMLParser Library $Name:  $ - A java-based parser for HTML
// http://sourceforge.org/projects/htmlparser
// Copyright (C) 2004 Derrick Oswald
//
// Revision Control Information
//
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/sax/Attributes.java,v $
// $Author: derrickoswald $
// $Date: 2004/07/14 01:58:02 $
// $Revision: 1.1 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//

package org.htmlparser.sax;

import java.util.Vector;

import org.htmlparser.Attribute;
import org.htmlparser.Tag;
import org.xml.sax.helpers.NamespaceSupport;

/**
 * Provides access to the tag attributes.
 */
public class Attributes
    implements
        org.xml.sax.Attributes
{
    /**
     * The tag from which attributes are exposed.
     */
    protected Tag mTag;

    /**
     * The utility class that converts namespaces.
     */
    protected NamespaceSupport mSupport;

    /**
     * Elements of the qname.
     * Allocated once for all uses of {@link #mSupport}.
     */
    protected String[] mParts;

    /**
     * Create an attibute access object.
     * @param tag The tag to expose.
     * @param support The namespace converter.
     * @param parts The elements of the qualified name.
     */
    public Attributes (Tag tag, NamespaceSupport support, String[] parts)
    {
        mTag = tag;
        mSupport = support;
        mParts = parts;
    }

    ////////////////////////////////////////////////////////////////////
    // Indexed access.
    ////////////////////////////////////////////////////////////////////

    /**
     * Return the number of attributes in the list.
     *
     * <p>Once you know the number of attributes, you can iterate
     * through the list.</p>
     *
     * @return The number of attributes in the list.
     * @see #getURI(int)
     * @see #getLocalName(int)
     * @see #getQName(int)
     * @see #getType(int)
     * @see #getValue(int)
     */
    public int getLength ()
    {
        return (mTag.getAttributesEx ().size () - 1);
    }

    /**
     * Look up an attribute's Namespace URI by index.
     *
     * @param index The attribute index (zero-based).
     * @return The Namespace URI, or the empty string if none
     *         is available, or null if the index is out of
     *         range.
     * @see #getLength
     */
    public String getURI (int index)
    {
        mSupport.processName (getQName (index), mParts, true);
        return (mParts[0]);
    }

    /**
     * Look up an attribute's local name by index.
     *
     * @param index The attribute index (zero-based).
     * @return The local name, or the empty string if Namespace
     *         processing is not being performed, or null
     *         if the index is out of range.
     * @see #getLength
     */
    public String getLocalName (int index)
    {
        mSupport.processName (getQName (index), mParts, true);
        return (mParts[1]);
    }

    /**
     * Look up an attribute's XML qualified (prefixed) name by index.
     *
     * @param index The attribute index (zero-based).
     * @return The XML qualified name, or the empty string
     *         if none is available, or null if the index
     *         is out of range.
     * @see #getLength
     */
    public String getQName (int index)
    {
        Attribute attribute;
        String ret;

        attribute = (Attribute)(mTag.getAttributesEx ().elementAt (index + 1));
        if (attribute.isWhitespace ())
            ret = "#text";
        else
            ret = attribute.getName ();

        return (ret);
    }

    /**
     * Look up an attribute's type by index.
     *
     * <p>The attribute type is one of the strings "CDATA", "ID",
     * "IDREF", "IDREFS", "NMTOKEN", "NMTOKENS", "ENTITY", "ENTITIES",
     * or "NOTATION" (always in upper case).</p>
     *
     * <p>If the parser has not read a declaration for the attribute,
     * or if the parser does not report attribute types, then it must
     * return the value "CDATA" as stated in the XML 1.0 Recommendation
     * (clause 3.3.3, "Attribute-Value Normalization").</p>
     *
     * <p>For an enumerated attribute that is not a notation, the
     * parser will report the type as "NMTOKEN".</p>
     *
     * @param index The attribute index (zero-based).
     * @return The attribute's type as a string, or null if the
     *         index is out of range.
     * @see #getLength
     */
    public String getType (int index)
    {
        return ("CDATA");
    }

    /**
     * Look up an attribute's value by index.
     *
     * <p>If the attribute value is a list of tokens (IDREFS,
     * ENTITIES, or NMTOKENS), the tokens will be concatenated
     * into a single string with each token separated by a
     * single space.</p>
     *
     * @param index The attribute index (zero-based).
     * @return The attribute's value as a string, or null if the
     *         index is out of range.
     * @see #getLength
     */
    public String getValue (int index)
    {
        Attribute attribute;
        String ret;

        attribute = (Attribute)(mTag.getAttributesEx ().elementAt (index + 1));
        ret = attribute.getValue ();
        if (null == ret)
            ret = "";

        return (ret);
    }

    ////////////////////////////////////////////////////////////////////
    // Name-based query.
    ////////////////////////////////////////////////////////////////////

    /**
     * Look up the index of an attribute by Namespace name.
     *
     * @param uri The Namespace URI, or the empty string if
     *        the name has no Namespace URI.
     * @param localName The attribute's local name.
     * @return The index of the attribute, or -1 if it does not
     *         appear in the list.
     */
    public int getIndex (String uri, String localName)
    {
        Vector attributes;
        int size;
        Attribute attribute;
        String string;
        int ret;

        ret = -1;

        attributes = mTag.getAttributesEx ();
        if (null != attributes)
        {
            size = attributes.size ();
            for (int i = 1; i < size; i++)
            {
                attribute = (Attribute)attributes.elementAt (i);
                string = attribute.getName ();
                if (null != string) // not whitespace
                {
                    mSupport.processName (string, mParts, true);
                    if (  uri.equals (mParts[0])
                        & localName.equalsIgnoreCase (mParts[1]))
                    {
                        ret = i;
                        i = size; // exit fast
                    }
                }
            }
        }

        return (ret);
    }

    /**
     * Look up the index of an attribute by XML qualified (prefixed) name.
     *
     * @param qName The qualified (prefixed) name.
     * @return The index of the attribute, or -1 if it does not
     *         appear in the list.
     */
    public int getIndex (String qName)
    {
        mSupport.processName (qName, mParts, true);
        return (getIndex (mParts[0], mParts[1]));
    }

    /**
     * Look up an attribute's type by Namespace name.
     *
     * <p>See {@link #getType(int) getType(int)} for a description
     * of the possible types.</p>
     *
     * @param uri The Namespace URI, or the empty String if the
     *        name has no Namespace URI.
     * @param localName The local name of the attribute.
     * @return The attribute type as a string, or null if the
     *         attribute is not in the list or if Namespace
     *         processing is not being performed.
     */
    public String getType (String uri, String localName)
    {
        return (null);
    }

    /**
     * Look up an attribute's type by XML qualified (prefixed) name.
     *
     * <p>See {@link #getType(int) getType(int)} for a description
     * of the possible types.</p>
     *
     * @param qName The XML qualified name.
     * @return The attribute type as a string, or null if the
     *         attribute is not in the list or if qualified names
     *         are not available.
     */
    public String getType (String qName)
    {
        return (null);
    }

    /**
     * Look up an attribute's value by Namespace name.
     *
     * <p>See {@link #getValue(int) getValue(int)} for a description
     * of the possible values.</p>
     *
     * @param uri The Namespace URI, or the empty String if the
     *        name has no Namespace URI.
     * @param localName The local name of the attribute.
     * @return The attribute value as a string, or null if the
     *         attribute is not in the list.
     */
    public String getValue (String uri, String localName)
    {
        return (mTag.getAttribute (localName));
    }

    /**
     * Look up an attribute's value by XML qualified (prefixed) name.
     *
     * <p>See {@link #getValue(int) getValue(int)} for a description
     * of the possible values.</p>
     *
     * @param qName The XML qualified name.
     * @return The attribute value as a string, or null if the
     *         attribute is not in the list or if qualified names
     *         are not available.
     */
    public String getValue (String qName)
    {
        mSupport.processName (qName, mParts, true);
        return (getValue (mParts[0], mParts[1]));
    }
}

Thread: [Htmlparser-cvs] htmlparser/src/org/htmlparser/sax Attributes.java,NONE,1.1 Feedback.java,NONE,1.1 L

htmlparser-cvs