[Htmlparser-cvs] htmlparser/src/org/htmlparser PrototypicalNodeFactory.java,NONE,1.1 Parser.java,1.7
Brought to you by:
derrickoswald
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser In directory sc8-pr-cvs1:/tmp/cvs-serv16537 Modified Files: Parser.java RemarkNode.java StringNode.java StringNodeFactory.java Added Files: PrototypicalNodeFactory.java Log Message: Remove most of the scanners. The only scanners left are ones that really do something different (script and jsp). Instead of registering a scanner to enable returning a specific tag you now add a tag to the a PrototypicalNodeFactory. All known tags are 'registered' by default in a new Parser which is similar to having called the old 'registerDOMScanners()', so tags are fully nested. This is different behaviour, and specifically, you will need to recurse into returned nodes to get at what you want. I've tried to adjust the applications accordingly, but worked examples are still scarce. If you want to return only some of the derived tags while keeping most as generic tags, there are various constructors and manipulators on the factory. See the javadocs and examples in the tests package. Nearly all the old scanner tests are folded into the tag tests. toString() has been revamped. This means that the default Parser mainline now returns an indented listing of tags, making it easy to see the structure of a page. The downside is the text of the page had to have newlines, tabs etc. turned into escape sequences. But if you were really interested in content you would be using toHtml() or toPlainTextString(). --- NEW FILE: PrototypicalNodeFactory.java --- // HTMLParser Library $Name: $ - A java-based parser for HTML // http://sourceforge.org/projects/htmlparser // Copyright (C) 2003 Derrick Oswald // // Revision Control Information // // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/PrototypicalNodeFactory.java,v $ // $Author: derrickoswald $ // $Date: 2003/12/07 23:41:39 $ // $Revision: 1.1 $ // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // package org.htmlparser; import java.io.Serializable; import java.util.Hashtable; import java.util.Map; import java.util.Vector; import org.htmlparser.lexer.Page; import org.htmlparser.lexer.nodes.Attribute; import org.htmlparser.lexer.nodes.NodeFactory; import org.htmlparser.nodeDecorators.DecodingNode; import org.htmlparser.nodeDecorators.EscapeCharacterRemovingNode; import org.htmlparser.nodeDecorators.NonBreakingSpaceConvertingNode; //import org.htmlparser.tags.Tag; import org.htmlparser.tags.*; // import everything for now import org.htmlparser.util.ParserException; /** * A node factory based on the prototype pattern. * This factory uses the prototype pattern to generate new Tag nodes. * Prototype tags, in the form of undifferentiated tags are held in a hash * table. On a */ public class PrototypicalNodeFactory implements Serializable, NodeFactory { /** * The list of tags to return at the top level. * The list is keyed by tag name. */ protected Map mBlastocyst; /** * Create a new factory with all but DOM tags registered. */ public PrototypicalNodeFactory () { this (false); } /** * Create a new factory with no registered tags. */ public PrototypicalNodeFactory (boolean empty) { clear (); if (!empty) registerTags (); } /** * Create a new factory with the given tag as the only one registered. */ public PrototypicalNodeFactory (Tag tag) { this (true); registerTag (tag); } /** * Create a new factory with the given tags registered. */ public PrototypicalNodeFactory (Tag[] tags) { this (true); for (int i = 0; i < tags.length; i++) registerTag (tags[i]); } /** * Adds a tag to the registry. * @param id The name under which to register the tag. * @param tag The tag to be returned from a createTag(id) call. * @return The tag previously registered with that id, * or <code>null</code> if none. */ public Tag put (String id, Tag tag) { return ((Tag)mBlastocyst.put (id, tag)); } /** * Adds a tag to the registry. * @param id The name under which to register the tag. * @param tag The tag to be returned from a createTag(id) call. */ public Tag get (String id) { return ((Tag)mBlastocyst.get (id)); } /** * Remove a tag from the registry. * @param id The name under which to register the tag. * @return The tag that was registered with that id. */ public Tag remove (String id) { return ((Tag)mBlastocyst.remove (id)); } /** * Clean out the registry. */ public void clear () { mBlastocyst = new Hashtable (); } public void registerTag (Tag tag) { String ids[]; ids = tag.getIds (); for (int i = 0; i < ids.length; i++) put (ids[i], tag); } public void unregisterTag (Tag tag) { String ids[]; ids = tag.getIds (); for (int i = 0; i < ids.length; i++) remove (ids[i]); } public PrototypicalNodeFactory registerTags () { registerTag (new AppletTag ()); registerTag (new BaseHrefTag ()); registerTag (new Bullet ()); registerTag (new BulletList ()); registerTag (new DoctypeTag ()); registerTag (new FormTag ()); registerTag (new FrameSetTag ()); registerTag (new FrameTag ()); registerTag (new ImageTag ()); registerTag (new InputTag ()); registerTag (new JspTag ()); registerTag (new LabelTag ()); registerTag (new LinkTag ()); registerTag (new MetaTag ()); registerTag (new OptionTag ()); registerTag (new ScriptTag ()); registerTag (new SelectTag ()); registerTag (new StyleTag ()); registerTag (new TableColumn ()); registerTag (new TableRow ()); registerTag (new TableTag ()); registerTag (new TextareaTag ()); registerTag (new TitleTag ()); registerTag (new Div ()); registerTag (new Span ()); registerTag (new BodyTag ()); registerTag (new HeadTag ()); registerTag (new Html ()); return (this); } // // NodeFactory interface // /** * Create a new string node. * @param page The page the node is on. * @param start The beginning position of the string. * @param end The ending positiong of the string. */ public Node createStringNode (Page page, int start, int end) { Node ret; ret = new StringNode (page, start, end); return (ret); } /** * Create a new remark node. * @param page The page the node is on. * @param start The beginning position of the remark. * @param end The ending positiong of the remark. */ public Node createRemarkNode (Page page, int start, int end) { return (new RemarkNode (page, start, end)); } /** * Create a new tag node. * Note that the attributes vector contains at least one element, * which is the tag name (standalone attribute) at position zero. * This can be used to decide which type of node to create, or * gate other processing that may be appropriate. * @param page The page the node is on. * @param start The beginning position of the tag. * @param end The ending positiong of the tag. * @param attributes The attributes contained in this tag. */ public Node createTagNode (Page page, int start, int end, Vector attributes) throws ParserException { Attribute attribute; String id; Tag prototype; Tag ret; ret = null; if (0 != attributes.size ()) { attribute = (Attribute)attributes.elementAt (0); id = attribute.getName (); if (null != id) { try { id = id.toUpperCase (); if (!id.startsWith ("/")) { if (id.endsWith ("/")) id = id.substring (0, id.length () - 1); prototype = (Tag)mBlastocyst.get (id); if (null != prototype) { ret = (Tag)prototype.clone (); ret.setPage (page); ret.setStartPosition (start); ret.setEndPosition (end); ret.setAttributesEx (attributes); } } } catch (CloneNotSupportedException cnse) { // default to creating a new one } } } if (null == ret) ret = new Tag (page, start, end, attributes); return (ret); } } Index: Parser.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Parser.java,v retrieving revision 1.75 retrieving revision 1.76 diff -C2 -d -r1.75 -r1.76 *** Parser.java 9 Nov 2003 17:07:08 -0000 1.75 --- Parser.java 7 Dec 2003 23:41:39 -0000 1.76 *************** *** 49,75 **** import org.htmlparser.nodeDecorators.EscapeCharacterRemovingNode; import org.htmlparser.nodeDecorators.NonBreakingSpaceConvertingNode; ! import org.htmlparser.scanners.AppletScanner; ! import org.htmlparser.scanners.BaseHrefScanner; ! import org.htmlparser.scanners.BodyScanner; ! import org.htmlparser.scanners.BulletListScanner; ! import org.htmlparser.scanners.CompositeTagScanner; ! import org.htmlparser.scanners.DivScanner; ! import org.htmlparser.scanners.DoctypeScanner; ! import org.htmlparser.scanners.FormScanner; ! import org.htmlparser.scanners.FrameSetScanner; ! import org.htmlparser.scanners.HeadScanner; ! import org.htmlparser.scanners.HtmlScanner; ! import org.htmlparser.scanners.ImageScanner; ! import org.htmlparser.scanners.JspScanner; ! import org.htmlparser.scanners.LinkScanner; ! import org.htmlparser.scanners.MetaTagScanner; ! import org.htmlparser.scanners.ScriptScanner; ! import org.htmlparser.scanners.StyleScanner; ! import org.htmlparser.scanners.TableScanner; ! import org.htmlparser.scanners.TagScanner; ! import org.htmlparser.scanners.TitleScanner; ! import org.htmlparser.tags.ImageTag; ! import org.htmlparser.tags.LinkTag; ! import org.htmlparser.tags.Tag; import org.htmlparser.util.DefaultParserFeedback; import org.htmlparser.util.IteratorImpl; --- 49,53 ---- import org.htmlparser.nodeDecorators.EscapeCharacterRemovingNode; import org.htmlparser.nodeDecorators.NonBreakingSpaceConvertingNode; ! import org.htmlparser.tags.Tag; // temporarily import org.htmlparser.util.DefaultParserFeedback; import org.htmlparser.util.IteratorImpl; *************** *** 87,143 **** * Typical usage of the parser is as follows : <BR> * [1] Create a parser object - passing the URL and a feedback object to the parser<BR> ! * [2] Register the common scanners. See {@link #registerScanners()} <BR> ! * You wouldnt do this if you want to configure a custom lightweight parser. In that case, ! * you would add the scanners of your choice using {@link #addScanner(TagScanner)}<BR> ! * [3] Enumerate through the elements from the parser object <BR> ! * It is important to note that the parsing occurs when you enumerate, ON DEMAND. This is a thread-safe way, ! * and you only get the control back after a particular element is parsed and returned. ! * ! * <BR> ! * Below is some sample code to parse Yahoo.com and print all the tags. ! * <pre> ! * Parser parser = new Parser("http://www.yahoo.com",new DefaultHTMLParserFeedback()); ! * // In this example, we are registering all the common scanners ! * parser.registerScanners(); ! * for (NodeIterator i = parser.elements();i.hasMoreNodes();) { ! * Node node = i.nextNode(); ! * node.print(); ! * } ! * </pre> Below is some sample code to parse Yahoo.com and print only the text ! * information. This scanning will run faster, as there are no scanners ! * registered here. ! * <pre> ! * Parser parser = new Parser("http://www.yahoo.com",new DefaultHTMLParserFeedback()); ! * // In this example, none of the scanners need to be registered ! * // as a string node is not a tag to be scanned for. ! * for (NodeIterator i = parser.elements();i.hasMoreNodes();) { ! * Node node = i.nextNode(); ! * if (node instanceof StringNode) { ! * StringNode stringNode = ! * (StringNode)node; ! * System.out.println(stringNode.getText()); ! * } ! * } ! * </pre> ! * The above snippet will print out only the text contents in the html document.<br> ! * Here's another snippet that will only print out the link urls in a document. ! * This is an example of adding a link scanner. ! * <pre> ! * Parser parser = new Parser("http://www.yahoo.com",new DefaultHTMLParserFeedback()); ! * parser.addScanner(new LinkScanner("-l")); ! * for (NodeIterator i = parser.elements();i.hasMoreNodes();) { ! * Node node = i.nextNode(); ! * if (node instanceof LinkTag) { ! * LinkTag linkTag = (LinkTag)node; ! * System.out.println(linkTag.getLink()); ! * } ! * } ! * </pre> * @see Parser#elements() */ public class Parser implements ! Serializable, ! NodeFactory { // Please don't change the formatting of the version variables below. --- 65,77 ---- * Typical usage of the parser is as follows : <BR> * [1] Create a parser object - passing the URL and a feedback object to the parser<BR> ! * [2] Enumerate through the elements from the parser object <BR> ! * It is important to note that the parsing occurs when you enumerate, ON DEMAND. ! * This is a thread-safe way, and you only get the control back after a ! * particular element is parsed and returned, which could be the entire body. * @see Parser#elements() */ public class Parser implements ! Serializable { // Please don't change the formatting of the version variables below. *************** *** 175,187 **** /** - * This object is used by the StringParser to create new StringNodes at runtime, based on - * use configurations of the factory - */ - private StringNodeFactory stringNodeFactory; - - /** * Feedback object. */ ! protected ParserFeedback feedback; /** --- 109,115 ---- /** * Feedback object. */ ! protected ParserFeedback mFeedback; /** *************** *** 191,210 **** /** - * The list of scanners to apply at the top level. - */ - protected Map mScanners; - - /** - * The list of tags to return at the top level. - * The list is keyed by tag name. - */ - protected Map mBlastocyst; - - /** - * The current scanner when recursing into a tag. - */ - protected TagScanner mScanner; - - /** * Variable to store lineSeparator. * This is setup to read <code>line.separator</code> from the System property. --- 119,122 ---- *************** *** 273,279 **** public Parser () { ! setFeedback (null); ! setScanners (null); ! setLexer (new Lexer (new Page (""))); } --- 185,189 ---- public Parser () { ! this (new Lexer (new Page ("")), noFeedback); } *************** *** 300,305 **** { setFeedback (fb); ! setScanners (null); setLexer (lexer); } --- 210,217 ---- { setFeedback (fb); ! if (null == lexer) ! throw new IllegalArgumentException ("lexer cannot be null"); setLexer (lexer); + setNodeFactory (new PrototypicalNodeFactory ()); } *************** *** 314,320 **** ParserException { ! setFeedback (fb); ! setScanners (null); ! setConnection (connection); } --- 226,230 ---- ParserException { ! this (new Lexer (connection), fb); } *************** *** 383,389 **** * Set the connection for this parser. * This method creates a new <code>Lexer</code> reading from the connection. ! * It does not adjust the <code>mScanners</code> list ! * or <code>feedback</code> object. Trying to ! * set the connection to null is a noop. * @param connection A fully conditioned connection. The connect() * method will be called so it need not be connected yet. --- 293,297 ---- * Set the connection for this parser. * This method creates a new <code>Lexer</code> reading from the connection. ! * Trying to set the connection to null is a noop. * @param connection A fully conditioned connection. The connect() * method will be called so it need not be connected yet. *************** *** 391,394 **** --- 299,303 ---- * HTTP header is not supported, or an i/o exception occurs creating the * lexer. + * @see #setLexer */ public void setConnection (URLConnection connection) *************** *** 414,420 **** * Set the URL for this parser. * This method creates a new Lexer reading from the given URL. ! * It does not adjust the <code>mScanners</code> list ! * or <code>feedback</code> object. Trying to set the url to null or an ! * empty string is a noop. * @see #setConnection(URLConnection) */ --- 323,327 ---- * Set the URL for this parser. * This method creates a new Lexer reading from the given URL. ! * Trying to set the url to null or an empty string is a noop. * @see #setConnection(URLConnection) */ *************** *** 460,465 **** /** * Set the lexer for this parser. ! * TIt does not adjust the <code>mScanners</code> list ! * or <code>feedback</code> object. * Trying to set the lexer to <code>null</code> is a noop. * @param lexer The lexer object to use. --- 367,373 ---- /** * Set the lexer for this parser. ! * The current NodeFactory is set on the given lexer, since the lexer ! * contains the node factory object. ! * It does not adjust the <code>feedback</code> object. * Trying to set the lexer to <code>null</code> is a noop. * @param lexer The lexer object to use. *************** *** 467,474 **** public void setLexer (Lexer lexer) { if (null != lexer) ! { mLexer = lexer; - mLexer.setNodeFactory (this); } } --- 375,388 ---- public void setLexer (Lexer lexer) { + NodeFactory factory; + if (null != lexer) ! { // move a node factory that's been set to the new lexer ! factory = null; ! if (null != getLexer ()) ! factory = getLexer ().getNodeFactory (); ! if (null != factory) ! lexer.setNodeFactory (factory); mLexer = lexer; } } *************** *** 484,520 **** /** ! * Get the number of scanners registered currently in the parser. ! * @return int number of scanners registered. ! */ ! public int getNumScanners() ! { ! return mScanners.size(); ! } ! ! /** ! * This method is to be used to change the set of scanners in the current parser. ! * @param newScanners List of scanner objects to be used during the parsing process. */ ! public void setScanners (Map newScanners) { ! Iterator iterator; ! TagScanner scanner; ! ! flushScanners (); ! if (null != newScanners) ! for (iterator = newScanners.entrySet ().iterator (); iterator.hasNext (); ) ! { ! scanner = (TagScanner)iterator.next (); ! addScanner (scanner); ! } } /** ! * Get the list of scanners registered currently in the parser ! * @return List of scanners currently registered in the parser */ ! public Map getScanners() { ! return mScanners; } --- 398,418 ---- /** ! * Get the current node factory. ! * @return The parser's node factory. */ ! public NodeFactory getNodeFactory () { ! return (getLexer ().getNodeFactory ()); } /** ! * Get the current node factory. ! * @return The parser's node factory. */ ! public void setNodeFactory (NodeFactory factory) { ! if (null == factory) ! throw new IllegalArgumentException ("node factory cannot be null"); ! getLexer ().setNodeFactory (factory); } *************** *** 523,529 **** * @param fb The new feedback object to use. */ ! public void setFeedback(ParserFeedback fb) { ! feedback = (null == fb) ? noFeedback : fb; } --- 421,427 ---- * @param fb The new feedback object to use. */ ! public void setFeedback (ParserFeedback fb) { ! mFeedback = (null == fb) ? noFeedback : fb; } *************** *** 532,537 **** * @return HTMLParserFeedback */ ! public ParserFeedback getFeedback() { ! return feedback; } --- 430,436 ---- * @return HTMLParserFeedback */ ! public ParserFeedback getFeedback() ! { ! return (mFeedback); } *************** *** 549,590 **** /** - * Add a new Tag Scanner. - * In typical situations where you require a no-frills parser, use the registerScanners() method to add the most - * common parsers. But when you wish to either compose a parser with only certain scanners registered, use this method. - * It is advantageous to register only the scanners you want, in order to achieve faster parsing speed. This method - * would also be of use when you have developed custom scanners, and need to register them into the parser. - * @param scanner TagScanner object (or derivative) to be added to the list of registered scanners. - */ - public void addScanner(TagScanner scanner) - { - String ids[]; - Tag tag; - - ids = scanner.getID(); - for (int i = 0; i < ids.length; i++) - { - mScanners.put (ids[i], scanner); - // for now, the only way to create a tag is to ask the scanner... - try - { - if (scanner instanceof CompositeTagScanner) - { - tag = ((CompositeTagScanner)scanner).createTag (null, 0, 0, null, null, null, null); - mBlastocyst.put (ids[i], tag); - } - else - { - tag = scanner.createTag (null, 0, 0, null, null, null); - mBlastocyst.put (ids[i], tag); - } - } - catch (Exception e) - { - e.printStackTrace (); - } - } - } - - /** * Returns an iterator (enumeration) to the html nodes. Each node can be a tag/endtag/ * string/link/image<br> --- 448,451 ---- *************** *** 593,597 **** * <pre> * Parser parser = new Parser("http://www.yahoo.com"); - * parser.registerScanners(); * for (NodeIterator i = parser.elements();i.hasMoreElements();) { * Node node = i.nextHTMLNode(); --- 454,457 ---- *************** *** 605,608 **** --- 465,469 ---- * if (node instanceof ...) { * // Downcast, and process + * // recursively (nodes within nodes) * } * } *************** *** 612,646 **** public NodeIterator elements () throws ParserException { ! return (new IteratorImpl (getLexer (), feedback)); ! } ! ! /** ! * Flush the current scanners registered. ! * The registered scanners list becomes empty with this call. ! */ ! public void flushScanners() ! { ! mScanners = new Hashtable (); ! mBlastocyst = new Hashtable (); ! } ! ! /** ! * Return the scanner registered in the parser having the ! * given id ! * @param id The id of the requested scanner ! * @return TagScanner The Tag Scanner ! */ ! public TagScanner getScanner (String id) ! { ! Tag tag; ! TagScanner ret; ! ! ret = null; ! ! tag = (Tag)mBlastocyst.get (id); ! if (null != tag) ! ret = (TagScanner)tag.getThisScanner (); ! ! return (ret); } --- 473,477 ---- public NodeIterator elements () throws ParserException { ! return (new IteratorImpl (getLexer (), getFeedback ())); } *************** *** 672,762 **** /** - * This method should be invoked in order to register some common scanners. - * The scanners that get added are : <br> - * LinkScanner (filter key "-l")<br> - * ImageScanner (filter key "-i")<br> - * ScriptScanner (filter key "-s") <br> - * StyleScanner (filter key "-t") <br> - * JspScanner (filter key "-j") <br> - * AppletScanner (filter key "-a") <br> - * MetaTagScanner (filter key "-m") <br> - * TitleScanner (filter key "-t") <br> - * DoctypeScanner (filter key "-d") <br> - * FormScanner (filter key "-f") <br> - * FrameSetScanner(filter key "-r") <br> - * BulletListScanner(filter key "-bulletList") <br> - * DivScanner(filter key "-div") <br> - * TableScanner(filter key "") <br> - * <br> - * Call this method after creating the Parser object. e.g. <BR> - * <pre> - * Parser parser = new Parser("http://www.yahoo.com"); - * parser.registerScanners(); - * </pre> - */ - public void registerScanners() { - if (mScanners.size()>0) - { - System.err.println("registerScanners() should be called first, when no other scanner has been registered."); - System.err.println("Other scanners already exist, hence this method call won't have any effect"); - return; - } - addScanner(new LinkScanner(LinkTag.LINK_TAG_FILTER)); - addScanner(new ImageScanner(ImageTag.IMAGE_TAG_FILTER)); - addScanner(new ScriptScanner("-s")); - addScanner(new StyleScanner("-t")); - addScanner(new JspScanner("-j")); - addScanner(new AppletScanner("-a")); - addScanner(new MetaTagScanner("-m")); - addScanner(new TitleScanner("-T")); - addScanner(new DoctypeScanner("-d")); - addScanner(new FormScanner("-f",this)); - addScanner(new FrameSetScanner("-r")); - addScanner(new BaseHrefScanner("-b")); - addScanner(new BulletListScanner("-bulletList",this)); - // addScanner(new SpanScanner("-p")); - addScanner(new DivScanner("-div")); - addScanner(new TableScanner(this)); - } - - /** - * Make a call to registerDomScanners(), instead of registerScanners(), - * when you are interested in retrieving a Dom representation of the html - * page. Upon parsing, you will receive an Html object - which will contain - * children, one of which would be the body. This is still evolving, and in - * future releases, you might see consolidation of Html - to provide you - * with methods to access the body and the head. - */ - public void registerDomScanners() { - registerScanners(); - addScanner(new HtmlScanner()); - addScanner(new BodyScanner()); - addScanner(new HeadScanner()); - } - - /** - * Removes a specified scanner object. You can create - * an anonymous object as a parameter. This method - * will use the scanner's key and remove it from the - * registry of scanners. - * e.g. - * <pre> - * removeScanner(new FormScanner("")); - * </pre> - * @param scanner TagScanner object to be removed from the list of registered scanners - */ - public void removeScanner(TagScanner scanner) - { - String[] ids; - - ids = scanner.getID (); - for (int i = 0; i < ids.length; i++) - { - mScanners.remove (ids[i]); - mBlastocyst.remove (ids[i]); - } - } - - /** * Opens a connection using the given url. * @param url The url to open. --- 503,506 ---- *************** *** 874,878 **** { Parser parser = new Parser (args[0]); - parser.registerScanners (); System.out.println ("Parsing " + parser.getURL ()); NodeFilter filter; --- 618,621 ---- *************** *** 959,968 **** } - public static Parser createLinkRecognizingParser(String inputHTML) { - Parser parser = createParser(inputHTML); - parser.addScanner(new LinkScanner(LinkTag.LINK_TAG_FILTER)); - return parser; - } - /** * @return String lineSeparator that will be used in toHTML() --- 702,705 ---- *************** *** 970,1080 **** public static String getLineSeparator() { return lineSeparator; - } - - public StringNodeFactory getStringNodeFactory() { - if (stringNodeFactory == null) - stringNodeFactory = new StringNodeFactory(); - return stringNodeFactory; - } - - public void setStringNodeFactory(StringNodeFactory stringNodeFactory) { - this.stringNodeFactory = stringNodeFactory; - } - - // - // NodeFactory interface - // - - /** - * Create a new string node. - * @param page The page the node is on. - * @param start The beginning position of the string. - * @param end The ending positiong of the string. - */ - public Node createStringNode (Page page, int start, int end) - { - Node ret; - - ret = new StringNode (page, start, end); - if (null != stringNodeFactory) - { - if (stringNodeFactory.shouldDecodeNodes ()) - ret = new DecodingNode (ret); - if (stringNodeFactory.shouldRemoveEscapeCharacters ()) - ret = new EscapeCharacterRemovingNode (ret); - if (stringNodeFactory.shouldConvertNonBreakingSpace ()) - ret = new NonBreakingSpaceConvertingNode (ret); - } - - return (ret); - } - - /** - * Create a new remark node. - * @param page The page the node is on. - * @param start The beginning position of the remark. - * @param end The ending positiong of the remark. - */ - public Node createRemarkNode (Page page, int start, int end) - { - return (new RemarkNode (page, start, end)); - } - - /** - * Create a new tag node. - * Note that the attributes vector contains at least one element, - * which is the tag name (standalone attribute) at position zero. - * This can be used to decide which type of node to create, or - * gate other processing that may be appropriate. - * @param page The page the node is on. - * @param start The beginning position of the tag. - * @param end The ending positiong of the tag. - * @param attributes The attributes contained in this tag. - */ - public Node createTagNode (Page page, int start, int end, Vector attributes) - throws - ParserException - { - Attribute attribute; - String id; - Tag prototype; - Tag ret; - - ret = null; - - if (0 != attributes.size ()) - { - attribute = (Attribute)attributes.elementAt (0); - id = attribute.getName (); - if (null != id) - { - try - { - id = id.toUpperCase (); - if (!id.startsWith ("/")) - { - if (id.endsWith ("/")) - id = id.substring (0, id.length () - 1); - prototype = (Tag)mBlastocyst.get (id); - if (null != prototype) - { - ret = (Tag)prototype.clone (); - ret.setPage (page); - ret.setStartPosition (start); - ret.setEndPosition (end); - ret.setAttributesEx (attributes); - } - } - } - catch (CloneNotSupportedException cnse) - { - // default to creating a new one - } - } - } - if (null == ret) - ret = new Tag (page, start, end, attributes); - - return (ret); } } --- 707,710 ---- Index: RemarkNode.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/RemarkNode.java,v retrieving revision 1.37 retrieving revision 1.38 diff -C2 -d -r1.37 -r1.38 *** RemarkNode.java 9 Nov 2003 17:07:08 -0000 1.37 --- RemarkNode.java 7 Dec 2003 23:41:39 -0000 1.38 *************** *** 41,64 **** org.htmlparser.lexer.nodes.RemarkNode { - public final static String REMARK_NODE_FILTER="-r"; - - // /** - // * Tag contents will have the contents of the comment tag. - // */ - // String tagContents; - // - // /** - // * The HTMLRemarkTag is constructed by providing the beginning posn, ending posn - // * and the tag contents. - // * @param nodeBegin beginning position of the tag - // * @param nodeEnd ending position of the tag - // * @param tagContents contents of the remark tag - // */ - // public RemarkNode(int nodeBegin, int nodeEnd, String tagContents) - // { - // super(nodeBegin,nodeEnd); - // this.tagContents = tagContents; - // } - /** * Constructor takes in the text string, beginning and ending posns. --- 41,44 ---- *************** *** 73,95 **** /** - * Print the contents of the remark tag. - */ - public String toString() - { - StringBuffer ret; - - ret = new StringBuffer (1024); - ret.append ("Comment Tag : "); - ret.append (getText ()); - ret.append ("; begins at : "); - ret.append (getStartPosition ()); - ret.append ("; ends at : "); - ret.append (getEndPosition ()); - ret.append ("\n"); - - return (ret.toString ()); - } - - /** * Remark visiting code. * @param visitor The <code>NodeVisitor</code> object to invoke --- 53,56 ---- *************** *** 100,103 **** ((NodeVisitor)visitor).visitRemarkNode (this); } - } --- 61,63 ---- Index: StringNode.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/StringNode.java,v retrieving revision 1.45 retrieving revision 1.46 diff -C2 -d -r1.45 -r1.46 *** StringNode.java 9 Nov 2003 17:07:08 -0000 1.45 --- StringNode.java 7 Dec 2003 23:41:39 -0000 1.46 *************** *** 41,62 **** org.htmlparser.lexer.nodes.StringNode { - public static final String STRING_FILTER="-string"; - - // /** - // * The text of the string. - // */ - // protected StringBuffer textBuffer; - // - /** - * Constructor takes in the text string, beginning and ending posns. - * @param text The contents of the string line - * @param textBegin The beginning position of the string - * @param textEnd The ending positiong of the string - */ - public StringNode (StringBuffer text, int textBegin,int textEnd) - { - super(new Page (text.toString ()), textBegin,textEnd); - } - /** * Constructor takes in the text string, beginning and ending posns. --- 41,44 ---- *************** *** 68,86 **** { super (page, start, end); - } - - public String toString() - { - StringBuffer ret; - - ret = new StringBuffer (1024); - ret.append ("Text = "); - ret.append (getText ()); - ret.append ("; begins at : "); - ret.append (getStartPosition ()); - ret.append ("; ends at : "); - ret.append (getEndPosition ()); - - return (ret.toString ()); } --- 50,53 ---- Index: StringNodeFactory.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/StringNodeFactory.java,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** StringNodeFactory.java 9 Nov 2003 17:07:08 -0000 1.7 --- StringNodeFactory.java 7 Dec 2003 23:41:39 -0000 1.8 *************** *** 30,33 **** --- 30,34 ---- import java.io.Serializable; + import org.htmlparser.lexer.Page; import org.htmlparser.nodeDecorators.DecodingNode; *************** *** 35,45 **** import org.htmlparser.nodeDecorators.NonBreakingSpaceConvertingNode; ! public class StringNodeFactory implements Serializable { ! /** * Flag to tell the parser to decode strings returned by StringNode's toPlainTextString. * Decoding occurs via the method, org.htmlparser.util.Translate.decode() */ ! private boolean shouldDecodeNodes = false; --- 36,50 ---- import org.htmlparser.nodeDecorators.NonBreakingSpaceConvertingNode; ! public class StringNodeFactory ! extends ! PrototypicalNodeFactory ! implements ! Serializable ! { /** * Flag to tell the parser to decode strings returned by StringNode's toPlainTextString. * Decoding occurs via the method, org.htmlparser.util.Translate.decode() */ ! protected boolean mDecode; *************** *** 48,52 **** * Escape character removal occurs via the method, org.htmlparser.util.ParserUtils.removeEscapeCharacters() */ ! private boolean shouldRemoveEscapeCharacters = false; /** --- 53,57 ---- * Escape character removal occurs via the method, org.htmlparser.util.ParserUtils.removeEscapeCharacters() */ ! protected boolean mRemoveEscapes; /** *************** *** 54,98 **** * (i.e. \u00a0) to a space (" "). If true, this will happen inside StringNode's toPlainTextString. */ ! private boolean shouldConvertNonBreakingSpace = false; ! public Node createStringNode( ! StringBuffer textBuffer, ! int textBegin, ! int textEnd) { ! Node newNode = new StringNode(textBuffer, textBegin, textEnd); ! if (shouldDecodeNodes()) ! newNode = new DecodingNode(newNode); ! if (shouldRemoveEscapeCharacters()) ! newNode = new EscapeCharacterRemovingNode(newNode); ! if (shouldConvertNonBreakingSpace()) ! newNode = new NonBreakingSpaceConvertingNode(newNode); ! return newNode; } /** ! * Tells the parser to decode nodes using org.htmlparser.util.Translate.decode() */ ! public void setNodeDecoding(boolean shouldDecodeNodes) { ! this.shouldDecodeNodes = shouldDecodeNodes; ! } ! public boolean shouldDecodeNodes() { ! return shouldDecodeNodes; } ! public void setEscapeCharacterRemoval(boolean shouldRemoveEscapeCharacters) { ! this.shouldRemoveEscapeCharacters = shouldRemoveEscapeCharacters; } ! public boolean shouldRemoveEscapeCharacters() { ! return shouldRemoveEscapeCharacters; } ! public void setNonBreakSpaceConversion(boolean shouldConvertNonBreakSpace) { ! this.shouldConvertNonBreakingSpace = shouldConvertNonBreakSpace; } ! public boolean shouldConvertNonBreakingSpace() { ! return shouldConvertNonBreakingSpace; } } --- 59,148 ---- * (i.e. \u00a0) to a space (" "). If true, this will happen inside StringNode's toPlainTextString. */ ! protected boolean mConvertNonBreakingSpaces; ! ! public StringNodeFactory () ! { ! mDecode = false; ! mRemoveEscapes = false; ! mConvertNonBreakingSpaces = false; ! } ! // ! // NodeFactory interface override ! // ! ! /** ! * Create a new string node. ! * @param page The page the node is on. ! * @param start The beginning position of the string. ! * @param end The ending positiong of the string. ! */ ! public Node createStringNode (Page page, int start, int end) ! { ! Node ret; ! ! ret = super.createStringNode (page, start, end); ! if (getDecode ()) ! ret = new DecodingNode (ret); ! if (getRemoveEscapes ()) ! ret = new EscapeCharacterRemovingNode (ret); ! if (getConvertNonBreakingSpaces ()) ! ret = new NonBreakingSpaceConvertingNode (ret); ! ! return (ret); } /** ! * Set the decoding state. ! * @param decode If <code>true</code>, string nodes decode text using {@link org.htmlparser.util.Translate#decode}. */ ! public void setDecode (boolean decode) ! { ! mDecode = decode; ! } ! /** ! * Get the decoding state. ! * @return <code>true</code> if string nodes decode text. ! */ ! public boolean getDecode () ! { ! return (mDecode); } ! /** ! * Set the escape removing state. ! * @param decode If <code>true</code>, string nodes remove escape characters. ! */ ! public void setRemoveEscapes (boolean remove) ! { ! mRemoveEscapes = remove; } ! /** ! * Get the escape removing state. ! * @return The removing state. ! */ ! public boolean getRemoveEscapes () ! { ! return (mRemoveEscapes); } ! /** ! * Set the non-breaking space replacing state. ! * @param convert If <code>true</code>, string nodes replace ;nbsp; characters with spaces. ! */ ! public void setConvertNonBreakingSpaces (boolean convert) ! { ! mConvertNonBreakingSpaces = convert; } ! /** ! * Get the non-breaking space replacing state. ! * @return The replacing state. ! */ ! public boolean getConvertNonBreakingSpaces () ! { ! return (mConvertNonBreakingSpaces); } } |