[Htmlparser-cvs] htmlparser/src/org/htmlparser/lexer Lexer.java,1.44,1.45
Brought to you by:
derrickoswald
From: Derrick O. <der...@us...> - 2006-04-14 22:18:52
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv31675/src/org/htmlparser/lexer Modified Files: Lexer.java Log Message: Cleanup to isolate htmllexer jar build. Index: Lexer.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Lexer.java,v retrieving revision 1.44 retrieving revision 1.45 diff -C2 -d -r1.44 -r1.45 *** Lexer.java 19 Mar 2006 21:26:32 -0000 1.44 --- Lexer.java 14 Apr 2006 22:18:47 -0000 1.45 *************** *** 59,62 **** --- 59,95 ---- NodeFactory { + // Please don't change the formatting of the version variables below. + // This is done so as to facilitate ant script processing. + + /** + * The floating point version number ({@value}). + */ + public static final double + VERSION_NUMBER = 1.6 + ; + + /** + * The type of version ({@value}). + */ + public static final String + VERSION_TYPE = "Integration Build" + ; + + /** + * The date of the version ({@value}). + */ + public static final String + VERSION_DATE = "Mar 19, 2006" + ; + + // End of formatting + + /** + * The display version ({@value}). + */ + public static final String VERSION_STRING = + "" + VERSION_NUMBER + + " (" + VERSION_TYPE + " " + VERSION_DATE + ")"; + /** * The page lexemes are retrieved from. *************** *** 84,87 **** --- 117,140 ---- protected static int mDebugLineTrigger = -1; + // + // Static methods + // + + /** + * Return the version string of this parser. + * @return A string of the form: + * <pre> + * "[floating point number] ([build-type] [build-date])" + * </pre> + */ + public static String getVersion () + { + return (VERSION_STRING); + } + + // + // Constructors + // + /** * Creates a new instance of a Lexer. *************** *** 124,137 **** } ! /** ! * Reset the lexer to start parsing from the beginning again. ! * The underlying components are reset such that the next call to ! * <code>nextNode()</code> will return the first lexeme on the page. ! */ ! public void reset () ! { ! getPage ().reset (); ! setCursor (new Cursor (getPage (), 0)); ! } /** --- 177,183 ---- } ! // ! // Bean patterns ! // /** *************** *** 234,237 **** --- 280,298 ---- } + // + // Public methods + // + + /** + * Reset the lexer to start parsing from the beginning again. + * The underlying components are reset such that the next call to + * <code>nextNode()</code> will return the first lexeme on the page. + */ + public void reset () + { + getPage ().reset (); + setCursor (new Cursor (getPage (), 0)); + } + /** * Get the next node from the source. *************** *** 333,336 **** --- 394,659 ---- /** + * Return CDATA as a text node. + * According to appendix <a href="http://www.w3.org/TR/html4/appendix/notes.html#notes-specifying-data"> + * B.3.2 Specifying non-HTML data</a> of the + * <a href="http://www.w3.org/TR/html4/">HTML 4.01 Specification</a>:<br> + * <quote> + * <b>Element content</b><br> + * When script or style data is the content of an element (SCRIPT and STYLE), + * the data begins immediately after the element start tag and ends at the + * first ETAGO ("</") delimiter followed by a name start character ([a-zA-Z]); + * note that this may not be the element's end tag. + * Authors should therefore escape "</" within the content. Escape mechanisms + * are specific to each scripting or style sheet language. + * </quote> + * @return The <code>TextNode</code> of the CDATA or <code>null</code> if none. + * @exception ParserException If a problem occurs reading from the source. + */ + public Node parseCDATA () + throws + ParserException + { + return (parseCDATA (false)); + } + + /** + * Return CDATA as a text node. + * Slightly less rigid than {@link #parseCDATA()} this method provides for + * parsing CDATA that may contain quoted strings that have embedded + * ETAGO ("</") delimiters and skips single and multiline comments. + * @param quotesmart If <code>true</code> the strict definition of CDATA is + * extended to allow for single or double quoted ETAGO ("</") sequences. + * @return The <code>TextNode</code> of the CDATA or <code>null</code> if none. + * @see #parseCDATA() + * @exception ParserException If a problem occurs reading from the source. + */ + public Node parseCDATA (boolean quotesmart) + throws + ParserException + { + int start; + int state; + boolean done; + char quote; + char ch; + int end; + boolean comment; + + start = mCursor.getPosition (); + state = 0; + done = false; + quote = 0; + comment = false; + + while (!done) + { + ch = mPage.getCharacter (mCursor); + switch (state) + { + case 0: // prior to ETAGO + switch (ch) + { + case Page.EOF: + done = true; + break; + case '\'': + if (quotesmart && !comment) + if (0 == quote) + quote = '\''; // enter quoted state + else if ('\'' == quote) + quote = 0; // exit quoted state + break; + case '"': + if (quotesmart && !comment) + if (0 == quote) + quote = '"'; // enter quoted state + else if ('"' == quote) + quote = 0; // exit quoted state + break; + case '\\': + if (quotesmart) + if (0 != quote) + { + ch = mPage.getCharacter (mCursor); // try to consume escaped character + if (Page.EOF == ch) + done = true; + else if ( (ch != '\\') && (ch != quote)) + mCursor.retreat (); // unconsume char if character was not an escapable char. + } + break; + case '/': + if (quotesmart) + if (0 == quote) + { + // handle multiline and double slash comments (with a quote) + ch = mPage.getCharacter (mCursor); + if (Page.EOF == ch) + done = true; + else if ('/' == ch) + comment = true; + else if ('*' == ch) + { + do + { + do + ch = mPage.getCharacter (mCursor); + while ((Page.EOF != ch) && ('*' != ch)); + ch = mPage.getCharacter (mCursor); + if (ch == '*') + mCursor.retreat (); + } + while ((Page.EOF != ch) && ('/' != ch)); + } + else + mCursor.retreat (); + } + break; + case '\n': + comment = false; + break; + case '<': + if (quotesmart) + { + if (0 == quote) + state = 1; + } + else + state = 1; + break; + default: + break; + } + break; + case 1: // < + switch (ch) + { + case Page.EOF: + done = true; + break; + case '/': + state = 2; + break; + case '!': + ch = mPage.getCharacter (mCursor); + if (Page.EOF == ch) + done = true; + else if ('-' == ch) + { + ch = mPage.getCharacter (mCursor); + if (Page.EOF == ch) + done = true; + else if ('-' == ch) + state = 3; + else + state = 0; + } + else + state = 0; + break; + default: + state = 0; + break; + } + break; + case 2: // </ + comment = false; + if (Page.EOF == ch) + done = true; + else if (Character.isLetter (ch)) + { + done = true; + // back up to the start of ETAGO + mCursor.retreat (); + mCursor.retreat (); + mCursor.retreat (); + } + else + state = 0; + break; + case 3: // <! + comment = false; + if (Page.EOF == ch) + done = true; + else if ('-' == ch) + { + ch = mPage.getCharacter (mCursor); + if (Page.EOF == ch) + done = true; + else if ('-' == ch) + { + ch = mPage.getCharacter (mCursor); + if (Page.EOF == ch) + done = true; + else if ('>' == ch) + state = 0; + else + { + mCursor.retreat (); + mCursor.retreat (); + } + } + else + mCursor.retreat (); + } + break; + default: + throw new IllegalStateException ("how the fuck did we get in state " + state); + } + } + end = mCursor.getPosition (); + + return (makeString (start, end)); + } + + // + // NodeFactory interface + // + + /** + * Create a new string node. + * @param page The page the node is on. + * @param start The beginning position of the string. + * @param end The ending positiong of the string. + * @return The created Text node. + */ + public Text createStringNode (Page page, int start, int end) + { + return (new TextNode (page, start, end)); + } + + /** + * Create a new remark node. + * @param page The page the node is on. + * @param start The beginning position of the remark. + * @param end The ending positiong of the remark. + * @return The created Remark node. + */ + public Remark createRemarkNode (Page page, int start, int end) + { + return (new RemarkNode (page, start, end)); + } + + /** + * Create a new tag node. + * Note that the attributes vector contains at least one element, + * which is the tag name (standalone attribute) at position zero. + * This can be used to decide which type of node to create, or + * gate other processing that may be appropriate. + * @param page The page the node is on. + * @param start The beginning position of the tag. + * @param end The ending positiong of the tag. + * @param attributes The attributes contained in this tag. + * @return The created Tag node. + */ + public Tag createTagNode (Page page, int start, int end, Vector attributes) + { + return (new TagNode (page, start, end, attributes)); + } + + // + // Internal methods + // + + /** * Advance the cursor through a JIS escape sequence. * @param cursor A cursor positioned within the escape sequence. *************** *** 1303,1565 **** } - /** - * Return CDATA as a text node. - * According to appendix <a href="http://www.w3.org/TR/html4/appendix/notes.html#notes-specifying-data"> - * B.3.2 Specifying non-HTML data</a> of the - * <a href="http://www.w3.org/TR/html4/">HTML 4.01 Specification</a>:<br> - * <quote> - * <b>Element content</b><br> - * When script or style data is the content of an element (SCRIPT and STYLE), - * the data begins immediately after the element start tag and ends at the - * first ETAGO ("</") delimiter followed by a name start character ([a-zA-Z]); - * note that this may not be the element's end tag. - * Authors should therefore escape "</" within the content. Escape mechanisms - * are specific to each scripting or style sheet language. - * </quote> - * @return The <code>TextNode</code> of the CDATA or <code>null</code> if none. - * @exception ParserException If a problem occurs reading from the source. - */ - public Node parseCDATA () - throws - ParserException - { - return (parseCDATA (false)); - } - - /** - * Return CDATA as a text node. - * Slightly less rigid than {@link #parseCDATA()} this method provides for - * parsing CDATA that may contain quoted strings that have embedded - * ETAGO ("</") delimiters and skips single and multiline comments. - * @param quotesmart If <code>true</code> the strict definition of CDATA is - * extended to allow for single or double quoted ETAGO ("</") sequences. - * @return The <code>TextNode</code> of the CDATA or <code>null</code> if none. - * @see #parseCDATA() - * @exception ParserException If a problem occurs reading from the source. - */ - public Node parseCDATA (boolean quotesmart) - throws - ParserException - { - int start; - int state; - boolean done; - char quote; - char ch; - int end; - boolean comment; - - start = mCursor.getPosition (); - state = 0; - done = false; - quote = 0; - comment = false; - - while (!done) - { - ch = mPage.getCharacter (mCursor); - switch (state) - { - case 0: // prior to ETAGO - switch (ch) - { - case Page.EOF: - done = true; - break; - case '\'': - if (quotesmart && !comment) - if (0 == quote) - quote = '\''; // enter quoted state - else if ('\'' == quote) - quote = 0; // exit quoted state - break; - case '"': - if (quotesmart && !comment) - if (0 == quote) - quote = '"'; // enter quoted state - else if ('"' == quote) - quote = 0; // exit quoted state - break; - case '\\': - if (quotesmart) - if (0 != quote) - { - ch = mPage.getCharacter (mCursor); // try to consume escaped character - if (Page.EOF == ch) - done = true; - else if ( (ch != '\\') && (ch != quote)) - mCursor.retreat (); // unconsume char if character was not an escapable char. - } - break; - case '/': - if (quotesmart) - if (0 == quote) - { - // handle multiline and double slash comments (with a quote) - ch = mPage.getCharacter (mCursor); - if (Page.EOF == ch) - done = true; - else if ('/' == ch) - comment = true; - else if ('*' == ch) - { - do - { - do - ch = mPage.getCharacter (mCursor); - while ((Page.EOF != ch) && ('*' != ch)); - ch = mPage.getCharacter (mCursor); - if (ch == '*') - mCursor.retreat (); - } - while ((Page.EOF != ch) && ('/' != ch)); - } - else - mCursor.retreat (); - } - break; - case '\n': - comment = false; - break; - case '<': - if (quotesmart) - { - if (0 == quote) - state = 1; - } - else - state = 1; - break; - default: - break; - } - break; - case 1: // < - switch (ch) - { - case Page.EOF: - done = true; - break; - case '/': - state = 2; - break; - case '!': - ch = mPage.getCharacter (mCursor); - if (Page.EOF == ch) - done = true; - else if ('-' == ch) - { - ch = mPage.getCharacter (mCursor); - if (Page.EOF == ch) - done = true; - else if ('-' == ch) - state = 3; - else - state = 0; - } - else - state = 0; - break; - default: - state = 0; - break; - } - break; - case 2: // </ - comment = false; - if (Page.EOF == ch) - done = true; - else if (Character.isLetter (ch)) - { - done = true; - // back up to the start of ETAGO - mCursor.retreat (); - mCursor.retreat (); - mCursor.retreat (); - } - else - state = 0; - break; - case 3: // <! - comment = false; - if (Page.EOF == ch) - done = true; - else if ('-' == ch) - { - ch = mPage.getCharacter (mCursor); - if (Page.EOF == ch) - done = true; - else if ('-' == ch) - { - ch = mPage.getCharacter (mCursor); - if (Page.EOF == ch) - done = true; - else if ('>' == ch) - state = 0; - else - { - mCursor.retreat (); - mCursor.retreat (); - } - } - else - mCursor.retreat (); - } - break; - default: - throw new IllegalStateException ("how the fuck did we get in state " + state); - } - } - end = mCursor.getPosition (); - - return (makeString (start, end)); - } - // ! // NodeFactory interface // /** - * Create a new string node. - * @param page The page the node is on. - * @param start The beginning position of the string. - * @param end The ending positiong of the string. - * @return The created Text node. - */ - public Text createStringNode (Page page, int start, int end) - { - return (new TextNode (page, start, end)); - } - - /** - * Create a new remark node. - * @param page The page the node is on. - * @param start The beginning position of the remark. - * @param end The ending positiong of the remark. - * @return The created Remark node. - */ - public Remark createRemarkNode (Page page, int start, int end) - { - return (new RemarkNode (page, start, end)); - } - - /** - * Create a new tag node. - * Note that the attributes vector contains at least one element, - * which is the tag name (standalone attribute) at position zero. - * This can be used to decide which type of node to create, or - * gate other processing that may be appropriate. - * @param page The page the node is on. - * @param start The beginning position of the tag. - * @param end The ending positiong of the tag. - * @param attributes The attributes contained in this tag. - * @return The created Tag node. - */ - public Tag createTagNode (Page page, int start, int end, Vector attributes) - { - return (new TagNode (page, start, end, attributes)); - } - - /** * Mainline for command line operation * @param args [0] The URL to parse. --- 1626,1634 ---- } // ! // Main program // /** * Mainline for command line operation * @param args [0] The URL to parse. *************** *** 1572,1585 **** ParserException { Lexer lexer; Node node; if (0 >= args.length) System.out.println ("usage: java -jar htmllexer.jar <url>"); else { try { ! ConnectionManager manager = Page.getConnectionManager (); lexer = new Lexer (manager.openConnection (args[0])); while (null != (node = lexer.nextNode (false))) --- 1641,1659 ---- ParserException { + ConnectionManager manager; Lexer lexer; Node node; if (0 >= args.length) + { + System.out.println ("HTML Lexer v" + getVersion () + "\n"); + System.out.println (); System.out.println ("usage: java -jar htmllexer.jar <url>"); + } else { try { ! manager = Page.getConnectionManager (); lexer = new Lexer (manager.openConnection (args[0])); while (null != (node = lexer.nextNode (false))) |