[Htmlparser-cvs] htmlparser/src/org/htmlparser/lexer Cursor.java,1.10,1.11 Lexer.java,1.11,1.12
Brought to you by:
derrickoswald
From: <der...@us...> - 2003-10-05 13:50:18
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer In directory sc8-pr-cvs1:/tmp/cvs-serv9618/lexer Modified Files: Cursor.java Lexer.java Log Message: Add bean like accessors for positions on Node, AbstractNode and AbstractNodeDecorator. Handle null page in Cursor. Add smartquotes mode in Lexer and CompositeTagScannerHelper. Add simple name constructor in Attribute. Remove emptyxmltag member, replace with computing accessors in TagNode. Removed ScriptScannerHelper and moved scanning logic to ScriptScanner. Reworked extractImageLocn in ImageScanner Implement extractXMLData in TagScanner. Made virtual tags zero length in TagData. Added push() to IteratorImpl. Added single node constructor to NodeList. Numerous and various test adjustments. Still 133 failures. Index: Cursor.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Cursor.java,v retrieving revision 1.10 retrieving revision 1.11 diff -C2 -d -r1.10 -r1.11 *** Cursor.java 29 Sep 2003 00:00:38 -0000 1.10 --- Cursor.java 5 Oct 2003 13:49:41 -0000 1.11 *************** *** 137,146 **** ret = new StringBuffer (9 * 3 + 3); // three ints and delimiters ret.append (getPosition ()); - row = mPage.row (this); - column = mPage.column (this); ret.append ("["); ! ret.append (row); ret.append (","); ! ret.append (column); ret.append ("]"); --- 137,150 ---- ret = new StringBuffer (9 * 3 + 3); // three ints and delimiters ret.append (getPosition ()); ret.append ("["); ! if (null != mPage) ! ret.append (mPage.row (this)); ! else ! ret.append ("?"); ret.append (","); ! if (null != mPage) ! ret.append (mPage.column (this)); ! else ! ret.append ("?"); ret.append ("]"); Index: Lexer.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Lexer.java,v retrieving revision 1.11 retrieving revision 1.12 diff -C2 -d -r1.11 -r1.12 *** Lexer.java 29 Sep 2003 00:00:38 -0000 1.11 --- Lexer.java 5 Oct 2003 13:49:42 -0000 1.12 *************** *** 229,232 **** --- 229,246 ---- ParserException { + return nextNode (false); + } + + /** + * Get the next node from the source. + * @param quotesmart If <code>true</code>, strings ignore quoted contents. + * @return A RemarkNode, StringNode or TagNode, or <code>null</code> if no + * more lexemes are present. + * @exception ParserException If there is a problem with the underlying page. + */ + public Node nextNode (boolean quotesmart) + throws + ParserException + { Cursor probe; char ch; *************** *** 258,262 **** probe.retreat (); // remark and tag need this character if ('-' == ch) ! ret = parseRemark (probe); else { --- 272,276 ---- probe.retreat (); // remark and tag need this character if ('-' == ch) ! ret = parseRemark (probe, quotesmart); else { *************** *** 267,274 **** } else ! ret = parseString (probe); break; default: ! ret = parseString (probe); break; } --- 281,288 ---- } else ! ret = parseString (probe, quotesmart); break; default: ! ret = parseString (probe, quotesmart); break; } *************** *** 282,287 **** * letter is encountered, or the input stream is exhausted, in which * case <code>null</code> is returned. */ ! protected Node parseString (Cursor cursor) throws ParserException --- 296,303 ---- * letter is encountered, or the input stream is exhausted, in which * case <code>null</code> is returned. + * @param cursor The position at which to start scanning. + * @param quotesmart If <code>true</code>, strings ignore quoted contents. */ ! protected Node parseString (Cursor cursor, boolean quotesmart) throws ParserException *************** *** 292,298 **** --- 308,316 ---- int begin; int end; + char quote; Node ret; done = false; + quote = 0; while (!done) { *************** *** 300,304 **** if (0 == ch) done = true; ! else if ('<' == ch) { ch = mPage.getCharacter (cursor); --- 318,326 ---- if (0 == ch) done = true; ! else if (quotesmart && (0 == quote) && (('\'' == ch) || ('"' == ch))) ! quote = ch; // enter quoted state ! else if (quotesmart && (ch == quote)) ! quote = 0; // exit quoted state ! else if ((0 == quote) && ('<' == ch)) { ch = mPage.getCharacter (cursor); *************** *** 314,319 **** else { ! // it's not a tag, so keep going, ! // the extra characters consumed are in this string } } --- 336,341 ---- else { ! // it's not a tag, so keep going, but check for quotes ! cursor.retreat (); } } *************** *** 443,446 **** --- 465,469 ---- * one slot for each whitespace or attribute/value pair. * The first slot is for attribute name (kind of like a standalone attribute). + * @param cursor The position at which to start scanning. */ protected Node parseTag (Cursor cursor) *************** *** 624,629 **** * in the remark text. * We allow terminators like --!> even though this isn't part of the spec. */ ! protected Node parseRemark (Cursor cursor) throws ParserException --- 647,654 ---- * in the remark text. * We allow terminators like --!> even though this isn't part of the spec. + * @param cursor The position at which to start scanning. + * @param quotesmart If <code>true</code>, strings ignore quoted contents. */ ! protected Node parseRemark (Cursor cursor, boolean quotesmart) throws ParserException *************** *** 644,648 **** state = 1; else ! return (parseString (cursor)); break; case 1: // prior to the second open delimiter --- 669,673 ---- state = 1; else ! return (parseString (cursor, quotesmart)); break; case 1: // prior to the second open delimiter *************** *** 650,654 **** state = 2; else ! return (parseString (cursor)); break; case 2: // prior to the first closing delimiter --- 675,679 ---- state = 2; else ! return (parseString (cursor, quotesmart)); break; case 2: // prior to the first closing delimiter *************** *** 656,660 **** state = 3; else if (0 == ch) ! return (parseString (cursor)); // no terminator break; case 3: // prior to the second closing delimiter --- 681,685 ---- state = 3; else if (0 == ch) ! return (parseString (cursor, quotesmart)); // no terminator break; case 3: // prior to the second closing delimiter |