[Htmlparser-cvs] htmlparser/src/org/htmlparser/lexer Cursor.java,1.11,1.12 Lexer.java,1.12,1.13 Page
Brought to you by:
derrickoswald
From: <der...@us...> - 2003-10-13 21:48:50
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer In directory sc8-pr-cvs1:/tmp/cvs-serv16902/lexer Modified Files: Cursor.java Lexer.java Page.java Log Message: Eliminated ParserHelper static class. Add fixAttributes() to handle bad tags. Provide for more than just an equals sign between the attribute name and the value. Unquote the values in getAttributes() hashtable. Fixed a bug regarding factory creation in script scanner. Returned temporaryFailures classes to servicability. Skip JSP testing, fix tests broken because of unquoted attribute values. Some JavaDoc cleanup. Index: Cursor.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Cursor.java,v retrieving revision 1.11 retrieving revision 1.12 diff -C2 -d -r1.11 -r1.12 *** Cursor.java 5 Oct 2003 13:49:41 -0000 1.11 --- Cursor.java 13 Oct 2003 21:48:12 -0000 1.12 *************** *** 87,91 **** /** * Set the position of this cursor. ! * @param The new cursor position. */ public void setPosition (int position) --- 87,91 ---- /** * Set the position of this cursor. ! * @param position The new cursor position. */ public void setPosition (int position) Index: Lexer.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Lexer.java,v retrieving revision 1.12 retrieving revision 1.13 diff -C2 -d -r1.12 -r1.13 *** Lexer.java 5 Oct 2003 13:49:42 -0000 1.12 --- Lexer.java 13 Oct 2003 21:48:12 -0000 1.13 *************** *** 386,390 **** private void empty (Vector attributes, int[] bookmarks) { ! attributes.addElement (new Attribute (mPage, bookmarks[1], bookmarks[2], bookmarks[2] + 1, bookmarks[2] + 1, (char)0)); //attributes.addElement (new Attribute (mPage.getText (bookmarks[1], bookmarks[2]), "", (char)0)); } --- 386,390 ---- private void empty (Vector attributes, int[] bookmarks) { ! attributes.addElement (new Attribute (mPage, bookmarks[1], bookmarks[2], bookmarks[2] + 1, -1, (char)0)); //attributes.addElement (new Attribute (mPage.getText (bookmarks[1], bookmarks[2]), "", (char)0)); } *************** *** 500,504 **** } break; ! case 1: // within attributre name if ((0 == ch) || ('>' == ch)) { --- 500,504 ---- } break; ! case 1: // within attribute name if ((0 == ch) || ('>' == ch)) { *************** *** 531,534 **** --- 531,540 ---- bookmarks[5] = bookmarks[3]; } + else if (Character.isWhitespace (ch)) + { + empty (attributes, bookmarks); + bookmarks[0] = bookmarks[3]; + state = 0; + } else state = 3; *************** *** 577,582 **** } } ! return (makeTag (cursor, attributes)); } --- 583,721 ---- } } ! ! // OK, before constructing the node, fix up erroneous attributes ! fixAttributes (attributes); ! return (makeTag (cursor, attributes)); + } + + /** + * Try to resolve bad attributes. + * Look for the following patterns and assume what they meant was the + * construct on the right: + * <p>Rule 1. + * <pre> + * att = -> att= + * </pre> + * An attribute named "=", converts a previous standalone attribute into + * an empty attribute. + * <p>Rule 2. + * <pre> + * att =value -> att=value + * </pre> + * An attribute name beginning with an equals sign, is the value of + * a previous standalone attribute. + * <p>Rule 3. + * <pre> + * att= "value" -> att="value" + * </pre> + * A quoted attribute name, is the value of a previous empty + * attribute. + * <p>Rule 4 and Rule 5. + * <pre> + * att="va"lue" -> att='va"lue' + * att='val'ue' -> att="val'ue" + * </pre> + * An attribute name ending in a quote is a second part of a + * similarly quoted value of a previous attribute. Note, this doesn't + * change the quote value but it should, or the contained quote should be + * removed. + * <p>Note: + * <pre> + * att = "value" -> att="value" + * </pre> + * A quoted attribute name, is the value of a previous standalone + * attribute separated by an attribute named "=" will be handled by + * sequential application of rule 1 and 3. + */ + protected void fixAttributes (Vector attributes) throws ParserException + { + Attribute attribute; + Cursor cursor; + char ch1; // name starting character + char ch2; // name ending character + Attribute prev1; // attribute prior to the current + Attribute prev2; // attribute prior but one to the current + char quote; + + cursor = new Cursor (getPage (), 0); + prev1 = null; + prev2 = null; + // leave the name alone & start with second attribute + for (int i = 2; i < attributes.size (); ) + { + attribute = (Attribute)attributes.elementAt (i); + if (!attribute.isWhitespace ()) + { + cursor.setPosition (attribute.getNameStartPosition ()); + ch1 = attribute.getPage ().getCharacter (cursor); + cursor.setPosition (attribute.getNameEndPosition () - 1); + ch2 = attribute.getPage ().getCharacter (cursor); + if ('=' == ch1) + { // possible rule 1 or 2 + // check for a previous standalone, both rules need it, also check prev1 as a sanity check + if (null != prev2 && prev2.isStandAlone () && prev1.isWhitespace ()) + { + if (1 == attribute.getNameEndPosition () - attribute.getNameStartPosition ()) + { // rule 1, an isolated equals sign + prev2.setValueStartPosition (attribute.getNameEndPosition ()); + attributes.removeElementAt (i); // current + attributes.removeElementAt (i - 1); // whitespace + prev1 = prev2; + prev2 = null; + i--; + continue; + } + else + { + // rule 2, name starts with equals + prev2.setValueStartPosition (attribute.getNameStartPosition () + 1); // past the equals sign + prev2.setValueEndPosition (attribute.getNameEndPosition ()); + attributes.removeElementAt (i); // current + attributes.removeElementAt (i - 1); // whitespace + prev1 = prev2; + prev2 = null; + i--; + continue; + } + } + } + else if ((('\'' == ch1) && ('\'' == ch2)) || (('"' == ch1) && ('"' == ch2))) + { // possible rule 3 + // check for a previous empty, also check prev1 as a sanity check + if (null != prev2 && prev2.isEmpty () && prev1.isWhitespace ()) + { // TODO check that name has more than one character + prev2.setValueStartPosition (attribute.getNameStartPosition () + 1); + prev2.setValueEndPosition (attribute.getNameEndPosition () - 1); + prev2.setQuote (ch1); + attributes.removeElementAt (i); // current + attributes.removeElementAt (i - 1); // whitespace + prev1 = prev2; + prev2 = null; + i--; + continue; + } + } + else if (('\'' == ch2) || ('"' == ch2)) + { // possible rule 4 or 5 + // check for a previous valued attribute + if (null != prev1 && prev1.isValued ()) + { // check for a terminating quote of the same type + cursor.setPosition (prev1.getValueEndPosition ()); + ch1 = prev1.getPage ().getCharacter (cursor); // crossing pages with cursor? + if (ch1 == ch2) + { + prev1.setValueEndPosition (attribute.getNameEndPosition () - 1); + attributes.removeElementAt (i); // current + continue; + } + } + } + } + // shift and go on to next attribute + prev2 = prev1; + prev1 = attribute; + i++; + } } Index: Page.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Page.java,v retrieving revision 1.18 retrieving revision 1.19 diff -C2 -d -r1.18 -r1.19 *** Page.java 30 Sep 2003 02:12:34 -0000 1.18 --- Page.java 13 Oct 2003 21:48:12 -0000 1.19 *************** *** 213,217 **** /** * Deserialize the page. ! * @see #writeObject * @param in The object stream to decode. */ --- 213,217 ---- /** * Deserialize the page. ! * For details see <code>writeObject()</code>. * @param in The object stream to decode. */ *************** *** 802,806 **** /** * Get the text line the position of the cursor lies on. ! * @param cursor The position to calculate for. * @return The contents of the URL or file corresponding to the line number * containg the cursor position. --- 802,806 ---- /** * Get the text line the position of the cursor lies on. ! * @param position The position to calculate for. * @return The contents of the URL or file corresponding to the line number * containg the cursor position. |