Thread: [Htmlparser-cvs] htmlparser/src/org/htmlparser/lexer Lexer.java,1.31,1.32 PageIndex.java,1.16,1.17 P
Brought to you by:
derrickoswald
From: Derrick O. <der...@us...> - 2004-08-01 02:16:13
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv2072/src/org/htmlparser/lexer Modified Files: Lexer.java PageIndex.java Page.java Log Message: Speed optimizations based on profiling. Index: PageIndex.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/PageIndex.java,v retrieving revision 1.16 retrieving revision 1.17 diff -C2 -d -r1.16 -r1.17 *** PageIndex.java 2 Jan 2004 16:24:53 -0000 1.16 --- PageIndex.java 1 Aug 2004 02:16:04 -0000 1.17 *************** *** 45,51 **** { /** * Increment for allocations. */ ! protected static final int mIncrement = 100; /** --- 45,56 ---- { /** + * Starting increment for allocations. + */ + protected static final int mStartIncrement = 100; + + /** * Increment for allocations. */ ! protected int mIncrement; /** *************** *** 73,76 **** --- 78,82 ---- mIndices = new int[mIncrement]; mCount = 0; + mIncrement = mStartIncrement * 2; } *************** *** 136,148 **** { int position; int ret; - // find where it goes - ret = Sort.bsearch (this, cursor); - - // insert, but not twice position = cursor.getPosition (); ! if (!((ret < size ()) && (position == mIndices[ret]))) insertElementAt (position, ret); return (ret); --- 142,175 ---- { int position; + int last; int ret; position = cursor.getPosition (); ! if (0 == mCount) ! { ! ret = 0; insertElementAt (position, ret); + } + else + { + last = mIndices[mCount - 1]; + if (position == last) + ret = mCount - 1; + else + if (position > last) + { + ret = mCount; + insertElementAt (position, ret); + } + else + { + // find where it goes + ret = Sort.bsearch (this, cursor); + + // insert, but not twice + if (!((ret < size ()) && (position == mIndices[ret]))) + insertElementAt (position, ret); + } + } return (ret); *************** *** 304,307 **** --- 331,335 ---- { // allocate more space int new_values[] = new int[Math.max (capacity () + mIncrement, index + 1)]; + mIncrement *= 2; if (index < capacity ()) { Index: Page.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Page.java,v retrieving revision 1.41 retrieving revision 1.42 diff -C2 -d -r1.41 -r1.42 *** Page.java 31 Jul 2004 16:42:31 -0000 1.41 --- Page.java 1 Aug 2004 02:16:04 -0000 1.42 *************** *** 369,372 **** --- 369,373 ---- stream = new Stream (getConnection ().getInputStream ()); } + try { *************** *** 952,968 **** public String getText () { ! String ret; ! ! try ! { ! ret = mSource.getString (0, mSource.offset ()); ! } ! catch (IOException ioe) ! { ! throw new IllegalArgumentException ( ! "can't get all the previous characters - " + ioe.getMessage ()); ! } ! ! return (ret); } --- 953,957 ---- public String getText () { ! return (getText (0, mSource.offset ())); } Index: Lexer.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Lexer.java,v retrieving revision 1.31 retrieving revision 1.32 diff -C2 -d -r1.31 -r1.32 *** Lexer.java 31 Jul 2004 16:42:31 -0000 1.31 --- Lexer.java 1 Aug 2004 02:16:04 -0000 1.32 *************** *** 245,249 **** ParserException { ! Cursor probe; char ch; Node ret; --- 245,249 ---- ParserException { ! int start; char ch; Node ret; *************** *** 257,262 **** mDebugLineTrigger = lineno + 1; // trigger on subsequent lines too } ! probe = mCursor.dup (); ! ch = mPage.getCharacter (probe); switch (ch) { --- 257,262 ---- mDebugLineTrigger = lineno + 1; // trigger on subsequent lines too } ! start = mCursor.getPosition (); ! ch = mPage.getCharacter (mCursor); switch (ch) { *************** *** 265,299 **** break; case '<': ! ch = mPage.getCharacter (probe); if (0 == ch) ! ret = makeString (probe); else if ('%' == ch) { ! probe.retreat (); ! ret = parseJsp (probe); } else if ('/' == ch || '%' == ch || Character.isLetter (ch)) { ! probe.retreat (); ! ret = parseTag (probe); } else if ('!' == ch) { ! ch = mPage.getCharacter (probe); if (0 == ch) ! ret = makeString (probe); else { if ('>' == ch) // handle <!> ! ret = makeRemark (probe); else { ! probe.retreat (); // remark and tag need this character if ('-' == ch) ! ret = parseRemark (probe, quotesmart); else { ! probe.retreat (); // tag needs the previous one too ! ret = parseTag (probe); } } --- 265,299 ---- break; case '<': ! ch = mPage.getCharacter (mCursor); if (0 == ch) ! ret = makeString (start, mCursor.getPosition ()); else if ('%' == ch) { ! mCursor.retreat (); ! ret = parseJsp (start); } else if ('/' == ch || '%' == ch || Character.isLetter (ch)) { ! mCursor.retreat (); ! ret = parseTag (start); } else if ('!' == ch) { ! ch = mPage.getCharacter (mCursor); if (0 == ch) ! ret = makeString (start, mCursor.getPosition ()); else { if ('>' == ch) // handle <!> ! ret = makeRemark (start, mCursor.getPosition ()); else { ! mCursor.retreat (); // remark and tag need this character if ('-' == ch) ! ret = parseRemark (start, quotesmart); else { ! mCursor.retreat (); // tag needs the previous one too ! ret = parseTag (start); } } *************** *** 301,309 **** } else ! ret = parseString (probe, quotesmart); break; default: ! probe.retreat (); // string needs to see leading foreslash ! ret = parseString (probe, quotesmart); break; } --- 301,309 ---- } else ! ret = parseString (start, quotesmart); break; default: ! mCursor.retreat (); // string needs to see leading foreslash ! ret = parseString (start, quotesmart); break; } *************** *** 364,368 **** * @param quotesmart If <code>true</code>, strings ignore quoted contents. */ ! protected Node parseString (Cursor cursor, boolean quotesmart) throws ParserException --- 364,368 ---- * @param quotesmart If <code>true</code>, strings ignore quoted contents. */ ! protected Node parseString (int start, boolean quotesmart) throws ParserException *************** *** 376,402 **** while (!done) { ! ch = mPage.getCharacter (cursor); if (0 == ch) done = true; else if (0x1b == ch) // escape { ! ch = mPage.getCharacter (cursor); if (0 == ch) done = true; else if ('$' == ch) { ! ch = mPage.getCharacter (cursor); if (0 == ch) done = true; else if ('B' == ch) ! scanJIS (cursor); else { ! cursor.retreat (); ! cursor.retreat (); } } else ! cursor.retreat (); } else if (quotesmart && (0 == quote) && (('\'' == ch) || ('"' == ch))) --- 376,402 ---- while (!done) { ! ch = mPage.getCharacter (mCursor); if (0 == ch) done = true; else if (0x1b == ch) // escape { ! ch = mPage.getCharacter (mCursor); if (0 == ch) done = true; else if ('$' == ch) { ! ch = mPage.getCharacter (mCursor); if (0 == ch) done = true; else if ('B' == ch) ! scanJIS (mCursor); else { ! mCursor.retreat (); ! mCursor.retreat (); } } else ! mCursor.retreat (); } else if (quotesmart && (0 == quote) && (('\'' == ch) || ('"' == ch))) *************** *** 405,413 **** else if (quotesmart && (0 != quote) && ('\\' == ch)) { ! ch = mPage.getCharacter (cursor); //try to consume escaped character if ( (ch != '\\') // escaped backslash && (ch != quote)) // escaped quote character // ( reflects ["] or ['] whichever opened the quotation) ! cursor.retreat(); // unconsume char if character was not an escapable char. } else if (quotesmart && (ch == quote)) --- 405,413 ---- else if (quotesmart && (0 != quote) && ('\\' == ch)) { ! ch = mPage.getCharacter (mCursor); //try to consume escaped character if ( (ch != '\\') // escaped backslash && (ch != quote)) // escaped quote character // ( reflects ["] or ['] whichever opened the quotation) ! mCursor.retreat(); // unconsume char if character was not an escapable char. } else if (quotesmart && (ch == quote)) *************** *** 417,421 **** // handle multiline and double slash comments (with a quote) in script like: // I can't handle single quotations. ! ch = mPage.getCharacter (cursor); if (0 == ch) done = true; --- 417,421 ---- // handle multiline and double slash comments (with a quote) in script like: // I can't handle single quotations. ! ch = mPage.getCharacter (mCursor); if (0 == ch) done = true; *************** *** 423,427 **** { do ! ch = mPage.getCharacter (cursor); while ((ch != 0) && (ch != '\n')); } --- 423,427 ---- { do ! ch = mPage.getCharacter (mCursor); while ((ch != 0) && (ch != '\n')); } *************** *** 431,448 **** { do ! ch = mPage.getCharacter (cursor); while ((ch != 0) && (ch != '*')); ! ch = mPage.getCharacter (cursor); if (ch == '*') ! cursor.retreat (); } while ((ch != 0) && (ch != '/')); } else ! cursor.retreat (); } else if ((0 == quote) && ('<' == ch)) { ! ch = mPage.getCharacter (cursor); if (0 == ch) done = true; --- 431,448 ---- { do ! ch = mPage.getCharacter (mCursor); while ((ch != 0) && (ch != '*')); ! ch = mPage.getCharacter (mCursor); if (ch == '*') ! mCursor.retreat (); } while ((ch != 0) && (ch != '/')); } else ! mCursor.retreat (); } else if ((0 == quote) && ('<' == ch)) { ! ch = mPage.getCharacter (mCursor); if (0 == ch) done = true; *************** *** 451,466 **** { done = true; ! cursor.retreat (); ! cursor.retreat (); } else { // it's not a tag, so keep going, but check for quotes ! cursor.retreat (); } } } ! return (makeString (cursor)); } --- 451,466 ---- { done = true; ! mCursor.retreat (); ! mCursor.retreat (); } else { // it's not a tag, so keep going, but check for quotes ! mCursor.retreat (); } } } ! return (makeString (start, mCursor.getPosition ())); } *************** *** 468,487 **** * Create a string node based on the current cursor and the one provided. */ ! protected Node makeString (Cursor cursor) throws ParserException { int length; - int begin; - int end; Node ret; ! begin = mCursor.getPosition (); ! end = cursor.getPosition (); ! length = end - begin; if (0 != length) { // got some characters ! mCursor = cursor; ! ret = getNodeFactory ().createStringNode (this.getPage (), begin, end); } else --- 468,482 ---- * Create a string node based on the current cursor and the one provided. */ ! protected Node makeString (int start, int end) throws ParserException { int length; Node ret; ! length = end - start; if (0 != length) { // got some characters ! ret = getNodeFactory ().createStringNode (this.getPage (), start, end); } else *************** *** 583,587 **** * @return The parsed tag. */ ! protected Node parseTag (Cursor cursor) throws ParserException --- 578,582 ---- * @return The parsed tag. */ ! protected Node parseTag (int start) throws ParserException *************** *** 597,605 **** state = 0; bookmarks = new int[8]; ! bookmarks[0] = cursor.getPosition (); while (!done) { ! bookmarks[state + 1] = cursor.getPosition (); ! ch = mPage.getCharacter (cursor); switch (state) { --- 592,600 ---- state = 0; bookmarks = new int[8]; ! bookmarks[0] = mCursor.getPosition (); while (!done) { ! bookmarks[state + 1] = mCursor.getPosition (); ! ch = mPage.getCharacter (mCursor); switch (state) { *************** *** 610,615 **** { // don't consume the opening angle ! cursor.retreat (); ! bookmarks[state + 1] = cursor.getPosition (); } whitespace (attributes, bookmarks); --- 605,610 ---- { // don't consume the opening angle ! mCursor.retreat (); ! bookmarks[state + 1] = mCursor.getPosition (); } whitespace (attributes, bookmarks); *************** *** 628,633 **** { // don't consume the opening angle ! cursor.retreat (); ! bookmarks[state + 1] = cursor.getPosition (); } standalone (attributes, bookmarks); --- 623,628 ---- { // don't consume the opening angle ! mCursor.retreat (); ! bookmarks[state + 1] = mCursor.getPosition (); } standalone (attributes, bookmarks); *************** *** 718,722 **** standalone (attributes, bookmarks); bookmarks[0]=bookmarks[6]; ! cursor.retreat(); state=0; } --- 713,717 ---- standalone (attributes, bookmarks); bookmarks[0]=bookmarks[6]; ! mCursor.retreat(); state=0; } *************** *** 740,744 **** standalone (attributes, bookmarks); bookmarks[0]=bookmarks[6]; ! cursor.retreat(); state=0; } --- 735,739 ---- standalone (attributes, bookmarks); bookmarks[0]=bookmarks[6]; ! mCursor.retreat(); state=0; } *************** *** 749,753 **** } ! return (makeTag (cursor, attributes)); } --- 744,748 ---- } ! return (makeTag (start, mCursor.getPosition (), attributes)); } *************** *** 755,777 **** * Create a tag node based on the current cursor and the one provided. */ ! protected Node makeTag (Cursor cursor, Vector attributes) throws ParserException { int length; - int begin; - int end; Node ret; ! begin = mCursor.getPosition (); ! end = cursor.getPosition (); ! length = end - begin; if (0 != length) { // return tag based on second character, '/', '%', Letter (ch), '!' if (2 > length) // this is an error ! return (makeString (cursor)); ! mCursor = cursor; ! ret = getNodeFactory ().createTagNode (this.getPage (), begin, end, attributes); } else --- 750,767 ---- * Create a tag node based on the current cursor and the one provided. */ ! protected Node makeTag (int start, int end, Vector attributes) throws ParserException { int length; Node ret; ! length = end - start; if (0 != length) { // return tag based on second character, '/', '%', Letter (ch), '!' if (2 > length) // this is an error ! return (makeString (start, end)); ! ret = getNodeFactory ().createTagNode (this.getPage (), start, end, attributes); } else *************** *** 821,825 **** * @param quotesmart If <code>true</code>, strings ignore quoted contents. */ ! protected Node parseRemark (Cursor cursor, boolean quotesmart) throws ParserException --- 811,815 ---- * @param quotesmart If <code>true</code>, strings ignore quoted contents. */ ! protected Node parseRemark (int start, boolean quotesmart) throws ParserException *************** *** 833,837 **** while (!done) { ! ch = mPage.getCharacter (cursor); if (0 == ch) done = true; --- 823,827 ---- while (!done) { ! ch = mPage.getCharacter (mCursor); if (0 == ch) done = true; *************** *** 845,849 **** state = 1; else ! return (parseString (cursor, quotesmart)); break; case 1: // prior to the second open delimiter --- 835,839 ---- state = 1; else ! return (parseString (start, quotesmart)); break; case 1: // prior to the second open delimiter *************** *** 851,855 **** { // handle <!--> because netscape does ! ch = mPage.getCharacter (cursor); if (0 == ch) done = true; --- 841,845 ---- { // handle <!--> because netscape does ! ch = mPage.getCharacter (mCursor); if (0 == ch) done = true; *************** *** 858,867 **** else { ! cursor.retreat (); state = 2; } } else ! return (parseString (cursor, quotesmart)); break; case 2: // prior to the first closing delimiter --- 848,857 ---- else { ! mCursor.retreat (); state = 2; } } else ! return (parseString (start, quotesmart)); break; case 2: // prior to the first closing delimiter *************** *** 869,873 **** state = 3; else if (0 == ch) ! return (parseString (cursor, quotesmart)); // no terminator break; case 3: // prior to the second closing delimiter --- 859,863 ---- state = 3; else if (0 == ch) ! return (parseString (start, quotesmart)); // no terminator break; case 3: // prior to the second closing delimiter *************** *** 892,896 **** } ! return (makeRemark (cursor)); } --- 882,886 ---- } ! return (makeRemark (start, mCursor.getPosition ())); } *************** *** 898,920 **** * Create a remark node based on the current cursor and the one provided. */ ! protected Node makeRemark (Cursor cursor) throws ParserException { int length; - int begin; - int end; Node ret; ! begin = mCursor.getPosition (); ! end = cursor.getPosition (); ! length = end - begin; if (0 != length) { // return tag based on second character, '/', '%', Letter (ch), '!' if (2 > length) // this is an error ! return (makeString (cursor)); ! mCursor = cursor; ! ret = getNodeFactory ().createRemarkNode (this.getPage (), begin, end); } else --- 888,905 ---- * Create a remark node based on the current cursor and the one provided. */ ! protected Node makeRemark (int start, int end) throws ParserException { int length; Node ret; ! length = end - start; if (0 != length) { // return tag based on second character, '/', '%', Letter (ch), '!' if (2 > length) // this is an error ! return (makeString (start, end)); ! ret = getNodeFactory ().createRemarkNode (this.getPage (), start, end); } else *************** *** 930,934 **** * @param cursor The position at which to start scanning. */ ! protected Node parseJsp (Cursor cursor) throws ParserException --- 915,919 ---- * @param cursor The position at which to start scanning. */ ! protected Node parseJsp (int start) throws ParserException *************** *** 952,956 **** while (!done) { ! ch = mPage.getCharacter (cursor); switch (state) { --- 937,941 ---- while (!done) { ! ch = mPage.getCharacter (mCursor); switch (state) { *************** *** 977,987 **** case '=': // <%= case '@': // <%@ ! code = cursor.getPosition (); ! attributes.addElement (new PageAttribute (mPage, mCursor.getPosition () + 1, code, -1, -1, (char)0)); state = 2; break; default: // <%x ! code = cursor.getPosition () - 1; ! attributes.addElement (new PageAttribute (mPage, mCursor.getPosition () + 1, code, -1, -1, (char)0)); state = 2; break; --- 962,972 ---- case '=': // <%= case '@': // <%@ ! code = mCursor.getPosition (); ! attributes.addElement (new PageAttribute (mPage, start + 1, code, -1, -1, (char)0)); state = 2; break; default: // <%x ! code = mCursor.getPosition () - 1; ! attributes.addElement (new PageAttribute (mPage, start + 1, code, -1, -1, (char)0)); state = 2; break; *************** *** 1056,1060 **** if (0 != code) { ! state = cursor.getPosition () - 2; // reuse state attributes.addElement (new PageAttribute (mPage, code, state, -1, -1, (char)0)); attributes.addElement (new PageAttribute (mPage, state, state + 1, -1, -1, (char)0)); --- 1041,1045 ---- if (0 != code) { ! state = mCursor.getPosition () - 2; // reuse state attributes.addElement (new PageAttribute (mPage, code, state, -1, -1, (char)0)); attributes.addElement (new PageAttribute (mPage, state, state + 1, -1, -1, (char)0)); *************** *** 1064,1070 **** } else ! return (parseString (cursor, true)); // hmmm, true? ! return (makeTag (cursor, attributes)); } --- 1049,1055 ---- } else ! return (parseString (start, true)); // hmmm, true? ! return (makeTag (start, mCursor.getPosition (), attributes)); } |