[Htmlparser-cvs] htmlparser/src/org/htmlparser/scanners LinkScanner.java,1.38,1.39 TagScanner.java,1
Brought to you by:
derrickoswald
From: <der...@us...> - 2003-05-17 12:12:53
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners In directory sc8-pr-cvs1:/tmp/cvs-serv30035/org/htmlparser/scanners Modified Files: LinkScanner.java TagScanner.java Log Message: Fix tab handling on the suggestion of oyoaha (philippe blanc). Rewrite some string handling methods to remove gross inefficiencies. Index: LinkScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/LinkScanner.java,v retrieving revision 1.38 retrieving revision 1.39 diff -C2 -d -r1.38 -r1.39 *** LinkScanner.java 12 May 2003 01:37:45 -0000 1.38 --- LinkScanner.java 17 May 2003 12:12:49 -0000 1.39 *************** *** 118,140 **** * scan has begun, and hence allows us to write scanners that can work with dirty html */ ! public boolean evaluate(String s,TagScanner previousOpenScanner) ! { ! // Eat up leading blanks ! s = absorbLeadingBlanks(s); ! boolean retVal; ! char ch = s.charAt(0); ! ! if (s.length()<5) retVal = false; else ! if ((ch=='a' || ch=='A') && (s.charAt(1)==' ' || s.charAt(1)=='\n' || s.charAt(1)=='\r')) retVal = true; else retVal = false; ! if (retVal) ! { ! if (s.toUpperCase().indexOf("HREF")==-1) ! retVal=false; ! } - return retVal; - - } /** * Extract the link from the given string. The URL of the actual html page is also --- 118,142 ---- * scan has begun, and hence allows us to write scanners that can work with dirty html */ ! public boolean evaluate (String s, TagScanner previousOpenScanner) ! { ! char ch; ! boolean ret; ! ! // eat up leading blanks ! s = absorbLeadingBlanks (s); ! if (5 > s.length ()) ! ret = false; ! else ! { ! ch = s.charAt (0); ! if ((ch=='a' || ch=='A') && Character.isWhitespace (s.charAt (1))) ! ret = -1 != s.toUpperCase().indexOf ("HREF"); ! else ! ret = false; ! } ! return (ret); ! } /** * Extract the link from the given string. The URL of the actual html page is also Index: TagScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/TagScanner.java,v retrieving revision 1.21 retrieving revision 1.22 diff -C2 -d -r1.21 -r1.22 *** TagScanner.java 12 May 2003 01:37:45 -0000 1.21 --- TagScanner.java 17 May 2003 12:12:49 -0000 1.22 *************** *** 103,119 **** /** ! * Insert the method's description here. ! * Creation date: (6/18/2001 2:15:02 AM) ! * @return java.lang.String */ ! public static String absorbLeadingBlanks(String s) ! { ! String temp = new String(s); ! while (temp.length()!=0 && temp.charAt(0)==' ') ! { ! temp = temp.substring(1,temp.length()); ! } ! return temp; ! } /** --- 103,129 ---- /** ! * Remove whitespace from the front of the given string. ! * @param s The string to trim. ! * @return Either the same string or a string with whitespace chopped off. */ ! public static String absorbLeadingBlanks (String s) ! { ! int length; ! int i; ! String ret; ! ! i = 0; ! length = s.length (); ! while (i < length && Character.isWhitespace (s.charAt (i))) ! i++; ! if (0 == i) ! ret = s; ! else if (length == i) ! ret = ""; ! else ! ret = s.substring (i); ! ! return (ret); ! } /** |