[Htmlparser-cvs] htmlparser/src/org/htmlparser/scanners FormScanner.java,1.47,1.48 LinkScanner.java,
Brought to you by:
derrickoswald
From: <der...@us...> - 2003-10-27 02:18:38
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners In directory sc8-pr-cvs1:/tmp/cvs-serv25308/scanners Modified Files: FormScanner.java LinkScanner.java TagScanner.java TitleScanner.java Log Message: Some speed improvements; passing tags to evaluate, creating strings without string buffers, etc. Index: FormScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/FormScanner.java,v retrieving revision 1.47 retrieving revision 1.48 diff -C2 -d -r1.47 -r1.48 *** FormScanner.java 26 Oct 2003 19:46:19 -0000 1.47 --- FormScanner.java 27 Oct 2003 02:18:04 -0000 1.48 *************** *** 115,126 **** } ! public boolean evaluate(String s, TagScanner previousOpenScanner) { ! if (previousOpenScanner instanceof LinkScanner) { linkScannerAlreadyOpen = true; StringBuffer msg= new StringBuffer(); ! msg.append("<"); ! msg.append(s); ! msg.append(">"); msg.append(PREVIOUS_DIRTY_LINK_MESSAGE); feedback.warning(msg.toString()); --- 115,125 ---- } ! public boolean evaluate(Tag tag, TagScanner previousOpenScanner) { ! if (previousOpenScanner instanceof LinkScanner) ! { linkScannerAlreadyOpen = true; StringBuffer msg= new StringBuffer(); ! msg.append(tag.toHtml ()); msg.append(PREVIOUS_DIRTY_LINK_MESSAGE); feedback.warning(msg.toString()); *************** *** 133,137 **** else linkScannerAlreadyOpen = false; ! return super.evaluate(s, previousOpenScanner); } --- 132,136 ---- else linkScannerAlreadyOpen = false; ! return super.evaluate(tag, previousOpenScanner); } Index: LinkScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/LinkScanner.java,v retrieving revision 1.56 retrieving revision 1.57 diff -C2 -d -r1.56 -r1.57 *** LinkScanner.java 26 Oct 2003 19:46:20 -0000 1.56 --- LinkScanner.java 27 Oct 2003 02:18:04 -0000 1.57 *************** *** 89,117 **** /** ! * Template Method, used to decide if this scanner can handle the Link tag type. If ! * the evaluation returns true, the calling side makes a call to scan(). ! * @param s The complete text contents of the Tag. ! * @param previousOpenScanner Indicates any previous scanner which hasnt completed, before the current ! * scan has begun, and hence allows us to write scanners that can work with dirty html */ ! public boolean evaluate (String s, TagScanner previousOpenScanner) { ! char ch; ! boolean ret; ! ! // eat up leading blanks ! s = absorbLeadingBlanks (s); ! if (5 > s.length ()) ! ret = false; ! else ! { ! ch = s.charAt (0); ! if ((ch=='a' || ch=='A') && Character.isWhitespace (s.charAt (1))) ! ret = -1 != s.toUpperCase().indexOf ("HREF"); ! else ! ret = false; ! } ! ! return (ret); } --- 89,100 ---- /** ! * Check if we can handle this tag. ! * @param tag The generic tag with the name A. ! * @param previousOpenScanner Indicates any previous scanner which hasn't ! * completed, before the current scan has begun. */ ! public boolean evaluate (Tag tag, TagScanner previousOpenScanner) { ! return (null != tag.getAttributeEx ("HREF")); } Index: TagScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/TagScanner.java,v retrieving revision 1.44 retrieving revision 1.45 diff -C2 -d -r1.44 -r1.45 *** TagScanner.java 26 Oct 2003 19:46:21 -0000 1.44 --- TagScanner.java 27 Oct 2003 02:18:04 -0000 1.45 *************** *** 57,61 **** * <br> * If you wish to write your own scanner, then you must implement scan(). ! * You MAY implement evaluate() as well, if your evaluation logic is not based on a simple text match. * You MUST implement getID() - which identifies your scanner uniquely in the hashtable of scanners. * --- 57,62 ---- * <br> * If you wish to write your own scanner, then you must implement scan(). ! * You MAY implement evaluate() as well, if your evaluation logic is not based ! * on a match of the tag name. * You MUST implement getID() - which identifies your scanner uniquely in the hashtable of scanners. * *************** *** 136,221 **** } ! /** ! * This method is used to decide if this scanner can handle this tag type. If the ! * evaluation returns true, the calling side makes a call to scan(). ! * <strong>This method has to be implemented meaningfully only if a first-word match with ! * the scanner id does not imply a match (or extra processing needs to be done). ! * Default returns true</strong> ! * @param tagContents The complete text contents of the Tag. ! * @param previousOpenScanner Indicates any previous scanner which hasnt completed, before the current ! * scan has begun, and hence allows us to write scanners that can work with dirty html ! */ ! public boolean evaluate(String tagContents,TagScanner previousOpenScanner) { ! return true; ! } ! ! /** ! * Pull the text between two matching capitalized 'XML' tags. ! * @deprecated This reads ahead on your iterator and doesn't put them back if it's not an XML tag. ! */ ! public static String extractXMLData (Node node, String tagName, NodeIterator iterator) throws ! ParserException ! { ! try ! { ! String xmlData = ""; ! ! boolean xmlTagFound = isXMLTagFound (node, tagName); ! if (xmlTagFound) ! { ! try ! { ! do ! { ! node = iterator.nextNode (); ! if (node!=null) ! { ! if (node instanceof StringNode) ! { ! StringNode stringNode = (StringNode)node; ! if (xmlData.length ()>0) ! xmlData+=" "; ! xmlData += stringNode.getText (); ! } ! else ! if (!(node instanceof Tag && ((Tag)node).isEndTag ())) ! xmlTagFound = false; ! } ! } ! while (node instanceof StringNode); ! ! } ! ! catch (Exception e) ! { ! throw new ParserException ("TagScanner.extractXMLData() : error while trying to find xml tag",e); ! } ! } ! // check end tag matches start tag ! if (xmlTagFound) ! { ! if (node!=null) ! { ! if (node instanceof Tag && ((Tag)node).isEndTag ()) ! { ! Tag endTag = (Tag)node; ! if (!endTag.getTagName ().equals (tagName)) ! xmlTagFound = false; ! } ! ! } ! ! } ! if (xmlTagFound) ! return xmlData; ! else ! return null; ! } ! catch (Exception e) ! { ! throw new ParserException ("TagScanner.extractXMLData() : Error occurred while trying to extract xml tag",e); ! } ! } public String getFilter() { --- 137,224 ---- } ! /** ! * This method is used to decide if this scanner can handle this tag type. If the ! * evaluation returns true, the calling side makes a call to scan(). ! * <strong>This method has to be implemented meaningfully only if a first-word match with ! * the scanner id does not imply a match (or extra processing needs to be done). ! * Default returns true</strong> ! * @param tag The tag with a name that matches a value from {@link #getID}. ! * @param previousOpenScanner Indicates any previous scanner which hasn't ! * completed, before the current scan has begun, and hence allows us to ! * write scanners that can work with dirty html. ! */ ! public boolean evaluate (Tag tag, TagScanner previousOpenScanner) ! { ! return (true); ! } ! ! /** ! * Pull the text between two matching capitalized 'XML' tags. ! * @deprecated This reads ahead on your iterator and doesn't put them back if it's not an XML tag. ! */ ! public static String extractXMLData (Node node, String tagName, NodeIterator iterator) throws ! ParserException ! { ! try ! { ! String xmlData = ""; ! ! boolean xmlTagFound = isXMLTagFound (node, tagName); ! if (xmlTagFound) ! { ! try ! { ! do ! { ! node = iterator.nextNode (); ! if (node!=null) ! { ! if (node instanceof StringNode) ! { ! StringNode stringNode = (StringNode)node; ! if (xmlData.length ()>0) ! xmlData+=" "; ! xmlData += stringNode.getText (); ! } ! else ! if (!(node instanceof Tag && ((Tag)node).isEndTag ())) ! xmlTagFound = false; ! } ! } ! while (node instanceof StringNode); ! ! } ! ! catch (Exception e) ! { ! throw new ParserException ("TagScanner.extractXMLData() : error while trying to find xml tag",e); ! } ! } ! // check end tag matches start tag ! if (xmlTagFound) ! { ! if (node!=null) ! { ! if (node instanceof Tag && ((Tag)node).isEndTag ()) ! { ! Tag endTag = (Tag)node; ! if (!endTag.getTagName ().equals (tagName)) ! xmlTagFound = false; ! } ! ! } ! ! } ! if (xmlTagFound) ! return xmlData; ! else ! return null; ! } ! catch (Exception e) ! { ! throw new ParserException ("TagScanner.extractXMLData() : Error occurred while trying to extract xml tag",e); ! } ! } public String getFilter() { Index: TitleScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/TitleScanner.java,v retrieving revision 1.32 retrieving revision 1.33 diff -C2 -d -r1.32 -r1.33 *** TitleScanner.java 26 Oct 2003 19:46:21 -0000 1.32 --- TitleScanner.java 27 Oct 2003 02:18:04 -0000 1.33 *************** *** 53,60 **** } ! public boolean evaluate(String tagNameBeingChecked, TagScanner previousOpenScanner) { ! absorbLeadingBlanks(tagNameBeingChecked); ! return (tagNameBeingChecked.toUpperCase ().startsWith (MATCH_NAME[0]) && ((null == previousOpenScanner) ! || !previousOpenScanner.getID ()[0].equals ("TITLE"))); } --- 53,59 ---- } ! public boolean evaluate(Tag tag, TagScanner previousOpenScanner) ! { ! return ((null == previousOpenScanner) || !previousOpenScanner.getID ()[0].equals ("TITLE")); } |