[Htmlparser-cvs] htmlparser/src/org/htmlparser/scanners CompositeTagScanner.java,1.75,1.76 FormScann
Brought to you by:
derrickoswald
From: <der...@us...> - 2003-10-28 12:55:11
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners In directory sc8-pr-cvs1:/tmp/cvs-serv5437/scanners Modified Files: CompositeTagScanner.java FormScanner.java ScriptScanner.java TagScanner.java Log Message: Remove TagScanner cruft. Index: CompositeTagScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/CompositeTagScanner.java,v retrieving revision 1.75 retrieving revision 1.76 diff -C2 -d -r1.75 -r1.76 *** CompositeTagScanner.java 28 Oct 2003 10:31:02 -0000 1.75 --- CompositeTagScanner.java 28 Oct 2003 12:54:21 -0000 1.76 *************** *** 238,242 **** scanner = parser.getScanner (name); if ((null != scanner) && scanner.evaluate (next, this)) ! node = scanner.createScannedNode (next, lexer.getPage ().getUrl (), lexer); } } --- 238,242 ---- scanner = parser.getScanner (name); if ((null != scanner) && scanner.evaluate (next, this)) ! node = scanner.scan (next, lexer.getPage ().getUrl (), lexer); } } *************** *** 252,255 **** --- 252,256 ---- composite = (CompositeTag)createTag (lexer.getPage (), tag.elementBegin (), endTag.elementEnd (), tag.getAttributesEx (), tag, endTag, nodeList); + composite.setThisScanner (this); for (int i = 0; i < composite.getChildCount (); i++) composite.childAt (i).setParent (composite); Index: FormScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/FormScanner.java,v retrieving revision 1.50 retrieving revision 1.51 diff -C2 -d -r1.50 -r1.51 *** FormScanner.java 28 Oct 2003 10:31:02 -0000 1.50 --- FormScanner.java 28 Oct 2003 12:54:21 -0000 1.51 *************** *** 45,50 **** { private static final String [] MATCH_ID = { "FORM" }; - public static final String PREVIOUS_DIRTY_LINK_MESSAGE="Encountered a form tag after an open link tag.\nThere should have been an end tag for the link before the form tag began.\nCorrecting this.."; - private boolean linkScannerAlreadyOpen=false; private static final String [] formTagEnders = {"FORM","HTML","BODY"}; --- 45,48 ---- *************** *** 110,133 **** { return MATCH_ID; - } - - public boolean evaluate(Tag tag, TagScanner previousOpenScanner) - { - if (previousOpenScanner instanceof LinkScanner) - { - linkScannerAlreadyOpen = true; - StringBuffer msg= new StringBuffer(); - msg.append(tag.toHtml ()); - msg.append(PREVIOUS_DIRTY_LINK_MESSAGE); - feedback.warning(msg.toString()); - // This is dirty HTML. Assume the current tag is - // not a new link tag - but an end tag. This is actually a really wild bug - - // Internet Explorer actually parses such tags. - // So - we shall then proceed to fool the scanner into sending an endtag of type </A> - // For this - set the dirty flag to true and return - } - else - linkScannerAlreadyOpen = false; - return super.evaluate(tag, previousOpenScanner); } --- 108,111 ---- Index: ScriptScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/ScriptScanner.java,v retrieving revision 1.45 retrieving revision 1.46 diff -C2 -d -r1.45 -r1.46 *** ScriptScanner.java 28 Oct 2003 03:04:18 -0000 1.45 --- ScriptScanner.java 28 Oct 2003 12:54:21 -0000 1.46 *************** *** 169,172 **** --- 169,173 ---- //TODO: use the factory: ret = createTag (lexer.getPage (), tag.elementBegin(), end.elementEnd(), tag.getAttributesEx (), tag, end, new NodeList (last)); + ret.setThisScanner (this); } finally Index: TagScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/TagScanner.java,v retrieving revision 1.45 retrieving revision 1.46 diff -C2 -d -r1.45 -r1.46 *** TagScanner.java 27 Oct 2003 02:18:04 -0000 1.45 --- TagScanner.java 28 Oct 2003 12:54:21 -0000 1.46 *************** *** 70,138 **** Serializable { ! /** ! * A filter which is used to associate this tag. The filter contains a string ! * that is used to match which tags are to be allowed to pass through. This can ! * be useful when one wishes to dynamically filter out all tags except one type ! * which may be programmed later than the parser. Is also useful for command line ! * implementations of the parser. ! */ ! protected String filter; ! ! /** ! * HTMLParserFeedback object automatically initialized ! */ ! protected ParserFeedback feedback; ! /** ! * Default Constructor, automatically registers the scanner into a static array of ! * scanners inside Tag ! */ ! public TagScanner() ! { ! this.filter=""; ! } ! /** ! * This constructor automatically registers the scanner, and sets the filter for this ! * tag. ! * @param filter The filter which will allow this tag to pass through. ! */ ! public TagScanner(String filter) ! { ! this.filter=filter; ! } ! /** ! * Insert the method's description here. ! * Creation date: (6/4/2001 11:44:09 AM) ! * @return java.lang.String ! * @param c char */ ! public String absorb(String s,char c) { ! int index = s.indexOf(c); ! if (index!=-1) s=s.substring(index+1,s.length()); ! return s; } /** ! * Remove whitespace from the front of the given string. ! * @param s The string to trim. ! * @return Either the same string or a string with whitespace chopped off. */ ! public static String absorbLeadingBlanks (String s) { ! int length; ! int i; ! String ret; ! ! i = 0; ! length = s.length (); ! while (i < length && Character.isWhitespace (s.charAt (i))) ! i++; ! if (0 == i) ! ret = s; ! else if (length == i) ! ret = ""; ! else ! ret = s.substring (i); ! ! return (ret); } --- 70,99 ---- Serializable { ! /** ! * A filter which is used to associate this tag. The filter contains a string ! * that is used to match which tags are to be allowed to pass through. This can ! * be useful when one wishes to dynamically filter out all tags except one type ! * which may be programmed later than the parser. Is also useful for command line ! * implementations of the parser. */ ! protected String filter; ! ! /** ! * Default Constructor, automatically registers the scanner into a static array of ! * scanners inside Tag ! */ ! public TagScanner () ! { ! this (""); } /** ! * This constructor automatically registers the scanner, and sets the filter for this ! * tag. ! * @param filter The filter which will allow this tag to pass through. */ ! public TagScanner (String filter) { ! this.filter=filter; } *************** *** 153,356 **** } ! /** ! * Pull the text between two matching capitalized 'XML' tags. ! * @deprecated This reads ahead on your iterator and doesn't put them back if it's not an XML tag. ! */ ! public static String extractXMLData (Node node, String tagName, NodeIterator iterator) ! throws ! ParserException { - try - { - String xmlData = ""; - - boolean xmlTagFound = isXMLTagFound (node, tagName); - if (xmlTagFound) - { - try - { - do - { - node = iterator.nextNode (); - if (node!=null) - { - if (node instanceof StringNode) - { - StringNode stringNode = (StringNode)node; - if (xmlData.length ()>0) - xmlData+=" "; - xmlData += stringNode.getText (); - } - else - if (!(node instanceof Tag && ((Tag)node).isEndTag ())) - xmlTagFound = false; - } - } - while (node instanceof StringNode); - - } - - catch (Exception e) - { - throw new ParserException ("TagScanner.extractXMLData() : error while trying to find xml tag",e); - } - } - // check end tag matches start tag - if (xmlTagFound) - { - if (node!=null) - { - if (node instanceof Tag && ((Tag)node).isEndTag ()) - { - Tag endTag = (Tag)node; - if (!endTag.getTagName ().equals (tagName)) - xmlTagFound = false; - } - - } - - } - if (xmlTagFound) - return xmlData; - else - return null; - } - catch (Exception e) - { - throw new ParserException ("TagScanner.extractXMLData() : Error occurred while trying to extract xml tag",e); - } - } - - public String getFilter() { return filter; } - public static boolean isXMLTagFound(Node node, String tagName) { - boolean xmlTagFound=false; - if (node instanceof Tag) { - Tag tag = (Tag)node; - if (tag.getText().toUpperCase().indexOf(tagName)==0) { - xmlTagFound=true; - } - } - return xmlTagFound; - } - - public final Tag createScannedNode(Tag tag,String url,Lexer lexer) throws ParserException { - Tag thisTag = scan(tag,url,lexer); - thisTag.setThisScanner(this); - thisTag.setAttributesEx(tag.getAttributesEx()); - return thisTag; - } - - /** - * Override this method to create your own tag type - * @param tagData - * @param tag - * @param url - * @return Tag - * @throws ParserException - */ - protected abstract Tag createTag(Page page, int start, int end, Vector attributes, Tag tag, String url) throws ParserException; - /** * Scan the tag and extract the information related to this type. The url of the * initiating scan has to be provided in case relative links are found. The initial * url is then prepended to it to give an absolute link. ! * The NodeReader is provided in order to do a lookahead operation. We assume that * the identification has already been performed using the evaluate() method. ! * @param tag HTML Tag to be scanned for identification ! * @param url The initiating url of the scan (Where the html page lies) ! * @param reader The reader object responsible for reading the html page */ ! public Tag scan(Tag tag,String url,Lexer lexer) throws ParserException ! { ! return (createTag(lexer.getPage (), tag.elementBegin(), tag.elementEnd(), tag.getAttributesEx (), tag, url)); ! } ! ! public String removeChars(String s,String occur) { ! StringBuffer newString = new StringBuffer(); ! int index; ! do { ! index = s.indexOf(occur); ! if (index!=-1) { ! newString.append(s.substring(0,index)); ! s=s.substring(index+occur.length()); ! } ! } ! while (index!=-1); ! newString.append(s); ! return newString.toString(); ! } ! ! public abstract String [] getID(); ! ! public final void setFeedback(ParserFeedback feedback) { ! this.feedback = feedback; ! } ! ! public static Map adjustScanners(Parser parser) { ! Map ret; ! ! ret = parser.getScanners(); ! // Remove all existing scanners ! parser.flushScanners(); return (ret); } - public static void restoreScanners(Parser parser, Hashtable tempScanners) - { - // Flush the scanners - parser.setScanners(tempScanners); - } - /** ! * Insert an EndTag in the currentLine, just before the occurence of the provided tag */ ! public String insertEndTagBeforeNode(AbstractNode node, String currentLine) { ! String newLine = currentLine.substring(0,node.elementBegin()); ! newLine += "</A>"; ! newLine += currentLine.substring(node.elementBegin(),currentLine.length()); ! return newLine; ! } ! ! // protected Tag getReplacedEndTag(Tag tag, NodeReader reader, String currentLine) { ! // // Replace tag - it was a <A> tag - replace with </a> ! // String newLine = replaceFaultyTagWithEndTag(tag, currentLine); ! // reader.changeLine(newLine); ! // return new EndTag( ! // new TagData( ! // tag.elementBegin(), ! // tag.elementBegin()+3, ! // tag.getTagName(), ! // currentLine ! // ) ! // ); ! // } ! ! public String replaceFaultyTagWithEndTag(Tag tag, String currentLine) { ! String newLine = currentLine.substring(0,tag.elementBegin()); ! newLine+="</"+tag.getTagName()+">"; ! newLine+=currentLine.substring(tag.elementEnd()+1,currentLine.length()); ! ! return newLine; ! } ! ! // protected Tag getInsertedEndTag(Tag tag, String currentLine) { ! // // Insert end tag ! // String newLine = insertEndTagBeforeNode(tag, currentLine); ! // reader.changeLine(newLine); ! // return new EndTag( ! // new TagData( ! // tag.elementBegin(), ! // tag.elementBegin()+3, ! // tag.getTagName(), ! // currentLine ! // ) ! // ); ! // } ! } --- 114,154 ---- } ! public String getFilter() { return filter; } /** * Scan the tag and extract the information related to this type. The url of the * initiating scan has to be provided in case relative links are found. The initial * url is then prepended to it to give an absolute link. ! * The Lexer is provided in order to do a lookahead operation. We assume that * the identification has already been performed using the evaluate() method. ! * @param tag HTML Tag to be scanned for identification. ! * @param url The initiating url of the scan (Where the html page lies). ! * @param lexer Provides html page access. ! * @return The resultant tag (may be unchanged). */ ! public Tag scan (Tag tag, String url, Lexer lexer) throws ParserException { ! Tag ret; ! ! ret = createTag(lexer.getPage (), tag.elementBegin(), tag.elementEnd(), tag.getAttributesEx (), tag, url); ! ret.setThisScanner(this); return (ret); } /** ! * Create a tag. ! * Override this method to create your own tag type. ! * @param tagData ! * @param tag ! * @param url ! * @return Tag ! * @throws ParserException */ ! protected abstract Tag createTag(Page page, int start, int end, Vector attributes, Tag tag, String url) throws ParserException; + public abstract String [] getID(); } |