[Htmlparser-cvs] htmlparser/src/org/htmlparser/scanners CompositeTagScanner.java,1.67,1.68 DoctypeSc
Brought to you by:
derrickoswald
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners In directory sc8-pr-cvs1:/tmp/cvs-serv30684/scanners Modified Files: CompositeTagScanner.java DoctypeScanner.java ImageScanner.java JspScanner.java ScriptScanner.java TagScanner.java Log Message: Lexer Integration Removed old Parser classes. Removed EndTag, this class was replaced by a call to the new isEndTag() method on the Tag class The StringNode, RemarkNode and tags.Tag class now derive from their lexeme counterparts in lexer.nodes instead of the other way around. The beginnings of a node factory interface are included. This was added so the lexer could return 'visitable' nodes to the parser. The parser acts as it's own node factory, as does the Lexer. The node count for parsing goes up in most cases because every whitespace (i.e. newline) now counts as a StringNode. This has whacked out a lot of the tests that were expecting fewer nodes or a certain type of node at a particular index. Attributes now maintain their order and case. The count of attributes also went up because whitespace is maintained within tags too. The storage in a Vector means the element 0 Attribute is actually the name of the tag, rather than having the $TAGNAME entry in a HashTable. Index: CompositeTagScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/CompositeTagScanner.java,v retrieving revision 1.67 retrieving revision 1.68 diff -C2 -d -r1.67 -r1.68 *** CompositeTagScanner.java 22 Sep 2003 02:40:00 -0000 1.67 --- CompositeTagScanner.java 28 Sep 2003 15:33:58 -0000 1.68 *************** *** 33,39 **** import org.htmlparser.Node; ! import org.htmlparser.NodeReader; import org.htmlparser.parserHelper.CompositeTagScannerHelper; - import org.htmlparser.tags.EndTag; import org.htmlparser.tags.Tag; import org.htmlparser.tags.data.CompositeTagData; --- 33,38 ---- import org.htmlparser.Node; ! import org.htmlparser.lexer.Lexer; import org.htmlparser.parserHelper.CompositeTagScannerHelper; import org.htmlparser.tags.Tag; import org.htmlparser.tags.data.CompositeTagData; *************** *** 177,183 **** } ! public Tag scan(Tag tag, String url, NodeReader reader,String currLine) throws ParserException { CompositeTagScannerHelper helper = ! new CompositeTagScannerHelper(this,tag,url,reader,currLine,balance_quotes); return helper.scan(); } --- 176,182 ---- } ! public Tag scan (Tag tag, String url, Lexer lexer) throws ParserException { CompositeTagScannerHelper helper = ! new CompositeTagScannerHelper(this, tag, lexer, balance_quotes); return helper.scan(); } *************** *** 199,202 **** --- 198,214 ---- /** + * For composite tags this shouldn't be used and hence throws an exception. + * @param tagData + * @param tag + * @param url + * @return Tag + * @throws ParserException + */ + protected Tag createTag(TagData tagData, Tag tag, String url) throws ParserException + { + throw new IllegalStateException ("composite tags shouldn't be using this"); + } + + /** * You must override this method to create the tag of your choice upon successful parsing. Data required * for construction of your tag can be found within tagData and compositeTagData *************** *** 205,210 **** public final boolean isTagToBeEndedFor(Tag tag) { ! boolean isEndTag = tag instanceof EndTag; String tagName = tag.getTagName(); if ( ( isEndTag && endTagEnderSet.contains(tagName)) || --- 217,224 ---- public final boolean isTagToBeEndedFor(Tag tag) { ! boolean isEndTag = tag.isEndTag (); String tagName = tag.getTagName(); + if (isEndTag) + tagName = tagName.substring (1); if ( ( isEndTag && endTagEnderSet.contains(tagName)) || Index: DoctypeScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/DoctypeScanner.java,v retrieving revision 1.27 retrieving revision 1.28 diff -C2 -d -r1.27 -r1.28 *** DoctypeScanner.java 22 Sep 2003 02:40:00 -0000 1.27 --- DoctypeScanner.java 28 Sep 2003 15:33:58 -0000 1.28 *************** *** 59,63 **** String tagContents = tag.getText(); tagContents=tagContents.substring(9,tagContents.length()); ! tagData.setTagContents(tagContents); return new DoctypeTag(tagData); } --- 59,63 ---- String tagContents = tag.getText(); tagContents=tagContents.substring(9,tagContents.length()); ! tagData.setTagContents (tagContents, tag.getAttributesEx (), "" /*url*/, false /*xml_end_tag*/); return new DoctypeTag(tagData); } Index: ImageScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/ImageScanner.java,v retrieving revision 1.28 retrieving revision 1.29 diff -C2 -d -r1.28 -r1.29 *** ImageScanner.java 22 Sep 2003 02:40:00 -0000 1.28 --- ImageScanner.java 28 Sep 2003 15:33:58 -0000 1.29 *************** *** 49,53 **** { public static final String IMAGE_SCANNER_ID = "IMG"; - private Hashtable table; private LinkProcessor processor; /** --- 49,52 ---- *************** *** 73,104 **** * @param url URL of web page being parsed. */ ! public String extractImageLocn(Tag tag,String url) throws ParserException { ! String relativeLink=null; ! try { ! table = tag.getAttributes(); ! relativeLink = (String)table.get("SRC"); ! if (relativeLink!=null) { ! relativeLink = ParserUtils.removeChars(relativeLink,'\n'); ! relativeLink = ParserUtils.removeChars(relativeLink,'\r'); ! } ! if (relativeLink==null || relativeLink.length()==0) { ! // try fix ! String tagText = tag.getText().toUpperCase(); ! int indexSrc = tagText.indexOf("SRC"); ! if (indexSrc != -1) { ! // There is a missing equals. ! tag.setText(tag.getText().substring(0,indexSrc+3)+"="+tag.getText().substring(indexSrc+3,tag.getText().length())); ! table = tag.redoParseAttributes(); ! relativeLink = (String) table.get("SRC"); ! } } ! if (relativeLink==null) return ""; else ! return processor.extract(relativeLink,url); } ! catch (Exception e) { ! throw new ParserException("HTMLImageScanner.extractImageLocn() : Error in extracting image location, relativeLink = "+relativeLink+", url = "+url,e); } } --- 72,106 ---- * @param url URL of web page being parsed. */ ! public String extractImageLocn (Tag tag,String url) throws ParserException { ! String ret; ! Hashtable table; ! ret = ""; ! try ! { ! table = tag.getAttributes (); ! ret = (String)table.get ("SRC"); ! if (null != ret) ! { ! ret = ParserUtils.removeChars (ret, '\n'); ! ret = ParserUtils.removeChars (ret, '\r'); ! ret = processor.extract (ret, url); } ! else ! ret = ""; } ! catch (Exception e) ! { ! throw new ParserException ( ! "ImageScanner.extractImageLocn() : " ! + "Error in extracting image location, relativeLink = " ! + ret ! + ", url = " ! + url, ! e); } + + return (ret); } Index: JspScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/JspScanner.java,v retrieving revision 1.27 retrieving revision 1.28 diff -C2 -d -r1.27 -r1.28 *** JspScanner.java 22 Sep 2003 02:40:00 -0000 1.27 --- JspScanner.java 28 Sep 2003 15:33:58 -0000 1.28 *************** *** 57,63 **** protected Tag createTag(TagData tagData, Tag tag, String url) ! throws ParserException { ! String tagContents = tagData.getTagContents(); ! tagData.setTagContents(tagContents.substring(1,tagContents.length()-1)); return new JspTag(tagData); } --- 57,64 ---- protected Tag createTag(TagData tagData, Tag tag, String url) ! throws ParserException ! { ! tagData.setTagBegin (tagData.getTagBegin () + 1); ! tagData.setTagEnd (tagData.getTagEnd () - 1); return new JspTag(tagData); } Index: ScriptScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/ScriptScanner.java,v retrieving revision 1.39 retrieving revision 1.40 diff -C2 -d -r1.39 -r1.40 *** ScriptScanner.java 22 Sep 2003 02:40:00 -0000 1.39 --- ScriptScanner.java 28 Sep 2003 15:33:58 -0000 1.40 *************** *** 32,35 **** --- 32,36 ---- ///////////////////////// import org.htmlparser.*; + import org.htmlparser.lexer.Lexer; import org.htmlparser.parserHelper.*; import org.htmlparser.tags.*; *************** *** 67,75 **** } ! public Tag scan(Tag tag, String url, NodeReader nodeReader, String currLine) throws ParserException { try { ScriptScannerHelper helper = ! new ScriptScannerHelper(tag,url,nodeReader,currLine, this); return helper.scan(); --- 68,76 ---- } ! public Tag scan (Tag tag, Lexer lexer) throws ParserException { try { ScriptScannerHelper helper = ! new ScriptScannerHelper(tag, lexer, this); return helper.scan(); Index: TagScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/TagScanner.java,v retrieving revision 1.40 retrieving revision 1.41 diff -C2 -d -r1.40 -r1.41 *** TagScanner.java 22 Sep 2003 02:40:00 -0000 1.40 --- TagScanner.java 28 Sep 2003 15:33:58 -0000 1.41 *************** *** 34,43 **** import java.util.Hashtable; import java.util.Map; import org.htmlparser.AbstractNode; import org.htmlparser.Node; ! import org.htmlparser.NodeReader; import org.htmlparser.StringNode; ! import org.htmlparser.tags.EndTag; import org.htmlparser.tags.Tag; import org.htmlparser.tags.data.TagData; --- 34,44 ---- import java.util.Hashtable; import java.util.Map; + import java.util.Vector; import org.htmlparser.AbstractNode; import org.htmlparser.Node; ! import org.htmlparser.Parser; import org.htmlparser.StringNode; ! import org.htmlparser.lexer.Lexer; import org.htmlparser.tags.Tag; import org.htmlparser.tags.data.TagData; *************** *** 141,186 **** return true; } ! public static String extractXMLData(Node node, String tagName, NodeReader reader) throws ParserException{ ! try { ! String xmlData = ""; ! ! boolean xmlTagFound = isXMLTagFound(node, tagName); ! if (xmlTagFound) { ! try{ ! do { ! node = reader.readElement(); ! if (node!=null) { ! if (node instanceof StringNode) { ! StringNode stringNode = (StringNode)node; ! if (xmlData.length()>0) xmlData+=" "; ! xmlData += stringNode.getText(); ! } else if (!(node instanceof org.htmlparser.tags.EndTag)) ! xmlTagFound = false; ! } ! } ! while (node instanceof StringNode); ! ! } ! ! catch (Exception e) { ! throw new ParserException("HTMLTagScanner.extractXMLData() : error while trying to find xml tag",e); ! } ! } ! if (xmlTagFound) { ! if (node!=null) { ! if (node instanceof org.htmlparser.tags.EndTag) { ! org.htmlparser.tags.EndTag endTag = (org.htmlparser.tags.EndTag)node; ! if (!endTag.getText().equals(tagName)) xmlTagFound = false; ! } ! ! } ! ! } ! if (xmlTagFound) return xmlData; else return null; ! } ! catch (Exception e) { ! throw new ParserException("HTMLTagScanner.extractXMLData() : Error occurred while trying to extract xml tag",e); ! } ! } public String getFilter() { --- 142,188 ---- return true; } ! ! // public static String extractXMLData(Node node, String tagName, NodeReader reader) throws ParserException{ ! // try { ! // String xmlData = ""; ! // ! // boolean xmlTagFound = isXMLTagFound(node, tagName); ! // if (xmlTagFound) { ! // try{ ! // do { ! // node = reader.readElement(); ! // if (node!=null) { ! // if (node instanceof StringNode) { ! // StringNode stringNode = (StringNode)node; ! // if (xmlData.length()>0) xmlData+=" "; ! // xmlData += stringNode.getText(); ! // } else if (!(node instanceof org.htmlparser.tags.EndTag)) ! // xmlTagFound = false; ! // } ! // } ! // while (node instanceof StringNode); ! // ! // } ! // ! // catch (Exception e) { ! // throw new ParserException("HTMLTagScanner.extractXMLData() : error while trying to find xml tag",e); ! // } ! // } ! // if (xmlTagFound) { ! // if (node!=null) { ! // if (node instanceof org.htmlparser.tags.EndTag) { ! // org.htmlparser.tags.EndTag endTag = (org.htmlparser.tags.EndTag)node; ! // if (!endTag.getText().equals(tagName)) xmlTagFound = false; ! // } ! // ! // } ! // ! // } ! // if (xmlTagFound) return xmlData; else return null; ! // } ! // catch (Exception e) { ! // throw new ParserException("HTMLTagScanner.extractXMLData() : Error occurred while trying to extract xml tag",e); ! // } ! // } public String getFilter() { *************** *** 199,204 **** } ! public final Tag createScannedNode(Tag tag,String url,NodeReader reader,String currLine) throws ParserException { ! Tag thisTag = scan(tag,url,reader,currLine); thisTag.setThisScanner(this); thisTag.setAttributesEx(tag.getAttributesEx()); --- 201,206 ---- } ! public final Tag createScannedNode(Tag tag,String url,Lexer lexer) throws ParserException { ! Tag thisTag = scan(tag,url,lexer); thisTag.setThisScanner(this); thisTag.setAttributesEx(tag.getAttributesEx()); *************** *** 216,226 **** * @param reader The reader object responsible for reading the html page */ ! public Tag scan(Tag tag,String url,NodeReader reader,String currLine) throws ParserException { ! return createTag(new TagData( tag.elementBegin(), tag.elementEnd(), ! tag.getText(), ! currLine ! ), tag, url); } --- 218,231 ---- * @param reader The reader object responsible for reading the html page */ ! public Tag scan(Tag tag,String url,Lexer lexer) throws ParserException ! { ! TagData data; ! ! data = new TagData( ! lexer.getPage (), tag.elementBegin(), tag.elementEnd(), ! new Vector ()); ! return (createTag(data, tag, url)); } *************** *** 246,262 **** } ! public static Map adjustScanners(NodeReader reader) { ! Map tempScanners= new Hashtable(); ! tempScanners = reader.getParser().getScanners(); // Remove all existing scanners ! reader.getParser().flushScanners(); ! return tempScanners; } ! public static void restoreScanners(NodeReader pReader, Hashtable tempScanners) { // Flush the scanners ! pReader.getParser().setScanners(tempScanners); } --- 251,269 ---- } ! public static Map adjustScanners(Parser parser) { ! Map ret; ! ! ret = parser.getScanners(); // Remove all existing scanners ! parser.flushScanners(); ! ! return (ret); } ! public static void restoreScanners(Parser parser, Hashtable tempScanners) { // Flush the scanners ! parser.setScanners(tempScanners); } *************** *** 279,300 **** * @throws ParserException */ ! protected Tag createTag(TagData tagData, Tag tag, String url) throws ParserException { ! return null; ! } ! ! protected Tag getReplacedEndTag(Tag tag, NodeReader reader, String currentLine) { ! // Replace tag - it was a <A> tag - replace with </a> ! String newLine = replaceFaultyTagWithEndTag(tag, currentLine); ! reader.changeLine(newLine); ! return new EndTag( ! new TagData( ! tag.elementBegin(), ! tag.elementBegin()+3, ! tag.getTagName(), ! currentLine ! ) ! ); ! } public String replaceFaultyTagWithEndTag(Tag tag, String currentLine) { --- 286,304 ---- * @throws ParserException */ ! protected abstract Tag createTag(TagData tagData, Tag tag, String url) throws ParserException; ! // protected Tag getReplacedEndTag(Tag tag, NodeReader reader, String currentLine) { ! // // Replace tag - it was a <A> tag - replace with </a> ! // String newLine = replaceFaultyTagWithEndTag(tag, currentLine); ! // reader.changeLine(newLine); ! // return new EndTag( ! // new TagData( ! // tag.elementBegin(), ! // tag.elementBegin()+3, ! // tag.getTagName(), ! // currentLine ! // ) ! // ); ! // } public String replaceFaultyTagWithEndTag(Tag tag, String currentLine) { *************** *** 306,322 **** } ! protected Tag getInsertedEndTag(Tag tag, NodeReader reader, String currentLine) { ! // Insert end tag ! String newLine = insertEndTagBeforeNode(tag, currentLine); ! reader.changeLine(newLine); ! return new EndTag( ! new TagData( ! tag.elementBegin(), ! tag.elementBegin()+3, ! tag.getTagName(), ! currentLine ! ) ! ); ! } --- 310,326 ---- } ! // protected Tag getInsertedEndTag(Tag tag, String currentLine) { ! // // Insert end tag ! // String newLine = insertEndTagBeforeNode(tag, currentLine); ! // reader.changeLine(newLine); ! // return new EndTag( ! // new TagData( ! // tag.elementBegin(), ! // tag.elementBegin()+3, ! // tag.getTagName(), ! // currentLine ! // ) ! // ); ! // } |