Thread: [Htmlparser-user] stack overflow in toHtml() after removing attributes
Brought to you by:
derrickoswald
From: dust <du...@i2...> - 2006-04-02 19:15:40
Attachments:
HtmlParser.java
|
Hello, Am I doing something wrong in the attached code? I generates stack overflow error when run with the default url found in main. Exception in thread "main" java.lang.StackOverflowError at java.lang.StringBuffer.append(Unknown Source) at org.htmlparser.lexer.InputStreamSource.getCharacters(InputStreamSource.java:641) at org.htmlparser.lexer.Page.getText(Page.java:1021) at org.htmlparser.lexer.PageAttribute.getRawValue(PageAttribute.java:384) at org.htmlparser.Attribute.toString(Attribute.java:730) at org.htmlparser.nodes.TagNode.toHtml(TagNode.java:686) at org.htmlparser.tags.CompositeTag.toHtml(CompositeTag.java:177) at org.htmlparser.tags.CompositeTag.putEndTagInto(CompositeTag.java:167) at org.htmlparser.tags.CompositeTag.toHtml(CompositeTag.java:182) at org.htmlparser.tags.CompositeTag.putEndTagInto(CompositeTag.java:167) at org.htmlparser.tags.CompositeTag.toHtml(CompositeTag.java:182) at org.htmlparser.tags.CompositeTag.putEndTagInto(CompositeTag.java:167) at org.htmlparser.tags.CompositeTag.toHtml(CompositeTag.java:182) at org.htmlparser.tags.CompositeTag.putEndTagInto(CompositeTag.java:167) at org.htmlparser.tags.CompositeTag.toHtml(CompositeTag.java:182) etc, -- |
From: Derrick O. <Der...@Ro...> - 2006-04-03 00:33:46
|
I don't see anything obviously wrong with it... ...it doesn't overflow if you don't remove the attributes? dust wrote: >Hello, > > >Am I doing something wrong in the attached code? > >I generates stack overflow error when run with the >default url found in main. > > > >Exception in thread "main" java.lang.StackOverflowError > at java.lang.StringBuffer.append(Unknown Source) > at >org.htmlparser.lexer.InputStreamSource.getCharacters(InputStreamSource.java:641) > at org.htmlparser.lexer.Page.getText(Page.java:1021) > at org.htmlparser.lexer.PageAttribute.getRawValue(PageAttribute.java:384) > at org.htmlparser.Attribute.toString(Attribute.java:730) > at org.htmlparser.nodes.TagNode.toHtml(TagNode.java:686) > at org.htmlparser.tags.CompositeTag.toHtml(CompositeTag.java:177) > at org.htmlparser.tags.CompositeTag.putEndTagInto(CompositeTag.java:167) > at org.htmlparser.tags.CompositeTag.toHtml(CompositeTag.java:182) > at org.htmlparser.tags.CompositeTag.putEndTagInto(CompositeTag.java:167) > at org.htmlparser.tags.CompositeTag.toHtml(CompositeTag.java:182) > at org.htmlparser.tags.CompositeTag.putEndTagInto(CompositeTag.java:167) > at org.htmlparser.tags.CompositeTag.toHtml(CompositeTag.java:182) > at org.htmlparser.tags.CompositeTag.putEndTagInto(CompositeTag.java:167) > at org.htmlparser.tags.CompositeTag.toHtml(CompositeTag.java:182) > >etc, > >-- > > >------------------------------------------------------------------------ > >import java.util.HashSet; >import java.util.Set; >import java.util.Vector; > >import org.htmlparser.Attribute; >import org.htmlparser.Node; >import org.htmlparser.Parser; >import org.htmlparser.Tag; >import org.htmlparser.util.NodeList; >import org.htmlparser.util.ParserException; >import org.htmlparser.util.SimpleNodeIterator; > >public class HtmlParser >{ > Parser parser; > > public HtmlParser (String link) throws ParserException > { > parser = new Parser (link); > } > > public static void main (String[] args) throws ParserException > { > String link="http://mips.gsf.de/projects/fungi/fungi_db.html"; > if(args.length>0) > link=args[0]; > > HtmlParser htmlParser = new HtmlParser (link); > String html = htmlParser.parse(); > System.out.println(html); > } > > private String parse() throws ParserException { > > NodeList list = parser.parse(null); > > recurse(list); > System.err.println("done, trying toHtml()"); > return list.toHtml(); > } > > private NodeList recurse(NodeList list) { > if(list==null) > return null; > Node node; > SimpleNodeIterator iterator = list.elements(); > while(iterator.hasMoreNodes()) > { > node = iterator.nextNode(); > if(node==null) > break; > if(node instanceof Tag) > { > Tag tag = (Tag)node; > removeAttributes(tag); > recurse(node.getChildren()); > } > } > return null; > } > > static private void removeAttributes(Tag tag) { > String[] allowedAttrs = {""}; > Set allowed = new HashSet(); > for(int i=0;i<allowedAttrs.length;i++) > allowed.add(allowedAttrs[i]); > > allowed.add(tag.getRawTagName()); > allowed.add("/"+tag.getRawTagName()); > > Vector attrs = tag.getAttributesEx(); > for(int i=0;i<attrs.size();i++) > { > Attribute attr = (Attribute)attrs.get(i); > if(attr.getName()==null) > continue; > if(!allowed.contains(attr.getName())) > { > tag.removeAttribute(attr.getName()); > System.out.println("Removed attr: "+attr.getName()); > } > } > } >} > > |
From: Derrick O. <Der...@Ro...> - 2006-04-03 00:45:30
|
I think I see the problem now. For an XML tag like <hello />, the end tag is the tag. Change it to: allowed.add(tag.getRawTagName()); allowed.add("/"); and it should work. dust wrote: >Hello, > > >Am I doing something wrong in the attached code? > >I generates stack overflow error when run with the >default url found in main. > > > >Exception in thread "main" java.lang.StackOverflowError > at java.lang.StringBuffer.append(Unknown Source) > at >org.htmlparser.lexer.InputStreamSource.getCharacters(InputStreamSource.java:641) > at org.htmlparser.lexer.Page.getText(Page.java:1021) > at org.htmlparser.lexer.PageAttribute.getRawValue(PageAttribute.java:384) > at org.htmlparser.Attribute.toString(Attribute.java:730) > at org.htmlparser.nodes.TagNode.toHtml(TagNode.java:686) > at org.htmlparser.tags.CompositeTag.toHtml(CompositeTag.java:177) > at org.htmlparser.tags.CompositeTag.putEndTagInto(CompositeTag.java:167) > at org.htmlparser.tags.CompositeTag.toHtml(CompositeTag.java:182) > at org.htmlparser.tags.CompositeTag.putEndTagInto(CompositeTag.java:167) > at org.htmlparser.tags.CompositeTag.toHtml(CompositeTag.java:182) > at org.htmlparser.tags.CompositeTag.putEndTagInto(CompositeTag.java:167) > at org.htmlparser.tags.CompositeTag.toHtml(CompositeTag.java:182) > at org.htmlparser.tags.CompositeTag.putEndTagInto(CompositeTag.java:167) > at org.htmlparser.tags.CompositeTag.toHtml(CompositeTag.java:182) > >etc, > >-- > > >------------------------------------------------------------------------ > >import java.util.HashSet; >import java.util.Set; >import java.util.Vector; > >import org.htmlparser.Attribute; >import org.htmlparser.Node; >import org.htmlparser.Parser; >import org.htmlparser.Tag; >import org.htmlparser.util.NodeList; >import org.htmlparser.util.ParserException; >import org.htmlparser.util.SimpleNodeIterator; > >public class HtmlParser >{ > Parser parser; > > public HtmlParser (String link) throws ParserException > { > parser = new Parser (link); > } > > public static void main (String[] args) throws ParserException > { > String link="http://mips.gsf.de/projects/fungi/fungi_db.html"; > if(args.length>0) > link=args[0]; > > HtmlParser htmlParser = new HtmlParser (link); > String html = htmlParser.parse(); > System.out.println(html); > } > > private String parse() throws ParserException { > > NodeList list = parser.parse(null); > > recurse(list); > System.err.println("done, trying toHtml()"); > return list.toHtml(); > } > > private NodeList recurse(NodeList list) { > if(list==null) > return null; > Node node; > SimpleNodeIterator iterator = list.elements(); > while(iterator.hasMoreNodes()) > { > node = iterator.nextNode(); > if(node==null) > break; > if(node instanceof Tag) > { > Tag tag = (Tag)node; > removeAttributes(tag); > recurse(node.getChildren()); > } > } > return null; > } > > static private void removeAttributes(Tag tag) { > String[] allowedAttrs = {""}; > Set allowed = new HashSet(); > for(int i=0;i<allowedAttrs.length;i++) > allowed.add(allowedAttrs[i]); > > allowed.add(tag.getRawTagName()); > allowed.add("/"+tag.getRawTagName()); > > Vector attrs = tag.getAttributesEx(); > for(int i=0;i<attrs.size();i++) > { > Attribute attr = (Attribute)attrs.get(i); > if(attr.getName()==null) > continue; > if(!allowed.contains(attr.getName())) > { > tag.removeAttribute(attr.getName()); > System.out.println("Removed attr: "+attr.getName()); > } > } > } >} > > |