No end tag was found for FIELDSET tag, while it present at html
Brought to you by:
derrickoswald
private Parser parser = null; private NodeFilter filter = null; public HtmlparserTagNameEventer() { parser = new Parser(); filter = new NotFilter(new OrFilter( new TagNameFilter[] { new TagNameFilter("DOCTYPE"), new TagNameFilter("!DOCTYPE"), new TagNameFilter("script"), } )); } public void parse(final String content, String url) throws ParserException { parser.setInputHTML(content); NodeList nl = parser.parse(filter).extractAllNodesThatMatch(filter); nl.visitAllNodesWith(new NodeVisitor() { Stack<Tag> stack = new Stack<Tag>(); Stack<Tag> stackProblemed = new Stack<Tag>(); @Override public void beginParsing() { stack.clear(); stackProblemed.clear(); onBefore(); } @Override public void finishedParsing() { onAfterParse(); } @Override public void visitTag(Tag tag) { if (tag.getTagName().equalsIgnoreCase("fieldset")) { // for debug System.err.printf("%s [%s]%s%s", tag.getTagName(), (tag.isEndTag() ? "close" : "open"), (tag.isEmptyXmlTag() ? " [selfclosed]" : ""), (tag.getEndTag() != null ? String.format(" [has end tag: %s]", tag.getEndTag().getTagName()) : "")); } if (!tag.isEndTag() && !tag.isEmptyXmlTag() && tag.getEndTag() != null) { System.out.printf("<%s>%n", tag.getTagName()); stack.push(tag); printStack(stack); } else if (!tag.isEndTag() && !tag.isEmptyXmlTag() && tag.getEndTag() == null) { stackProblemed.push(tag); } } @Override public void visitEndTag(Tag tag) { System.out.printf("</%s>%n", tag.getTagName()); if (stack.size() > 0) { if (stack.peek().tag.getTagName().equals(tag.getTagName())) { stack.pop(); printStack(stack); } else { if (stackProblemed.size() > 0 && stackProblemed.peek().tag.getTagName().equalsIgnoreCase(tag.getTagName())) { Tag ppped = stackProblemed.pop(); System.out.printf("error: tag is not on main stack <%s> popped%n", ppped.getTagName()); System.exit(1); //System.out.printf(" warning: problem tag <%s> popped%n", tag.getTagName()); } else { System.err.printf("problem situation [tag not match] </%s>%n", tag.getTagName()); System.exit(1); } } } else { System.err.printf("problem situation [empty stack] </%s>%n", tag.getTagName()); System.exit(1); } } private void printStack(Stack<Tg> stack) { StringBuilder sb = new StringBuilder(); int k = 0; for (Tag tag : stack) { sb.append(k > 0 ? "->" : "").append(tag.getTagName()); k++; } System.out.println(sb.toString()); } }); }
/*
After running parse() you will see that when <fieldset> tag visited Tag.getEndTag == null condition says that it has no end tag (</fieldset>). While this tag is present as you can see in html, and moreover meets as closing tag in visitEndTag(Tag tag).
Something I do not understand: does condition tag.getEndTag == null mean that tag has no closing tag or not? Javadoc says that it does...
(content is read from file in attachment)
*/