HTMLTransducer, actually it happens in the
HTMLParserVisitor visit(HtmlDocument.EndTag t). What
happens is the tree Node is not being built properly.
For example if I had an html document that had tags
that looked like this.
<div>
<div>
<div>
<div>
</div>
<div>
</div>
</div>
<div>
<div>
</div>
<div>
</div>
</div>
<div>
<div>
</div>
<div>
</div>
</div>
<div>
</div>
</div>
</div>
</div>
As you can see above some <div> have children who are
<div>. But when I try to rebuild the html doc by
parsing the Tree Node I get this in return.
<div>
</div>
<div>
</div>
<div>
</div>
<div>
</div>
<div>
</div>
<div>
</div>
<div>
</div>
<div>
</div>
<div>
</div>
<div>
</div>
<div>
</div>
<div>
</div>
<div>
</div>
Notice there are no children <div>.
It happens in the visit(HtmlDocument.EndTag t).... The
end tag looks up the stack to find the first StartTag
<div> and will not check if it already has an endtag
</div> associated with it.
peter_otto@countrywide.com
Logged In: NO
here is my fix to the problem, added another Stack to trace
if the Start Tag has an End Tag already associated with it.
class HTMLParserVisitor extends HtmlVisitor {
private INode myContainer;
private Stack stack;
private Stack stack2;
private ArrayList popingList;
private INode nextNode, node, previousNode;
private Vector attributes;
private String tag, comment;
boolean keepgoing;
public HTMLParserVisitor(INode container) {
myContainer = container;
stack = new Stack();
stack2 = new Stack();
popingList = new ArrayList();
}
public void finish() {
// add any nodes on the stack to the container
for (int i=0;i<stack.size();i++) {
nextNode = (INode)stack.get(i);
myContainer.add(nextNode);
}
}
public void visit(HtmlDocument.Tag t) {
////System.out.println("visit Tag: "+t.tagName);
node = contextFactory.getGraph();
if (!node.getID().equals(t.tagName)) {
node.setID(t.tagName);
}
node.setValue(t.tagName);
////System.out.println("setNode Tag: "+t.tagName);
HtmlDocument.Attribute nextAttribute;
attributes = t.attributeList.attributes;
for (int i=0;i<attributes.size();i++) {
nextAttribute =
(HtmlDocument.Attribute)attributes.elementAt(i);
node.setProperty(nextAttribute.name,
nextAttribute.value);
}
node.setProperty(TAG, "true");
//System.out.println("Push Stack start Tag: "+t.tagName);
stack.push(node);
stack2.push("YES");
}
public void visit(HtmlDocument.EndTag t) {
//System.out.println("visit EndTag: "+t.tagName);
// if(t.tagName.equalsIgnoreCase("title")) {
// //System.out.println(" Start going backwords");
// }
String sValue = "";
tag = t.tagName;
popingList.clear();
previousNode = (INode)stack.pop();
sValue = (String)stack2.pop();
//System.out.println(" POP Node in stack " +
previousNode.getID());
keepgoing = true;
while (keepgoing) {
if ((sValue.equalsIgnoreCase("NO")) ||
(!previousNode.getID().equals(tag)) || (null ==
previousNode.getProperty(TAG))) {
popingList.add(previousNode);
if (!stack.empty()) {
previousNode = (INode)stack.pop();
sValue = (String)stack2.pop();
//System.out.println(" POP Node in stack " +
previousNode.getID());
} else {
keepgoing = false;
}
} else {
keepgoing = false;
}
}
if (keepgoing) { // We didn't find the start tag for
this EndTag
// Im assuming we don't want to throw away the bad
end tag, even though its bad
// so we'll add it just like its a tag.
node = contextFactory.getGraph();
if (!node.getID().equals(t.tagName)) {
node.setID(t.tagName);
}
node.setValue(t.tagName);
//System.out.println(" Push #2 "+ t.tagName);
stack2.push("No");
stack.push(node);
} else { // We found the starttag for this
EndTag and its previousNode
for (int i=popingList.size()-1;i>=0;i--) {
previousNode.add((INode)popingList.get(i));
}
//System.out.println(" Push #3 " +
previousNode.getID());
stack2.push("NO");
stack.push(previousNode);
}
popingList.clear();
}
public void visit(HtmlDocument.Comment c) {
comment = "<!" + c.comment + ">";
node = contextFactory.getGraph();
if (!node.getID().equals(comment)) {
node.setID(comment);
}
node.setValue(comment);
node.setProperty(COMMENT, "true");
//System.out.println("Push #5 " + node.getID());
stack2.push("NO");
stack.push(node);
}
public void visit(HtmlDocument.Text t) {
////System.out.println("visit Text: "+t);
node = contextFactory.getGraph();
if (!node.getID().equals(t.text)) {
node.setID(t.text);
}
node.setValue(t.text);
node.setProperty(TEXT, "true");
//System.out.println("Push #4 " + node.getID());
stack2.push("NO");
stack.push(node);
}
public void visit(HtmlDocument.Newline n) {
// //System.out.println("visit Newline: "+n);
}
public void visit(HtmlDocument.Annotation a) {
// //System.out.println("visit Annotation: "+a);
}
}