[Htmlparser-user] encoding question:how to read a GB2312/GBK html use parser and then write a xml f

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

I want to parser a html file with encoding GB2312 or GBK and then write 
a xml file with encoding UTF-8.I use jdom to write the XML file.The 
resource html file didn't have a <meta> to identify the chareset,for 
exmaple:
========================
<!DOCTYPE NETSCAPE-Bookmark-file-1>
<!-- This is an automatically generated file.
It will be read and overwritten.
Do Not Edit! -->
<TITLE>Bookmarks</TITLE>
<H1>Bookmarks</H1>
<DL><p>
    <DT><H3 FOLDED ADD_DATE="1120124714">链接</H3>
    <DL><p>
        <DT><A 
HREF="http://www.microsoft.com/isapi/redir.dll?prd=ie&ar=windowsmedia">Windows 
Media</A>
        <DT><A 
HREF="http://www.microsoft.com/isapi/redir.dll?prd=ie&ar=windows">Windows</A>
        <DT><A 
HREF="http://www.microsoft.com/isapi/redir.dll?prd=ie&ar=hotmail"> 免费 
Hotmail</A>
        <DT><A 
HREF="http://www.microsoft.com/isapi/redir.dll?prd=ie&pver=6&ar=CLinks"> 
自定义链接</A>
    </DL><p>
    <DT><A HREF="http://www.yxcard.com/download.htm">..远兴科技..</A>
    <DT><A 
HREF="http://www.microsoft.com/isapi/redir.dll?prd=ie&pver=6&ar=IStart">MSN</A>
    <DT><A HREF="http://www.yesure.com/storm/sort.php/1">暴风影音</A>
    <DT><A 
HREF="http://www.yesky.com/SoftChannel/72348977504190464/20050411/1934159.shtml">Eclipse 
Yesky</A>
</DL><p>
=======================

the java source code is:
=============================================
package html;

import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.util.List;

import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.tags.DefinitionList;
import org.htmlparser.tags.DefinitionListBullet;
import org.htmlparser.tags.HeadingTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.SimpleNodeIterator;
import org.htmlparser.visitors.TagFindingVisitor;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.output.Format;
import org.jdom.output.XMLOutputter;

public class ChangeHtml2XML {

    private String htmlPath="d:/bookmark.htm";

    private String xmlPath="d:/toXML.xml";

    public Document getFirstMark() throws ParserException{
        Parser parser=new Parser(htmlPath);
        parser.setEncoding("GB2312");       
        String [] tagsToBeFound = {"DL"};
        TagFindingVisitor visitor = new TagFindingVisitor 
(tagsToBeFound);       
        parser.visitAllNodesWith(visitor);
        Node [] nodes=visitor.getTags(0);

        DefinitionList dl=(DefinitionList)nodes[0];

        Element rootElement=new Element("favorite");
        Document userDocument=new Document(rootElement);
        visitEachAndBuild(userDocument,rootElement,dl);
        System.out.println(parser.getEncoding());
        return userDocument;

    }

    public void visitEachAndBuild(Document document,Element 
parentElement,DefinitionList parentDL){
        SimpleNodeIterator iteratorParentDlChildren=parentDL.children();
        while(iteratorParentDlChildren.hasMoreNodes()){
            Node node=iteratorParentDlChildren.nextNode();

            if 
(node.getClass().getName().equals(DefinitionListBullet.class.getName())){
                DefinitionListBullet dt=(DefinitionListBullet)node;
                Node justNode=dt.getChild(0);

                if 
(justNode.getClass().getName().equals(HeadingTag.class.getName())){

                    TextNode tn=(TextNode)dt.getChild(1);
                    Element newElement=new Element("folder");

                    newElement.setAttribute("label",tn.getText());
                    System.out.println(tn.getText());
                    parentElement.addContent(newElement);

                    DefinitionList findTheDL=null;
                    SimpleNodeIterator 
forChildDefinitionList=dt.getChildren().elements();
                    while(forChildDefinitionList.hasMoreNodes()){
                        Node n=forChildDefinitionList.nextNode();
                        if 
(n.getClass().getName().equals(DefinitionList.class.getName())){
                            findTheDL=(DefinitionList)n;
                            break;
                        }
                    }

                    if (findTheDL!=null) 
visitEachAndBuild(document,newElement,findTheDL);
                }else{
                    TextNode tn=(TextNode)dt.getChild(1);
                    LinkTag link=(LinkTag)dt.getChild(0);
                    Element newElement=new Element("address");
                    newElement.setAttribute("lable",tn.getText());
                    System.out.println(tn.getText());
                    newElement.setAttribute("url",link.getLink());
                    newElement.setAttribute("target","blank");
                    parentElement.addContent(newElement);                

                }
            }
        }
    }

    public void saveDocument(Document doc){
        StringBuffer buff = new StringBuffer();

        buff.append(xmlPath);
        try {
            XMLOutputter outputter = new 
XMLOutputter(Format.getPrettyFormat());
            Format format=outputter.getFormat();
            format.setEncoding("UTF-8");
            format.setExpandEmptyElements(true);
            outputter.setFormat(format);

            FileOutputStream fos=new FileOutputStream(buff.toString());
            Writer output=new OutputStreamWriter(fos,"UTF-8");
            outputter.output(doc, output);
            output.close();
            //return true;
        } catch (java.io.IOException e) {
            System.out.println("cant write to file system");
            //throw new Exception(e);
        }
    }

}
===========================
The result XML file cant display the Chinese words correctly,it looks 
like this "&#xD;&#xA;"
What's wrong with me? By the way how to detect a file's charset without 
MetaTag?
Any positive suggestion is welcome.
Thank you!!!!!!!!!!!

[Htmlparser-user] encoding question:how to read a GB2312/GBK html use parser and then write a xml f

[Htmlparser-user] encoding question:how to read a GB2312/GBK html use parser and then write a xml file with UTF-8