[Htmlparser-user] encoding question:how to read a GB2312/GBK html use parser and then write a xml f
Brought to you by:
derrickoswald
|
From: HuangGehua <bo...@gm...> - 2006-01-30 16:31:56
|
I want to parser a html file with encoding GB2312 or GBK and then write
a xml file with encoding UTF-8.I use jdom to write the XML file.The
resource html file didn't have a <meta> to identify the chareset,for
exmaple:
========================
<!DOCTYPE NETSCAPE-Bookmark-file-1>
<!-- This is an automatically generated file.
It will be read and overwritten.
Do Not Edit! -->
<TITLE>Bookmarks</TITLE>
<H1>Bookmarks</H1>
<DL><p>
<DT><H3 FOLDED ADD_DATE="1120124714">链接</H3>
<DL><p>
<DT><A
HREF="http://www.microsoft.com/isapi/redir.dll?prd=ie&ar=windowsmedia">Windows
Media</A>
<DT><A
HREF="http://www.microsoft.com/isapi/redir.dll?prd=ie&ar=windows">Windows</A>
<DT><A
HREF="http://www.microsoft.com/isapi/redir.dll?prd=ie&ar=hotmail"> 免费
Hotmail</A>
<DT><A
HREF="http://www.microsoft.com/isapi/redir.dll?prd=ie&pver=6&ar=CLinks">
自定义链接</A>
</DL><p>
<DT><A HREF="http://www.yxcard.com/download.htm">..远兴科技..</A>
<DT><A
HREF="http://www.microsoft.com/isapi/redir.dll?prd=ie&pver=6&ar=IStart">MSN</A>
<DT><A HREF="http://www.yesure.com/storm/sort.php/1">暴风影音</A>
<DT><A
HREF="http://www.yesky.com/SoftChannel/72348977504190464/20050411/1934159.shtml">Eclipse
Yesky</A>
</DL><p>
=======================
the java source code is:
=============================================
package html;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.util.List;
import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.tags.DefinitionList;
import org.htmlparser.tags.DefinitionListBullet;
import org.htmlparser.tags.HeadingTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.SimpleNodeIterator;
import org.htmlparser.visitors.TagFindingVisitor;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.output.Format;
import org.jdom.output.XMLOutputter;
public class ChangeHtml2XML {
private String htmlPath="d:/bookmark.htm";
private String xmlPath="d:/toXML.xml";
public Document getFirstMark() throws ParserException{
Parser parser=new Parser(htmlPath);
parser.setEncoding("GB2312");
String [] tagsToBeFound = {"DL"};
TagFindingVisitor visitor = new TagFindingVisitor
(tagsToBeFound);
parser.visitAllNodesWith(visitor);
Node [] nodes=visitor.getTags(0);
DefinitionList dl=(DefinitionList)nodes[0];
Element rootElement=new Element("favorite");
Document userDocument=new Document(rootElement);
visitEachAndBuild(userDocument,rootElement,dl);
System.out.println(parser.getEncoding());
return userDocument;
}
public void visitEachAndBuild(Document document,Element
parentElement,DefinitionList parentDL){
SimpleNodeIterator iteratorParentDlChildren=parentDL.children();
while(iteratorParentDlChildren.hasMoreNodes()){
Node node=iteratorParentDlChildren.nextNode();
if
(node.getClass().getName().equals(DefinitionListBullet.class.getName())){
DefinitionListBullet dt=(DefinitionListBullet)node;
Node justNode=dt.getChild(0);
if
(justNode.getClass().getName().equals(HeadingTag.class.getName())){
TextNode tn=(TextNode)dt.getChild(1);
Element newElement=new Element("folder");
newElement.setAttribute("label",tn.getText());
System.out.println(tn.getText());
parentElement.addContent(newElement);
DefinitionList findTheDL=null;
SimpleNodeIterator
forChildDefinitionList=dt.getChildren().elements();
while(forChildDefinitionList.hasMoreNodes()){
Node n=forChildDefinitionList.nextNode();
if
(n.getClass().getName().equals(DefinitionList.class.getName())){
findTheDL=(DefinitionList)n;
break;
}
}
if (findTheDL!=null)
visitEachAndBuild(document,newElement,findTheDL);
}else{
TextNode tn=(TextNode)dt.getChild(1);
LinkTag link=(LinkTag)dt.getChild(0);
Element newElement=new Element("address");
newElement.setAttribute("lable",tn.getText());
System.out.println(tn.getText());
newElement.setAttribute("url",link.getLink());
newElement.setAttribute("target","blank");
parentElement.addContent(newElement);
}
}
}
}
public void saveDocument(Document doc){
StringBuffer buff = new StringBuffer();
buff.append(xmlPath);
try {
XMLOutputter outputter = new
XMLOutputter(Format.getPrettyFormat());
Format format=outputter.getFormat();
format.setEncoding("UTF-8");
format.setExpandEmptyElements(true);
outputter.setFormat(format);
FileOutputStream fos=new FileOutputStream(buff.toString());
Writer output=new OutputStreamWriter(fos,"UTF-8");
outputter.output(doc, output);
output.close();
//return true;
} catch (java.io.IOException e) {
System.out.println("cant write to file system");
//throw new Exception(e);
}
}
}
===========================
The result XML file cant display the Chinese words correctly,it looks
like this "
"
What's wrong with me? By the way how to detect a file's charset without
MetaTag?
Any positive suggestion is welcome.
Thank you!!!!!!!!!!!
|