[Htmlparser-user] encoding question:how to read a GB2312/GBK html use parser and then write a xml f
Brought to you by:
derrickoswald
From: HuangGehua <bo...@gm...> - 2006-01-30 16:31:56
|
I want to parser a html file with encoding GB2312 or GBK and then write a xml file with encoding UTF-8.I use jdom to write the XML file.The resource html file didn't have a <meta> to identify the chareset,for exmaple: ======================== <!DOCTYPE NETSCAPE-Bookmark-file-1> <!-- This is an automatically generated file. It will be read and overwritten. Do Not Edit! --> <TITLE>Bookmarks</TITLE> <H1>Bookmarks</H1> <DL><p> <DT><H3 FOLDED ADD_DATE="1120124714">链接</H3> <DL><p> <DT><A HREF="http://www.microsoft.com/isapi/redir.dll?prd=ie&ar=windowsmedia">Windows Media</A> <DT><A HREF="http://www.microsoft.com/isapi/redir.dll?prd=ie&ar=windows">Windows</A> <DT><A HREF="http://www.microsoft.com/isapi/redir.dll?prd=ie&ar=hotmail"> 免费 Hotmail</A> <DT><A HREF="http://www.microsoft.com/isapi/redir.dll?prd=ie&pver=6&ar=CLinks"> 自定义链接</A> </DL><p> <DT><A HREF="http://www.yxcard.com/download.htm">..远兴科技..</A> <DT><A HREF="http://www.microsoft.com/isapi/redir.dll?prd=ie&pver=6&ar=IStart">MSN</A> <DT><A HREF="http://www.yesure.com/storm/sort.php/1">暴风影音</A> <DT><A HREF="http://www.yesky.com/SoftChannel/72348977504190464/20050411/1934159.shtml">Eclipse Yesky</A> </DL><p> ======================= the java source code is: ============================================= package html; import java.io.FileOutputStream; import java.io.OutputStreamWriter; import java.io.UnsupportedEncodingException; import java.io.Writer; import java.util.List; import org.htmlparser.Node; import org.htmlparser.Parser; import org.htmlparser.nodes.TextNode; import org.htmlparser.tags.DefinitionList; import org.htmlparser.tags.DefinitionListBullet; import org.htmlparser.tags.HeadingTag; import org.htmlparser.tags.LinkTag; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import org.htmlparser.util.SimpleNodeIterator; import org.htmlparser.visitors.TagFindingVisitor; import org.jdom.Document; import org.jdom.Element; import org.jdom.output.Format; import org.jdom.output.XMLOutputter; public class ChangeHtml2XML { private String htmlPath="d:/bookmark.htm"; private String xmlPath="d:/toXML.xml"; public Document getFirstMark() throws ParserException{ Parser parser=new Parser(htmlPath); parser.setEncoding("GB2312"); String [] tagsToBeFound = {"DL"}; TagFindingVisitor visitor = new TagFindingVisitor (tagsToBeFound); parser.visitAllNodesWith(visitor); Node [] nodes=visitor.getTags(0); DefinitionList dl=(DefinitionList)nodes[0]; Element rootElement=new Element("favorite"); Document userDocument=new Document(rootElement); visitEachAndBuild(userDocument,rootElement,dl); System.out.println(parser.getEncoding()); return userDocument; } public void visitEachAndBuild(Document document,Element parentElement,DefinitionList parentDL){ SimpleNodeIterator iteratorParentDlChildren=parentDL.children(); while(iteratorParentDlChildren.hasMoreNodes()){ Node node=iteratorParentDlChildren.nextNode(); if (node.getClass().getName().equals(DefinitionListBullet.class.getName())){ DefinitionListBullet dt=(DefinitionListBullet)node; Node justNode=dt.getChild(0); if (justNode.getClass().getName().equals(HeadingTag.class.getName())){ TextNode tn=(TextNode)dt.getChild(1); Element newElement=new Element("folder"); newElement.setAttribute("label",tn.getText()); System.out.println(tn.getText()); parentElement.addContent(newElement); DefinitionList findTheDL=null; SimpleNodeIterator forChildDefinitionList=dt.getChildren().elements(); while(forChildDefinitionList.hasMoreNodes()){ Node n=forChildDefinitionList.nextNode(); if (n.getClass().getName().equals(DefinitionList.class.getName())){ findTheDL=(DefinitionList)n; break; } } if (findTheDL!=null) visitEachAndBuild(document,newElement,findTheDL); }else{ TextNode tn=(TextNode)dt.getChild(1); LinkTag link=(LinkTag)dt.getChild(0); Element newElement=new Element("address"); newElement.setAttribute("lable",tn.getText()); System.out.println(tn.getText()); newElement.setAttribute("url",link.getLink()); newElement.setAttribute("target","blank"); parentElement.addContent(newElement); } } } } public void saveDocument(Document doc){ StringBuffer buff = new StringBuffer(); buff.append(xmlPath); try { XMLOutputter outputter = new XMLOutputter(Format.getPrettyFormat()); Format format=outputter.getFormat(); format.setEncoding("UTF-8"); format.setExpandEmptyElements(true); outputter.setFormat(format); FileOutputStream fos=new FileOutputStream(buff.toString()); Writer output=new OutputStreamWriter(fos,"UTF-8"); outputter.output(doc, output); output.close(); //return true; } catch (java.io.IOException e) { System.out.println("cant write to file system"); //throw new Exception(e); } } } =========================== The result XML file cant display the Chinese words correctly,it looks like this "
" What's wrong with me? By the way how to detect a file's charset without MetaTag? Any positive suggestion is welcome. Thank you!!!!!!!!!!! |