[Plog4u-devel] org.plog4u.jlibrary/src/org/plog4u/jlibrary/actions/mediawiki/xml/importer WikipediaX
Status: Beta
Brought to you by:
axelcl
From: Axel C. K. <ax...@us...> - 2006-02-22 20:14:28
|
Update of /cvsroot/plog4u/org.plog4u.jlibrary/src/org/plog4u/jlibrary/actions/mediawiki/xml/importer In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv9082/src/org/plog4u/jlibrary/actions/mediawiki/xml/importer Modified Files: WikipediaXmlHandler.java Log Message: Test wikipedia dump imports Index: WikipediaXmlHandler.java =================================================================== RCS file: /cvsroot/plog4u/org.plog4u.jlibrary/src/org/plog4u/jlibrary/actions/mediawiki/xml/importer/WikipediaXmlHandler.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** WikipediaXmlHandler.java 22 Feb 2006 06:03:18 -0000 1.1 --- WikipediaXmlHandler.java 22 Feb 2006 20:14:23 -0000 1.2 *************** *** 1,4 **** --- 1,5 ---- package org.plog4u.jlibrary.actions.mediawiki.xml.importer; + import org.eclipse.core.runtime.Status; import org.xml.sax.Attributes; import org.xml.sax.SAXException; *************** *** 6,37 **** /** ! * Uses a SAX parser to read a Wikipedia current XML file and ! * store the text of the latest revision in a Lucene full text ! * index. * @author Marco Schmidt */ ! public class WikipediaXmlHandler extends DefaultHandler ! { private static final int PROGRESS_STEP = 100; private static final String WIKIPEDIA_PAGE = "page"; private static final String WIKIPEDIA_REVISION = "revision"; private static final String WIKIPEDIA_TEXT = "text"; private static final String WIKIPEDIA_TIMESTAMP = "timestamp"; private static final String WIKIPEDIA_TITLE = "title"; private StringBuffer data; private Attributes attributes; private WikipediaArticle article; private WikipediaArticleCreator creator; private long articleCounter; ! public WikipediaXmlHandler(WikipediaArticleCreator articleSink) ! { creator = articleSink; } ! public void characters(char[] ch, int start, int length) throws SAXException ! { ! if (data == null) ! { data = new StringBuffer(length); } --- 7,45 ---- /** ! * Uses a SAX parser to read a Wikipedia current XML file and store the text of ! * the latest revision in a Lucene full text index. ! * * @author Marco Schmidt */ ! public class WikipediaXmlHandler extends DefaultHandler { private static final int PROGRESS_STEP = 100; + private static final String WIKIPEDIA_PAGE = "page"; + private static final String WIKIPEDIA_REVISION = "revision"; + private static final String WIKIPEDIA_TEXT = "text"; + private static final String WIKIPEDIA_TIMESTAMP = "timestamp"; + private static final String WIKIPEDIA_TITLE = "title"; + private StringBuffer data; + private Attributes attributes; + private WikipediaArticle article; + private WikipediaArticleCreator creator; + private long articleCounter; ! public WikipediaXmlHandler(WikipediaArticleCreator articleSink) { creator = articleSink; } ! public void characters(char[] ch, int start, int length) ! throws SAXException { ! if (data == null) { data = new StringBuffer(length); } *************** *** 39,66 **** } ! public void endElement(String namespaceURI, String localName, String qName) ! { ! try ! { ! if (WIKIPEDIA_PAGE.equals(qName)) ! { ! if (article != null) ! { creator.add(article); articleCounter++; ! if ((articleCounter % PROGRESS_STEP) == 1) ! { ! System.out.println(articleCounter + ":" + article.getTitle()); } } ! } ! else ! if (WIKIPEDIA_TEXT.equals(qName)) ! { article.setText(getString()); ! } ! else ! if (WIKIPEDIA_TITLE.equals(qName)) ! { article.setTitle(getString()); } --- 47,64 ---- } ! public void endElement(String namespaceURI, String localName, String qName) { ! try { ! if (WIKIPEDIA_PAGE.equals(qName)) { ! if (article != null) { creator.add(article); articleCounter++; ! if ((articleCounter % PROGRESS_STEP) == 1) { ! System.out.println(articleCounter + ":" ! + article.getTitle()); } } ! } else if (WIKIPEDIA_TEXT.equals(qName)) { article.setText(getString()); ! } else if (WIKIPEDIA_TITLE.equals(qName)) { article.setTitle(getString()); } *************** *** 68,101 **** attributes = null; ! } ! catch (RuntimeException re) ! { re.printStackTrace(); } } ! private int getInt() ! { return Integer.parseInt(getString()); } ! private long getLong() ! { return Long.parseLong(getString()); } ! public long getNumArticles() ! { return articleCounter; } ! private String getString() ! { ! if (data == null) ! { return null; ! } ! else ! { String s = data.toString(); data = null; --- 66,92 ---- attributes = null; ! } catch (JobCancelledException e) { ! throw e; ! } catch (RuntimeException re) { re.printStackTrace(); } } ! private int getInt() { return Integer.parseInt(getString()); } ! private long getLong() { return Long.parseLong(getString()); } ! public long getNumArticles() { return articleCounter; } ! private String getString() { ! if (data == null) { return null; ! } else { String s = data.toString(); data = null; *************** *** 104,113 **** } ! public void startElement(String namespaceURI, String localName, String qName, Attributes atts) ! { attributes = atts; ! if (WIKIPEDIA_PAGE.equals(qName)) ! { article = new WikipediaArticle(); } --- 95,103 ---- } ! public void startElement(String namespaceURI, String localName, ! String qName, Attributes atts) { attributes = atts; ! if (WIKIPEDIA_PAGE.equals(qName)) { article = new WikipediaArticle(); } |