From: <bi...@us...> - 2009-10-26 22:57:34
|
Revision: 2832 http://archive-access.svn.sourceforge.net/archive-access/?rev=2832&view=rev Author: binzino Date: 2009-10-26 22:57:25 +0000 (Mon, 26 Oct 2009) Log Message: ----------- Fix WAX-67. One-line change to include metadata passed in from Importer. Added Paths: ----------- tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/ tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/ tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/src/ tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/src/java/ tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/src/java/org/ tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/src/java/org/apache/ tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/src/java/org/apache/nutch/ tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/src/java/org/apache/nutch/parse/ tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/src/java/org/apache/nutch/parse/oo/ tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/src/java/org/apache/nutch/parse/oo/OOParser.java Added: tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/src/java/org/apache/nutch/parse/oo/OOParser.java =================================================================== --- tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/src/java/org/apache/nutch/parse/oo/OOParser.java (rev 0) +++ tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/src/java/org/apache/nutch/parse/oo/OOParser.java 2009-10-26 22:57:25 UTC (rev 2832) @@ -0,0 +1,220 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.oo; + +import java.io.*; +import java.net.MalformedURLException; +import java.util.ArrayList; +import java.util.List; +import java.util.zip.*; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.*; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.LogUtil; +import org.apache.nutch.util.NutchConfiguration; +import org.jaxen.*; +import org.jaxen.jdom.JDOMXPath; +import org.jdom.*; +import org.jdom.input.*; + +/** + * Parser for OpenOffice and OpenDocument formats. This should handle + * the following formats: Text, Spreadsheet, Presentation, and + * corresponding templates and "master" documents. + * + * @author Andrzej Bialecki + */ +public class OOParser implements Parser { + public static final Log LOG = LogFactory.getLog(OOParser.class); + + private Configuration conf; + + public OOParser () { + } + + public void setConf(Configuration conf) { + this.conf = conf; + } + + public Configuration getConf() { + return conf; + } + + public ParseResult getParse(Content content) { + String text = null; + String title = null; + Metadata metadata = new Metadata(); + ArrayList outlinks = new ArrayList(); + + try { + byte[] raw = content.getContent(); + String contentLength = content.getMetadata().get("Content-Length"); + if (contentLength != null + && raw.length != Integer.parseInt(contentLength)) { + return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, + "Content truncated at "+raw.length + +" bytes. Parser can't handle incomplete files.").getEmptyParseResult(content.getUrl(), conf); + } + ZipInputStream zis = new ZipInputStream(new ByteArrayInputStream(raw)); + ZipEntry ze = null; + while ((ze = zis.getNextEntry()) != null) { + if (ze.getName().equals("content.xml")) { + text = parseContent(ze, zis, outlinks); + } else if (ze.getName().equals("meta.xml")) { + parseMeta(ze, zis, metadata); + } + } + zis.close(); + } catch (Exception e) { // run time exception + e.printStackTrace(LogUtil.getWarnStream(LOG)); + return new ParseStatus(ParseStatus.FAILED, + "Can't be handled as OO document. " + e).getEmptyParseResult(content.getUrl(), conf); + } + + title = metadata.get(Metadata.TITLE); + if (text == null) + text = ""; + + if (title == null) + title = ""; + + Outlink[] links = (Outlink[])outlinks.toArray(new Outlink[outlinks.size()]); + ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, links, content.getMetadata(), metadata); + return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData)); + } + + // extract as much plain text as possible. + private String parseContent(ZipEntry ze, ZipInputStream zis, ArrayList outlinks) throws Exception { + StringBuffer res = new StringBuffer(); + FilterInputStream fis = new FilterInputStream(zis) { + public void close() {}; + }; + SAXBuilder builder = new SAXBuilder(); + Document doc = builder.build(fis); + Element root = doc.getRootElement(); + // XXX this is expensive for very large documents. In those cases another + // XXX method (direct processing of SAX events, or XMLPull) should be used. + XPath path = new JDOMXPath("//text:span | //text:p | //text:tab | //text:tab-stop | //text:a"); + path.addNamespace("text", root.getNamespace("text").getURI()); + Namespace xlink = Namespace.getNamespace("xlink", "http://www.w3.org/1999/xlink"); + List list = path.selectNodes(doc); + boolean lastp = true; + for (int i = 0; i < list.size(); i++) { + Element el = (Element)list.get(i); + String text = el.getText(); + if (el.getName().equals("p")) { + // skip empty paragraphs + if (!text.equals("")) { + if (!lastp) res.append("\n"); + res.append(text + "\n"); + lastp = true; + } + } else if (el.getName().startsWith("tab")) { + res.append("\t"); + lastp = false; + } else if (el.getName().equals("a")) { + List nl = el.getChildren(); + String a = null; + for (int k = 0; k < nl.size(); k++) { + Element anchor = (Element)nl.get(k); + String nsName = anchor.getNamespacePrefix() + ":" + anchor.getName(); + if (!nsName.equals("text:span")) continue; + a = anchor.getText(); + break; + } + String u = el.getAttributeValue("href", xlink); + if (u == null) u = a; // often anchors are URLs + try { + Outlink o = new Outlink(u, a); + outlinks.add(o); + } catch (MalformedURLException mue) { + // skip + } + if (a != null && !a.equals("")) { + if (!lastp) res.append(' '); + res.append(a); + lastp = false; + } + } else { + if (!text.equals("")) { + if (!lastp) res.append(' '); + res.append(text); + } + lastp = false; + } + } + return res.toString(); + } + + // extract metadata and convert them to Nutch format + private void parseMeta(ZipEntry ze, ZipInputStream zis, Metadata metadata) throws Exception { + FilterInputStream fis = new FilterInputStream(zis) { + public void close() {}; + }; + SAXBuilder builder = new SAXBuilder(); + Document doc = builder.build(fis); + XPath path = new JDOMXPath("/office:document-meta/office:meta/*"); + Element root = doc.getRootElement(); + path.addNamespace("office", root.getNamespace("office").getURI()); + List list = path.selectNodes(doc); + for (int i = 0; i < list.size(); i++) { + Element n = (Element)list.get(i); + String text = n.getText(); + if (text.trim().equals("")) continue; + String name = n.getName(); + if (name.equals("title")) + metadata.add(Metadata.TITLE, text); + else if (name.equals("language")) + metadata.add(Metadata.LANGUAGE, text); + else if (name.equals("creation-date")) + metadata.add(Metadata.DATE, text); + else if (name.equals("print-date")) + metadata.add(Metadata.LAST_PRINTED, text); + else if (name.equals("generator")) + metadata.add(Metadata.APPLICATION_NAME, text); + else if (name.equals("creator")) + metadata.add(Metadata.CREATOR, text); + } + } + + public static void main(String[] args) throws Exception { + OOParser oo = new OOParser(); + Configuration conf = NutchConfiguration.create(); + oo.setConf(conf); + FileInputStream fis = new FileInputStream(args[0]); + byte[] bytes = new byte[fis.available()]; + fis.read(bytes); + fis.close(); + Content c = new Content("local", "local", bytes, "application/vnd.oasis.opendocument.text", new Metadata(), conf); + Parse p = oo.getParse(c).get(c.getUrl()); + System.out.println(p.getData()); + System.out.println("Text: '" + p.getText() + "'"); + /* + // create the test output file + OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream("e:\\ootest.txt"), "UTF-8"); + osw.write(p.getText()); + osw.flush(); + osw.close(); + */ + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |