[Archive-access-cvs] SF.net SVN: archive-access:[2832] tags/nutchwax-0_12_9/archive/src/nutch/ src

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 2832
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2832&view=rev
Author:   binzino
Date:     2009-10-26 22:57:25 +0000 (Mon, 26 Oct 2009)

Log Message:
-----------
Fix WAX-67.  One-line change to include metadata passed in from Importer.

Added Paths:
-----------
    tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/
    tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/
    tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/src/
    tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/src/java/
    tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/src/java/org/
    tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/src/java/org/apache/
    tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/src/java/org/apache/nutch/
    tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/src/java/org/apache/nutch/parse/
    tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/src/java/org/apache/nutch/parse/oo/
    tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/src/java/org/apache/nutch/parse/oo/OOParser.java

Added: tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/src/java/org/apache/nutch/parse/oo/OOParser.java
===================================================================

--- tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/src/java/org/apache/nutch/parse/oo/OOParser.java	                        (rev 0)
+++ tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/src/java/org/apache/nutch/parse/oo/OOParser.java	2009-10-26 22:57:25 UTC (rev 2832)
@@ -0,0 +1,220 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.oo;
+
+import java.io.*;
+import java.net.MalformedURLException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.zip.*;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.*;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.LogUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.jaxen.*;
+import org.jaxen.jdom.JDOMXPath;
+import org.jdom.*;
+import org.jdom.input.*;
+
+/**
+ * Parser for OpenOffice and OpenDocument formats. This should handle
+ * the following formats: Text, Spreadsheet, Presentation, and
+ * corresponding templates and "master" documents.
+ * 
+ * @author Andrzej Bialecki
+ */
+public class OOParser implements Parser {
+  public static final Log LOG = LogFactory.getLog(OOParser.class);
+  
+  private Configuration conf;
+
+  public OOParser () {
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+  
+  public Configuration getConf() {
+    return conf;
+  }
+  
+  public ParseResult getParse(Content content) {
+    String text = null;
+    String title = null;
+    Metadata metadata = new Metadata();
+    ArrayList outlinks = new ArrayList();
+
+    try {
+      byte[] raw = content.getContent();
+      String contentLength = content.getMetadata().get("Content-Length");
+      if (contentLength != null
+            && raw.length != Integer.parseInt(contentLength)) {
+          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
+                  "Content truncated at "+raw.length
+            +" bytes. Parser can't handle incomplete files.").getEmptyParseResult(content.getUrl(), conf);
+      }
+      ZipInputStream zis = new ZipInputStream(new ByteArrayInputStream(raw));
+      ZipEntry ze = null;
+      while ((ze = zis.getNextEntry()) != null) {
+        if (ze.getName().equals("content.xml")) {
+          text = parseContent(ze, zis, outlinks);
+        } else if (ze.getName().equals("meta.xml")) {
+          parseMeta(ze, zis, metadata);
+        }
+      }
+      zis.close();
+    } catch (Exception e) { // run time exception
+      e.printStackTrace(LogUtil.getWarnStream(LOG));
+      return new ParseStatus(ParseStatus.FAILED,
+              "Can't be handled as OO document. " + e).getEmptyParseResult(content.getUrl(), conf);
+    }
+
+    title = metadata.get(Metadata.TITLE);
+    if (text == null)
+      text = "";
+
+    if (title == null)
+      title = "";
+
+    Outlink[] links = (Outlink[])outlinks.toArray(new Outlink[outlinks.size()]);
+    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, links, content.getMetadata(), metadata);
+    return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
+  }
+  
+  // extract as much plain text as possible.
+  private String parseContent(ZipEntry ze, ZipInputStream zis, ArrayList outlinks) throws Exception {
+    StringBuffer res = new StringBuffer();
+    FilterInputStream fis = new FilterInputStream(zis) {
+      public void close() {};
+    };
+    SAXBuilder builder = new SAXBuilder();
+    Document doc = builder.build(fis);
+    Element root = doc.getRootElement();
+    // XXX this is expensive for very large documents. In those cases another
+    // XXX method (direct processing of SAX events, or XMLPull) should be used.
+    XPath path = new JDOMXPath("//text:span | //text:p | //text:tab | //text:tab-stop | //text:a");
+    path.addNamespace("text", root.getNamespace("text").getURI());
+    Namespace xlink = Namespace.getNamespace("xlink", "http://www.w3.org/1999/xlink");
+    List list = path.selectNodes(doc);
+    boolean lastp = true;
+    for (int i = 0; i < list.size(); i++) {
+      Element el = (Element)list.get(i);
+      String text = el.getText();
+      if (el.getName().equals("p")) {
+        // skip empty paragraphs
+        if (!text.equals("")) {
+          if (!lastp) res.append("\n");
+          res.append(text + "\n");
+          lastp = true;
+        }
+      } else if (el.getName().startsWith("tab")) {
+        res.append("\t");
+        lastp = false;
+      } else if (el.getName().equals("a")) {
+        List nl = el.getChildren();
+        String a = null;
+        for (int k = 0; k < nl.size(); k++) {
+          Element anchor = (Element)nl.get(k);
+          String nsName = anchor.getNamespacePrefix() + ":" + anchor.getName();
+          if (!nsName.equals("text:span")) continue;
+          a = anchor.getText();
+          break;
+        }
+        String u = el.getAttributeValue("href", xlink);
+        if (u == null) u = a; // often anchors are URLs
+        try {
+          Outlink o = new Outlink(u, a);
+          outlinks.add(o);
+        } catch (MalformedURLException mue) {
+          // skip
+        }
+        if (a != null && !a.equals("")) {
+          if (!lastp) res.append(' ');
+          res.append(a);
+          lastp = false;
+        }
+      } else {
+        if (!text.equals("")) {
+          if (!lastp) res.append(' ');
+          res.append(text);
+        }
+        lastp = false;
+      }
+    }
+    return res.toString();
+  }
+  
+  // extract metadata and convert them to Nutch format
+  private void parseMeta(ZipEntry ze, ZipInputStream zis, Metadata metadata) throws Exception {
+    FilterInputStream fis = new FilterInputStream(zis) {
+      public void close() {};
+    };
+    SAXBuilder builder = new SAXBuilder();
+    Document doc = builder.build(fis);
+    XPath path = new JDOMXPath("/office:document-meta/office:meta/*");
+    Element root = doc.getRootElement();
+    path.addNamespace("office", root.getNamespace("office").getURI());
+    List list = path.selectNodes(doc);
+    for (int i = 0; i < list.size(); i++) {
+      Element n = (Element)list.get(i);
+      String text = n.getText();
+      if (text.trim().equals("")) continue;
+      String name = n.getName();
+      if (name.equals("title"))
+        metadata.add(Metadata.TITLE, text);
+      else if (name.equals("language"))
+        metadata.add(Metadata.LANGUAGE, text);
+      else if (name.equals("creation-date"))
+        metadata.add(Metadata.DATE, text);
+      else if (name.equals("print-date"))
+        metadata.add(Metadata.LAST_PRINTED, text);
+      else if (name.equals("generator"))
+        metadata.add(Metadata.APPLICATION_NAME, text);
+      else if (name.equals("creator"))
+        metadata.add(Metadata.CREATOR, text);
+    }
+  }
+  
+  public static void main(String[] args) throws Exception {
+    OOParser oo = new OOParser();
+    Configuration conf = NutchConfiguration.create();
+    oo.setConf(conf);
+    FileInputStream fis = new FileInputStream(args[0]);
+    byte[] bytes = new byte[fis.available()];
+    fis.read(bytes);
+    fis.close();
+    Content c = new Content("local", "local", bytes, "application/vnd.oasis.opendocument.text", new Metadata(), conf);
+    Parse p = oo.getParse(c).get(c.getUrl());
+    System.out.println(p.getData());
+    System.out.println("Text: '" + p.getText() + "'");
+    /*
+    // create the test output file
+    OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream("e:\\ootest.txt"), "UTF-8");
+    osw.write(p.getText());
+    osw.flush();
+    osw.close();
+    */
+  }
+}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.