[Archive-access-cvs] SF.net SVN: archive-access:[3344] tags/nutchwax-0_13-JIRA-WAX-75/archive/ src

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 3344
          http://archive-access.svn.sourceforge.net/archive-access/?rev=3344&view=rev
Author:   binzino
Date:     2010-11-22 22:44:48 +0000 (Mon, 22 Nov 2010)

Log Message:
-----------
Add PDF parser that uses external 'pdftotext' tool.

Modified Paths:
--------------
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml

Added Paths:
-----------
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/build.xml
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/plugin.xml
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java

Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml
===================================================================

--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml	2010-11-19 02:51:57 UTC (rev 3343)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml	2010-11-22 22:44:48 UTC (rev 3344)
@@ -10,7 +10,7 @@
   <!-- Add 'index-nutchwax' and 'query-nutchwax' to plugin list. -->
   <!-- Also, add 'parse-pdf' -->
   <!-- Remove 'urlfilter-regex' and 'normalizer-(pass|regex|basic)' -->
-  <value>protocol-http|parse-(tika|text|html)|index-nutchwax|query-(basic|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax</value>
+  <value>protocol-http|parse-(pdf2|tika|text|html)|index-nutchwax|query-(basic|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax</value>
 </property>
 
 <!-- 

Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml	2010-11-19 02:51:57 UTC (rev 3343)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml	2010-11-22 22:44:48 UTC (rev 3344)
@@ -31,7 +31,7 @@
   </mimeType>
 
   <mimeType name="application/pdf">
-    <plugin id="parse-tika" />
+    <plugin id="parse-pdf2" />
   </mimeType>
   
   <mimeType name="application/vnd.ms-excel">
@@ -152,6 +152,7 @@
     <alias name="parse-ext"          extension-id="ExtParser" />
     <alias name="parse-text"         extension-id="org.apache.nutch.parse.text.TextParser" />
     <alias name="parse-html"         extension-id="org.apache.nutch.parse.html.HtmlParser" />
+    <alias name="parse-pdf2"         extension-id="org.archive.nutchwax.parse.pdf.PDFParser" />
     <!--
     <alias name="parse-js"           extension-id="JSParser" />
     <alias name="parse-msexceld"     extension-id="org.apache.nutch.parse.msexcel.MSExcelParser" />

Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml	2010-11-19 02:51:57 UTC (rev 3343)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml	2010-11-22 22:44:48 UTC (rev 3344)
@@ -91,6 +91,7 @@
      <ant dir="query-nutchwax" target="deploy" />
      <ant dir="scoring-nutchwax" target="deploy" />
      <ant dir="urlfilter-nutchwax" target="deploy" />
+     <ant dir="parse-pdf2" target="deploy" />
 
   </target>
 
@@ -202,5 +203,6 @@
     <ant dir="query-nutchwax" target="clean" />
     <ant dir="scoring-nutchwax" target="clean" />
     <ant dir="urlfilter-nutchwax" target="clean" />
+    <ant dir="parse-pdf2" target="clean" />
   </target>
 </project>

Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/build.xml
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/build.xml	                        (rev 0)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/build.xml	2010-11-22 22:44:48 UTC (rev 3344)
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parse-pdf2" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/plugin.xml
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/plugin.xml	                        (rev 0)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/plugin.xml	2010-11-22 22:44:48 UTC (rev 3344)
@@ -0,0 +1,49 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright (C) 2010 Internet Archive.
+ 
+ This file is part of the archive-access tools project
+ (http://sourceforge.net/projects/archive-access).
+ 
+ The archive-access tools are free software; you can redistribute them and/or
+ modify them under the terms of the GNU Lesser Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or any
+ later version.
+ 
+ The archive-access tools are distributed in the hope that they will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
+ Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser Public License along with
+ the archive-access tools; if not, write to the Free Software Foundation,
+ Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+-->
+<plugin
+   id="parse-pdf2"
+   name="External PDF Parser"
+   version="1.0.0"
+   provider-name="archive.org">
+
+   <runtime>
+      <library name="parse-pdf2.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.archive.nutchwax.parse.pdf"
+              name="NutchWAX External PDF Parser"
+              point="org.apache.nutch.parse.Parser">
+
+     <implementation id="org.archive.nutchwax.parse.pdf.PDFParser"
+                     class="org.archive.nutchwax.parse.pdf.PDFParser">
+       <parameter name="contentType" value="application/pdf" />
+       <parameter name="pathSuffix"  value="" />
+     </implementation>
+   </extension>
+   
+</plugin>

Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java	                        (rev 0)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java	2010-11-22 22:44:48 UTC (rev 3344)
@@ -0,0 +1,180 @@
+/*
+ * Copyright (C) 2010 Internet Archive.
+ * 
+ * This file is part of the archive-access tools project
+ * (http://sourceforge.net/projects/archive-access).
+ * 
+ * The archive-access tools are free software; you can redistribute them and/or
+ * modify them under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or any
+ * later version.
+ * 
+ * The archive-access tools are distributed in the hope that they will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
+ * Public License for more details.
+ * 
+ * You should have received a copy of the GNU Lesser Public License along with
+ * the archive-access tools; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+package org.archive.nutchwax.parse.pdf;
+
+import java.io.*;
+import java.util.*;
+import java.util.regex.*;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.OutlinkExtractor;
+import org.apache.nutch.util.LogUtil;
+
+
+/** 
+ * 
+ */
+public class PDFParser implements Parser
+{
+  public static final Log LOG = LogFactory.getLog( PDFParser.class );
+
+  private Configuration conf;
+
+  public void setConf( Configuration conf )
+  {
+    this.conf = conf;
+  }
+  
+  public Configuration getConf( )
+  {
+    return this.conf;
+  }
+  
+  public ParseResult getParse( Content content )
+  {
+    System.out.println( "PDFParser" );
+    
+    Metadata metadata = new Metadata();
+    String title = "";
+    String text  = "";
+    
+    byte[] raw = content.getContent( );
+    
+    File tmpfile = null;
+    try
+      {
+        tmpfile = File.createTempFile( "pdf2-", ".pdf" );
+    
+        // Write the PDF document to the tmp file.
+        FileOutputStream fos = new FileOutputStream( tmpfile );
+        fos.write( raw );
+        fos.close();
+        
+        // Now create a Process to call 'pdftotext' to extract the metadata.
+        ProcessBuilder pb = new ProcessBuilder( "/usr/bin/pdftotext", "-htmlmeta", "-f", "1", "-l", "1", tmpfile.toString(), "-" );
+        
+        Process p = pb.start();
+        
+        p.getOutputStream( ).close();
+        String head = suck( new InputStreamReader( p.getInputStream( ) ) );
+        byte[] err  = suck( p.getErrorStream( ) );
+
+        if ( err.length > 0 )
+          {
+            LOG.warn( "Error from pdftotext: " + new String( err, "utf-8" ) );
+          }
+
+        p.destroy( );
+
+        pb = new ProcessBuilder( "/usr/bin/pdftotext", tmpfile.toString(), "-" );
+        p = pb.start( );
+
+        p.getOutputStream( ).close( );
+        text = suck( new InputStreamReader( p.getInputStream( ) ) );
+        err  = suck( p.getErrorStream( ) );
+
+        if ( err.length > 0 )
+          {
+            LOG.warn( "Error from pdftotext: " + new String( err, "utf-8" ) );
+          }
+        
+        p.destroy( );
+
+        Matcher m = Pattern.compile( "<html>.*?<title>(.*?)</title>.*?</head>", Pattern.DOTALL ).matcher( head );
+        if ( m.find( ) )
+          {
+            title = m.group(1);
+          }
+        
+        //System.out.println( "head  = " + head );
+        //System.out.println( "title = " + title );
+
+        // No outlinks.
+        Outlink[] outlinks  = new Outlink[0];
+
+        ParseData parseData = new ParseData( ParseStatus.STATUS_SUCCESS, 
+                                             title,
+                                             outlinks, 
+                                             content.getMetadata(),
+                                             metadata );
+
+        return ParseResult.createParseResult( content.getUrl(), new ParseImpl( text, parseData ) );
+      }
+    catch ( Exception e )
+      {
+        LOG.error( e );
+      }
+    finally
+      {
+        if ( tmpfile != null )
+          {
+            tmpfile.delete();
+          }
+      }
+
+    // TODO!
+    return null;
+  }
+  
+  private byte[] suck( InputStream is )
+    throws IOException
+  {
+    ByteArrayOutputStream baos = new ByteArrayOutputStream( 4* 1024 );
+    byte[] buf = new byte[1024*4];
+    int c = -1;
+    while ( (c = is.read( buf )) != -1 )
+      {
+        baos.write( buf, 0, c );
+      }
+    
+    return baos.toByteArray();
+  }
+
+  private String suck( InputStreamReader reader )
+    throws IOException
+  {
+    StringBuilder sb = new StringBuilder( 1024 * 4 );
+    char[] buf = new char[1024*4];
+    int c = -1;
+
+    while ( (c = reader.read( buf )) != -1 )
+      {
+        sb.append( buf, 0, c );
+      }
+
+    return sb.toString();    
+  }
+  
+}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.