[Archive-access-cvs] SF.net SVN: archive-access:[3627] tags/nutchwax-0_13-JIRA-WAX-75/archive/ src

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 3627
          http://archive-access.svn.sourceforge.net/archive-access/?rev=3627&view=rev
Author:   binzino
Date:     2012-07-02 23:33:10 +0000 (Mon, 02 Jul 2012)
Log Message:
-----------
Use pdfinfo for extracting PDF file metadata.  Also re-wrote sub-process handling to correctly handle stdout and stderr in separate threads.

Modified Paths:
--------------
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java

Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java
===================================================================

--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java	2012-04-25 02:12:22 UTC (rev 3626)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java	2012-07-02 23:33:10 UTC (rev 3627)
@@ -51,11 +51,26 @@
 {
   public static final Log LOG = LogFactory.getLog( PDFParser.class );
 
-  private Configuration conf;
+  public Configuration conf;
 
+  public String  pdfinfo;
+  public String  pdftotext;
+  public String  lastPage;
+  public boolean raw;
+
   public void setConf( Configuration conf )
   {
     this.conf = conf;
+
+    this.pdfinfo   = this.conf.get( "nutchwax.parse.pdf2.pdfinfo.path", "/usr/bin/pdfinfo" );
+    this.pdftotext = this.conf.get( "nutchwax.parse.pdf2.pdftotext.path", "/usr/bin/pdftotext" );
+    this.lastPage  = String.valueOf( this.conf.getInt( "nutchwax.parse.pdf2.lastPage", 100 ) );
+    this.raw       = this.conf.getBoolean( "nutchwax.parse.pdf2.raw", true );
+
+    LOG.info( "nutchwax.parse.pdf2.pdfinfo.path   = " + this.pdfinfo   );
+    LOG.info( "nutchwax.parse.pdf2.pdftotext.path = " + this.pdftotext );
+    LOG.info( "nutchwax.parse.pdf2.lastPage       = " + this.lastPage  );
+    LOG.info( "nutchwax.parse.pdf2.raw            = " + this.raw       );
   }
   
   public Configuration getConf( )
@@ -69,8 +84,6 @@
     String title = "";
     String text  = "";
     
-    byte[] raw = content.getContent( );
-    
     File tmpfile = null;
     try
       {
@@ -78,51 +91,58 @@
     
         // Write the PDF document to the tmp file.
         FileOutputStream fos = new FileOutputStream( tmpfile );
-        fos.write( raw );
+        fos.write( content.getContent( ) );
         fos.close();
         
-        String exepath  = this.conf.get( "nutchwax.parse.pdf2.pdftotext.path", "/usr/bin/pdftotext" );
-        String lastPage = String.valueOf( this.conf.getInt( "nutchwax.parse.pdf2.lastPage", 100 ) );
+        // Create a Process which calls 'pdfinfo' to extract the metadata.
+        ProcessInfo pinfo = execute( pdfinfo, "-enc", "UTF-8", tmpfile.toString() );
 
-        // Create a Process which calls 'pdftotext' to extract the metadata.  Only get the first page.
-        ProcessBuilder pb = new ProcessBuilder( exepath, "-q", "-nopgbrk", "-enc", "UTF-8", "-htmlmeta", "-f", "1", "-l", "1", tmpfile.toString(), "-" );
-        
-        Process p = pb.start();
-        
-        p.getOutputStream( ).close();
-        String head = suck( new InputStreamReader( p.getInputStream( ), "utf-8" ) );
-        byte[] err  = suck( p.getErrorStream( ) );
+        if ( pinfo.stderr.length() > 0 )
+          {
+            LOG.warn( "Error from pdfinfo: " + pinfo.stderr );
+          }
 
-        if ( err.length > 0 )
+        // Extract the title and other metadata properties from the above call to pdfinfo.
+        String[] properties = pinfo.stdout.split("\n");
+        for ( String property : properties )
           {
-            LOG.warn( "Error from pdftotext: " + new String( err, "utf-8" ) );
+            String[] kv = property.split( "[:]", 2 );
+            if ( kv.length != 2 ) continue ;
+
+            if ( kv[0].trim().equals( "Title" ) )
+              {
+                title = kv[1].trim();
+              }
+            else if ( kv[0].trim().equals( "Keywords" ) )
+              {
+                content.getMetadata().set( "keywords", kv[1].trim() );
+              }
+            else if ( kv[0].trim().equals( "Subject" ) )
+              {
+                content.getMetadata().set( "subject", kv[1].trim() );
+              }
+            else
+              {
+                content.getMetadata().set( kv[0].trim(), kv[1].trim() );
+              }
           }
 
-        p.destroy( );
-
-        // Extract the title from the HTML-formatted metadata output of the above call to pdftotext.
-        Matcher m = Pattern.compile( "<title>(.+)</title>", Pattern.DOTALL ).matcher( head );
-        if ( m.find( ) )
+        // Create a Process which calls 'pdftotext' to extract the content.
+        if ( raw )
           {
-            title = m.group(1);
+            pinfo = execute( pdftotext, "-q", "-raw", "-nopgbrk", "-enc", "UTF-8", "-f", "1", "-l", lastPage, tmpfile.toString(), "-" );
           }
-       
-        // Create a Process which calls 'pdftotext' to extract the content.
-        pb = new ProcessBuilder( exepath, "-q", "-nopgbrk", "-enc", "UTF-8", "-f", "1", "-l", lastPage, tmpfile.toString(), "-" );
-        p = pb.start( );
+        else
+          {
+            pinfo = execute( pdftotext, "-q",         "-nopgbrk", "-enc", "UTF-8", "-f", "1", "-l", lastPage, tmpfile.toString(), "-" );
+          }
 
-        p.getOutputStream( ).close( );
-        text = suck( new InputStreamReader( p.getInputStream( ), "utf-8" ) );
-        err  = suck( p.getErrorStream( ) );
-
-        if ( err.length > 0 )
+        if ( pinfo.stderr.length() > 0 )
           {
-            LOG.warn( "Error from pdftotext: " + new String( err, "utf-8" ) );
+            LOG.warn( "Error from pdftotext: " + pinfo.stderr );
           }
         
-        p.destroy( );
-
-        // No outlinks, sorry :(
+        // No outlinks with pdftotext, sorry :(
         Outlink[] outlinks  = new Outlink[0];
 
         ParseData parseData = new ParseData( ParseStatus.STATUS_SUCCESS, 
@@ -131,7 +151,7 @@
                                              content.getMetadata(),
                                              metadata );
 
-        return ParseResult.createParseResult( content.getUrl(), new ParseImpl( text, parseData ) );
+        return ParseResult.createParseResult( content.getUrl(), new ParseImpl( pinfo.stdout, parseData ) );
       }
     catch ( Exception e )
       {
@@ -145,37 +165,83 @@
           }
       }
 
-    // TODO!
+    // FIXME: Is there anything better/smarter to do here?
     return null;
   }
-  
-  private byte[] suck( InputStream is )
-    throws IOException
+
+  ProcessInfo execute( String... args ) throws Exception
   {
-    ByteArrayOutputStream baos = new ByteArrayOutputStream( 4* 1024 );
-    byte[] buf = new byte[1024*4];
-    int c = -1;
-    while ( (c = is.read( buf )) != -1 )
-      {
-        baos.write( buf, 0, c );
-      }
+    // Create a Process which calls 'pdfinfo' to extract the metadata.
+    ProcessBuilder pb = new ProcessBuilder( args );
     
-    return baos.toByteArray();
+    Process p = pb.start();
+    
+    // Close the stdin of the child process.
+    p.getOutputStream( ).close();
+    
+    Sucker stdoutSucker = new Sucker( p.getInputStream() );
+    Sucker stderrSucker = new Sucker( p.getErrorStream() );
+    
+    stdoutSucker.start();
+    stderrSucker.start();
+    
+    p.waitFor();
+    
+    ProcessInfo result = new ProcessInfo();
+
+    result.returnCode = p.exitValue();
+    result.stdout     = new String( stdoutSucker.getBytes(), "utf-8" );
+    result.stderr     = new String( stderrSucker.getBytes(), "utf-8" );
+    
+    return result;
   }
 
-  private String suck( InputStreamReader reader )
-    throws IOException
+  /**
+   * Simple struct to hold sub-process return code, stdout and stderr.
+   */
+  static class ProcessInfo
   {
-    StringBuilder sb = new StringBuilder( 1024 * 4 );
-    char[] buf = new char[1024*4];
-    int c = -1;
+    public int    returnCode;
+    public String stdout;
+    public String stderr;
+  }
+  
+  /**
+   * Thread that sucks up the output of a sub-process stdout or stderr.
+   */
+  static class Sucker extends Thread
+  {
+    InputStream is;
+    ByteArrayOutputStream baos;
 
-    while ( (c = reader.read( buf )) != -1 )
-      {
-        sb.append( buf, 0, c );
-      }
+    public Sucker( InputStream is )
+    {
+      this.is = is;
+      this.baos = new ByteArrayOutputStream( 4*1024 );
+    }
 
-    return sb.toString();    
-  }
+    public byte[] getBytes()
+    {
+      return this.baos.toByteArray();
+    }
+
+    public void run( )
+    {
+      try
+        {
+          byte[] buf = new byte[4*1024];
+          int c = -1;
+          while ( (c = is.read( buf )) != -1 )
+            {
+              baos.write( buf, 0, c );
+            }
+        }
+      catch ( IOException ioe )
+        {
+          PDFParser.LOG.warn( "Error reading from sub-process: " + ioe );
+        }
+    }
+
+  };
   
 }

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.





[Archive-access-cvs] SF.net SVN: archive-access:[3627] tags/nutchwax-0_13-JIRA-WAX-75/archive/ src

[Archive-access-cvs] SF.net SVN: archive-access:[3627] tags/nutchwax-0_13-JIRA-WAX-75/archive/ src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser. java