archive-access-cvs Mailing List for Web Archive Access Utilities

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

wayback - Build # 34 - Successful:

Check console output at https://builds.archive.org:1443/job/wayback/34/ to view the results.



wayback - Build # 33 - Successful:

Check console output at https://builds.archive.org:1443/job/wayback/33/ to view the results.



wayback - Build # 32 - Successful:

Check console output at https://builds.archive.org:1443/job/wayback/32/ to view the results.



wayback - Build # 31 - Successful:

Check console output at https://builds.archive.org:1443/job/wayback/31/ to view the results.



wayback - Build # 30 - Successful:

Check console output at https://builds.archive.org:1443/job/wayback/30/ to view the results.



wayback - Build # 29 - Successful:

Check console output at https://builds.archive.org:1443/job/wayback/29/ to view the results.



wayback - Build # 28 - Successful:

Check console output at https://builds.archive.org:1443/job/wayback/28/ to view the results.



wayback - Build # 27 - Successful:

Check console output at https://builds.archive.org:1443/job/wayback/27/ to view the results.



wayback - Build # 26 - Successful:

Check console output at https://builds.archive.org:1443/job/wayback/26/ to view the results.



wayback - Build # 25 - Successful:

Check console output at https://builds.archive.org:1443/job/wayback/25/ to view the results.



wayback - Build # 24 - Successful:

Check console output at https://builds.archive.org:1443/job/wayback/24/ to view the results.



wayback - Build # 23 - Successful:

Check console output at https://builds.archive.org:1443/job/wayback/23/ to view the results.



wayback - Build # 22 - Successful:

Check console output at https://builds.archive.org:1443/job/wayback/22/ to view the results.



wayback - Build # 21 - Successful:

Check console output at https://builds.archive.org:1443/job/wayback/21/ to view the results.



wayback - Build # 20 - Successful:

Check console output at https://builds.archive.org:1443/job/wayback/20/ to view the results.



wayback - Build # 19 - Successful:

Check console output at https://builds.archive.org:1443/job/wayback/19/ to view the results.



wayback - Build # 18 - Successful:

Check console output at https://builds.archive.org:1443/job/wayback/18/ to view the results.



wayback - Build # 17 - Successful:

Check console output at https://builds.archive.org:1443/job/wayback/17/ to view the results.



wayback - Build # 16 - Successful:

Check console output at https://builds.archive.org:1443/job/wayback/16/ to view the results.



wayback - Build # 15 - Successful:

Check console output at https://builds.archive.org:1443/job/wayback/15/ to view the results.



wayback - Build # 14 - Successful:

Check console output at https://builds.archive.org:1443/job/wayback/14/ to view the results.



wayback - Build # 13 - Successful:

Check console output at https://builds.archive.org:1443/job/wayback/13/ to view the results.



wayback - Build # 12 - Successful:

Check console output at https://builds.archive.org:1443/job/wayback/12/ to view the results.



Revision: 3628
          http://archive-access.svn.sourceforge.net/archive-access/?rev=3628&view=rev
Author:   binzino
Date:     2012-07-03 00:43:30 +0000 (Tue, 03 Jul 2012)
Log Message:
-----------
Prepend PDF-only metadata property keys with "pdf.".

Modified Paths:
--------------
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java

Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java
===================================================================

--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java	2012-07-02 23:33:10 UTC (rev 3627)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java	2012-07-03 00:43:30 UTC (rev 3628)
@@ -123,7 +123,8 @@
               }
             else
               {
-                content.getMetadata().set( kv[0].trim(), kv[1].trim() );
+                // For all other properties, append "pdf." prexif.
+                content.getMetadata().set( "pdf." + kv[0].trim(), kv[1].trim() );
               }
           }
 

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.




Revision: 3627
          http://archive-access.svn.sourceforge.net/archive-access/?rev=3627&view=rev
Author:   binzino
Date:     2012-07-02 23:33:10 +0000 (Mon, 02 Jul 2012)
Log Message:
-----------
Use pdfinfo for extracting PDF file metadata.  Also re-wrote sub-process handling to correctly handle stdout and stderr in separate threads.

Modified Paths:
--------------
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java

Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java	2012-04-25 02:12:22 UTC (rev 3626)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java	2012-07-02 23:33:10 UTC (rev 3627)
@@ -51,11 +51,26 @@
 {
   public static final Log LOG = LogFactory.getLog( PDFParser.class );
 
-  private Configuration conf;
+  public Configuration conf;
 
+  public String  pdfinfo;
+  public String  pdftotext;
+  public String  lastPage;
+  public boolean raw;
+
   public void setConf( Configuration conf )
   {
     this.conf = conf;
+
+    this.pdfinfo   = this.conf.get( "nutchwax.parse.pdf2.pdfinfo.path", "/usr/bin/pdfinfo" );
+    this.pdftotext = this.conf.get( "nutchwax.parse.pdf2.pdftotext.path", "/usr/bin/pdftotext" );
+    this.lastPage  = String.valueOf( this.conf.getInt( "nutchwax.parse.pdf2.lastPage", 100 ) );
+    this.raw       = this.conf.getBoolean( "nutchwax.parse.pdf2.raw", true );
+
+    LOG.info( "nutchwax.parse.pdf2.pdfinfo.path   = " + this.pdfinfo   );
+    LOG.info( "nutchwax.parse.pdf2.pdftotext.path = " + this.pdftotext );
+    LOG.info( "nutchwax.parse.pdf2.lastPage       = " + this.lastPage  );
+    LOG.info( "nutchwax.parse.pdf2.raw            = " + this.raw       );
   }
   
   public Configuration getConf( )
@@ -69,8 +84,6 @@
     String title = "";
     String text  = "";
     
-    byte[] raw = content.getContent( );
-    
     File tmpfile = null;
     try
       {
@@ -78,51 +91,58 @@
     
         // Write the PDF document to the tmp file.
         FileOutputStream fos = new FileOutputStream( tmpfile );
-        fos.write( raw );
+        fos.write( content.getContent( ) );
         fos.close();
         
-        String exepath  = this.conf.get( "nutchwax.parse.pdf2.pdftotext.path", "/usr/bin/pdftotext" );
-        String lastPage = String.valueOf( this.conf.getInt( "nutchwax.parse.pdf2.lastPage", 100 ) );
+        // Create a Process which calls 'pdfinfo' to extract the metadata.
+        ProcessInfo pinfo = execute( pdfinfo, "-enc", "UTF-8", tmpfile.toString() );
 
-        // Create a Process which calls 'pdftotext' to extract the metadata.  Only get the first page.
-        ProcessBuilder pb = new ProcessBuilder( exepath, "-q", "-nopgbrk", "-enc", "UTF-8", "-htmlmeta", "-f", "1", "-l", "1", tmpfile.toString(), "-" );
-        
-        Process p = pb.start();
-        
-        p.getOutputStream( ).close();
-        String head = suck( new InputStreamReader( p.getInputStream( ), "utf-8" ) );
-        byte[] err  = suck( p.getErrorStream( ) );
+        if ( pinfo.stderr.length() > 0 )
+          {
+            LOG.warn( "Error from pdfinfo: " + pinfo.stderr );
+          }
 
-        if ( err.length > 0 )
+        // Extract the title and other metadata properties from the above call to pdfinfo.
+        String[] properties = pinfo.stdout.split("\n");
+        for ( String property : properties )
           {
-            LOG.warn( "Error from pdftotext: " + new String( err, "utf-8" ) );
+            String[] kv = property.split( "[:]", 2 );
+            if ( kv.length != 2 ) continue ;
+
+            if ( kv[0].trim().equals( "Title" ) )
+              {
+                title = kv[1].trim();
+              }
+            else if ( kv[0].trim().equals( "Keywords" ) )
+              {
+                content.getMetadata().set( "keywords", kv[1].trim() );
+              }
+            else if ( kv[0].trim().equals( "Subject" ) )
+              {
+                content.getMetadata().set( "subject", kv[1].trim() );
+              }
+            else
+              {
+                content.getMetadata().set( kv[0].trim(), kv[1].trim() );
+              }
           }
 
-        p.destroy( );
-
-        // Extract the title from the HTML-formatted metadata output of the above call to pdftotext.
-        Matcher m = Pattern.compile( "<title>(.+)</title>", Pattern.DOTALL ).matcher( head );
-        if ( m.find( ) )
+        // Create a Process which calls 'pdftotext' to extract the content.
+        if ( raw )
           {
-            title = m.group(1);
+            pinfo = execute( pdftotext, "-q", "-raw", "-nopgbrk", "-enc", "UTF-8", "-f", "1", "-l", lastPage, tmpfile.toString(), "-" );
           }
-       
-        // Create a Process which calls 'pdftotext' to extract the content.
-        pb = new ProcessBuilder( exepath, "-q", "-nopgbrk", "-enc", "UTF-8", "-f", "1", "-l", lastPage, tmpfile.toString(), "-" );
-        p = pb.start( );
+        else
+          {
+            pinfo = execute( pdftotext, "-q",         "-nopgbrk", "-enc", "UTF-8", "-f", "1", "-l", lastPage, tmpfile.toString(), "-" );
+          }
 
-        p.getOutputStream( ).close( );
-        text = suck( new InputStreamReader( p.getInputStream( ), "utf-8" ) );
-        err  = suck( p.getErrorStream( ) );
-
-        if ( err.length > 0 )
+        if ( pinfo.stderr.length() > 0 )
           {
-            LOG.warn( "Error from pdftotext: " + new String( err, "utf-8" ) );
+            LOG.warn( "Error from pdftotext: " + pinfo.stderr );
           }
         
-        p.destroy( );
-
-        // No outlinks, sorry :(
+        // No outlinks with pdftotext, sorry :(
         Outlink[] outlinks  = new Outlink[0];
 
         ParseData parseData = new ParseData( ParseStatus.STATUS_SUCCESS, 
@@ -131,7 +151,7 @@
                                              content.getMetadata(),
                                              metadata );
 
-        return ParseResult.createParseResult( content.getUrl(), new ParseImpl( text, parseData ) );
+        return ParseResult.createParseResult( content.getUrl(), new ParseImpl( pinfo.stdout, parseData ) );
       }
     catch ( Exception e )
       {
@@ -145,37 +165,83 @@
           }
       }
 
-    // TODO!
+    // FIXME: Is there anything better/smarter to do here?
     return null;
   }
-  
-  private byte[] suck( InputStream is )
-    throws IOException
+
+  ProcessInfo execute( String... args ) throws Exception
   {
-    ByteArrayOutputStream baos = new ByteArrayOutputStream( 4* 1024 );
-    byte[] buf = new byte[1024*4];
-    int c = -1;
-    while ( (c = is.read( buf )) != -1 )
-      {
-        baos.write( buf, 0, c );
-      }
+    // Create a Process which calls 'pdfinfo' to extract the metadata.
+    ProcessBuilder pb = new ProcessBuilder( args );
     
-    return baos.toByteArray();
+    Process p = pb.start();
+    
+    // Close the stdin of the child process.
+    p.getOutputStream( ).close();
+    
+    Sucker stdoutSucker = new Sucker( p.getInputStream() );
+    Sucker stderrSucker = new Sucker( p.getErrorStream() );
+    
+    stdoutSucker.start();
+    stderrSucker.start();
+    
+    p.waitFor();
+    
+    ProcessInfo result = new ProcessInfo();
+
+    result.returnCode = p.exitValue();
+    result.stdout     = new String( stdoutSucker.getBytes(), "utf-8" );
+    result.stderr     = new String( stderrSucker.getBytes(), "utf-8" );
+    
+    return result;
   }
 
-  private String suck( InputStreamReader reader )
-    throws IOException
+  /**
+   * Simple struct to hold sub-process return code, stdout and stderr.
+   */
+  static class ProcessInfo
   {
-    StringBuilder sb = new StringBuilder( 1024 * 4 );
-    char[] buf = new char[1024*4];
-    int c = -1;
+    public int    returnCode;
+    public String stdout;
+    public String stderr;
+  }
+  
+  /**
+   * Thread that sucks up the output of a sub-process stdout or stderr.
+   */
+  static class Sucker extends Thread
+  {
+    InputStream is;
+    ByteArrayOutputStream baos;
 
-    while ( (c = reader.read( buf )) != -1 )
-      {
-        sb.append( buf, 0, c );
-      }
+    public Sucker( InputStream is )
+    {
+      this.is = is;
+      this.baos = new ByteArrayOutputStream( 4*1024 );
+    }
 
-    return sb.toString();    
-  }
+    public byte[] getBytes()
+    {
+      return this.baos.toByteArray();
+    }
+
+    public void run( )
+    {
+      try
+        {
+          byte[] buf = new byte[4*1024];
+          int c = -1;
+          while ( (c = is.read( buf )) != -1 )
+            {
+              baos.write( buf, 0, c );
+            }
+        }
+      catch ( IOException ioe )
+        {
+          PDFParser.LOG.warn( "Error reading from sub-process: " + ioe );
+        }
+    }
+
+  };
   
 }

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.





2005	Jan	Feb	Mar	Apr	May	Jun	Jul (1)	Aug (10)	Sep (36)	Oct (339)	Nov (103)	Dec (152)
2006	Jan (141)	Feb (102)	Mar (125)	Apr (203)	May (57)	Jun (30)	Jul (139)	Aug (46)	Sep (64)	Oct (105)	Nov (34)	Dec (162)
2007	Jan (81)	Feb (57)	Mar (141)	Apr (72)	May (9)	Jun (1)	Jul (144)	Aug (88)	Sep (40)	Oct (43)	Nov (34)	Dec (20)
2008	Jan (44)	Feb (45)	Mar (16)	Apr (36)	May (8)	Jun (77)	Jul (177)	Aug (66)	Sep (8)	Oct (33)	Nov (13)	Dec (37)
2009	Jan (2)	Feb (5)	Mar (8)	Apr	May (36)	Jun (19)	Jul (46)	Aug (8)	Sep (1)	Oct (66)	Nov (61)	Dec (10)
2010	Jan (13)	Feb (16)	Mar (38)	Apr (76)	May (47)	Jun (32)	Jul (35)	Aug (45)	Sep (20)	Oct (61)	Nov (24)	Dec (16)
2011	Jan (22)	Feb (34)	Mar (11)	Apr (8)	May (24)	Jun (23)	Jul (11)	Aug (42)	Sep (81)	Oct (48)	Nov (21)	Dec (20)
2012	Jan (30)	Feb (25)	Mar (4)	Apr (6)	May (1)	Jun (5)	Jul (5)	Aug (8)	Sep (6)	Oct (6)	Nov	Dec

archive-access-cvs Mailing List for Web Archive Access Utilities

archive-access-cvs — CVS commits