Revision: 3627
http://archive-access.svn.sourceforge.net/archive-access/?rev=3627&view=rev
Author: binzino
Date: 2012-07-02 23:33:10 +0000 (Mon, 02 Jul 2012)
Log Message:
-----------
Use pdfinfo for extracting PDF file metadata. Also re-wrote sub-process handling to correctly handle stdout and stderr in separate threads.
Modified Paths:
--------------
tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java
Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java 2012-04-25 02:12:22 UTC (rev 3626)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java 2012-07-02 23:33:10 UTC (rev 3627)
@@ -51,11 +51,26 @@
{
public static final Log LOG = LogFactory.getLog( PDFParser.class );
- private Configuration conf;
+ public Configuration conf;
+ public String pdfinfo;
+ public String pdftotext;
+ public String lastPage;
+ public boolean raw;
+
public void setConf( Configuration conf )
{
this.conf = conf;
+
+ this.pdfinfo = this.conf.get( "nutchwax.parse.pdf2.pdfinfo.path", "/usr/bin/pdfinfo" );
+ this.pdftotext = this.conf.get( "nutchwax.parse.pdf2.pdftotext.path", "/usr/bin/pdftotext" );
+ this.lastPage = String.valueOf( this.conf.getInt( "nutchwax.parse.pdf2.lastPage", 100 ) );
+ this.raw = this.conf.getBoolean( "nutchwax.parse.pdf2.raw", true );
+
+ LOG.info( "nutchwax.parse.pdf2.pdfinfo.path = " + this.pdfinfo );
+ LOG.info( "nutchwax.parse.pdf2.pdftotext.path = " + this.pdftotext );
+ LOG.info( "nutchwax.parse.pdf2.lastPage = " + this.lastPage );
+ LOG.info( "nutchwax.parse.pdf2.raw = " + this.raw );
}
public Configuration getConf( )
@@ -69,8 +84,6 @@
String title = "";
String text = "";
- byte[] raw = content.getContent( );
-
File tmpfile = null;
try
{
@@ -78,51 +91,58 @@
// Write the PDF document to the tmp file.
FileOutputStream fos = new FileOutputStream( tmpfile );
- fos.write( raw );
+ fos.write( content.getContent( ) );
fos.close();
- String exepath = this.conf.get( "nutchwax.parse.pdf2.pdftotext.path", "/usr/bin/pdftotext" );
- String lastPage = String.valueOf( this.conf.getInt( "nutchwax.parse.pdf2.lastPage", 100 ) );
+ // Create a Process which calls 'pdfinfo' to extract the metadata.
+ ProcessInfo pinfo = execute( pdfinfo, "-enc", "UTF-8", tmpfile.toString() );
- // Create a Process which calls 'pdftotext' to extract the metadata. Only get the first page.
- ProcessBuilder pb = new ProcessBuilder( exepath, "-q", "-nopgbrk", "-enc", "UTF-8", "-htmlmeta", "-f", "1", "-l", "1", tmpfile.toString(), "-" );
-
- Process p = pb.start();
-
- p.getOutputStream( ).close();
- String head = suck( new InputStreamReader( p.getInputStream( ), "utf-8" ) );
- byte[] err = suck( p.getErrorStream( ) );
+ if ( pinfo.stderr.length() > 0 )
+ {
+ LOG.warn( "Error from pdfinfo: " + pinfo.stderr );
+ }
- if ( err.length > 0 )
+ // Extract the title and other metadata properties from the above call to pdfinfo.
+ String[] properties = pinfo.stdout.split("\n");
+ for ( String property : properties )
{
- LOG.warn( "Error from pdftotext: " + new String( err, "utf-8" ) );
+ String[] kv = property.split( "[:]", 2 );
+ if ( kv.length != 2 ) continue ;
+
+ if ( kv[0].trim().equals( "Title" ) )
+ {
+ title = kv[1].trim();
+ }
+ else if ( kv[0].trim().equals( "Keywords" ) )
+ {
+ content.getMetadata().set( "keywords", kv[1].trim() );
+ }
+ else if ( kv[0].trim().equals( "Subject" ) )
+ {
+ content.getMetadata().set( "subject", kv[1].trim() );
+ }
+ else
+ {
+ content.getMetadata().set( kv[0].trim(), kv[1].trim() );
+ }
}
- p.destroy( );
-
- // Extract the title from the HTML-formatted metadata output of the above call to pdftotext.
- Matcher m = Pattern.compile( "<title>(.+)</title>", Pattern.DOTALL ).matcher( head );
- if ( m.find( ) )
+ // Create a Process which calls 'pdftotext' to extract the content.
+ if ( raw )
{
- title = m.group(1);
+ pinfo = execute( pdftotext, "-q", "-raw", "-nopgbrk", "-enc", "UTF-8", "-f", "1", "-l", lastPage, tmpfile.toString(), "-" );
}
-
- // Create a Process which calls 'pdftotext' to extract the content.
- pb = new ProcessBuilder( exepath, "-q", "-nopgbrk", "-enc", "UTF-8", "-f", "1", "-l", lastPage, tmpfile.toString(), "-" );
- p = pb.start( );
+ else
+ {
+ pinfo = execute( pdftotext, "-q", "-nopgbrk", "-enc", "UTF-8", "-f", "1", "-l", lastPage, tmpfile.toString(), "-" );
+ }
- p.getOutputStream( ).close( );
- text = suck( new InputStreamReader( p.getInputStream( ), "utf-8" ) );
- err = suck( p.getErrorStream( ) );
-
- if ( err.length > 0 )
+ if ( pinfo.stderr.length() > 0 )
{
- LOG.warn( "Error from pdftotext: " + new String( err, "utf-8" ) );
+ LOG.warn( "Error from pdftotext: " + pinfo.stderr );
}
- p.destroy( );
-
- // No outlinks, sorry :(
+ // No outlinks with pdftotext, sorry :(
Outlink[] outlinks = new Outlink[0];
ParseData parseData = new ParseData( ParseStatus.STATUS_SUCCESS,
@@ -131,7 +151,7 @@
content.getMetadata(),
metadata );
- return ParseResult.createParseResult( content.getUrl(), new ParseImpl( text, parseData ) );
+ return ParseResult.createParseResult( content.getUrl(), new ParseImpl( pinfo.stdout, parseData ) );
}
catch ( Exception e )
{
@@ -145,37 +165,83 @@
}
}
- // TODO!
+ // FIXME: Is there anything better/smarter to do here?
return null;
}
-
- private byte[] suck( InputStream is )
- throws IOException
+
+ ProcessInfo execute( String... args ) throws Exception
{
- ByteArrayOutputStream baos = new ByteArrayOutputStream( 4* 1024 );
- byte[] buf = new byte[1024*4];
- int c = -1;
- while ( (c = is.read( buf )) != -1 )
- {
- baos.write( buf, 0, c );
- }
+ // Create a Process which calls 'pdfinfo' to extract the metadata.
+ ProcessBuilder pb = new ProcessBuilder( args );
- return baos.toByteArray();
+ Process p = pb.start();
+
+ // Close the stdin of the child process.
+ p.getOutputStream( ).close();
+
+ Sucker stdoutSucker = new Sucker( p.getInputStream() );
+ Sucker stderrSucker = new Sucker( p.getErrorStream() );
+
+ stdoutSucker.start();
+ stderrSucker.start();
+
+ p.waitFor();
+
+ ProcessInfo result = new ProcessInfo();
+
+ result.returnCode = p.exitValue();
+ result.stdout = new String( stdoutSucker.getBytes(), "utf-8" );
+ result.stderr = new String( stderrSucker.getBytes(), "utf-8" );
+
+ return result;
}
- private String suck( InputStreamReader reader )
- throws IOException
+ /**
+ * Simple struct to hold sub-process return code, stdout and stderr.
+ */
+ static class ProcessInfo
{
- StringBuilder sb = new StringBuilder( 1024 * 4 );
- char[] buf = new char[1024*4];
- int c = -1;
+ public int returnCode;
+ public String stdout;
+ public String stderr;
+ }
+
+ /**
+ * Thread that sucks up the output of a sub-process stdout or stderr.
+ */
+ static class Sucker extends Thread
+ {
+ InputStream is;
+ ByteArrayOutputStream baos;
- while ( (c = reader.read( buf )) != -1 )
- {
- sb.append( buf, 0, c );
- }
+ public Sucker( InputStream is )
+ {
+ this.is = is;
+ this.baos = new ByteArrayOutputStream( 4*1024 );
+ }
- return sb.toString();
- }
+ public byte[] getBytes()
+ {
+ return this.baos.toByteArray();
+ }
+
+ public void run( )
+ {
+ try
+ {
+ byte[] buf = new byte[4*1024];
+ int c = -1;
+ while ( (c = is.read( buf )) != -1 )
+ {
+ baos.write( buf, 0, c );
+ }
+ }
+ catch ( IOException ioe )
+ {
+ PDFParser.LOG.warn( "Error reading from sub-process: " + ioe );
+ }
+ }
+
+ };
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|