Revision: 3627 http://archive-access.svn.sourceforge.net/archive-access/?rev=3627&view=rev Author: binzino Date: 2012-07-02 23:33:10 +0000 (Mon, 02 Jul 2012) Log Message: ----------- Use pdfinfo for extracting PDF file metadata. Also re-wrote sub-process handling to correctly handle stdout and stderr in separate threads. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java 2012-04-25 02:12:22 UTC (rev 3626) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java 2012-07-02 23:33:10 UTC (rev 3627) @@ -51,11 +51,26 @@ { public static final Log LOG = LogFactory.getLog( PDFParser.class ); - private Configuration conf; + public Configuration conf; + public String pdfinfo; + public String pdftotext; + public String lastPage; + public boolean raw; + public void setConf( Configuration conf ) { this.conf = conf; + + this.pdfinfo = this.conf.get( "nutchwax.parse.pdf2.pdfinfo.path", "/usr/bin/pdfinfo" ); + this.pdftotext = this.conf.get( "nutchwax.parse.pdf2.pdftotext.path", "/usr/bin/pdftotext" ); + this.lastPage = String.valueOf( this.conf.getInt( "nutchwax.parse.pdf2.lastPage", 100 ) ); + this.raw = this.conf.getBoolean( "nutchwax.parse.pdf2.raw", true ); + + LOG.info( "nutchwax.parse.pdf2.pdfinfo.path = " + this.pdfinfo ); + LOG.info( "nutchwax.parse.pdf2.pdftotext.path = " + this.pdftotext ); + LOG.info( "nutchwax.parse.pdf2.lastPage = " + this.lastPage ); + LOG.info( "nutchwax.parse.pdf2.raw = " + this.raw ); } public Configuration getConf( ) @@ -69,8 +84,6 @@ String title = ""; String text = ""; - byte[] raw = content.getContent( ); - File tmpfile = null; try { @@ -78,51 +91,58 @@ // Write the PDF document to the tmp file. FileOutputStream fos = new FileOutputStream( tmpfile ); - fos.write( raw ); + fos.write( content.getContent( ) ); fos.close(); - String exepath = this.conf.get( "nutchwax.parse.pdf2.pdftotext.path", "/usr/bin/pdftotext" ); - String lastPage = String.valueOf( this.conf.getInt( "nutchwax.parse.pdf2.lastPage", 100 ) ); + // Create a Process which calls 'pdfinfo' to extract the metadata. + ProcessInfo pinfo = execute( pdfinfo, "-enc", "UTF-8", tmpfile.toString() ); - // Create a Process which calls 'pdftotext' to extract the metadata. Only get the first page. - ProcessBuilder pb = new ProcessBuilder( exepath, "-q", "-nopgbrk", "-enc", "UTF-8", "-htmlmeta", "-f", "1", "-l", "1", tmpfile.toString(), "-" ); - - Process p = pb.start(); - - p.getOutputStream( ).close(); - String head = suck( new InputStreamReader( p.getInputStream( ), "utf-8" ) ); - byte[] err = suck( p.getErrorStream( ) ); + if ( pinfo.stderr.length() > 0 ) + { + LOG.warn( "Error from pdfinfo: " + pinfo.stderr ); + } - if ( err.length > 0 ) + // Extract the title and other metadata properties from the above call to pdfinfo. + String[] properties = pinfo.stdout.split("\n"); + for ( String property : properties ) { - LOG.warn( "Error from pdftotext: " + new String( err, "utf-8" ) ); + String[] kv = property.split( "[:]", 2 ); + if ( kv.length != 2 ) continue ; + + if ( kv[0].trim().equals( "Title" ) ) + { + title = kv[1].trim(); + } + else if ( kv[0].trim().equals( "Keywords" ) ) + { + content.getMetadata().set( "keywords", kv[1].trim() ); + } + else if ( kv[0].trim().equals( "Subject" ) ) + { + content.getMetadata().set( "subject", kv[1].trim() ); + } + else + { + content.getMetadata().set( kv[0].trim(), kv[1].trim() ); + } } - p.destroy( ); - - // Extract the title from the HTML-formatted metadata output of the above call to pdftotext. - Matcher m = Pattern.compile( "<title>(.+)</title>", Pattern.DOTALL ).matcher( head ); - if ( m.find( ) ) + // Create a Process which calls 'pdftotext' to extract the content. + if ( raw ) { - title = m.group(1); + pinfo = execute( pdftotext, "-q", "-raw", "-nopgbrk", "-enc", "UTF-8", "-f", "1", "-l", lastPage, tmpfile.toString(), "-" ); } - - // Create a Process which calls 'pdftotext' to extract the content. - pb = new ProcessBuilder( exepath, "-q", "-nopgbrk", "-enc", "UTF-8", "-f", "1", "-l", lastPage, tmpfile.toString(), "-" ); - p = pb.start( ); + else + { + pinfo = execute( pdftotext, "-q", "-nopgbrk", "-enc", "UTF-8", "-f", "1", "-l", lastPage, tmpfile.toString(), "-" ); + } - p.getOutputStream( ).close( ); - text = suck( new InputStreamReader( p.getInputStream( ), "utf-8" ) ); - err = suck( p.getErrorStream( ) ); - - if ( err.length > 0 ) + if ( pinfo.stderr.length() > 0 ) { - LOG.warn( "Error from pdftotext: " + new String( err, "utf-8" ) ); + LOG.warn( "Error from pdftotext: " + pinfo.stderr ); } - p.destroy( ); - - // No outlinks, sorry :( + // No outlinks with pdftotext, sorry :( Outlink[] outlinks = new Outlink[0]; ParseData parseData = new ParseData( ParseStatus.STATUS_SUCCESS, @@ -131,7 +151,7 @@ content.getMetadata(), metadata ); - return ParseResult.createParseResult( content.getUrl(), new ParseImpl( text, parseData ) ); + return ParseResult.createParseResult( content.getUrl(), new ParseImpl( pinfo.stdout, parseData ) ); } catch ( Exception e ) { @@ -145,37 +165,83 @@ } } - // TODO! + // FIXME: Is there anything better/smarter to do here? return null; } - - private byte[] suck( InputStream is ) - throws IOException + + ProcessInfo execute( String... args ) throws Exception { - ByteArrayOutputStream baos = new ByteArrayOutputStream( 4* 1024 ); - byte[] buf = new byte[1024*4]; - int c = -1; - while ( (c = is.read( buf )) != -1 ) - { - baos.write( buf, 0, c ); - } + // Create a Process which calls 'pdfinfo' to extract the metadata. + ProcessBuilder pb = new ProcessBuilder( args ); - return baos.toByteArray(); + Process p = pb.start(); + + // Close the stdin of the child process. + p.getOutputStream( ).close(); + + Sucker stdoutSucker = new Sucker( p.getInputStream() ); + Sucker stderrSucker = new Sucker( p.getErrorStream() ); + + stdoutSucker.start(); + stderrSucker.start(); + + p.waitFor(); + + ProcessInfo result = new ProcessInfo(); + + result.returnCode = p.exitValue(); + result.stdout = new String( stdoutSucker.getBytes(), "utf-8" ); + result.stderr = new String( stderrSucker.getBytes(), "utf-8" ); + + return result; } - private String suck( InputStreamReader reader ) - throws IOException + /** + * Simple struct to hold sub-process return code, stdout and stderr. + */ + static class ProcessInfo { - StringBuilder sb = new StringBuilder( 1024 * 4 ); - char[] buf = new char[1024*4]; - int c = -1; + public int returnCode; + public String stdout; + public String stderr; + } + + /** + * Thread that sucks up the output of a sub-process stdout or stderr. + */ + static class Sucker extends Thread + { + InputStream is; + ByteArrayOutputStream baos; - while ( (c = reader.read( buf )) != -1 ) - { - sb.append( buf, 0, c ); - } + public Sucker( InputStream is ) + { + this.is = is; + this.baos = new ByteArrayOutputStream( 4*1024 ); + } - return sb.toString(); - } + public byte[] getBytes() + { + return this.baos.toByteArray(); + } + + public void run( ) + { + try + { + byte[] buf = new byte[4*1024]; + int c = -1; + while ( (c = is.read( buf )) != -1 ) + { + baos.write( buf, 0, c ); + } + } + catch ( IOException ioe ) + { + PDFParser.LOG.warn( "Error reading from sub-process: " + ioe ); + } + } + + }; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |