Revision: 3346 http://archive-access.svn.sourceforge.net/archive-access/?rev=3346&view=rev Author: binzino Date: 2010-11-23 01:22:45 +0000 (Tue, 23 Nov 2010) Log Message: ----------- Add params to pdftotext to inhibit error messages and to omit page breaks. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java 2010-11-23 00:31:27 UTC (rev 3345) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java 2010-11-23 01:22:45 UTC (rev 3346) @@ -80,8 +80,10 @@ fos.write( raw ); fos.close(); + String exepath = this.conf.get( "org.archive.nutchwax.parse.pdf.pdftotext.path", "/usr/bin/pdftotext" ); + // Now create a Process to call 'pdftotext' to extract the metadata. - ProcessBuilder pb = new ProcessBuilder( this.conf.get( "org.archive.nutchwax.parse.pdf.pdftotext.path", "/usr/bin/pdftotext" ), "-htmlmeta", "-f", "1", "-l", "1", tmpfile.toString(), "-" ); + ProcessBuilder pb = new ProcessBuilder( exepath, "-q", "-nopgbrk", "-htmlmeta", "-f", "1", "-l", "1", tmpfile.toString(), "-" ); Process p = pb.start(); @@ -96,7 +98,7 @@ p.destroy( ); - pb = new ProcessBuilder( "/usr/bin/pdftotext", tmpfile.toString(), "-" ); + pb = new ProcessBuilder( exepath, "-q", "-nopgbrk", tmpfile.toString(), "-" ); p = pb.start( ); p.getOutputStream( ).close( ); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |