Revision: 3493 http://archive-access.svn.sourceforge.net/archive-access/?rev=3493&view=rev Author: binzino Date: 2011-08-02 23:23:25 +0000 (Tue, 02 Aug 2011) Log Message: ----------- Fix ARI-2784: Add "utf-8" encoding to pdftotext invocation, as well as when reading the output into Java. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java 2011-08-02 22:40:10 UTC (rev 3492) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java 2011-08-02 23:23:25 UTC (rev 3493) @@ -83,12 +83,12 @@ String exepath = this.conf.get( "org.archive.nutchwax.parse.pdf.pdftotext.path", "/usr/bin/pdftotext" ); // Now create a Process to call 'pdftotext' to extract the metadata. - ProcessBuilder pb = new ProcessBuilder( exepath, "-q", "-nopgbrk", "-htmlmeta", "-f", "1", "-l", "1", tmpfile.toString(), "-" ); + ProcessBuilder pb = new ProcessBuilder( exepath, "-q", "-nopgbrk", "-enc", "UTF-8", "-htmlmeta", "-f", "1", "-l", "1", tmpfile.toString(), "-" ); Process p = pb.start(); p.getOutputStream( ).close(); - String head = suck( new InputStreamReader( p.getInputStream( ) ) ); + String head = suck( new InputStreamReader( p.getInputStream( ), "utf-8" ) ); byte[] err = suck( p.getErrorStream( ) ); if ( err.length > 0 ) @@ -98,11 +98,11 @@ p.destroy( ); - pb = new ProcessBuilder( exepath, "-q", "-nopgbrk", tmpfile.toString(), "-" ); + pb = new ProcessBuilder( exepath, "-q", "-nopgbrk", "-enc", "UTF-8", tmpfile.toString(), "-" ); p = pb.start( ); p.getOutputStream( ).close( ); - text = suck( new InputStreamReader( p.getInputStream( ) ) ); + text = suck( new InputStreamReader( p.getInputStream( ), "utf-8" ) ); err = suck( p.getErrorStream( ) ); if ( err.length > 0 ) This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |