Revision: 3609 http://archive-access.svn.sourceforge.net/archive-access/?rev=3609&view=rev Author: binzino Date: 2012-01-27 01:44:25 +0000 (Fri, 27 Jan 2012) Log Message: ----------- Add lastPage as configurable param for text extraction. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java 2012-01-26 21:57:22 UTC (rev 3608) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java 2012-01-27 01:44:25 UTC (rev 3609) @@ -44,7 +44,8 @@ /** - * + * Nutch plugin which calls 'pdftotext' command-line utility to parse + * PDF documents, as well as extract the title. */ public class PDFParser implements Parser { @@ -80,9 +81,10 @@ fos.write( raw ); fos.close(); - String exepath = this.conf.get( "org.archive.nutchwax.parse.pdf.pdftotext.path", "/usr/bin/pdftotext" ); + String exepath = this.conf.get( "nutchwax.parse.pdf2.pdftotext.path", "/usr/bin/pdftotext" ); + String lastPage = String.valueOf( this.conf.getInt( "nutchwax.parse.pdf2.lastPage", 100 ) ); - // Now create a Process to call 'pdftotext' to extract the metadata. + // Create a Process which calls 'pdftotext' to extract the metadata. Only get the first page. ProcessBuilder pb = new ProcessBuilder( exepath, "-q", "-nopgbrk", "-enc", "UTF-8", "-htmlmeta", "-f", "1", "-l", "1", tmpfile.toString(), "-" ); Process p = pb.start(); @@ -98,7 +100,15 @@ p.destroy( ); - pb = new ProcessBuilder( exepath, "-q", "-nopgbrk", "-enc", "UTF-8", tmpfile.toString(), "-" ); + // Extract the title from the HTML-formatted metadata output of the above call to pdftotext. + Matcher m = Pattern.compile( "<title>(.+)</title>", Pattern.DOTALL ).matcher( head ); + if ( m.find( ) ) + { + title = m.group(1); + } + + // Create a Process which calls 'pdftotext' to extract the content. + pb = new ProcessBuilder( exepath, "-q", "-nopgbrk", "-enc", "UTF-8", "-f", "1", "-l", lastPage, tmpfile.toString(), "-" ); p = pb.start( ); p.getOutputStream( ).close( ); @@ -112,16 +122,7 @@ p.destroy( ); - Matcher m = Pattern.compile( "<html>.*?<title>(.*?)</title>.*?</head>", Pattern.DOTALL ).matcher( head ); - if ( m.find( ) ) - { - title = m.group(1); - } - - //System.out.println( "head = " + head ); - //System.out.println( "title = " + title ); - - // No outlinks. + // No outlinks, sorry :( Outlink[] outlinks = new Outlink[0]; ParseData parseData = new ParseData( ParseStatus.STATUS_SUCCESS, This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |