[Archive-access-cvs] SF.net SVN: archive-access:[3609] tags/nutchwax-0_13-JIRA-WAX-75/archive/ src

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 3609
          http://archive-access.svn.sourceforge.net/archive-access/?rev=3609&view=rev
Author:   binzino
Date:     2012-01-27 01:44:25 +0000 (Fri, 27 Jan 2012)
Log Message:
-----------
Add lastPage as configurable param for text extraction.

Modified Paths:
--------------
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java

Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java
===================================================================

--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java	2012-01-26 21:57:22 UTC (rev 3608)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java	2012-01-27 01:44:25 UTC (rev 3609)
@@ -44,7 +44,8 @@
 
 
 /** 
- * 
+ * Nutch plugin which calls 'pdftotext' command-line utility to parse
+ * PDF documents, as well as extract the title.
  */
 public class PDFParser implements Parser
 {
@@ -80,9 +81,10 @@
         fos.write( raw );
         fos.close();
         
-        String exepath = this.conf.get( "org.archive.nutchwax.parse.pdf.pdftotext.path", "/usr/bin/pdftotext" );
+        String exepath  = this.conf.get( "nutchwax.parse.pdf2.pdftotext.path", "/usr/bin/pdftotext" );
+        String lastPage = String.valueOf( this.conf.getInt( "nutchwax.parse.pdf2.lastPage", 100 ) );
 
-        // Now create a Process to call 'pdftotext' to extract the metadata.
+        // Create a Process which calls 'pdftotext' to extract the metadata.  Only get the first page.
         ProcessBuilder pb = new ProcessBuilder( exepath, "-q", "-nopgbrk", "-enc", "UTF-8", "-htmlmeta", "-f", "1", "-l", "1", tmpfile.toString(), "-" );
         
         Process p = pb.start();
@@ -98,7 +100,15 @@
 
         p.destroy( );
 
-        pb = new ProcessBuilder( exepath, "-q", "-nopgbrk", "-enc", "UTF-8", tmpfile.toString(), "-" );
+        // Extract the title from the HTML-formatted metadata output of the above call to pdftotext.
+        Matcher m = Pattern.compile( "<title>(.+)</title>", Pattern.DOTALL ).matcher( head );
+        if ( m.find( ) )
+          {
+            title = m.group(1);
+          }
+       
+        // Create a Process which calls 'pdftotext' to extract the content.
+        pb = new ProcessBuilder( exepath, "-q", "-nopgbrk", "-enc", "UTF-8", "-f", "1", "-l", lastPage, tmpfile.toString(), "-" );
         p = pb.start( );
 
         p.getOutputStream( ).close( );
@@ -112,16 +122,7 @@
         
         p.destroy( );
 
-        Matcher m = Pattern.compile( "<html>.*?<title>(.*?)</title>.*?</head>", Pattern.DOTALL ).matcher( head );
-        if ( m.find( ) )
-          {
-            title = m.group(1);
-          }
-        
-        //System.out.println( "head  = " + head );
-        //System.out.println( "title = " + title );
-
-        // No outlinks.
+        // No outlinks, sorry :(
         Outlink[] outlinks  = new Outlink[0];
 
         ParseData parseData = new ParseData( ParseStatus.STATUS_SUCCESS, 

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.





[Archive-access-cvs] SF.net SVN: archive-access:[3609] tags/nutchwax-0_13-JIRA-WAX-75/archive/ src

[Archive-access-cvs] SF.net SVN: archive-access:[3609] tags/nutchwax-0_13-JIRA-WAX-75/archive/ src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser. java