From: <bi...@us...> - 2010-11-22 22:44:55
|
Revision: 3344 http://archive-access.svn.sourceforge.net/archive-access/?rev=3344&view=rev Author: binzino Date: 2010-11-22 22:44:48 +0000 (Mon, 22 Nov 2010) Log Message: ----------- Add PDF parser that uses external 'pdftotext' tool. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml Added Paths: ----------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/build.xml tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/plugin.xml tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml 2010-11-19 02:51:57 UTC (rev 3343) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml 2010-11-22 22:44:48 UTC (rev 3344) @@ -10,7 +10,7 @@ <!-- Add 'index-nutchwax' and 'query-nutchwax' to plugin list. --> <!-- Also, add 'parse-pdf' --> <!-- Remove 'urlfilter-regex' and 'normalizer-(pass|regex|basic)' --> - <value>protocol-http|parse-(tika|text|html)|index-nutchwax|query-(basic|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax</value> + <value>protocol-http|parse-(pdf2|tika|text|html)|index-nutchwax|query-(basic|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax</value> </property> <!-- Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml 2010-11-19 02:51:57 UTC (rev 3343) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml 2010-11-22 22:44:48 UTC (rev 3344) @@ -31,7 +31,7 @@ </mimeType> <mimeType name="application/pdf"> - <plugin id="parse-tika" /> + <plugin id="parse-pdf2" /> </mimeType> <mimeType name="application/vnd.ms-excel"> @@ -152,6 +152,7 @@ <alias name="parse-ext" extension-id="ExtParser" /> <alias name="parse-text" extension-id="org.apache.nutch.parse.text.TextParser" /> <alias name="parse-html" extension-id="org.apache.nutch.parse.html.HtmlParser" /> + <alias name="parse-pdf2" extension-id="org.archive.nutchwax.parse.pdf.PDFParser" /> <!-- <alias name="parse-js" extension-id="JSParser" /> <alias name="parse-msexceld" extension-id="org.apache.nutch.parse.msexcel.MSExcelParser" /> Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml 2010-11-19 02:51:57 UTC (rev 3343) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml 2010-11-22 22:44:48 UTC (rev 3344) @@ -91,6 +91,7 @@ <ant dir="query-nutchwax" target="deploy" /> <ant dir="scoring-nutchwax" target="deploy" /> <ant dir="urlfilter-nutchwax" target="deploy" /> + <ant dir="parse-pdf2" target="deploy" /> </target> @@ -202,5 +203,6 @@ <ant dir="query-nutchwax" target="clean" /> <ant dir="scoring-nutchwax" target="clean" /> <ant dir="urlfilter-nutchwax" target="clean" /> + <ant dir="parse-pdf2" target="clean" /> </target> </project> Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/build.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/build.xml (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/build.xml 2010-11-22 22:44:48 UTC (rev 3344) @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="parse-pdf2" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/plugin.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/plugin.xml (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/plugin.xml 2010-11-22 22:44:48 UTC (rev 3344) @@ -0,0 +1,49 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Copyright (C) 2010 Internet Archive. + + This file is part of the archive-access tools project + (http://sourceforge.net/projects/archive-access). + + The archive-access tools are free software; you can redistribute them and/or + modify them under the terms of the GNU Lesser Public License as published by + the Free Software Foundation; either version 2.1 of the License, or any + later version. + + The archive-access tools are distributed in the hope that they will be + useful, but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser + Public License for more details. + + You should have received a copy of the GNU Lesser Public License along with + the archive-access tools; if not, write to the Free Software Foundation, + Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +--> +<plugin + id="parse-pdf2" + name="External PDF Parser" + version="1.0.0" + provider-name="archive.org"> + + <runtime> + <library name="parse-pdf2.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.archive.nutchwax.parse.pdf" + name="NutchWAX External PDF Parser" + point="org.apache.nutch.parse.Parser"> + + <implementation id="org.archive.nutchwax.parse.pdf.PDFParser" + class="org.archive.nutchwax.parse.pdf.PDFParser"> + <parameter name="contentType" value="application/pdf" /> + <parameter name="pathSuffix" value="" /> + </implementation> + </extension> + +</plugin> Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java 2010-11-22 22:44:48 UTC (rev 3344) @@ -0,0 +1,180 @@ +/* + * Copyright (C) 2010 Internet Archive. + * + * This file is part of the archive-access tools project + * (http://sourceforge.net/projects/archive-access). + * + * The archive-access tools are free software; you can redistribute them and/or + * modify them under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or any + * later version. + * + * The archive-access tools are distributed in the hope that they will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser + * Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License along with + * the archive-access tools; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.nutchwax.parse.pdf; + +import java.io.*; +import java.util.*; +import java.util.regex.*; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; + +import org.apache.nutch.protocol.Content; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.Response; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.parse.ParseResult; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.parse.Parser; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.OutlinkExtractor; +import org.apache.nutch.util.LogUtil; + + +/** + * + */ +public class PDFParser implements Parser +{ + public static final Log LOG = LogFactory.getLog( PDFParser.class ); + + private Configuration conf; + + public void setConf( Configuration conf ) + { + this.conf = conf; + } + + public Configuration getConf( ) + { + return this.conf; + } + + public ParseResult getParse( Content content ) + { + System.out.println( "PDFParser" ); + + Metadata metadata = new Metadata(); + String title = ""; + String text = ""; + + byte[] raw = content.getContent( ); + + File tmpfile = null; + try + { + tmpfile = File.createTempFile( "pdf2-", ".pdf" ); + + // Write the PDF document to the tmp file. + FileOutputStream fos = new FileOutputStream( tmpfile ); + fos.write( raw ); + fos.close(); + + // Now create a Process to call 'pdftotext' to extract the metadata. + ProcessBuilder pb = new ProcessBuilder( "/usr/bin/pdftotext", "-htmlmeta", "-f", "1", "-l", "1", tmpfile.toString(), "-" ); + + Process p = pb.start(); + + p.getOutputStream( ).close(); + String head = suck( new InputStreamReader( p.getInputStream( ) ) ); + byte[] err = suck( p.getErrorStream( ) ); + + if ( err.length > 0 ) + { + LOG.warn( "Error from pdftotext: " + new String( err, "utf-8" ) ); + } + + p.destroy( ); + + pb = new ProcessBuilder( "/usr/bin/pdftotext", tmpfile.toString(), "-" ); + p = pb.start( ); + + p.getOutputStream( ).close( ); + text = suck( new InputStreamReader( p.getInputStream( ) ) ); + err = suck( p.getErrorStream( ) ); + + if ( err.length > 0 ) + { + LOG.warn( "Error from pdftotext: " + new String( err, "utf-8" ) ); + } + + p.destroy( ); + + Matcher m = Pattern.compile( "<html>.*?<title>(.*?)</title>.*?</head>", Pattern.DOTALL ).matcher( head ); + if ( m.find( ) ) + { + title = m.group(1); + } + + //System.out.println( "head = " + head ); + //System.out.println( "title = " + title ); + + // No outlinks. + Outlink[] outlinks = new Outlink[0]; + + ParseData parseData = new ParseData( ParseStatus.STATUS_SUCCESS, + title, + outlinks, + content.getMetadata(), + metadata ); + + return ParseResult.createParseResult( content.getUrl(), new ParseImpl( text, parseData ) ); + } + catch ( Exception e ) + { + LOG.error( e ); + } + finally + { + if ( tmpfile != null ) + { + tmpfile.delete(); + } + } + + // TODO! + return null; + } + + private byte[] suck( InputStream is ) + throws IOException + { + ByteArrayOutputStream baos = new ByteArrayOutputStream( 4* 1024 ); + byte[] buf = new byte[1024*4]; + int c = -1; + while ( (c = is.read( buf )) != -1 ) + { + baos.write( buf, 0, c ); + } + + return baos.toByteArray(); + } + + private String suck( InputStreamReader reader ) + throws IOException + { + StringBuilder sb = new StringBuilder( 1024 * 4 ); + char[] buf = new char[1024*4]; + int c = -1; + + while ( (c = reader.read( buf )) != -1 ) + { + sb.append( buf, 0, c ); + } + + return sb.toString(); + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |