|
From: <bi...@us...> - 2010-10-28 04:31:54
|
Revision: 3320
http://archive-access.svn.sourceforge.net/archive-access/?rev=3320&view=rev
Author: binzino
Date: 2010-10-28 04:31:47 +0000 (Thu, 28 Oct 2010)
Log Message:
-----------
Over-rides of Nutch's default parse-tika config. We only want Tika to handle an explicit list of content types, not everything.
Added Paths:
-----------
tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml
tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/parse-tika/
tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/parse-tika/plugin.xml
Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml (rev 0)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml 2010-10-28 04:31:47 UTC (rev 3320)
@@ -0,0 +1,169 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+ Author : mattmann
+ Description: This xml file represents a natural ordering for which parsing
+ plugin should get called for a particular mimeType.
+-->
+
+<parse-plugins>
+
+ <!-- Explicitly set parse-tika as the parser for *only* the types we want
+ to parse. In the parse-tika plugin's plugin.xml, we disable the '*'
+ (wildcard) which matches everything. -->
+
+ <mimeType name="application/msword">
+ <plugin id="parse-tika" />
+ </mimeType>
+
+ <mimeType name="application/pdf">
+ <plugin id="parse-tika" />
+ </mimeType>
+
+ <mimeType name="application/vnd.ms-excel">
+ <plugin id="parse-tika" />
+ </mimeType>
+
+ <mimeType name="application/vnd.ms-powerpoint">
+ <plugin id="parse-tika" />
+ </mimeType>
+
+ <mimeType name="application/vnd.oasis.opendocument.text">
+ <plugin id="parse-tika" />
+ </mimeType>
+
+ <mimeType name="application/vnd.oasis.opendocument.text-template">
+ <plugin id="parse-tika" />
+ </mimeType>
+
+ <mimeType name="application/vnd.oasis.opendocument.text-master">
+ <plugin id="parse-tika" />
+ </mimeType>
+
+ <mimeType name="application/vnd.oasis.opendocument.text-web">
+ <plugin id="parse-tika" />
+ </mimeType>
+
+ <mimeType name="application/vnd.oasis.opendocument.presentation">
+ <plugin id="parse-tika" />
+ </mimeType>
+
+ <mimeType name="application/vnd.oasis.opendocument.presentation-template">
+ <plugin id="parse-tika" />
+ </mimeType>
+
+ <mimeType name="application/vnd.oasis.opendocument.spreadsheet">
+ <plugin id="parse-tika" />
+ </mimeType>
+
+ <mimeType name="application/vnd.oasis.opendocument.spreadsheet-template">
+ <plugin id="parse-tika" />
+ </mimeType>
+
+ <mimeType name="application/vnd.sun.xml.calc">
+ <plugin id="parse-tika" />
+ </mimeType>
+
+ <mimeType name="application/vnd.sun.xml.calc.template">
+ <plugin id="parse-tika" />
+ </mimeType>
+
+ <mimeType name="application/vnd.sun.xml.impress">
+ <plugin id="parse-tika" />
+ </mimeType>
+
+ <mimeType name="application/vnd.sun.xml.impress.template">
+ <plugin id="parse-tika" />
+ </mimeType>
+
+ <mimeType name="application/vnd.sun.xml.writer">
+ <plugin id="parse-tika" />
+ </mimeType>
+
+ <mimeType name="application/vnd.sun.xml.writer.template">
+ <plugin id="parse-tika" />
+ </mimeType>
+
+ <mimeType name="application/x-kword">
+ <plugin id="parse-tika" />
+ </mimeType>
+
+ <mimeType name="application/x-kspread">
+ <plugin id="parse-tika" />
+ </mimeType>
+
+ <mimeType name="text/html">
+ <plugin id="parse-tika" />
+ </mimeType>
+
+ <mimeType name="application/xhtml+xml">
+ <plugin id="parse-tika" />
+ </mimeType>
+
+ <mimeType name="text/plain">
+ <plugin id="parse-tika" />
+ </mimeType>
+
+ <mimeType name="text/richtext">
+ <plugin id="parse-tika" />
+ </mimeType>
+
+ <mimeType name="text/rtf">
+ <plugin id="parse-tika" />
+ </mimeType>
+
+ <!--
+ <mimeType name="text/sgml">
+ <plugin id="parse-tika" />
+ </mimeType>
+
+ <mimeType name="text/tab-separated-values">
+ <plugin id="parse-tika" />
+ </mimeType>
+ -->
+
+ <!-- Types for parse-ext plugin: required for unit tests to pass. -->
+ <mimeType name="application/vnd.nutch.example.cat">
+ <plugin id="parse-ext" />
+ </mimeType>
+
+ <mimeType name="application/vnd.nutch.example.md5sum">
+ <plugin id="parse-ext" />
+ </mimeType>
+
+ <!-- alias mappings for parse-xxx names to the actual extension implementation
+ ids described in each plugin's plugin.xml file -->
+ <aliases>
+ <alias name="parse-tika" extension-id="org.apache.nutch.parse.tika.Parser" />
+ <alias name="parse-ext" extension-id="ExtParser" />
+ <!--
+ <alias name="parse-html" extension-id="org.apache.nutch.parse.html.HtmlParser" />
+ <alias name="parse-js" extension-id="JSParser" />
+ <alias name="parse-msexceld" extension-id="org.apache.nutch.parse.msexcel.MSExcelParser" />
+ <alias name="parse-mspowerpoint" extension-id="org.apache.nutch.parse.mspowerpoint.MSPowerPointParser" />
+ <alias name="parse-msword" extension-id="org.apache.nutch.parse.msword.MSWordParser" />
+ <alias name="parse-oo" extension-id="org.apache.nutch.parse.oo.OpenDocument.Text" />
+ <alias name="parse-pdf" extension-id="org.apache.nutch.parse.pdf.PdfParser" />
+ <alias name="parse-rss" extension-id="org.apache.nutch.parse.rss.RSSParser" />
+ <alias name="feed" extension-id="org.apache.nutch.parse.feed.FeedParser" />
+ <alias name="parse-swf" extension-id="org.apache.nutch.parse.swf.SWFParser" />
+ <alias name="parse-text" extension-id="org.apache.nutch.parse.text.TextParser" />
+ <alias name="parse-zip" extension-id="org.apache.nutch.parse.zip.ZipParser" />
+ -->
+ </aliases>
+
+</parse-plugins>
Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/parse-tika/plugin.xml
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/parse-tika/plugin.xml (rev 0)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/parse-tika/plugin.xml 2010-10-28 04:31:47 UTC (rev 3320)
@@ -0,0 +1,68 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="parse-tika"
+ name="Tika Parser Plug-in"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="parse-tika.jar">
+ <export name="*"/>
+ </library>
+
+ <library name="asm-3.1.jar"/>
+ <library name="bcmail-jdk14-136.jar"/>
+ <library name="bcmail-jdk15-1.45.jar"/>
+ <library name="bcprov-jdk14-136.jar"/>
+ <library name="bcprov-jdk15-1.45.jar"/>
+ <library name="commons-compress-1.0.jar"/>
+ <library name="commons-logging-1.1.1.jar"/>
+ <library name="dom4j-1.6.1.jar"/>
+ <library name="fontbox-1.1.0.jar"/>
+ <library name="geronimo-stax-api_1.0_spec-1.0.1.jar"/>
+ <library name="jempbox-1.1.0.jar"/>
+ <library name="metadata-extractor-2.4.0-beta-1.jar"/>
+ <library name="pdfbox-1.1.0.jar"/>
+ <library name="poi-3.6.jar"/>
+ <library name="poi-ooxml-3.6.jar"/>
+ <library name="poi-ooxml-schemas-3.6.jar"/>
+ <library name="poi-scratchpad-3.6.jar"/>
+ <library name="tagsoup-1.2.jar"/>
+ <library name="tika-parsers-0.7.jar"/>
+ <library name="xml-apis-1.0.b2.jar"/>
+ <library name="xmlbeans-2.3.0.jar"/>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+
+ <extension point="org.apache.nutch.parse.Parser"
+ id="org.apache.nutch.parse.tika"
+ name="TikaParser">
+
+ <implementation id="org.apache.nutch.parse.tika.Parser"
+ class="org.apache.nutch.parse.tika.TikaParser">
+ <parameter name="contentType" value=""/>
+ </implementation>
+
+ </extension>
+
+</plugin>
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|