From: <my...@us...> - 2011-11-10 10:54:45
|
Revision: 2578 http://aperture.svn.sourceforge.net/aperture/?rev=2578&view=rev Author: mylka Date: 2011-11-10 10:54:38 +0000 (Thu, 10 Nov 2011) Log Message: ----------- [2850026] inside RtfExtractor, switched to using the Tika RtfParser Modified Paths: -------------- aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/rtf/RtfExtractor.java aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/extractor/rtf/RtfExtractorTest.java Added Paths: ----------- aperture/trunk/core/src/test/resources/org/semanticdesktop/aperture/docs/rtf-tika777.rtf Modified: aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/rtf/RtfExtractor.java =================================================================== --- aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/rtf/RtfExtractor.java 2011-11-09 14:53:48 UTC (rev 2577) +++ aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/rtf/RtfExtractor.java 2011-11-10 10:54:38 UTC (rev 2578) @@ -14,6 +14,12 @@ import javax.swing.text.Document; import javax.swing.text.rtf.RTFEditorKit; +import org.apache.tika.Tika; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.rtf.RTFParser; +import org.apache.tika.sax.ToTextContentHandler; import org.ontoware.rdf2go.model.node.URI; import org.ontoware.rdf2go.vocabulary.RDF; import org.semanticdesktop.aperture.extractor.Extractor; @@ -23,6 +29,7 @@ import org.semanticdesktop.aperture.vocabulary.NIE; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.xml.sax.SAXException; public class RtfExtractor implements Extractor { @@ -37,21 +44,21 @@ public void extract(URI id, InputStream stream, Charset charset, String mimeType, RDFContainer result) throws ExtractorException { - RTFEditorKit rtfParser = new RTFEditorKit(); - Document document = rtfParser.createDefaultDocument(); try { - rtfParser.read(stream, document, 0); - String text = document.getText(0, document.getLength()); + ToTextContentHandler handler = new ToTextContentHandler(); + RTFParser p = new RTFParser(); + Metadata m = new Metadata(); + p.parse(stream, handler, m, null); + String text = handler.toString(); result.add(NIE.plainTextContent, text); result.add(RDF.type,NFO.TextDocument); } - catch (BadLocationException e) { - // problem relates to the file contents: just log and ignore - Logger logger = LoggerFactory.getLogger(getClass()); - logger.warn("Bad RTF location", e); - } catch (IOException e) { throw new ExtractorException(e); + } catch (SAXException e) { + throw new ExtractorException(e); + } catch (TikaException e) { + throw new ExtractorException(e); } } } Modified: aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/extractor/rtf/RtfExtractorTest.java =================================================================== --- aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/extractor/rtf/RtfExtractorTest.java 2011-11-09 14:53:48 UTC (rev 2577) +++ aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/extractor/rtf/RtfExtractorTest.java 2011-11-10 10:54:38 UTC (rev 2578) @@ -19,6 +19,7 @@ import org.semanticdesktop.aperture.extractor.ExtractorFactory; import org.semanticdesktop.aperture.rdf.RDFContainer; import org.semanticdesktop.aperture.rdf.ValueFactory; +import org.semanticdesktop.aperture.test.ApertureTestBase; import org.semanticdesktop.aperture.test.extractor.ExtractorTestBase; import org.semanticdesktop.aperture.vocabulary.NIE; @@ -53,6 +54,14 @@ container.dispose(); } } + + public void testTika777Extraction() throws Exception { + RDFContainer container = getStatements(DOCS_PATH + "rtf-tika777.rtf"); + String text = container.getString(NIE.plainTextContent); + // this is supposed to be a proper Russian text + assertTrue(text.contains("\u0423\u0432\u0430\u0436\u0430\u0435\u043c\u044b\u0439\u0020\u043a\u043b\u0438\u0435\u043d\u0442\u0021")); + container.dispose(); + } private RDFContainer getStatements(String resourceName) throws ExtractorException, IOException { // apply the extractor on a text file containing a null character Added: aperture/trunk/core/src/test/resources/org/semanticdesktop/aperture/docs/rtf-tika777.rtf =================================================================== --- aperture/trunk/core/src/test/resources/org/semanticdesktop/aperture/docs/rtf-tika777.rtf (rev 0) +++ aperture/trunk/core/src/test/resources/org/semanticdesktop/aperture/docs/rtf-tika777.rtf 2011-11-10 10:54:38 UTC (rev 2578) @@ -0,0 +1,7 @@ +{\rtf1\ansi\ansicpg1252\fromtext \fbidis \deff0 +{\fonttbl + +{\f0\fswiss\fcharset0 Arial;} {\f1\fswiss\fcharset204 Arial;} +} +\par{\f1\fs20 \'d3\'e2\'e0\'e6\'e0\'e5\'ec\'fb\'e9 \'ea\'eb\'e8\'e5\'ed\'f2!\f0}\par +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |