From: <my...@us...> - 2011-06-14 15:08:57
|
Revision: 2506 http://aperture.svn.sourceforge.net/aperture/?rev=2506&view=rev Author: mylka Date: 2011-06-14 15:08:50 +0000 (Tue, 14 Jun 2011) Log Message: ----------- [3043080] implemented Arjohn's idea to apply the POIFSContainerDetector inside OfficeExtractor Modified Paths: -------------- aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/excel/ExcelExtractor.java aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/impl/DefaultExtractorRegistry.java aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/microsoft/util/PoiUtil.java aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/office/OfficeExtractor.java aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/office/OfficeExtractorFactory.java aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/office/bundle/OfficeExtractorActivator.java aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/powerpoint/PowerPointExtractor.java aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/tika/ApertureDetector.java aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/extractor/excel/ExcelExtractorTest.java aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/extractor/office/OfficeExtractorTest.java aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/extractor/powerpoint/PowerPointExtractorTest.java aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/extractor/presentations/PresentationsExtractorTest.java aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/tika/TikaMimeTypeIdentifierTest.java Modified: aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/excel/ExcelExtractor.java =================================================================== --- aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/excel/ExcelExtractor.java 2011-06-13 22:24:23 UTC (rev 2505) +++ aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/excel/ExcelExtractor.java 2011-06-14 15:08:50 UTC (rev 2506) @@ -25,7 +25,7 @@ import org.apache.poi.hssf.record.chart.SeriesTextRecord; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.DocumentInputStream; -import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; import org.ontoware.rdf2go.model.node.URI; import org.ontoware.rdf2go.vocabulary.RDF; import org.semanticdesktop.aperture.extractor.Extractor; @@ -49,7 +49,7 @@ result.add(RDF.type,NFO.Spreadsheet); } - public static String getText(DirectoryNode dirNode, POIFSFileSystem fs) throws IOException { + public static String getText(DirectoryNode dirNode, NPOIFSFileSystem fs) throws IOException { // get the stream containing the Workbook DocumentInputStream docStream = dirNode.createDocumentInputStream("Workbook"); @@ -80,7 +80,7 @@ private static class ExcelTextExtractor implements TextExtractor { - public String getText(DirectoryNode dirNode, POIFSFileSystem fs) throws IOException { + public String getText(DirectoryNode dirNode, NPOIFSFileSystem fs) throws IOException { return ExcelExtractor.getText(dirNode, fs); } } Modified: aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/impl/DefaultExtractorRegistry.java =================================================================== --- aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/impl/DefaultExtractorRegistry.java 2011-06-13 22:24:23 UTC (rev 2505) +++ aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/impl/DefaultExtractorRegistry.java 2011-06-14 15:08:50 UTC (rev 2506) @@ -9,11 +9,14 @@ import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; +import java.lang.reflect.Constructor; +import java.lang.reflect.InvocationTargetException; import java.util.Map; import javax.xml.parsers.ParserConfigurationException; import org.semanticdesktop.aperture.extractor.ExtractorFactory; +import org.semanticdesktop.aperture.extractor.ExtractorRegistry; import org.semanticdesktop.aperture.extractor.FileExtractorFactory; import org.semanticdesktop.aperture.util.ResourceUtil; import org.semanticdesktop.aperture.util.SimpleSAXAdapter; @@ -102,8 +105,18 @@ className = className.trim(); if (!className.equals("")) { try { - Class clazz = Class.forName(className); - Object instance = clazz.newInstance(); + Class<?> clazz = Class.forName(className); + Constructor<?> c = null; + Object instance = null; + try { + c = clazz.getConstructor(); + instance = clazz.newInstance(); + } + catch (NoSuchMethodException e) { + c = clazz.getConstructor(ExtractorRegistry.class); + instance = c.newInstance(DefaultExtractorRegistry.this); + } + if (instance instanceof ExtractorFactory) { ExtractorFactory factory = (ExtractorFactory) instance; add(factory); @@ -125,6 +138,15 @@ logger.warn("unable to cast instance to " + ExtractorFactory.class.getName() + ", ignoring", e); } + catch (NoSuchMethodException e) { + logger.warn("unable to instantiate class " + className + ", ignoring", e); + } + catch (IllegalArgumentException e) { + logger.warn("unable to instantiate class " + className + ", ignoring", e); + } + catch (InvocationTargetException e) { + logger.warn("unable to instantiate class " + className + ", ignoring", e); + } } } } Modified: aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/microsoft/util/PoiUtil.java =================================================================== --- aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/microsoft/util/PoiUtil.java 2011-06-13 22:24:23 UTC (rev 2505) +++ aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/microsoft/util/PoiUtil.java 2011-06-14 15:08:50 UTC (rev 2506) @@ -12,7 +12,8 @@ import org.apache.poi.hpsf.SummaryInformation; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.DocumentInputStream; -import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.tika.io.TikaInputStream; import org.ontoware.rdf2go.model.Model; import org.ontoware.rdf2go.model.node.Resource; import org.ontoware.rdf2go.model.node.URI; @@ -59,7 +60,7 @@ * @return A populated SummaryInformation, or 'null' when the relevant document parts could not be * located. */ - public static SummaryInformation getSummaryInformation(POIFSFileSystem poiFileSystem) { + public static SummaryInformation getSummaryInformation(NPOIFSFileSystem poiFileSystem) { return getSummaryInformation(poiFileSystem.getRoot()); } @@ -89,7 +90,7 @@ * @return A populated SummaryInformation, or 'null' when the relevant document parts could not be * located. */ - public static DocumentSummaryInformation getDocumentSummaryInformation(POIFSFileSystem poiFileSystem) { + public static DocumentSummaryInformation getDocumentSummaryInformation(NPOIFSFileSystem poiFileSystem) { return getDocumentSummaryInformation(poiFileSystem.getRoot()); } @@ -133,7 +134,7 @@ stream.mark(bufferSize); } - POIFSFileSystem fileSystem = new POIFSFileSystem(new NonCloseableStream(stream)); + NPOIFSFileSystem fileSystem = new NPOIFSFileSystem(new NonCloseableStream(stream)); extractMetadata(fileSystem, container); if (resetStream) { @@ -149,7 +150,7 @@ * @param poiFileSystem The POI file system to obtain the metadata from. * @param container The RDFContainer to store the created RDF statements in. */ - public static void extractMetadata(POIFSFileSystem poiFileSystem, RDFContainer container) { + public static void extractMetadata(NPOIFSFileSystem poiFileSystem, RDFContainer container) { extractMetadata(poiFileSystem.getRoot(), container); } @@ -275,7 +276,14 @@ try { // try to create a POI file system - POIFSFileSystem fileSystem = new POIFSFileSystem(new NonCloseableStream(stream)); + NPOIFSFileSystem fileSystem = null; + + if (TikaInputStream.isTikaInputStream(stream) && + TikaInputStream.get(stream).getOpenContainer() instanceof NPOIFSFileSystem) { + fileSystem = (NPOIFSFileSystem)TikaInputStream.get(stream).getOpenContainer(); + } else { + fileSystem = new NPOIFSFileSystem(new NonCloseableStream(stream)); + } // try to extract the text, ignoring any exceptions as metadata extraction may still succeed try { @@ -375,7 +383,7 @@ * @return A String containing the full-text of the document. * @throws IOException whenever access to the directory node caused an IOException. */ - public String getText(DirectoryNode dir, POIFSFileSystem fs) throws IOException; + public String getText(DirectoryNode dir, NPOIFSFileSystem fs) throws IOException; } public static class NonCloseableStream extends FilterInputStream { Modified: aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/office/OfficeExtractor.java =================================================================== --- aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/office/OfficeExtractor.java 2011-06-13 22:24:23 UTC (rev 2505) +++ aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/office/OfficeExtractor.java 2011-06-14 15:08:50 UTC (rev 2506) @@ -6,16 +6,26 @@ */ package org.semanticdesktop.aperture.extractor.office; +import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset; +import java.util.Set; +import org.apache.tika.detect.POIFSContainerDetector; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; import org.ontoware.rdf2go.model.node.URI; import org.ontoware.rdf2go.vocabulary.RDF; import org.semanticdesktop.aperture.extractor.Extractor; import org.semanticdesktop.aperture.extractor.ExtractorException; +import org.semanticdesktop.aperture.extractor.ExtractorFactory; +import org.semanticdesktop.aperture.extractor.ExtractorRegistry; import org.semanticdesktop.aperture.extractor.microsoft.util.PoiUtil; import org.semanticdesktop.aperture.rdf.RDFContainer; +import org.semanticdesktop.aperture.tika.TikaMimeTypeIdentifier; import org.semanticdesktop.aperture.vocabulary.NFO; +import org.semanticdesktop.aperture.vocabulary.NIE; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -28,10 +38,45 @@ private Logger logger = LoggerFactory.getLogger(getClass()); + private POIFSContainerDetector detector; + + private ExtractorRegistry registry; + + public OfficeExtractor(ExtractorRegistry reg) { + this.detector = new POIFSContainerDetector(); + this.registry = reg; + } + public void extract(URI id, InputStream stream, Charset charset, String mimeType, RDFContainer result) throws ExtractorException { - // do not specify a TextExtractor, PoiUtil will fall-back on using a StringExtractor - PoiUtil.extractAll(stream, null, result, logger); - result.add(RDF.type,NFO.Document); + + TikaInputStream tis = TikaInputStream.get(stream); + Metadata md = new Metadata(); + String fileName = result.getString(NFO.fileName); + + fileName = TikaMimeTypeIdentifier.getFileName(fileName, result.getDescribedUri()); + md.set(Metadata.RESOURCE_NAME_KEY, fileName); + + try { + MediaType mt = detector.detect(tis, md); + if (mt == null || POIFSContainerDetector.OLE.equals(mt) || registry == null) { + // do not specify a TextExtractor, PoiUtil will fall-back on using a StringExtractor + PoiUtil.extractAll(tis, null, result, logger); + result.add(RDF.type,NFO.Document); + } else { + String mtString = mt.toString(); + result.remove(NIE.mimeType); + result.put(NIE.mimeType, mtString); + Set set = registry.getExtractorFactories(mtString); + if (set != null && !set.isEmpty()) { + ExtractorFactory fac = (ExtractorFactory)set.iterator().next(); + Extractor ex = fac.get(); + ex.extract(id, tis, charset, mimeType, result); + } + } + } + catch (IOException e) { + throw new ExtractorException(e); + } } } Modified: aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/office/OfficeExtractorFactory.java =================================================================== --- aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/office/OfficeExtractorFactory.java 2011-06-13 22:24:23 UTC (rev 2505) +++ aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/office/OfficeExtractorFactory.java 2011-06-14 15:08:50 UTC (rev 2506) @@ -12,6 +12,7 @@ import org.semanticdesktop.aperture.extractor.Extractor; import org.semanticdesktop.aperture.extractor.ExtractorFactory; +import org.semanticdesktop.aperture.extractor.ExtractorRegistry; public class OfficeExtractorFactory implements ExtractorFactory { @@ -23,11 +24,15 @@ set.add("application/x-tika-msoffice"); MIME_TYPES = Collections.unmodifiableSet(set); } + + private ExtractorRegistry reg; + public OfficeExtractorFactory(ExtractorRegistry reg) { + this.reg = reg; + } - public Extractor get() { - return new OfficeExtractor(); + return new OfficeExtractor(reg); } public Set getSupportedMimeTypes() { Modified: aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/office/bundle/OfficeExtractorActivator.java =================================================================== --- aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/office/bundle/OfficeExtractorActivator.java 2011-06-13 22:24:23 UTC (rev 2505) +++ aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/office/bundle/OfficeExtractorActivator.java 2011-06-14 15:08:50 UTC (rev 2506) @@ -26,7 +26,7 @@ public void start(BundleContext context) throws Exception { registrations.add(context.registerService(ExtractorFactory.class - .getName(), new OfficeExtractorFactory(), new Hashtable())); + .getName(), new OfficeExtractorFactory(null), new Hashtable())); registrations.add(context.registerService(ExtractorFactory.class .getName(), new ExcelExtractorFactory(), new Hashtable())); registrations.add(context.registerService(ExtractorFactory.class Modified: aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/powerpoint/PowerPointExtractor.java =================================================================== --- aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/powerpoint/PowerPointExtractor.java 2011-06-13 22:24:23 UTC (rev 2505) +++ aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/powerpoint/PowerPointExtractor.java 2011-06-14 15:08:50 UTC (rev 2506) @@ -11,7 +11,7 @@ import java.nio.charset.Charset; import org.apache.poi.poifs.filesystem.DirectoryNode; -import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; import org.ontoware.rdf2go.model.node.URI; import org.ontoware.rdf2go.vocabulary.RDF; import org.semanticdesktop.aperture.extractor.Extractor; @@ -33,14 +33,14 @@ result.add(RDF.type,NFO.Presentation); } - public static String getText(DirectoryNode dir, POIFSFileSystem fs) throws IOException { - org.apache.poi.hslf.extractor.PowerPointExtractor extractor = new org.apache.poi.hslf.extractor.PowerPointExtractor(dir, fs); + public static String getText(DirectoryNode dir, NPOIFSFileSystem fs) throws IOException { + org.apache.poi.hslf.extractor.PowerPointExtractor extractor = new org.apache.poi.hslf.extractor.PowerPointExtractor(dir); return extractor.getText(true, true); } private static class PowerPointTextExtractor implements TextExtractor { - public String getText(DirectoryNode dir, POIFSFileSystem fs) throws IOException { + public String getText(DirectoryNode dir, NPOIFSFileSystem fs) throws IOException { return PowerPointExtractor.getText(dir,fs); } } Modified: aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/tika/ApertureDetector.java =================================================================== --- aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/tika/ApertureDetector.java 2011-06-13 22:24:23 UTC (rev 2505) +++ aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/tika/ApertureDetector.java 2011-06-14 15:08:50 UTC (rev 2506) @@ -24,21 +24,20 @@ private Detector zipDetector; - private Detector poifsDetector; +// private Detector poifsDetector; public ApertureDetector(MimeTypes fallbackDetector) { this.fallbackDetector = fallbackDetector; - poifsDetector = new POIFSContainerDetector(); +// poifsDetector = new POIFSContainerDetector(); zipDetector = new ZipContainerDetector(); } public MediaType detect(InputStream input, Metadata metadata) throws IOException { MediaType type = zipDetector.detect(input, metadata); - if (MediaType.OCTET_STREAM.equals(type)) { - type = poifsDetector.detect(input, metadata); - } - +// if (MediaType.OCTET_STREAM.equals(type)) { +// type = poifsDetector.detect(input, metadata); +// } MediaType fallbackType = fallbackDetector.detect(input, metadata); if (fallbackType == null) { Modified: aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/extractor/excel/ExcelExtractorTest.java =================================================================== --- aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/extractor/excel/ExcelExtractorTest.java 2011-06-13 22:24:23 UTC (rev 2505) +++ aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/extractor/excel/ExcelExtractorTest.java 2011-06-14 15:08:50 UTC (rev 2506) @@ -13,6 +13,10 @@ import org.semanticdesktop.aperture.extractor.Extractor; import org.semanticdesktop.aperture.extractor.ExtractorException; import org.semanticdesktop.aperture.extractor.ExtractorFactory; +import org.semanticdesktop.aperture.extractor.ExtractorRegistry; +import org.semanticdesktop.aperture.extractor.impl.ExtractorRegistryImpl; +import org.semanticdesktop.aperture.extractor.office.OfficeExtractorFactory; +import org.semanticdesktop.aperture.extractor.presentations.PresentationsExtractorFactory; import org.semanticdesktop.aperture.extractor.word.WordExtractorFactory; import org.semanticdesktop.aperture.rdf.RDFContainer; import org.semanticdesktop.aperture.rdf.RDFContainerFactory; @@ -45,6 +49,27 @@ container.dispose(); } + public void testExtractionViaOfficeExtractor() throws ExtractorException, IOException, ModelException { + // apply the extractor on an example file + ExtractorRegistry reg = new ExtractorRegistryImpl(); + reg.add(new ExcelExtractorFactory()); + ExtractorFactory factory = new OfficeExtractorFactory(reg); + Extractor extractor = factory.get(); + RDFContainer container = extract(DOCS_PATH + "microsoft-excel-2000.xls", extractor); + + // check the extraction results + checkStatement(NIE.plainTextContent, "spreadsheet", container); + checkStatement(NIE.title, "Excel", container); + checkStatement(NIE.subject, "document", container); + checkStatement(NIE.description, "comments", container); + checkStatement(NIE.generator, "Excel", container); + checkSimpleContact(NCO.creator, "Christiaan Fluit", container); + checkStatement(NIE.keyword, "test", container); + checkStatement(NIE.keyword, "rdf", container); + validate(container); + container.dispose(); + } + public void testEncryptedExcel2003Document() throws Exception { ExtractorFactory factory = new ExcelExtractorFactory(); Extractor extractor = factory.get(); Modified: aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/extractor/office/OfficeExtractorTest.java =================================================================== --- aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/extractor/office/OfficeExtractorTest.java 2011-06-13 22:24:23 UTC (rev 2505) +++ aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/extractor/office/OfficeExtractorTest.java 2011-06-14 15:08:50 UTC (rev 2506) @@ -12,6 +12,8 @@ import org.semanticdesktop.aperture.extractor.Extractor; import org.semanticdesktop.aperture.extractor.ExtractorException; import org.semanticdesktop.aperture.extractor.ExtractorFactory; +import org.semanticdesktop.aperture.extractor.ExtractorRegistry; +import org.semanticdesktop.aperture.extractor.impl.DefaultExtractorRegistry; import org.semanticdesktop.aperture.rdf.RDFContainer; import org.semanticdesktop.aperture.test.extractor.ExtractorTestBase; import org.semanticdesktop.aperture.vocabulary.NCO; @@ -21,7 +23,7 @@ public void testExtraction() throws ExtractorException, IOException, ModelException { // apply the extractor on an example file - ExtractorFactory factory = new OfficeExtractorFactory(); + ExtractorFactory factory = new OfficeExtractorFactory(null); Extractor extractor = factory.get(); RDFContainer container = extract(DOCS_PATH + "microsoft-word-2000.doc", extractor); Modified: aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/extractor/powerpoint/PowerPointExtractorTest.java =================================================================== --- aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/extractor/powerpoint/PowerPointExtractorTest.java 2011-06-13 22:24:23 UTC (rev 2505) +++ aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/extractor/powerpoint/PowerPointExtractorTest.java 2011-06-14 15:08:50 UTC (rev 2506) @@ -12,6 +12,9 @@ import org.semanticdesktop.aperture.extractor.Extractor; import org.semanticdesktop.aperture.extractor.ExtractorException; import org.semanticdesktop.aperture.extractor.ExtractorFactory; +import org.semanticdesktop.aperture.extractor.ExtractorRegistry; +import org.semanticdesktop.aperture.extractor.impl.ExtractorRegistryImpl; +import org.semanticdesktop.aperture.extractor.office.OfficeExtractorFactory; import org.semanticdesktop.aperture.rdf.RDFContainer; import org.semanticdesktop.aperture.rdf.RDFContainerFactory; import org.semanticdesktop.aperture.rdf.impl.RDFContainerFactoryImpl; @@ -42,8 +45,31 @@ checkStatement(NIE.keyword, "rdf", container); validate(container); container.dispose(); - } + } + public void testExtractionViaOfficeExtractor() throws ExtractorException, IOException, ModelException { + // apply the extractor on an example file + ExtractorRegistry reg = new ExtractorRegistryImpl(); + reg.add(new PowerPointExtractorFactory()); + ExtractorFactory factory = new OfficeExtractorFactory(reg); + Extractor extractor = factory.get(); + RDFContainer container = extract(DOCS_PATH + "microsoft-powerpoint-2000.ppt", extractor); + + // check the extraction results + checkStatement(NIE.plainTextContent, "presentation", container); + checkStatement(NIE.plainTextContent, "2000", container); + checkStatement(NIE.plainTextContent, "notes", container); + checkStatement(NIE.title, "Example", container); + checkStatement(NIE.subject, "document", container); + checkStatement(NIE.description, "comments", container); + checkStatement(NIE.generator, "PowerPoint", container); + checkSimpleContact(NCO.creator, "Christiaan Fluit", container); + checkStatement(NIE.keyword, "test", container); + checkStatement(NIE.keyword, "rdf", container); + validate(container); + container.dispose(); + } + /** * Tests the files gathered in the course of investigating the issue 1976336 * @throws Exception Modified: aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/extractor/presentations/PresentationsExtractorTest.java =================================================================== --- aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/extractor/presentations/PresentationsExtractorTest.java 2011-06-13 22:24:23 UTC (rev 2505) +++ aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/extractor/presentations/PresentationsExtractorTest.java 2011-06-14 15:08:50 UTC (rev 2506) @@ -12,6 +12,9 @@ import org.semanticdesktop.aperture.extractor.Extractor; import org.semanticdesktop.aperture.extractor.ExtractorException; import org.semanticdesktop.aperture.extractor.ExtractorFactory; +import org.semanticdesktop.aperture.extractor.ExtractorRegistry; +import org.semanticdesktop.aperture.extractor.impl.ExtractorRegistryImpl; +import org.semanticdesktop.aperture.extractor.office.OfficeExtractorFactory; import org.semanticdesktop.aperture.rdf.RDFContainer; import org.semanticdesktop.aperture.test.extractor.ExtractorTestBase; import org.semanticdesktop.aperture.vocabulary.NCO; @@ -50,4 +53,25 @@ validate(container); container.dispose(); } + + // tests full-text and metadata extraction + public void testPureOfficeBasedExtraction() throws ExtractorException, IOException, ModelException { + // apply the extractor on an example file + ExtractorRegistry reg = new ExtractorRegistryImpl(); + reg.add(new PresentationsExtractorFactory()); + ExtractorFactory factory = new OfficeExtractorFactory(reg); + Extractor extractor = factory.get(); + RDFContainer container = extract(DOCS_PATH + "corel-presentations-x3.shw", extractor); + + // check the extraction results + checkStatement(NIE.plainTextContent, "Presentation", container); + checkStatement(NIE.title, "Example", container); + checkStatement(NIE.subject, "Testing", container); + checkStatement(NIE.description, "comments", container); + checkSimpleContact(NCO.creator, "Christiaan Fluit", container); + checkStatement(NIE.keyword, "test", container); + checkStatement(NIE.keyword, "rdf", container); + validate(container); + container.dispose(); + } } Modified: aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/tika/TikaMimeTypeIdentifierTest.java =================================================================== --- aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/tika/TikaMimeTypeIdentifierTest.java 2011-06-13 22:24:23 UTC (rev 2505) +++ aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/tika/TikaMimeTypeIdentifierTest.java 2011-06-14 15:08:50 UTC (rev 2506) @@ -6,23 +6,39 @@ */ package org.semanticdesktop.aperture.tika; +import static org.junit.Assert.assertEquals; +import info.aduna.io.ResourceUtil; + +import java.io.BufferedInputStream; +import java.io.InputStream; +import java.util.Set; + import org.junit.Before; import org.junit.Test; +import org.ontoware.rdf2go.RDF2Go; +import org.semanticdesktop.aperture.extractor.Extractor; +import org.semanticdesktop.aperture.extractor.ExtractorFactory; +import org.semanticdesktop.aperture.extractor.impl.DefaultExtractorRegistry; import org.semanticdesktop.aperture.mime.identifier.AbstractIdentificationTest; import org.semanticdesktop.aperture.mime.identifier.MimeTypeIdentifier; +import org.semanticdesktop.aperture.rdf.RDFContainer; +import org.semanticdesktop.aperture.rdf.impl.RDFContainerImpl; +import org.semanticdesktop.aperture.util.IOUtil; +import org.semanticdesktop.aperture.vocabulary.NIE; public class TikaMimeTypeIdentifierTest extends AbstractIdentificationTest { private MimeTypeIdentifier identifier; + private DefaultExtractorRegistry reg; @Before public void setUp() { this.identifier = new TikaMimeTypeIdentifier(); + reg = new DefaultExtractorRegistry(); } @Test - public void testIdentification() throws Exception { - + public void testIdentification() throws Exception { t("bzip2-txt-bziptest.txt.bz2", "application/x-bzip", "application/x-bzip2"); t("compress-txt-compresstest.txt.Z", "application/x-compress", "application/x-compress"); t("corel-presentations-3.0.shw", "application/vnd.wordperfect","application/vnd.wordperfect"); // better @@ -141,11 +157,11 @@ "application/vnd.openxmlformats-officedocument.wordprocessingml.document"); t("microsoft-word-history-blair.doc", "application/msword", "application/msword"); - t("microsoft-word-illegal-unicode-characters.doc", "application/x-tika-msoffice", + t("microsoft-word-illegal-unicode-characters.doc", "application/msword", "application/msword"); - t("microsoft-word-testdoc-comments.doc", "application/msword", // wrong + t("microsoft-word-testdoc-comments.doc", "application/msword", "application/msword"); - t("microsoft-word-testdoc-nocomments.doc","application/msword", // wrong + t("microsoft-word-testdoc-nocomments.doc","application/msword", "application/msword"); t("microsoft-works-spreadsheet-3.0.wks", "application/x-123", "application/x-123"); // wrong @@ -270,4 +286,35 @@ test(identifier, mimeTypeWithoutName, "/org/semanticdesktop/aperture/docs/" + name, false); test(identifier, mimeTypeWithName, "/org/semanticdesktop/aperture/docs/" + name, true); } + + protected void test(MimeTypeIdentifier mimeTypeIdentifier, String desiredMimeType, String path, boolean withPath) throws Exception { + InputStream stream = new BufferedInputStream(ResourceUtil.getInputStream(path)); + int minimumArrayLength = mimeTypeIdentifier.getMinArrayLength(); + stream.mark(minimumArrayLength + 10); // add some for safety + byte[] bytes = IOUtil.readBytes(stream, minimumArrayLength); + String mimeType = null; + if (withPath) { + mimeType = mimeTypeIdentifier.identify(bytes, path, null); + } else { + mimeType = mimeTypeIdentifier.identify(bytes, null, null); + } + stream.reset(); + + if (mimeType != null) { + RDFContainer c = new RDFContainerImpl(RDF2Go.getModelFactory().createModel().open(), "uri:dummy"); + c.put(NIE.mimeType, mimeType); + Set set = reg.getExtractorFactories(mimeType); + if (set != null && !set.isEmpty()) { + ExtractorFactory fac = (ExtractorFactory)set.iterator().next(); + Extractor ex = fac.get(); + ex.extract(c.getDescribedUri(), stream, null, mimeType, c); + } + mimeType = c.getString(NIE.mimeType); + c.dispose(); + } + + + assertEquals(desiredMimeType, mimeType); + + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |