From: <my...@us...> - 2011-07-19 13:38:20
|
Revision: 2520 http://aperture.svn.sourceforge.net/aperture/?rev=2520&view=rev Author: mylka Date: 2011-07-19 13:38:14 +0000 (Tue, 19 Jul 2011) Log Message: ----------- updated the x2r version and improved the office extractor, it will fallback to the default string extractor for all unsupported file types Modified Paths: -------------- aperture/trunk/core/pom.xml aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/office/OfficeExtractor.java Modified: aperture/trunk/core/pom.xml =================================================================== --- aperture/trunk/core/pom.xml 2011-07-19 10:45:50 UTC (rev 2519) +++ aperture/trunk/core/pom.xml 2011-07-19 13:38:14 UTC (rev 2520) @@ -533,7 +533,7 @@ <dependency> <groupId>pl.edu.agh.x2r</groupId> <artifactId>x2r-core</artifactId> - <version>0.0.2</version> + <version>0.0.3-SNAPSHOT</version> <exclusions> <exclusion> <artifactId>jldap</artifactId> Modified: aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/office/OfficeExtractor.java =================================================================== --- aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/office/OfficeExtractor.java 2011-07-19 10:45:50 UTC (rev 2519) +++ aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/office/OfficeExtractor.java 2011-07-19 13:38:14 UTC (rev 2520) @@ -32,7 +32,8 @@ /** * An Extractor implementation that can be used to process MS Office documents when we don't know its specific * subtype (e.g. Word, Excel, PowerPoint) or when we don't have an Extractor for that particular subtype. This - * extractor is capable of extracting all metadata but not the textual contents. + * extractor first tries to guess the actual mime type of the file (with the help of Tika {@link POIFSContainerDetector}). + * If it succeeds, */ public class OfficeExtractor implements Extractor { @@ -59,21 +60,29 @@ try { MediaType mt = detector.detect(tis, md); - if (mt == null || POIFSContainerDetector.OLE.equals(mt) || registry == null) { - // do not specify a TextExtractor, PoiUtil will fall-back on using a StringExtractor - PoiUtil.extractAll(tis, null, result, logger); - result.add(RDF.type,NFO.Document); - } else { + if (mt != null && !POIFSContainerDetector.OLE.equals(mt)) { String mtString = mt.toString(); + + // this is necessary because some RDFContainer implementations we are forced + // to use in some of our apps, don't actually implement the Set contract + // properly result.remove(NIE.mimeType); result.put(NIE.mimeType, mtString); - Set set = registry.getExtractorFactories(mtString); - if (set != null && !set.isEmpty()) { - ExtractorFactory fac = (ExtractorFactory)set.iterator().next(); - Extractor ex = fac.get(); - ex.extract(id, tis, charset, mimeType, result); + + if (registry != null) { + Set<?> set = registry.getExtractorFactories(mtString); + if (set != null && !set.isEmpty()) { + ExtractorFactory fac = (ExtractorFactory)set.iterator().next(); + Extractor ex = fac.get(); + ex.extract(id, tis, charset, mimeType, result); + return; + } } } + + // do not specify a TextExtractor, PoiUtil will fall-back on using a StringExtractor + PoiUtil.extractAll(tis, null, result, logger); + result.add(RDF.type,NFO.Document); } catch (IOException e) { throw new ExtractorException(e); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |