[aperture-commit] SF.net SVN: aperture:[2520] aperture/trunk/core

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 454-5900

Revision: 2520
          http://aperture.svn.sourceforge.net/aperture/?rev=2520&view=rev
Author:   mylka
Date:     2011-07-19 13:38:14 +0000 (Tue, 19 Jul 2011)

Log Message:
-----------
updated the x2r version and improved the office extractor, it will fallback to the default string extractor for all unsupported file types

Modified Paths:
--------------
    aperture/trunk/core/pom.xml
    aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/office/OfficeExtractor.java

Modified: aperture/trunk/core/pom.xml
===================================================================

--- aperture/trunk/core/pom.xml	2011-07-19 10:45:50 UTC (rev 2519)
+++ aperture/trunk/core/pom.xml	2011-07-19 13:38:14 UTC (rev 2520)
@@ -533,7 +533,7 @@
 		  	<dependency>
 				<groupId>pl.edu.agh.x2r</groupId>
 				<artifactId>x2r-core</artifactId>
-				<version>0.0.2</version>
+				<version>0.0.3-SNAPSHOT</version>
 				<exclusions>
 					<exclusion>
 						<artifactId>jldap</artifactId>

Modified: aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/office/OfficeExtractor.java
===================================================================
--- aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/office/OfficeExtractor.java	2011-07-19 10:45:50 UTC (rev 2519)
+++ aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/extractor/office/OfficeExtractor.java	2011-07-19 13:38:14 UTC (rev 2520)
@@ -32,7 +32,8 @@
 /**
  * An Extractor implementation that can be used to process MS Office documents when we don't know its specific
  * subtype (e.g. Word, Excel, PowerPoint) or when we don't have an Extractor for that particular subtype. This
- * extractor is capable of extracting all metadata but not the textual contents.
+ * extractor first tries to guess the actual mime type of the file (with the help of Tika {@link POIFSContainerDetector}).
+ * If it succeeds, 
  */
 public class OfficeExtractor implements Extractor {
 
@@ -59,21 +60,29 @@
 	    
 	    try {
             MediaType mt = detector.detect(tis, md);
-            if (mt == null || POIFSContainerDetector.OLE.equals(mt) || registry == null) {
-                // do not specify a TextExtractor, PoiUtil will fall-back on using a StringExtractor
-                PoiUtil.extractAll(tis, null, result, logger);
-                result.add(RDF.type,NFO.Document);
-            } else {
+            if (mt != null && !POIFSContainerDetector.OLE.equals(mt)) {
                 String mtString = mt.toString();
+                
+                // this is necessary because some RDFContainer implementations we are forced
+                // to use in some of our apps, don't actually implement the Set contract
+                // properly
                 result.remove(NIE.mimeType);
                 result.put(NIE.mimeType, mtString);
-                Set set = registry.getExtractorFactories(mtString);
-                if (set != null && !set.isEmpty()) {
-                    ExtractorFactory fac = (ExtractorFactory)set.iterator().next();
-                    Extractor ex = fac.get();
-                    ex.extract(id, tis, charset, mimeType, result);
+                
+                if (registry != null) {
+                    Set<?> set = registry.getExtractorFactories(mtString);
+                    if (set != null && !set.isEmpty()) {
+                        ExtractorFactory fac = (ExtractorFactory)set.iterator().next();
+                        Extractor ex = fac.get();
+                        ex.extract(id, tis, charset, mimeType, result);
+                        return;
+                    }
                 }
             }
+            
+            // do not specify a TextExtractor, PoiUtil will fall-back on using a StringExtractor
+            PoiUtil.extractAll(tis, null, result, logger);
+            result.add(RDF.type,NFO.Document);
         }
         catch (IOException e) {
             throw new ExtractorException(e);


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.