[aperture-commit] SF.net SVN: aperture:[2295] aperture/trunk/core/src

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 454-5900

Revision: 2295
          http://aperture.svn.sourceforge.net/aperture/?rev=2295&view=rev
Author:   mylka
Date:     2010-03-12 10:52:52 +0000 (Fri, 12 Mar 2010)

Log Message:
-----------
2969249 - added a new method, that extracts information from a URL, with the help of accessors. This fixes the problem because HTTPAccessor reads the Charset from HTTP headers, something which the normal HtmlExtractor can't

Modified Paths:
--------------
    aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/runtime/ApertureRuntime.java
    aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/runtime/ApertureRuntimeTest.java

Modified: aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/runtime/ApertureRuntime.java
===================================================================

--- aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/runtime/ApertureRuntime.java	2010-03-12 09:02:32 UTC (rev 2294)
+++ aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/runtime/ApertureRuntime.java	2010-03-12 10:52:52 UTC (rev 2295)
@@ -11,6 +11,7 @@
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.net.MalformedURLException;
 import java.util.Set;
 import java.util.regex.Pattern;
 
@@ -18,6 +19,15 @@
 import org.ontoware.rdf2go.model.Model;
 import org.ontoware.rdf2go.model.node.URI;
 import org.ontoware.rdf2go.model.node.impl.URIImpl;
+import org.semanticdesktop.aperture.accessor.DataAccessor;
+import org.semanticdesktop.aperture.accessor.DataAccessorFactory;
+import org.semanticdesktop.aperture.accessor.DataAccessorRegistry;
+import org.semanticdesktop.aperture.accessor.DataObject;
+import org.semanticdesktop.aperture.accessor.FileDataObject;
+import org.semanticdesktop.aperture.accessor.RDFContainerFactory;
+import org.semanticdesktop.aperture.accessor.UrlNotFoundException;
+import org.semanticdesktop.aperture.accessor.base.RDFContainerFactoryImpl;
+import org.semanticdesktop.aperture.accessor.impl.DefaultDataAccessorRegistry;
 import org.semanticdesktop.aperture.extractor.Extractor;
 import org.semanticdesktop.aperture.extractor.ExtractorException;
 import org.semanticdesktop.aperture.extractor.ExtractorFactory;
@@ -29,6 +39,9 @@
 import org.semanticdesktop.aperture.mime.identifier.magic.MagicMimeTypeIdentifier;
 import org.semanticdesktop.aperture.rdf.RDFContainer;
 import org.semanticdesktop.aperture.rdf.impl.RDFContainerImpl;
+import org.semanticdesktop.aperture.subcrawler.SubCrawlerRegistry;
+import org.semanticdesktop.aperture.subcrawler.SubCrawlerUtil;
+import org.semanticdesktop.aperture.subcrawler.impl.DefaultSubCrawlerRegistry;
 import org.semanticdesktop.aperture.util.HttpClientUtil;
 import org.semanticdesktop.aperture.util.IOUtil;
 import org.slf4j.Logger;
@@ -44,15 +57,124 @@
     
     private Logger logger = LoggerFactory.getLogger(ApertureRuntime.class);
     
+    private DataAccessorRegistry accessorRegistry;
+    private SubCrawlerRegistry subCrawlerRegistry;
     private ExtractorRegistry extractorRegistry;
     private MimeTypeIdentifier identifier;
         
     public ApertureRuntime() {
         this.extractorRegistry = new DefaultExtractorRegistry();
         this.identifier = new MagicMimeTypeIdentifier();
+        this.accessorRegistry = new DefaultDataAccessorRegistry();
+        this.subCrawlerRegistry = new DefaultSubCrawlerRegistry();
     }
+    
+    /**
+     * Tries to extract as much information from the given URI as possible.
+     * 
+     * @param uri the uri from which information is to be extracted. Only the URU with schemes supported by
+     *            the {@link DefaultDataAccessorRegistry} registry and {@link DefaultSubCrawlerRegistry} can be accessed and extracted. In most cases 
+     *            the resulting InputStream will be read in its entirety, and the method may try download the
+     *            content into a temporary file (created with {@link File#createTempFile(String, String)}. The
+     *            file will be deleted before this method returns, yet there must be enough room on the partition
+     *            that houses the temporary folder. 
+     *            
+     * @return an {@link RDFContainer} instance containing the data extracted from the URL. It must be disposed
+     *            properly by the caller of this method, with a call to {@link RDFContainer#dispose()}. This method
+     *            may return null if an object with this URI has not been found.
+     *             
+     * @throws IllegalArgumentException if the uriString is invalid
+     * @throws IOException if an I/O error occurs during processing
+     */
+    public RDFContainer extractFrom(String uriString) throws IOException {
+    	if (uriString == null) {
+    		throw new NullPointerException("The URL cannot be null");
+    	}
+    	RDFContainerFactory fac = new RDFContainerFactoryImpl();
+    	URI uri = new URIImpl(uriString);
+    	DataObject obj = null;
+    	if (SubCrawlerUtil.isSubcrawledObjectUri(uri)) {
+    		URI topLevelUri = SubCrawlerUtil.getRootObjectUri(uri);
+    		DataObject rootObj = null;
+    		try {
+	    		rootObj = accessUri(topLevelUri,fac);
+	    		if (rootObj == null) {
+	    			return null;
+	    		} else if (!(rootObj instanceof FileDataObject)) {
+	    			return null;
+	    		}
+	    		FileDataObject fobj = (FileDataObject)rootObj;
+	    		InputStream contentStream = fobj.getContent();
+	    		if (contentStream == null) {
+	    			return null;
+	    		}
+	    		obj = SubCrawlerUtil.getDataObject(uri, contentStream, null, null, null, new RDFContainerFactoryImpl(), subCrawlerRegistry);
+    		} catch (Exception e) {
+    			if (rootObj != null) {
+    				rootObj.dispose();
+    			}
+    			return null;
+    		}
+    	} else {
+    		obj = accessUri(uri,fac);
+    	}
+    	
+    	if (obj == null) {
+    		return null;
+    	} else if (!(obj instanceof FileDataObject)) {
+    		return obj.getMetadata();
+    	} else {
+    		FileDataObject fobj = (FileDataObject)obj;
+    		InputStream stream = fobj.getContent();
+    		RDFContainer container = fobj.getMetadata();
+    		if (stream == null) {
+    			return container;
+    		}
+    		tryToApplyExtractors(stream, uri, container);
+    		return container;
+    	}
+    }
+    
+    private void tryToApplyExtractors(InputStream stream, URI uri, RDFContainer container) throws IOException {
+    	boolean ok = false;
+        try {
+            String mimeType = identifyMimeType(stream, uri);
+            
+            ok = applyExtractor(uri, stream, mimeType, container);
+            if (ok) {
+                return;
+            }
+             
+            ok = applyFileExtractor(uri, stream, mimeType, container);
+            if (ok) {
+                return;
+            }
+        }
+        catch (Exception e) { // this should cover both ExtractorExceptions and IOExceptions
+            logger.warn("Couldn't extract information from: " + uri.toString(), e);
+        } 
+    }
 
-    /**
+    private DataObject accessUri(URI topLevelUri, RDFContainerFactory fac) throws UrlNotFoundException, IOException {
+    	int colonIndex = topLevelUri.toString().indexOf(":");
+		if (colonIndex < 0) {
+			throw new IllegalArgumentException("The URI " + topLevelUri + "doesn't contain a colon");
+		}
+		String scheme = topLevelUri.toString().substring(0, colonIndex);
+		Set dafSet = accessorRegistry.get(scheme);
+		if (dafSet == null || dafSet.isEmpty()) {
+			return null;
+		}
+		DataAccessorFactory daf = (DataAccessorFactory)dafSet.iterator().next();
+		if (daf == null) {
+			return null;
+		}
+		DataAccessor da = daf.get();
+		DataObject dob = da.getDataObject(topLevelUri.toString(), null, null, fac);
+		return dob;
+	}
+
+	/**
      * Tries to extract as much information from the given input stream as possible.
      * 
      * @param stream the stream from which information is to be extracted. In most cases the stream will be

Modified: aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/runtime/ApertureRuntimeTest.java
===================================================================
--- aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/runtime/ApertureRuntimeTest.java	2010-03-12 09:02:32 UTC (rev 2294)
+++ aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/runtime/ApertureRuntimeTest.java	2010-03-12 10:52:52 UTC (rev 2295)
@@ -81,6 +81,22 @@
             FileUtil.deltree(tmpDir);
         }
     }
+    
+    public void testGetInfoFromFileUrl() throws Exception {
+        File tmpDir = new File(System.getProperty("java.io.tmpdir"), "ApertureRuntimeTest.tmpDir").getCanonicalFile();
+        try {
+            FileUtil.deltree(tmpDir);
+            tmpDir.mkdir();
+            checkFullTextFromFileUrl(tmpDir,"plain-text.txt", "normal plain text");
+            checkFullTextFromFileUrl(tmpDir,"plain-text-without-extension", "normal plain text");
+            checkFullTextFromFileUrl(tmpDir,"html-handwritten.html", "example text.");
+            checkFullTextFromFileUrl(tmpDir,"xml-handwritten.xml", "handwritten XML");
+            checkFullTextFromFileUrl(tmpDir,"rtf-word-2000.rtf", "example RTF");
+            checkMP3FromUrl(tmpDir,"jingle3.mp3","The Aperture test album");
+        } finally {
+            FileUtil.deltree(tmpDir);
+        }
+    }
 
     private void checkFullTextFromStream(String filename, String fullTextPart) throws IOException {
         InputStream stream = ResourceUtil.getInputStream(DOCS_PATH + filename, ApertureTestBase.class);
@@ -100,6 +116,16 @@
         cont.dispose();
     }
     
+    private void checkFullTextFromFileUrl(File tempFolder, String filename, String fullTextPart) throws IOException {
+        InputStream stream = ResourceUtil.getInputStream(DOCS_PATH + filename, ApertureTestBase.class);
+        File newFile = new File(tempFolder,filename);
+        IOUtil.writeStream(stream, newFile);
+        RDFContainer cont = ar.extractFrom(newFile.toURI().toString());
+        String text = cont.getString(NIE.plainTextContent);
+        assertTrue(text.contains(fullTextPart));
+        cont.dispose();
+    }
+    
     private void checkMP3(File tempFolder, String filename, String album) throws IOException {
         InputStream stream = ResourceUtil.getInputStream(DOCS_PATH + filename, ApertureTestBase.class);
         File newFile = new File(tempFolder,filename);
@@ -110,6 +136,16 @@
         cont.dispose();
     }
     
+    private void checkMP3FromUrl(File tempFolder, String filename, String album) throws IOException {
+        InputStream stream = ResourceUtil.getInputStream(DOCS_PATH + filename, ApertureTestBase.class);
+        File newFile = new File(tempFolder,filename);
+        IOUtil.writeStream(stream, newFile);
+        RDFContainer cont = ar.extractFrom(newFile.toURI().toString());
+        String fileAlbum = cont.getString(NID3.albumTitle);
+        assertEquals(fileAlbum,album);
+        cont.dispose();
+    }
+    
     private void checkMimeType(String filename, String mimetype) throws IOException {
         assertEquals(mimetype, ar.identifyMimeType(
                 ResourceUtil.getInputStream(DOCS_PATH + filename, ApertureTestBase.class),null));


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.