From: <my...@us...> - 2010-03-12 10:52:58
|
Revision: 2295 http://aperture.svn.sourceforge.net/aperture/?rev=2295&view=rev Author: mylka Date: 2010-03-12 10:52:52 +0000 (Fri, 12 Mar 2010) Log Message: ----------- 2969249 - added a new method, that extracts information from a URL, with the help of accessors. This fixes the problem because HTTPAccessor reads the Charset from HTTP headers, something which the normal HtmlExtractor can't Modified Paths: -------------- aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/runtime/ApertureRuntime.java aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/runtime/ApertureRuntimeTest.java Modified: aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/runtime/ApertureRuntime.java =================================================================== --- aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/runtime/ApertureRuntime.java 2010-03-12 09:02:32 UTC (rev 2294) +++ aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/runtime/ApertureRuntime.java 2010-03-12 10:52:52 UTC (rev 2295) @@ -11,6 +11,7 @@ import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; +import java.net.MalformedURLException; import java.util.Set; import java.util.regex.Pattern; @@ -18,6 +19,15 @@ import org.ontoware.rdf2go.model.Model; import org.ontoware.rdf2go.model.node.URI; import org.ontoware.rdf2go.model.node.impl.URIImpl; +import org.semanticdesktop.aperture.accessor.DataAccessor; +import org.semanticdesktop.aperture.accessor.DataAccessorFactory; +import org.semanticdesktop.aperture.accessor.DataAccessorRegistry; +import org.semanticdesktop.aperture.accessor.DataObject; +import org.semanticdesktop.aperture.accessor.FileDataObject; +import org.semanticdesktop.aperture.accessor.RDFContainerFactory; +import org.semanticdesktop.aperture.accessor.UrlNotFoundException; +import org.semanticdesktop.aperture.accessor.base.RDFContainerFactoryImpl; +import org.semanticdesktop.aperture.accessor.impl.DefaultDataAccessorRegistry; import org.semanticdesktop.aperture.extractor.Extractor; import org.semanticdesktop.aperture.extractor.ExtractorException; import org.semanticdesktop.aperture.extractor.ExtractorFactory; @@ -29,6 +39,9 @@ import org.semanticdesktop.aperture.mime.identifier.magic.MagicMimeTypeIdentifier; import org.semanticdesktop.aperture.rdf.RDFContainer; import org.semanticdesktop.aperture.rdf.impl.RDFContainerImpl; +import org.semanticdesktop.aperture.subcrawler.SubCrawlerRegistry; +import org.semanticdesktop.aperture.subcrawler.SubCrawlerUtil; +import org.semanticdesktop.aperture.subcrawler.impl.DefaultSubCrawlerRegistry; import org.semanticdesktop.aperture.util.HttpClientUtil; import org.semanticdesktop.aperture.util.IOUtil; import org.slf4j.Logger; @@ -44,15 +57,124 @@ private Logger logger = LoggerFactory.getLogger(ApertureRuntime.class); + private DataAccessorRegistry accessorRegistry; + private SubCrawlerRegistry subCrawlerRegistry; private ExtractorRegistry extractorRegistry; private MimeTypeIdentifier identifier; public ApertureRuntime() { this.extractorRegistry = new DefaultExtractorRegistry(); this.identifier = new MagicMimeTypeIdentifier(); + this.accessorRegistry = new DefaultDataAccessorRegistry(); + this.subCrawlerRegistry = new DefaultSubCrawlerRegistry(); } + + /** + * Tries to extract as much information from the given URI as possible. + * + * @param uri the uri from which information is to be extracted. Only the URU with schemes supported by + * the {@link DefaultDataAccessorRegistry} registry and {@link DefaultSubCrawlerRegistry} can be accessed and extracted. In most cases + * the resulting InputStream will be read in its entirety, and the method may try download the + * content into a temporary file (created with {@link File#createTempFile(String, String)}. The + * file will be deleted before this method returns, yet there must be enough room on the partition + * that houses the temporary folder. + * + * @return an {@link RDFContainer} instance containing the data extracted from the URL. It must be disposed + * properly by the caller of this method, with a call to {@link RDFContainer#dispose()}. This method + * may return null if an object with this URI has not been found. + * + * @throws IllegalArgumentException if the uriString is invalid + * @throws IOException if an I/O error occurs during processing + */ + public RDFContainer extractFrom(String uriString) throws IOException { + if (uriString == null) { + throw new NullPointerException("The URL cannot be null"); + } + RDFContainerFactory fac = new RDFContainerFactoryImpl(); + URI uri = new URIImpl(uriString); + DataObject obj = null; + if (SubCrawlerUtil.isSubcrawledObjectUri(uri)) { + URI topLevelUri = SubCrawlerUtil.getRootObjectUri(uri); + DataObject rootObj = null; + try { + rootObj = accessUri(topLevelUri,fac); + if (rootObj == null) { + return null; + } else if (!(rootObj instanceof FileDataObject)) { + return null; + } + FileDataObject fobj = (FileDataObject)rootObj; + InputStream contentStream = fobj.getContent(); + if (contentStream == null) { + return null; + } + obj = SubCrawlerUtil.getDataObject(uri, contentStream, null, null, null, new RDFContainerFactoryImpl(), subCrawlerRegistry); + } catch (Exception e) { + if (rootObj != null) { + rootObj.dispose(); + } + return null; + } + } else { + obj = accessUri(uri,fac); + } + + if (obj == null) { + return null; + } else if (!(obj instanceof FileDataObject)) { + return obj.getMetadata(); + } else { + FileDataObject fobj = (FileDataObject)obj; + InputStream stream = fobj.getContent(); + RDFContainer container = fobj.getMetadata(); + if (stream == null) { + return container; + } + tryToApplyExtractors(stream, uri, container); + return container; + } + } + + private void tryToApplyExtractors(InputStream stream, URI uri, RDFContainer container) throws IOException { + boolean ok = false; + try { + String mimeType = identifyMimeType(stream, uri); + + ok = applyExtractor(uri, stream, mimeType, container); + if (ok) { + return; + } + + ok = applyFileExtractor(uri, stream, mimeType, container); + if (ok) { + return; + } + } + catch (Exception e) { // this should cover both ExtractorExceptions and IOExceptions + logger.warn("Couldn't extract information from: " + uri.toString(), e); + } + } - /** + private DataObject accessUri(URI topLevelUri, RDFContainerFactory fac) throws UrlNotFoundException, IOException { + int colonIndex = topLevelUri.toString().indexOf(":"); + if (colonIndex < 0) { + throw new IllegalArgumentException("The URI " + topLevelUri + "doesn't contain a colon"); + } + String scheme = topLevelUri.toString().substring(0, colonIndex); + Set dafSet = accessorRegistry.get(scheme); + if (dafSet == null || dafSet.isEmpty()) { + return null; + } + DataAccessorFactory daf = (DataAccessorFactory)dafSet.iterator().next(); + if (daf == null) { + return null; + } + DataAccessor da = daf.get(); + DataObject dob = da.getDataObject(topLevelUri.toString(), null, null, fac); + return dob; + } + + /** * Tries to extract as much information from the given input stream as possible. * * @param stream the stream from which information is to be extracted. In most cases the stream will be Modified: aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/runtime/ApertureRuntimeTest.java =================================================================== --- aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/runtime/ApertureRuntimeTest.java 2010-03-12 09:02:32 UTC (rev 2294) +++ aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/runtime/ApertureRuntimeTest.java 2010-03-12 10:52:52 UTC (rev 2295) @@ -81,6 +81,22 @@ FileUtil.deltree(tmpDir); } } + + public void testGetInfoFromFileUrl() throws Exception { + File tmpDir = new File(System.getProperty("java.io.tmpdir"), "ApertureRuntimeTest.tmpDir").getCanonicalFile(); + try { + FileUtil.deltree(tmpDir); + tmpDir.mkdir(); + checkFullTextFromFileUrl(tmpDir,"plain-text.txt", "normal plain text"); + checkFullTextFromFileUrl(tmpDir,"plain-text-without-extension", "normal plain text"); + checkFullTextFromFileUrl(tmpDir,"html-handwritten.html", "example text."); + checkFullTextFromFileUrl(tmpDir,"xml-handwritten.xml", "handwritten XML"); + checkFullTextFromFileUrl(tmpDir,"rtf-word-2000.rtf", "example RTF"); + checkMP3FromUrl(tmpDir,"jingle3.mp3","The Aperture test album"); + } finally { + FileUtil.deltree(tmpDir); + } + } private void checkFullTextFromStream(String filename, String fullTextPart) throws IOException { InputStream stream = ResourceUtil.getInputStream(DOCS_PATH + filename, ApertureTestBase.class); @@ -100,6 +116,16 @@ cont.dispose(); } + private void checkFullTextFromFileUrl(File tempFolder, String filename, String fullTextPart) throws IOException { + InputStream stream = ResourceUtil.getInputStream(DOCS_PATH + filename, ApertureTestBase.class); + File newFile = new File(tempFolder,filename); + IOUtil.writeStream(stream, newFile); + RDFContainer cont = ar.extractFrom(newFile.toURI().toString()); + String text = cont.getString(NIE.plainTextContent); + assertTrue(text.contains(fullTextPart)); + cont.dispose(); + } + private void checkMP3(File tempFolder, String filename, String album) throws IOException { InputStream stream = ResourceUtil.getInputStream(DOCS_PATH + filename, ApertureTestBase.class); File newFile = new File(tempFolder,filename); @@ -110,6 +136,16 @@ cont.dispose(); } + private void checkMP3FromUrl(File tempFolder, String filename, String album) throws IOException { + InputStream stream = ResourceUtil.getInputStream(DOCS_PATH + filename, ApertureTestBase.class); + File newFile = new File(tempFolder,filename); + IOUtil.writeStream(stream, newFile); + RDFContainer cont = ar.extractFrom(newFile.toURI().toString()); + String fileAlbum = cont.getString(NID3.albumTitle); + assertEquals(fileAlbum,album); + cont.dispose(); + } + private void checkMimeType(String filename, String mimetype) throws IOException { assertEquals(mimetype, ar.identifyMimeType( ResourceUtil.getInputStream(DOCS_PATH + filename, ApertureTestBase.class),null)); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |