Thread: [aperture-commit] SF.net SVN: aperture:[2129] aperture-addons/trunk/src/main/java/org/ semanticdesk

Brought to you by: cfmfluit, leo_sauermann, mylka, reuschling

aperture-commit

[aperture-commit] SF.net SVN: aperture:[2129] aperture-addons/trunk/src/main/java/org/ semanticdesktop/aperture

From: <my...@us...> - 2009-11-26 16:11:36

Revision: 2129
          http://aperture.svn.sourceforge.net/aperture/?rev=2129&view=rev
Author:   mylka
Date:     2009-11-26 16:11:22 +0000 (Thu, 26 Nov 2009)

Log Message:
-----------
updated the addon crawlers to report the error cause

Modified Paths:
--------------
    aperture-addons/trunk/src/main/java/org/semanticdesktop/aperture/webdav/crawler/WebdavCrawler.java
    aperture-addons/trunk/src/main/java/org/semanticdesktop/aperture/websites/bibsonomy/BibsonomyCrawler.java

Modified: aperture-addons/trunk/src/main/java/org/semanticdesktop/aperture/webdav/crawler/WebdavCrawler.java
===================================================================
--- aperture-addons/trunk/src/main/java/org/semanticdesktop/aperture/webdav/crawler/WebdavCrawler.java	2009-11-19 16:17:16 UTC (rev 2128)
+++ aperture-addons/trunk/src/main/java/org/semanticdesktop/aperture/webdav/crawler/WebdavCrawler.java	2009-11-26 16:11:22 UTC (rev 2129)
@@ -127,8 +127,7 @@
 	protected ExitCode crawlObjects() {
 		DataSource dataSource = getDataSource();
 		if(!(dataSource instanceof WebDataSource)){
-			logger.error("wrong data source type");
-			return ExitCode.FATAL_ERROR;
+		    return reportFatalErrorCause("wrong data source type");
 		}
 		source = (WebDataSource)dataSource;
 		try {
@@ -152,13 +151,10 @@
 				//sets the path relative to root
 				root.setPath(root.getPath() + path);
 			} else {
-				logger.error("Unknown Protocol");
-				return ExitCode.FATAL_ERROR;
+			    return reportFatalErrorCause("Unknown Protocol: " + url.getProtocol());
 			}
 		} catch (Exception e) {
-			logger.error("WebdavResource Accessfailure");
-			e.printStackTrace();
-			return ExitCode.FATAL_ERROR;
+		    return reportFatalErrorCause("WebdavResource Accessfailure",e);
 		} 
 		
 

Modified: aperture-addons/trunk/src/main/java/org/semanticdesktop/aperture/websites/bibsonomy/BibsonomyCrawler.java
===================================================================
--- aperture-addons/trunk/src/main/java/org/semanticdesktop/aperture/websites/bibsonomy/BibsonomyCrawler.java	2009-11-19 16:17:16 UTC (rev 2128)
+++ aperture-addons/trunk/src/main/java/org/semanticdesktop/aperture/websites/bibsonomy/BibsonomyCrawler.java	2009-11-26 16:11:22 UTC (rev 2129)
@@ -99,16 +99,17 @@
 						}
 					}
 				} else {
-					logger.warn("Couldn't get data from Bibsonomy. Status code: "
+				    return reportFatalErrorCause("Couldn't get data from Bibsonomy. Status code: "
 							+ q.getHttpStatusCode()
 							+ " error: "
 							+ q.getError());
-					return ExitCode.FATAL_ERROR;
 				}
 			} catch (IllegalStateException e) {
-				logger.warn("Couldn't get data from Bibsonomy", e);
+			    reportFatalErrorCause("Couldn't get data from Bibsonomy", e);
+			    return ExitCode.FATAL_ERROR;
 			} catch (ErrorPerformingRequestException e) {
-				logger.warn("Couldn't get data from Bibsonomy", e);
+			    reportFatalErrorCause("Couldn't get data from Bibsonomy", e);
+			    return ExitCode.FATAL_ERROR;
 			}
 		} while (found == 50);
 


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[aperture-commit] SF.net SVN: aperture:[2381] aperture-addons/trunk/src/main/java/org/ semanticdesktop/aperture

From: <my...@us...> - 2010-07-05 23:09:15

Revision: 2381
          http://aperture.svn.sourceforge.net/aperture/?rev=2381&view=rev
Author:   mylka
Date:     2010-07-05 21:43:10 +0000 (Mon, 05 Jul 2010)

Log Message:
-----------
committed an expanded version of the ApertureRuntime tests, I did this in the aperture-addons folder to gather some feedback before thinking about committing it to the core

Added Paths:
-----------
    aperture-addons/trunk/src/main/java/org/semanticdesktop/aperture/util/
    aperture-addons/trunk/src/main/java/org/semanticdesktop/aperture/util/ExpandedApertureRuntime.java
    aperture-addons/trunk/src/main/java/org/semanticdesktop/aperture/util/ExpandedApertureRuntimeMain.java

Added: aperture-addons/trunk/src/main/java/org/semanticdesktop/aperture/util/ExpandedApertureRuntime.java
===================================================================
--- aperture-addons/trunk/src/main/java/org/semanticdesktop/aperture/util/ExpandedApertureRuntime.java	                        (rev 0)
+++ aperture-addons/trunk/src/main/java/org/semanticdesktop/aperture/util/ExpandedApertureRuntime.java	2010-07-05 21:43:10 UTC (rev 2381)
@@ -0,0 +1,494 @@
+/**
+ * Copyright (c) 2010 Aduna and Deutsches Forschungszentrum fuer Kuenstliche Intelligenz DFKI GmbH.
+ * All rights reserved.
+ * 
+ * Licensed under the Aperture BSD-style license.
+ */
+package org.semanticdesktop.aperture.util;
+
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import org.ontoware.rdf2go.RDF2Go;
+import org.ontoware.rdf2go.model.Model;
+import org.ontoware.rdf2go.model.node.URI;
+import org.ontoware.rdf2go.model.node.impl.URIImpl;
+import org.ontoware.rdf2go.vocabulary.RDF;
+import org.semanticdesktop.aperture.accessor.DataAccessor;
+import org.semanticdesktop.aperture.accessor.DataAccessorFactory;
+import org.semanticdesktop.aperture.accessor.DataAccessorRegistry;
+import org.semanticdesktop.aperture.accessor.DataObject;
+import org.semanticdesktop.aperture.accessor.FileDataObject;
+import org.semanticdesktop.aperture.accessor.RDFContainerFactory;
+import org.semanticdesktop.aperture.accessor.UrlNotFoundException;
+import org.semanticdesktop.aperture.accessor.base.DataObjectBase;
+import org.semanticdesktop.aperture.accessor.base.RDFContainerFactoryImpl;
+import org.semanticdesktop.aperture.accessor.impl.DefaultDataAccessorRegistry;
+import org.semanticdesktop.aperture.extractor.Extractor;
+import org.semanticdesktop.aperture.extractor.ExtractorException;
+import org.semanticdesktop.aperture.extractor.ExtractorFactory;
+import org.semanticdesktop.aperture.extractor.ExtractorRegistry;
+import org.semanticdesktop.aperture.extractor.FileExtractor;
+import org.semanticdesktop.aperture.extractor.FileExtractorFactory;
+import org.semanticdesktop.aperture.extractor.impl.DefaultExtractorRegistry;
+import org.semanticdesktop.aperture.mime.identifier.MimeTypeIdentifier;
+import org.semanticdesktop.aperture.mime.identifier.magic.MagicMimeTypeIdentifier;
+import org.semanticdesktop.aperture.rdf.RDFContainer;
+import org.semanticdesktop.aperture.rdf.impl.RDFContainerImpl;
+import org.semanticdesktop.aperture.runtime.ApertureRuntime;
+import org.semanticdesktop.aperture.subcrawler.SubCrawler;
+import org.semanticdesktop.aperture.subcrawler.SubCrawlerException;
+import org.semanticdesktop.aperture.subcrawler.SubCrawlerFactory;
+import org.semanticdesktop.aperture.subcrawler.SubCrawlerHandler;
+import org.semanticdesktop.aperture.subcrawler.SubCrawlerRegistry;
+import org.semanticdesktop.aperture.subcrawler.SubCrawlerUtil;
+import org.semanticdesktop.aperture.subcrawler.impl.DefaultSubCrawlerRegistry;
+import org.semanticdesktop.aperture.vocabulary.NID3;
+import org.semanticdesktop.aperture.vocabulary.NIE;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @author Antoni
+ *
+ */
+public class ExpandedApertureRuntime {
+
+    private static final Pattern URI_START_PATTERN = Pattern.compile("^[a-z]+:.*$");
+    
+    private Logger logger = LoggerFactory.getLogger(ApertureRuntime.class);
+    
+    private DataAccessorRegistry accessorRegistry;
+    private SubCrawlerRegistry subCrawlerRegistry;
+    private ExtractorRegistry extractorRegistry;
+    private MimeTypeIdentifier identifier;
+    private RDFContainerFactory factory;
+     
+    public interface TextProcessor {
+    	public void process(URI uri, String text);
+    }
+    
+    public ExpandedApertureRuntime() {
+        this.extractorRegistry = new DefaultExtractorRegistry();
+        this.identifier = new MagicMimeTypeIdentifier();
+        this.accessorRegistry = new DefaultDataAccessorRegistry();
+        this.subCrawlerRegistry = new DefaultSubCrawlerRegistry();
+        this.factory = new RDFContainerFactoryImpl();
+    }
+    
+    /**
+     * Tries to extract as much information from the given URI as possible.
+     * 
+     * @param uri the uri from which information is to be extracted. Only the URU with schemes supported by
+     *            the {@link DefaultDataAccessorRegistry} registry and {@link DefaultSubCrawlerRegistry} can be accessed and extracted. In most cases 
+     *            the resulting InputStream will be read in its entirety, and the method may try download the
+     *            content into a temporary file (created with {@link File#createTempFile(String, String)}. The
+     *            file will be deleted before this method returns, yet there must be enough room on the partition
+     *            that houses the temporary folder. 
+     *            
+     * @return an {@link RDFContainer} instance containing the data extracted from the URL. It must be disposed
+     *            properly by the caller of this method, with a call to {@link RDFContainer#dispose()}. This method
+     *            may return null if an object with this URI has not been found.
+     *             
+     * @throws IllegalArgumentException if the uriString is invalid
+     * @throws IOException if an I/O error occurs during processing
+     */
+    public RDFContainer extractFrom(String uriString) throws IOException {
+    	if (uriString == null) {
+    		throw new NullPointerException("The URL cannot be null");
+    	}
+    	RDFContainerFactory fac = new RDFContainerFactoryImpl();
+    	URI uri = new URIImpl(uriString);
+    	DataObject obj = null;
+    	if (SubCrawlerUtil.isSubcrawledObjectUri(uri)) {
+    		URI topLevelUri = SubCrawlerUtil.getRootObjectUri(uri);
+    		DataObject rootObj = null;
+    		try {
+	    		rootObj = accessUri(topLevelUri,fac);
+	    		if (rootObj == null) {
+	    			return null;
+	    		} else if (!(rootObj instanceof FileDataObject)) {
+	    			return null;
+	    		}
+	    		FileDataObject fobj = (FileDataObject)rootObj;
+	    		InputStream contentStream = fobj.getContent();
+	    		if (contentStream == null) {
+	    			return null;
+	    		}
+	    		obj = SubCrawlerUtil.getDataObject(uri, contentStream, null, null, null, new RDFContainerFactoryImpl(), subCrawlerRegistry);
+    		} catch (Exception e) {
+    			if (rootObj != null) {
+    				rootObj.dispose();
+    			}
+    			return null;
+    		}
+    	} else {
+    		obj = accessUri(uri,fac);
+    	}
+    	
+    	if (obj == null) {
+    		return null;
+    	} else if (!(obj instanceof FileDataObject)) {
+    		RDFContainer containerToReturn = obj.getMetadata();
+    		safelyDispose(obj);
+    		return containerToReturn;
+    	} else {
+    		FileDataObject fobj = (FileDataObject)obj;
+    		InputStream stream = fobj.getContent();
+    		RDFContainer container = fobj.getMetadata();
+    		if (stream == null) {
+    			return container;
+    		}
+            String mimeType = identifyMimeType(stream, uri);
+    		tryToApplyExtractors(stream, uri, container, mimeType);
+    		safelyDispose(obj);
+    		return container;
+    	}
+    }
+    
+	/**
+     * Tries to extract as much information from the given input stream as possible.
+     * 
+     * @param stream the stream from which information is to be extracted. In most cases the stream will be
+     *            read in its entirety, so it will probably be unusable after this method returns. THIS METHOD
+     *            may try to download the entire file to a temporary file
+     * @param nameOrPathOrUrl (obligatory) this argument has two purposes: firstly, the file name and
+     *            extension will be extracted from it which in turn will be used to aid the MIME type
+     *            detection. Secondly it will serve as the basis for the creation of the URI for the returned
+     *            RDFContainer. If this argument is already a valid URI it will be used unchanged. Otherwise a
+     *            new uri will be created, with a 'file:' prefix, and the name or path appended to that prefix,
+     *            correctly URL-encoded. The process is heuristic, so in a general case the user should make
+     *            no assumptions what will the URI exactly look like. It should always be checked by calling
+     *            {@link RDFContainer#getDescribedUri()} on the returned {@link RDFContainer} instance. 
+     * @return an {@link RDFContainer} instance containing the data extracted from the InputStream. 
+     * @throws IOException if an I/O error occurs in the process.
+     */
+    public RDFContainer extractFrom(InputStream stream, String nameOrPathOrUrl) throws IOException {
+        if (stream == null) {
+            throw new NullPointerException("stream cannot be null");
+        }
+        
+        if (nameOrPathOrUrl == null) {
+            throw new NullPointerException("nameOrPathOrUrl cannot be null");
+        }
+        
+        if (!stream.markSupported()) {
+            stream = new BufferedInputStream(stream);
+        }
+        
+        URI uri = tryToConvertToUri(nameOrPathOrUrl);
+        
+        Model model = RDF2Go.getModelFactory().createModel();
+        model.open();
+        RDFContainer container = new RDFContainerImpl(model,uri);
+        String mimeType = identifyMimeType(stream, uri);
+        tryToApplyExtractors(stream, uri, container, mimeType);
+        return container;
+    }
+    
+    /**
+     * Tries to extract as much information from the given file as possible.
+     * 
+     * @param file the file from which information is to be extracted
+     * @return an {@link RDFContainer} instance containing the data extracted from the file. 
+     * @throws IOException if an I/O error occurs in the process.
+     */
+    public RDFContainer extractFrom(File file) throws IOException {
+        if (file == null) {
+            throw new NullPointerException("stream cannot be null");
+        }
+        
+        if (!file.canRead()) {
+            throw new IOException("The file: " + file.getAbsolutePath() + " is unreadable");
+        }
+
+        InputStream stream = new FileInputStream(file);
+        return extractFrom(stream, file.toURI().toString());
+    }
+
+    public void extractAllText(InputStream stream, String nameOrPathOrUrl, TextProcessor processor) throws IOException, SubCrawlerException {
+        if (stream == null) {
+            throw new NullPointerException("stream cannot be null");
+        }
+        
+        if (processor == null) {
+        	throw new NullPointerException("processor cannot be null");
+        }
+        
+        if (nameOrPathOrUrl == null) {
+            throw new NullPointerException("nameOrPathOrUrl cannot be null");
+        }
+        
+        if (!stream.markSupported()) {
+            stream = new BufferedInputStream(stream);
+        }
+        
+        URI uri = tryToConvertToUri(nameOrPathOrUrl);
+        
+        Model model = RDF2Go.getModelFactory().createModel();
+        model.open();
+        RDFContainer container = new RDFContainerImpl(model,uri);
+        String mimeType = identifyMimeType(stream, uri);
+        boolean ok = tryToApplyExtractors(stream, uri, container, mimeType);
+        if (ok) {
+        	String text = getFullText(container);
+        	if (text != null) {
+        		processor.process(uri, text);
+        	}
+        	container.dispose();
+        } else {
+        	Set subcrawlers = subCrawlerRegistry.get(mimeType);
+            if (!subcrawlers.isEmpty()) {
+            	SubCrawlerFactory factory = (SubCrawlerFactory) subcrawlers.iterator().next();
+            	SubCrawler subcrawler = factory.get();
+                subcrawler.subCrawl(uri, stream, new TextExtractingSubCrawlerHandler(processor), null, null, null, mimeType, container);
+                String text = getFullText(container);
+            	if (text != null) {
+            		processor.process(uri, text);
+            	}
+            	container.dispose();
+            }
+        }
+    }
+    
+    private class TextExtractingSubCrawlerHandler implements SubCrawlerHandler {
+    	
+    	private TextProcessor processor;
+    	public TextExtractingSubCrawlerHandler(TextProcessor processor) {
+    		this.processor = processor;
+    	}
+    	public RDFContainerFactory getRDFContainerFactory(String url) {return factory;}
+    	
+    	public void objectNew(DataObject object) {
+			if (object instanceof FileDataObject) {
+				try {
+					InputStream contentStream = ((FileDataObject)object).getContent();
+	                extractAllText(contentStream, object.getID().toString(), processor);
+                } catch (RuntimeException e) {
+	                throw e;
+                } catch (Exception e) {
+	                throw new RuntimeException(e);
+                }
+			}
+			object.dispose();
+		}
+    	
+		public void objectNotModified(String url) {} // can't happen
+		public void objectChanged(DataObject object) {} // can't happen
+    }
+
+    /**
+     * Tries to convert a string into something resembling an URI. No guarantees are made. The goals were:
+     * <ul>
+     * <li>strings that already resemble uris are unchanged</li>
+     * <li>simple file names are converted to uris of the form uri:file+name.pdf. The name of the file is
+     * URL-encoded.</li>
+     * <li>file paths are converted to uri:C:/Documents+Settings/Antoni/My+Documents/file.pdf</li>
+     * </ul>
+     * 
+     * Package access has been chosen to allow for unit testing.
+     * 
+     * @param nameOrPathOrUrl
+     * @return
+     */
+    URI tryToConvertToUri(String nameOrPathOrUrl) {
+        URI uri = null;
+
+        // first replace backslashes into slashes, it won't affect proper URI's but will fix
+        // most of the normal windows paths
+        nameOrPathOrUrl = nameOrPathOrUrl.replaceAll("\\\\", "/");
+
+        // then see if the string begins with a uri scheme, this will also not affect proper
+        // URI's but will fix most of the paths and/or simple file names
+        if (!URI_START_PATTERN.matcher(nameOrPathOrUrl).matches()) {
+            nameOrPathOrUrl = "file:" + nameOrPathOrUrl;
+        }
+
+        try {
+            // leave some typical URL characters, so that normal proper URL's will come unchanged from
+            // this treatment,
+            uri = new URIImpl(HttpClientUtil.formUrlEncode(nameOrPathOrUrl, "/:%!?&+.="));
+        }
+        catch (Exception e) {
+            // something is wrong with the URI, let's apply proper formUriEncoding, it's plain impossible
+            // that uri:<url_encoded_string> may be considered invalid
+            uri = new URIImpl("file:" + HttpClientUtil.formUrlEncode(nameOrPathOrUrl));
+        }
+
+        return uri;
+    }
+
+    /**
+     * Tries to identify the MIME type of the given input stream.
+     * 
+     * @param stream the input stream you'd like to identify. If the stream supports mark() and reset(), it's
+     *            {@link InputStream#markSupported()} method returns true, the stream is marked and then reset
+     *            at the end. If not, then it's not. If you would like to use that stream afterwards, make
+     *            sure it does support {@link InputStream#mark(int)} or wrap it in a
+     *            {@link BufferedInputStream} if it doesn't. Closing the stream is YOUR responsiblity so make
+     *            so remember about this. You can also leave this argument as NULL in which case the
+     *            identifier will try to guess the MIME type from the file name or url.
+     * @param nameOrPathOrUrl an optional argument, that may help with checking. Many MIME types can be
+     *            detected by looking at the extension of that file. It can be the name of the file, its path
+     *            or an URL.
+     * 
+     * @return a string with the MIME type or null if the MIME type has not been recognized
+     * @throws IOException if an I/O error occurs while reading from the stream
+     */
+    public String identifyMimeType(InputStream stream, String nameOrPathOrUrl) throws IOException {
+        URI uri = null;
+        if (nameOrPathOrUrl != null) {
+            uri = tryToConvertToUri(nameOrPathOrUrl);
+        }
+        return identifyMimeType(stream, uri);
+    }
+    
+    private DataObject accessUri(URI topLevelUri, RDFContainerFactory fac) throws UrlNotFoundException, IOException {
+    	int colonIndex = topLevelUri.toString().indexOf(":");
+		if (colonIndex < 0) {
+			throw new IllegalArgumentException("The URI " + topLevelUri + "doesn't contain a colon");
+		}
+		String scheme = topLevelUri.toString().substring(0, colonIndex);
+		Set dafSet = accessorRegistry.get(scheme);
+		if (dafSet == null || dafSet.isEmpty()) {
+			return null;
+		}
+		DataAccessorFactory daf = (DataAccessorFactory)dafSet.iterator().next();
+		if (daf == null) {
+			return null;
+		}
+		DataAccessor da = daf.get();
+		DataObject dob = da.getDataObject(topLevelUri.toString(), null, null, fac);
+		return dob;
+	}
+    
+    
+
+	private boolean tryToApplyExtractors(InputStream stream, URI uri, RDFContainer container, String mimeType) throws IOException {
+    	boolean ok = false;
+        try {
+            ok = applyExtractor(uri, stream, mimeType, container);
+            if (ok) {
+                return true;
+            }
+             
+            ok = applyFileExtractor(uri, stream, mimeType, container);
+            if (ok) {
+                return true;
+            }
+        }
+        catch (Exception e) { // this should cover both ExtractorExceptions and IOExceptions
+            logger.warn("Couldn't extract information from: " + uri.toString(), e);
+        } 
+        return false;
+    }
+    
+    private String identifyMimeType(InputStream stream, URI uri) throws IOException {
+        byte[] bytes = null;
+        if (stream != null) {
+            int minimumArrayLength = identifier.getMinArrayLength();
+            if (stream.markSupported()) {
+                stream.mark(minimumArrayLength + 10); // add some for safety
+            }
+            bytes = IOUtil.readBytes(stream, minimumArrayLength);
+            if (stream.markSupported()) {
+                stream.reset();
+            }
+        }
+        return identifier.identify(bytes, null, uri);
+    }
+    
+    @SuppressWarnings("unchecked")
+    private boolean applyExtractor(URI id, InputStream contentStream, String mimeType, RDFContainer metadata)
+            throws ExtractorException {
+        Set extractors = extractorRegistry.getExtractorFactories(mimeType);
+        if (!extractors.isEmpty()) {
+            ExtractorFactory factory = (ExtractorFactory) extractors.iterator().next();
+            Extractor extractor = factory.get();
+            extractor.extract(id, contentStream, null, mimeType, metadata);
+            return true;
+        }
+        else {
+            return false;
+        }
+    }
+    
+    @SuppressWarnings("unchecked")
+    private boolean applyFileExtractor(URI id, InputStream stream, String mimeType, RDFContainer metadata)
+            throws ExtractorException, IOException {
+        Set fileextractors = extractorRegistry.getFileExtractorFactories(mimeType);
+        if (!fileextractors.isEmpty()) {
+            FileExtractorFactory factory = (FileExtractorFactory) fileextractors.iterator().next();
+            FileExtractor extractor = factory.get();
+            File tempFile = null; 
+            try {
+                tempFile = File.createTempFile("aperture", "tmp");
+                IOUtil.writeStream(stream, tempFile);
+                extractor.extract(id, tempFile, null, mimeType, metadata);
+                return true;
+            }
+            finally {
+                if (tempFile != null) {
+                    tempFile.delete();
+                }
+            }
+        }
+        else {
+            return false;
+        }
+    }
+
+    private void closeClosable(InputStream stream) {
+        if (stream != null) {
+            try {
+                stream.close();
+            } catch (Exception e) {
+                // nothing to be done...
+            }
+        }
+    }
+    
+    private String getFullText(RDFContainer container) {
+		String string = container.getString(NIE.plainTextContent);
+		if (string == null) {
+			string = container.getString(org.semanticdesktop.aperture.vocabulary.NMO.plainTextMessageContent);
+			if (string == null) {
+				string = container.getString(NID3.unsynchronizedTextContent);
+			}
+		}
+		return string;
+	}
+    
+    /**
+     * This is a hack that allows us to return the metadata RDFContainers while their
+     * parent data objects are disposed properly and don't cause warnings
+     * @param ob
+     */
+    private void safelyDispose(DataObject ob) {
+    	DataObjectBase obj = (DataObjectBase)ob;
+		obj.setMetadata(new RDFContainerImpl(null, RDF.Bag, true));
+		obj.dispose();
+	}
+    
+    /**
+     * Returns the extractor registry.
+     * @return the extractor registry.
+     */
+    public ExtractorRegistry getExtractorRegistry() {
+        return this.extractorRegistry;
+    }
+    
+    /**
+     * Returns the mime type identifier.
+     * @return the mime type identifier.
+     */
+    public MimeTypeIdentifier getMimeTypeIdentifier() {
+        return this.identifier;
+    }
+	
+}


Property changes on: aperture-addons/trunk/src/main/java/org/semanticdesktop/aperture/util/ExpandedApertureRuntime.java
___________________________________________________________________
Added: svn:mime-type
   + text/plain

Added: aperture-addons/trunk/src/main/java/org/semanticdesktop/aperture/util/ExpandedApertureRuntimeMain.java
===================================================================
--- aperture-addons/trunk/src/main/java/org/semanticdesktop/aperture/util/ExpandedApertureRuntimeMain.java	                        (rev 0)
+++ aperture-addons/trunk/src/main/java/org/semanticdesktop/aperture/util/ExpandedApertureRuntimeMain.java	2010-07-05 21:43:10 UTC (rev 2381)
@@ -0,0 +1,47 @@
+/**
+ * Copyright (c) 2010 Aduna and Deutsches Forschungszentrum fuer Kuenstliche Intelligenz DFKI GmbH.
+ * All rights reserved.
+ * 
+ * Licensed under the Aperture BSD-style license.
+ */
+package org.semanticdesktop.aperture.util;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+
+import org.ontoware.rdf2go.model.node.URI;
+import org.semanticdesktop.aperture.subcrawler.SubCrawlerException;
+import org.semanticdesktop.aperture.util.ExpandedApertureRuntime.TextProcessor;
+
+/**
+ * @author Antoni
+ *
+ */
+public class ExpandedApertureRuntimeMain {
+	
+	public static void main(String [] args) throws IOException, SubCrawlerException {
+		if (args.length < 1) {
+			usage();
+		}
+		File file = new File(args[0]);
+		FileInputStream st = new FileInputStream(file);
+		ExpandedApertureRuntime rt = new ExpandedApertureRuntime();
+		rt.extractAllText(st, file.toURI().toString(), new TextProcessor() {
+			
+			public void process(URI uri, String text) {
+				System.out.println("----------------------");
+				System.out.println(uri.toString());
+				System.out.println("----------------------");
+				System.out.println(text);
+				System.out.println("----------------------");
+			}
+		});
+		
+	}
+
+    private static void usage() {
+	    System.out.println("Usage: java -cp <classpath> <package>.ExpandedApertureRuntimeMain <fileToCrawl>");
+	    System.exit(-1);
+    }
+}


Property changes on: aperture-addons/trunk/src/main/java/org/semanticdesktop/aperture/util/ExpandedApertureRuntimeMain.java
___________________________________________________________________
Added: svn:mime-type
   + text/plain


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[aperture-commit] SF.net SVN: aperture:[2407] aperture-addons/trunk/src/main/java/org/ semanticdesktop/aperture

From: <my...@us...> - 2010-08-10 18:04:20

Revision: 2407
          http://aperture.svn.sourceforge.net/aperture/?rev=2407&view=rev
Author:   mylka
Date:     2010-08-10 18:04:14 +0000 (Tue, 10 Aug 2010)

Log Message:
-----------
[3035317] applied the patch to ExpandedApertureRuntime submitted by cbamford

Modified Paths:
--------------
    aperture-addons/trunk/src/main/java/org/semanticdesktop/aperture/accessor/impl/defaults.xml
    aperture-addons/trunk/src/main/java/org/semanticdesktop/aperture/util/ExpandedApertureRuntime.java
    aperture-addons/trunk/src/main/java/org/semanticdesktop/aperture/util/ExpandedApertureRuntimeMain.java

Modified: aperture-addons/trunk/src/main/java/org/semanticdesktop/aperture/accessor/impl/defaults.xml
===================================================================
--- aperture-addons/trunk/src/main/java/org/semanticdesktop/aperture/accessor/impl/defaults.xml	2010-08-10 10:47:41 UTC (rev 2406)
+++ aperture-addons/trunk/src/main/java/org/semanticdesktop/aperture/accessor/impl/defaults.xml	2010-08-10 18:04:14 UTC (rev 2407)
@@ -5,6 +5,7 @@
 		<name>org.semanticdesktop.aperture.accessor.file.FileAccessorFactory</name>
 	</dataAccessorFactory>
 	<dataAccessorFactory>
-		<name>dfki.km.medico.aperture.accessor.webdav.WebdavAccessorFactory</name>
+        <!--name>dfki.km.medico.aperture.accessor.webdav.WebdavAccessorFactory</name -->
+        <name>org.semanticdesktop.aperture.webdav.accessor.WebdavAccessorFactory</name>
 	</dataAccessorFactory>
 </dataAccessorFactories>
\ No newline at end of file

Modified: aperture-addons/trunk/src/main/java/org/semanticdesktop/aperture/util/ExpandedApertureRuntime.java
===================================================================
--- aperture-addons/trunk/src/main/java/org/semanticdesktop/aperture/util/ExpandedApertureRuntime.java	2010-08-10 10:47:41 UTC (rev 2406)
+++ aperture-addons/trunk/src/main/java/org/semanticdesktop/aperture/util/ExpandedApertureRuntime.java	2010-08-10 18:04:14 UTC (rev 2407)
@@ -211,7 +211,7 @@
         return extractFrom(stream, file.toURI().toString());
     }
 
-    public void extractAllText(InputStream stream, String nameOrPathOrUrl, TextProcessor processor) throws IOException, SubCrawlerException {
+    public void extractAllText(InputStream stream, String nameOrPathOrUrl, TextProcessor processor, String mimeType) throws IOException, SubCrawlerException {
         if (stream == null) {
             throw new NullPointerException("stream cannot be null");
         }
@@ -233,7 +233,9 @@
         Model model = RDF2Go.getModelFactory().createModel();
         model.open();
         RDFContainer container = new RDFContainerImpl(model,uri);
-        String mimeType = identifyMimeType(stream, uri);
+        if (mimeType == null) {
+            mimeType = identifyMimeType(stream, uri);
+        }
         boolean ok = tryToApplyExtractors(stream, uri, container, mimeType);
         if (ok) {
         	String text = getFullText(container);
@@ -268,7 +270,7 @@
 			if (object instanceof FileDataObject) {
 				try {
 					InputStream contentStream = ((FileDataObject)object).getContent();
-	                extractAllText(contentStream, object.getID().toString(), processor);
+	                extractAllText(contentStream, object.getID().toString(), processor, null);
                 } catch (RuntimeException e) {
 	                throw e;
                 } catch (Exception e) {

Modified: aperture-addons/trunk/src/main/java/org/semanticdesktop/aperture/util/ExpandedApertureRuntimeMain.java
===================================================================
--- aperture-addons/trunk/src/main/java/org/semanticdesktop/aperture/util/ExpandedApertureRuntimeMain.java	2010-08-10 10:47:41 UTC (rev 2406)
+++ aperture-addons/trunk/src/main/java/org/semanticdesktop/aperture/util/ExpandedApertureRuntimeMain.java	2010-08-10 18:04:14 UTC (rev 2407)
@@ -36,7 +36,7 @@
 				System.out.println(text);
 				System.out.println("----------------------");
 			}
-		});
+		}, null);
 		
 	}
 


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.