From: <my...@us...> - 2010-03-26 15:49:32
|
Revision: 2300 http://aperture.svn.sourceforge.net/aperture/?rev=2300&view=rev Author: mylka Date: 2010-03-26 15:49:26 +0000 (Fri, 26 Mar 2010) Log Message: ----------- made the vcard subcrawler report the photos and sounds as separate data objects Modified Paths: -------------- aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/subcrawler/vcard/VcardSubCrawler.java aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/subcrawler/TestSubCrawlerUtilIntegration.java aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/subcrawler/vcard/VcardSubCrawlerTest.java aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/test/ApertureTestBase.java aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/test/subcrawler/SubCrawlerTestBase.java Modified: aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/subcrawler/vcard/VcardSubCrawler.java =================================================================== --- aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/subcrawler/vcard/VcardSubCrawler.java 2010-03-25 09:12:14 UTC (rev 2299) +++ aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/subcrawler/vcard/VcardSubCrawler.java 2010-03-26 15:49:26 UTC (rev 2300) @@ -6,11 +6,13 @@ */ package org.semanticdesktop.aperture.subcrawler.vcard; +import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; +import java.io.StringWriter; import java.nio.charset.Charset; import java.util.Date; import java.util.List; @@ -33,12 +35,14 @@ import net.fortuna.ical4j.vcard.property.BDay; import net.fortuna.ical4j.vcard.property.Email; import net.fortuna.ical4j.vcard.property.Geo; +import net.fortuna.ical4j.vcard.property.Key; import net.fortuna.ical4j.vcard.property.Logo; import net.fortuna.ical4j.vcard.property.N; import net.fortuna.ical4j.vcard.property.Nickname; import net.fortuna.ical4j.vcard.property.Org; import net.fortuna.ical4j.vcard.property.Photo; import net.fortuna.ical4j.vcard.property.Revision; +import net.fortuna.ical4j.vcard.property.Sound; import net.fortuna.ical4j.vcard.property.Telephone; import org.apache.commons.codec.DecoderException; @@ -53,12 +57,14 @@ import org.semanticdesktop.aperture.accessor.DataObject; import org.semanticdesktop.aperture.accessor.RDFContainerFactory; import org.semanticdesktop.aperture.accessor.base.DataObjectBase; +import org.semanticdesktop.aperture.accessor.base.FileDataObjectBase; import org.semanticdesktop.aperture.datasource.DataSource; import org.semanticdesktop.aperture.rdf.RDFContainer; import org.semanticdesktop.aperture.rdf.util.ModelUtil; import org.semanticdesktop.aperture.subcrawler.SubCrawler; import org.semanticdesktop.aperture.subcrawler.SubCrawlerException; import org.semanticdesktop.aperture.subcrawler.SubCrawlerHandler; +import org.semanticdesktop.aperture.subcrawler.SubCrawlerUtil; import org.semanticdesktop.aperture.subcrawler.base.AbstractSubCrawler; import org.semanticdesktop.aperture.util.DateUtil; import org.semanticdesktop.aperture.util.StringUtil; @@ -71,13 +77,13 @@ import org.slf4j.LoggerFactory; /** - * An Extractor Implementation working with VCard documents. + * A {@link SubCrawler} Implementation working with VCard documents. * <p> * Known issues: * <ul> * <li>The preferred contact media aren't marked as such in the output, because the NCO doesn't cover this * <li>Theoretically the email addresses can have the TYPE=x400, this is not supported, all email addresses - * are treated as internet addresses. + * are treated as Internet addresses. * <li>The VCARD specification doesn't distinguish between private and business email addresses, so this * extractor doesn't do it either. * <li>The REV property defined in RFC 2426 sec. 3.6.4 doesn't have any direct equivalent in NCO, therefore @@ -85,16 +91,16 @@ * <li>NCO doesn't allow to preserve the order of the additional names, so this crawler discards that order. * Every additional name receives a separate nco:nameAdditional triple and the triples themselves are * unordered by definition.</li> - * <li>The above consideration also applies to nicknames. Nicknames can be ordered in the vcard but they are - * left unordered in the rdf data extracted from it.</li> - * <li>The ORG type in the vcard can specify an entity within an organization at an arbitrary level of nesting. + * <li>The above consideration also applies to nicknames. Nicknames can be ordered in the VCard but they are + * left unordered in the RDF data extracted from it.</li> + * <li>The ORG type in the VCard can specify an entity within an organization at an arbitrary level of nesting. * E.g a team within a project, within a department, within a division, within a company within a corporation. * NCO only allows for a single nco:department property of the affiliation, therefore supporting only a single * level of nesting. If more than one organizational unit is specified in the ORG element, the information about - * which unit is nested within which is lost, all units are recorded in the rdf at the same level with separate + * which unit is nested within which is lost, all units are recorded in the RDF at the same level with separate * nco:department triples attached to the affiliation resource. * </li> - * <li>Other elements of the vcard specification that aren't supported by NCO: (they are supported by JPIM) + * <li>Other elements of the VCard specification that aren't supported by NCO: (they are supported by JPIM) * <ul> * <li>ACCESS</li> * <li>CATEGORY</li> @@ -106,8 +112,8 @@ * <p> * <b>URIs for VCARDS</b><br/><br/> This crawler uses following conventions to generate URIS: * <ol> - * <li>If the UID parameter is present, it is concatenated to the stream id (preceeded by a hash)</li> - * <li>If it's not, then the contact is serialized to a string and a hash of that string is contactenated. to + * <li>If the UID parameter is present, it is concatenated to the stream id (preceded by a hash)</li> + * <li>If it's not, then the contact is serialized to a string and a hash of that string is concatenated. to * the stream id.</li> * </ol> * This guarantees that an unmodified contact will be detected and reported as unmodified. (Which is not the @@ -147,7 +153,7 @@ List<VCard> cards = builder.buildAll(); VCardOutputter outputter = new VCardOutputter(false); if (cards.size() == 1) { - processContact(cards.get(0), parentMetadata.getModel(), parentMetadata.getDescribedUri()); + processContact(cards.get(0), parentMetadata, parentMetadata.getDescribedUri(), handler, accessData, dataSource, outputter); } else { processAddressBook(cards, parentMetadata, handler, outputter, accessData, dataSource); @@ -196,7 +202,7 @@ URI contactUri = generateURIForContact(contact, parentMetadata, contactHash); RDFContainerFactory factory = handler.getRDFContainerFactory(contactUri.toString()); RDFContainer container = factory.getRDFContainer(contactUri); - processContact(contact, container.getModel(), contactUri); + processContact(contact, container, contactUri, handler, accessData, source, out); parentMetadata.add(NCO.containsContact, contactUri); container.add(RDF.type, NCO.ContactListDataObject); passMetadataToHandler(container, handler, contactHash, accessData, source); @@ -225,17 +231,50 @@ } } } + + private void passAttachmentToHandler(RDFContainer container, SubCrawlerHandler handler, + String attachmentHash, AccessData accessData, DataSource source, byte [] bytes) { + URI uri = container.getDescribedUri(); + DataObject object = new FileDataObjectBase(uri, source, container, new ByteArrayInputStream(bytes)); + if (accessData == null) { + handler.objectNew(object); + } else if (!accessData.isKnownId(uri.toString())) { + accessData.put(uri.toString(), OBJECT_HASH_KEY, attachmentHash); + handler.objectNew(object); + } else { + String oldHash = accessData.get(uri.toString(), OBJECT_HASH_KEY); + if (oldHash == null || !oldHash.equals(attachmentHash)) { + accessData.put(uri.toString(), OBJECT_HASH_KEY, attachmentHash); + handler.objectChanged(object); + } else { + handler.objectNotModified(uri.toString()); + } + } + } - private void processContact(VCard contact, Model model, Resource contactResource) { + private void processContact(VCard contact, RDFContainer container, URI contactResource, SubCrawlerHandler handler, + AccessData accessData, DataSource dataSource, VCardOutputter out) { + Model model = container.getModel(); model.addStatement(contactResource, RDF.type, NCO.Contact); - processPersonalIdentity(contact, model, contactResource); - Resource affiliationResource = processOrganizationIdentity(contact, model, contactResource); - processCommonProperties(contact, model, contactResource, affiliationResource); + processPersonalIdentity(contact, container, contactResource, handler, accessData, dataSource); + Resource affiliationResource = processOrganizationIdentity(contact, container, contactResource, handler, accessData, dataSource); + processCommonProperties(contact, container, contactResource, affiliationResource, handler, accessData, dataSource); + // add the fulltext + StringWriter sw = new StringWriter(); + try { + out.output(contact, sw); + container.add(NIE.plainTextContent, sw.toString()); + } catch (Exception e) { + logger.warn("Couldn't serialize the vcard",e); + } + } - private void processPersonalIdentity(VCard vc, Model model, - Resource contactResource) { + private void processPersonalIdentity(VCard vc, RDFContainer parentMetadata, + URI contactResource, SubCrawlerHandler handler, AccessData accessData, DataSource dataSource) { + + Model model = parentMetadata.getModel(); // this property is present in all contacts, regardless of whether they are PersonContacts // or OrganizationContacts, the presence of this property cannot tell us anything interesting @@ -274,8 +313,9 @@ } Photo photo = (Photo)vc.getProperty(Id.PHOTO); if (photo != null) { - processImage(model, contactResource, NCO.photo, - getParameterValue(photo, net.fortuna.ical4j.vcard.Parameter.Id.TYPE)); + processImage(photo.getBinary(), parentMetadata, contactResource, NCO.photo, + getParameterValue(photo, net.fortuna.ical4j.vcard.Parameter.Id.TYPE), handler, + accessData, dataSource); } for (int i = 0; i < length(name.getPrefixes()); i++) { String prefix = name.getPrefixes()[i]; @@ -288,9 +328,11 @@ } - private Resource processOrganizationIdentity(VCard organizationalIdentity, Model model, - Resource contactResource) { + private Resource processOrganizationIdentity(VCard organizationalIdentity, RDFContainer parentMetadata, + URI contactResource, SubCrawlerHandler handler, AccessData accessData, DataSource dataSource) { + Model model = parentMetadata.getModel(); + // first some sanity checking if (organizationalIdentity == null) { return null; @@ -323,15 +365,16 @@ } // now we know we have to create an organization resource - Resource organizationResource = ModelUtil.generateRandomResource(model); + URI organizationResource = ModelUtil.generateRandomURI(model); model.addStatement(organizationResource, RDF.type, NCO.OrganizationContact); model.addStatement(affiliationResource, NCO.org, organizationResource); Logo logo = (Logo)organizationalIdentity.getProperty(Id.LOGO); if (logo != null) { - processImage(model, organizationResource, NCO.logo, - getParameterValue(logo, net.fortuna.ical4j.vcard.Parameter.Id.TYPE)); + processImage(logo.getBinary(), parentMetadata, organizationResource, NCO.logo, + getParameterValue(logo, net.fortuna.ical4j.vcard.Parameter.Id.TYPE), handler, + accessData, dataSource); } Org org = (Org) organizationalIdentity.getProperty(Id.ORG); @@ -349,11 +392,12 @@ return affiliationResource; } - private void processCommonProperties(VCard contact, Model model, Resource contactResource, - Resource affiliationResource) { + private void processCommonProperties(VCard contact, RDFContainer parentMetadata, URI contactResource, + Resource affiliationResource, SubCrawlerHandler handler, AccessData accessData, + DataSource dataSource) { // so, first the addresses List<Property> adrs = contact.getProperties(Id.ADR); - + Model model = parentMetadata.getModel(); for (Property address : adrs) { // let's hope this simple comparison will work as desired... String type = getParameterValue(address, net.fortuna.ical4j.vcard.Parameter.Id.TYPE); @@ -370,13 +414,17 @@ processGeographicalInformation(model, contactResource, NCO.hasLocation, contact); Property key = contact.getProperty(Id.KEY); if (key != null) { - processPublicKey(model, contactResource, NCO.key, - getParameterValue(key, net.fortuna.ical4j.vcard.Parameter.Id.TYPE)); + Key keyProp = (Key)key; + processPublicKey(keyProp.getBinary(), parentMetadata, contactResource, NCO.key, + getParameterValue(key, net.fortuna.ical4j.vcard.Parameter.Id.TYPE), + handler, accessData, dataSource); } Property sound = contact.getProperty(Id.SOUND); if (sound != null) { - processSound(model, contactResource, NCO.sound, - getParameterValue(sound, net.fortuna.ical4j.vcard.Parameter.Id.TYPE)); + Sound soundProp = (Sound)sound; + processSound(soundProp.getBinary(), parentMetadata, contactResource, NCO.sound, + getParameterValue(key, net.fortuna.ical4j.vcard.Parameter.Id.TYPE), + handler, accessData, dataSource); } // and then the simple properties @@ -390,7 +438,7 @@ } } - private void processAddress(Model model, Address address, Resource contactResource, + private void processAddress(Model model, Address address, Resource contactResource, Resource affiliationResource, boolean preferred) { if (address != null) { Resource addressResource = ModelUtil.generateRandomResource(model); @@ -602,28 +650,57 @@ } } - private void processImage(Model model, Resource contactResource, URI property, String mimeType) { - Resource imageResource = ModelUtil.generateRandomResource(model); - model.addStatement(imageResource, RDF.type, NEXIF.Photo); - model.addStatement(imageResource, RDF.type, NFO.Attachment); - model.addStatement(contactResource, property, imageResource); - addStringProperty(model, imageResource, NIE.mimeType, mimeType); + private void processImage(byte [] bytes, RDFContainer parentMetadata, URI contactResource, + URI property, String mimeType, SubCrawlerHandler handler, AccessData accessData, + DataSource source) { + String attachmentHash = StringUtil.sha1Hash(bytes); + URI attachmentUri = generateURIForAttachment(contactResource, attachmentHash); + RDFContainerFactory factory = handler.getRDFContainerFactory(attachmentUri.toString()); + RDFContainer container = factory.getRDFContainer(attachmentUri); + parentMetadata.add(property, attachmentUri); + container.add(NIE.isPartOf, contactResource); + container.add(RDF.type, NCO.ContactListDataObject); + container.add(RDF.type, NEXIF.Photo); + container.add(RDF.type, NFO.Attachment); + addStringProperty(container.getModel(), container.getDescribedUri(), NIE.mimeType, mimeType); + passAttachmentToHandler(container, handler, attachmentHash, accessData, source, bytes); + } - private void processSound(Model model, Resource contactResource, URI property, String mimeType) { - Resource soundResource = ModelUtil.generateRandomResource(model); - model.addStatement(soundResource, RDF.type, NFO.Audio); - model.addStatement(soundResource, RDF.type, NFO.Attachment); - model.addStatement(contactResource, property, soundResource); - addStringProperty(model, soundResource, NIE.mimeType, mimeType); + private URI generateURIForAttachment(URI contactResource, String attachmentHash) { + if (!SubCrawlerUtil.isSubcrawledObjectUri(contactResource)) { + return createChildUri(contactResource, attachmentHash); + } else { + return new org.ontoware.rdf2go.model.node.impl.URIImpl(contactResource.toString() + "/" + attachmentHash); + } + } + + private void processSound(byte [] bytes, RDFContainer parentMetadata, URI contactResource, + URI property, String mimeType, SubCrawlerHandler handler, AccessData accessData, + DataSource source) { + String attachmentHash = StringUtil.sha1Hash(bytes); + URI attachmentUri = generateURIForAttachment(contactResource, attachmentHash); + RDFContainerFactory factory = handler.getRDFContainerFactory(attachmentUri.toString()); + RDFContainer container = factory.getRDFContainer(attachmentUri); + parentMetadata.add(property, attachmentUri); + container.add(RDF.type, NFO.Audio); + container.add(RDF.type, NFO.Attachment); + addStringProperty(container.getModel(), container.getDescribedUri(), NIE.mimeType, mimeType); + passAttachmentToHandler(container, handler, attachmentHash, accessData, source, bytes); } - private void processPublicKey(Model model, Resource contactResource, URI property, String mimeType) { - Resource keyResource = ModelUtil.generateRandomResource(model); - model.addStatement(keyResource, RDF.type, NIE.InformationElement); - model.addStatement(keyResource, RDF.type, NFO.Attachment); - model.addStatement(contactResource, property, keyResource); - addStringProperty(model, keyResource, NIE.mimeType, mimeType); + private void processPublicKey(byte [] bytes, RDFContainer parentMetadata, URI contactResource, + URI property, String mimeType, SubCrawlerHandler handler, AccessData accessData, + DataSource source) { + String attachmentHash = StringUtil.sha1Hash(bytes); + URI attachmentUri = generateURIForAttachment(contactResource, attachmentHash); + RDFContainerFactory factory = handler.getRDFContainerFactory(attachmentUri.toString()); + RDFContainer container = factory.getRDFContainer(attachmentUri); + parentMetadata.add(property, attachmentUri); + container.add(RDF.type, NIE.InformationElement); + container.add(RDF.type, NFO.Attachment); + addStringProperty(container.getModel(), container.getDescribedUri(), NIE.mimeType, mimeType); + passAttachmentToHandler(container, handler, attachmentHash, accessData, source, bytes); } private void addStringProperty(Model model, Resource resource, URI property, String value) { @@ -632,13 +709,6 @@ } } - private void addDateProperty(Model model, Resource resource, URI property, Date date) { - if (date != null) { - String dateString = DateUtil.date2String(date); - model.addStatement(resource, property, model.createDatatypeLiteral(dateString, XSD._date)); - } - } - private void addDateTimeProperty(Model model, Resource resource, URI property, Date date) { if (date != null) { String dateString = DateUtil.dateTime2String(date); Modified: aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/subcrawler/TestSubCrawlerUtilIntegration.java =================================================================== --- aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/subcrawler/TestSubCrawlerUtilIntegration.java 2010-03-25 09:12:14 UTC (rev 2299) +++ aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/subcrawler/TestSubCrawlerUtilIntegration.java 2010-03-26 15:49:26 UTC (rev 2300) @@ -53,6 +53,25 @@ } } + public void testVCardAttachment() throws Exception { + InputStream stream = ResourceUtil.getInputStream(DOCS_PATH + "vcard-antoni-kontact.vcf", + getClass()); + URI uri = new URIImpl( + "vcard:" + + "file:///C:/somefolder/somevcard.vcf" + + "!/d6bb8c38b78663b2aeef0b30538968660caf95c9"); + TestRDFContainerFactory fac = new TestRDFContainerFactory(); + DataObject obj = SubCrawlerUtil.getDataObject(uri, stream, null, null, null, fac, + new DefaultSubCrawlerRegistry()); + assertNotNull(obj); + assertTrue(obj instanceof FileDataObject); + assertMimeType("image/jpeg", uri, ((FileDataObject)obj).getContent()); + obj.dispose(); + for (Map.Entry<String, RDFContainer> entry : fac.returnedContainers.entrySet()) { + assertFalse(entry.getValue().getModel().isOpen()); + } + } + /** * Tests if the method can extract a file whose name contains a space from inside a ZIP archive. * @throws Exception Modified: aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/subcrawler/vcard/VcardSubCrawlerTest.java =================================================================== --- aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/subcrawler/vcard/VcardSubCrawlerTest.java 2010-03-25 09:12:14 UTC (rev 2299) +++ aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/subcrawler/vcard/VcardSubCrawlerTest.java 2010-03-26 15:49:26 UTC (rev 2300) @@ -26,6 +26,7 @@ import org.ontoware.rdf2go.vocabulary.XSD; import org.semanticdesktop.aperture.accessor.AccessData; import org.semanticdesktop.aperture.accessor.base.AccessDataImpl; +import org.semanticdesktop.aperture.extractor.impl.DefaultExtractorRegistry; import org.semanticdesktop.aperture.rdf.RDFContainer; import org.semanticdesktop.aperture.rdf.impl.RDFContainerImpl; import org.semanticdesktop.aperture.test.subcrawler.SubCrawlerTestBase; @@ -88,8 +89,14 @@ VcardSubCrawler subCrawler = new VcardSubCrawler(); metadata = subCrawl(DOCS_PATH + "vcard-antoni-kontact.vcf", subCrawler); // note that NO additional data objects have been reported, this - // file contains only one contact - assertNewModUnmod(handler, 0, 0, 0); + // file contains only one contact, but this contact has a photo, + // which is returned as a separate object + assertNewModUnmod(handler, 1, 0, 0); + + // we should get the fulltext too + String fullText = metadata.getString(NIE.plainTextContent); + assertTrue(fullText.contains("Antoni")); + validate(metadata); metadata.dispose(); metadata = null; @@ -420,7 +427,7 @@ private RDFContainer subCrawl(String string, VcardSubCrawler subCrawler) throws Exception { InputStream stream = org.semanticdesktop.aperture.util.ResourceUtil.getInputStream(string, this.getClass()); - handler = new TestBasicSubCrawlerHandler(); + handler = new TestBasicSubCrawlerHandler(new DefaultExtractorRegistry()); RDFContainer parentMetadata = new RDFContainerImpl(handler.getModel(),new URIImpl("uri:dummyuri")); subCrawler.subCrawl(null, stream, handler, null, null, null, null, parentMetadata); return parentMetadata; Modified: aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/test/ApertureTestBase.java =================================================================== --- aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/test/ApertureTestBase.java 2010-03-25 09:12:14 UTC (rev 2299) +++ aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/test/ApertureTestBase.java 2010-03-26 15:49:26 UTC (rev 2300) @@ -612,7 +612,7 @@ stream.mark(minimumArrayLength + 10); // add some for safety byte[] bytes = IOUtil.readBytes(stream, minimumArrayLength); String mimeType = mimeTypeIdentifier.identify(bytes, null, uri); - assertEquals(mimeType, desiredMimeType); + assertEquals(desiredMimeType, mimeType); stream.reset(); } Modified: aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/test/subcrawler/SubCrawlerTestBase.java =================================================================== --- aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/test/subcrawler/SubCrawlerTestBase.java 2010-03-25 09:12:14 UTC (rev 2299) +++ aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/test/subcrawler/SubCrawlerTestBase.java 2010-03-26 15:49:26 UTC (rev 2300) @@ -127,9 +127,9 @@ */ public void assertNewModUnmod(TestBasicSubCrawlerHandler handler, int newObjects, int changedObjects, int unchangedObjects) { - assertEquals(handler.getNewObjects().size(), newObjects); - assertEquals(handler.getChangedObjects().size(), changedObjects); - assertEquals(handler.getUnchangedObjects().size(), unchangedObjects); + assertEquals(newObjects, handler.getNewObjects().size()); + assertEquals(changedObjects, handler.getChangedObjects().size()); + assertEquals(unchangedObjects, handler.getUnchangedObjects().size()); } protected class CompressorSubCrawlerHandler extends TestBasicSubCrawlerHandler { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |