From: <my...@us...> - 2011-09-15 13:01:11
|
Revision: 2549 http://aperture.svn.sourceforge.net/aperture/?rev=2549&view=rev Author: mylka Date: 2011-09-15 13:00:59 +0000 (Thu, 15 Sep 2011) Log Message: ----------- changed X2RSubCrawlerUtil.registerXMLDatatype into registerXMLDatatypes. One method call can allow you to register more than one mapping stored in a single file. Modified Paths: -------------- aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/subcrawler/impl/DefaultSubCrawlerRegistry.java aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/tika/TikaMimeTypeIdentifier.java aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/x2r/X2RSubCrawlerUtil.java aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/x2r/X2RSubCrawlerUtilTest.java Added Paths: ----------- aperture/trunk/core/src/test/resources/org/semanticdesktop/aperture/x2r/two-mappings.ttl Modified: aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/subcrawler/impl/DefaultSubCrawlerRegistry.java =================================================================== --- aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/subcrawler/impl/DefaultSubCrawlerRegistry.java 2011-09-13 19:40:46 UTC (rev 2548) +++ aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/subcrawler/impl/DefaultSubCrawlerRegistry.java 2011-09-15 13:00:59 UTC (rev 2549) @@ -122,7 +122,7 @@ } try { String mapping = IOUtil.readString(mappingStream); - X2RSubCrawlerUtil.registerXMLDatatype( + X2RSubCrawlerUtil.registerXMLDatatypes( DefaultSubCrawlerRegistry.this, null, mapping); } catch (Exception e) { logger.warn("cannot initialize the xml subcrawler mapping: " + text,e); Modified: aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/tika/TikaMimeTypeIdentifier.java =================================================================== --- aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/tika/TikaMimeTypeIdentifier.java 2011-09-13 19:40:46 UTC (rev 2548) +++ aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/tika/TikaMimeTypeIdentifier.java 2011-09-15 13:00:59 UTC (rev 2549) @@ -318,7 +318,7 @@ } try { String mapping = IOUtil.readString(mappingStream); - X2RSubCrawlerUtil.registerXMLDatatype( + X2RSubCrawlerUtil.registerXMLDatatypes( null, TikaMimeTypeIdentifier.this, mapping); } catch (Exception e) { logger.warn("cannot initialize the xml subcrawler mapping: " + text,e); Modified: aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/x2r/X2RSubCrawlerUtil.java =================================================================== --- aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/x2r/X2RSubCrawlerUtil.java 2011-09-13 19:40:46 UTC (rev 2548) +++ aperture/trunk/core/src/main/java/org/semanticdesktop/aperture/x2r/X2RSubCrawlerUtil.java 2011-09-15 13:00:59 UTC (rev 2549) @@ -7,7 +7,13 @@ package org.semanticdesktop.aperture.x2r; import java.io.StringReader; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashSet; +import java.util.LinkedList; import java.util.Locale; +import java.util.Set; import org.apache.tika.mime.MimeTypeException; import org.ontoware.aifbcommons.collection.ClosableIterator; @@ -100,50 +106,102 @@ } } - public static void registerXMLDatatype(SubCrawlerRegistry registry, - TikaMimeTypeIdentifier identifier, String mapping) + public static void registerXMLDatatypes(SubCrawlerRegistry registry, + TikaMimeTypeIdentifier identifier, String mappings) throws X2RSubCrawlerUtilException { Model model = RDF2Go.getModelFactory().createModel().open(); try { try { - model.readFrom(new StringReader(mapping),Syntax.Turtle); + model.readFrom(new StringReader(mappings),Syntax.Turtle); } catch (Exception e1) { throw new X2RSubCrawlerUtilException(e1); } + Collection<Resource> mappingResources = findMappingResources(model); + for (Resource mappingResource : mappingResources) {; - Resource mappingResource = findMappingResource(model); - String mimeTypeString = findPropertyValue(model, mappingResource, AX.MIMETYPE); + // that's a crappy hack, but without it I'd have to implement + // it in X2R, which I don't want + String singleMappingString = getSingleMappingString(model, mappingResource); - if (mimeTypeString != null) { - mimeTypeString = mimeTypeString.toLowerCase(Locale.US); + String mimeTypeString = findPropertyValue(model, mappingResource, AX.MIMETYPE); + + if (mimeTypeString != null) { + mimeTypeString = mimeTypeString.toLowerCase(Locale.US); + } + + String rootElementName = findPropertyValue(model, mappingResource, AX.ROOTELEMENTNAME); + String rootElementNameSpace = findPropertyValue(model, mappingResource, AX.ROOTELEMENTNS); + + if (mimeTypeString != null && registry != null) { + registry.add(new X2RSubCrawlerFactory(mimeTypeString, singleMappingString)); + } + + if (mimeTypeString != null && identifier != null) { + try { + identifier.addNewDefinition( + mimeTypeString, + "<" + rootElementName, + rootElementNameSpace, + rootElementName, + null); + } catch (MimeTypeException e) { + throw new X2RSubCrawlerUtilException(e); + } + } } - - String rootElementName = findPropertyValue(model, mappingResource, AX.ROOTELEMENTNAME); - String rootElementNameSpace = findPropertyValue(model, mappingResource, AX.ROOTELEMENTNS); - - if (mimeTypeString != null && registry != null) { - registry.add(new X2RSubCrawlerFactory(mimeTypeString, mapping)); - } - - if (mimeTypeString != null && identifier != null) { - try { - identifier.addNewDefinition( - mimeTypeString, - "<" + rootElementName, - rootElementNameSpace, - rootElementName, - null); - } catch (MimeTypeException e) { - throw new X2RSubCrawlerUtilException(e); - } - } } finally { model.close(); } } - /** + private static String getSingleMappingString(Model model, + Resource mappingResource) { + Model newModel = RDF2Go.getModelFactory().createModel().open(); + LinkedList<Resource> queue = new LinkedList<Resource>(); + queue.add(mappingResource); + Set<Resource> visitedResources = new HashSet<Resource>(); + visitedResources.add(mappingResource); + while (!queue.isEmpty()) { + Resource r =queue.remove(0); + ClosableIterator<Statement> i1 = + model.findStatements(r, Variable.ANY, Variable.ANY); + while (i1.hasNext()) { + Statement s = i1.next(); + if (!newModel.contains(s)) { + newModel.addStatement(s); + Node n = s.getObject(); + if (n instanceof Resource && !visitedResources.contains(n)) { + queue.add((Resource)n); + visitedResources.add((Resource)n); + } + } + } + i1.close(); + i1 = model.findStatements(Variable.ANY, Variable.ANY, r); + while (i1.hasNext()) { + Statement s = i1.next(); + if (!newModel.contains(s)) { + newModel.addStatement(s); + Resource subject = s.getSubject(); + if (!visitedResources.contains(subject)) { + queue.add(subject); + visitedResources.add(subject); + } + } + } + } + + StringWriter sw = new StringWriter(); + try { + model.writeTo(sw, Syntax.Turtle); + } catch (Exception e) { + throw new RuntimeException(e); // will not happen + } + return sw.toString(); + } + + /** * Returns the value of the property. There can be at most once value * and it has to be a literal. * @@ -174,17 +232,16 @@ return result; } - private static Resource findMappingResource(Model model) throws X2RSubCrawlerUtilException { + private static Collection<Resource> findMappingResources(Model model) throws X2RSubCrawlerUtilException { ClosableIterator<Statement> iter = model.findStatements(Variable.ANY, RDF.type, model.createURI(XML2R.MAPPING.toString())); - Resource result = null; - if (iter.hasNext()) { + Collection<Resource> result = new ArrayList<Resource>(); + while (iter.hasNext()) { Statement st = iter.next(); - result = st.getSubject(); - if (iter.hasNext()) { - throw new X2RSubCrawlerUtilException("Mapping string contains more than one mapping"); - } - } else { + Resource r = st.getSubject(); + result.add(r); + } + if (result.isEmpty()) { throw new X2RSubCrawlerUtilException("No mapping found in the mapping string"); } return result; Modified: aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/x2r/X2RSubCrawlerUtilTest.java =================================================================== --- aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/x2r/X2RSubCrawlerUtilTest.java 2011-09-13 19:40:46 UTC (rev 2548) +++ aperture/trunk/core/src/test/java/org/semanticdesktop/aperture/x2r/X2RSubCrawlerUtilTest.java 2011-09-15 13:00:59 UTC (rev 2549) @@ -49,6 +49,24 @@ performWikipediaTest(id, reg, exReg); } + public void testWikipediaOneFileTwoMappings() throws Exception { + TikaMimeTypeIdentifier id = new TikaMimeTypeIdentifier(); + SubCrawlerRegistry reg = new DefaultSubCrawlerRegistry(); + ExtractorRegistry exReg = new DefaultExtractorRegistry(); + + X2RSubCrawlerUtil.registerXMLDatatypes( + reg, + id, + IOUtil.readString(getClass().getResourceAsStream( + "two-mappings.ttl"))); + + performWikipediaTest(id, reg, exReg); + + assertFalse(reg.get("application/x-somethingelse").isEmpty()); + + } + + public void testWikipediaUpperCaseInMimeType() throws Exception { TikaMimeTypeIdentifier id = new TikaMimeTypeIdentifier(); SubCrawlerRegistry reg = new DefaultSubCrawlerRegistry(); @@ -73,7 +91,7 @@ SubCrawlerRegistry reg = new DefaultSubCrawlerRegistry(); ExtractorRegistry exReg = new DefaultExtractorRegistry(); - X2RSubCrawlerUtil.registerXMLDatatype( + X2RSubCrawlerUtil.registerXMLDatatypes( reg, id, IOUtil.readString(getClass().getResourceAsStream( @@ -87,7 +105,7 @@ SubCrawlerRegistry reg = new DefaultSubCrawlerRegistry(); ExtractorRegistry exReg = new DefaultExtractorRegistry(); - X2RSubCrawlerUtil.registerXMLDatatype( + X2RSubCrawlerUtil.registerXMLDatatypes( reg, id, IOUtil.readString(getClass().getResourceAsStream( @@ -133,7 +151,9 @@ // there should be exactly two (there are two entries in the file) // the first entry doesn't contain any reasonable content (a test for markup removal) // the second entry does, it's from the "Jimmy Carter" page + assertTrue(sts.hasNext()); Statement st1 = sts.next(); + assertTrue(sts.hasNext()); Statement st2 = sts.next(); assertFalse(sts.hasNext()); Added: aperture/trunk/core/src/test/resources/org/semanticdesktop/aperture/x2r/two-mappings.ttl =================================================================== --- aperture/trunk/core/src/test/resources/org/semanticdesktop/aperture/x2r/two-mappings.ttl (rev 0) +++ aperture/trunk/core/src/test/resources/org/semanticdesktop/aperture/x2r/two-mappings.ttl 2011-09-15 13:00:59 UTC (rev 2549) @@ -0,0 +1,75 @@ +@prefix xml2r: <http://fivo.cyf-kr.edu.pl/trac/wiki/X2R/xml/mapping#> . +@prefix ax: <http://aperture.sourceforge.net/2011/07/x2rsubcrawler#> . +@prefix dc: <http://purl.org/dc/elements/1.1/> . +@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . +@prefix nie: <http://www.semanticdesktop.org/ontologies/2007/01/19/nie#> . +@prefix nfo: <http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#> . +@prefix : <uri:test:dblp1#> . + +:wikipediaMapping a xml2r:Mapping ; + xml2r:namespaceDefinition [ + xml2r:namespacePrefix "xsi"; + xml2r:namespaceUri "http://www.w3.org/2001/XMLSchema-instance" + ] ; + xml2r:namespaceDefinition [ + xml2r:namespacePrefix "mw" ; + xml2r:namespaceUri "http://www.mediawiki.org/xml/export-0.5/" + ] ; + ax:mimeType "application/x-mediawiki-xml-export" ; + ax:rootElementName "mediawiki" ; + ax:rootElementNameSpace "http://www.mediawiki.org/xml/export-0.5/" . + +:mediawikiElementClassMap a xml2r:ClassMap ; + xml2r:belongsToMapping :wikipediaMapping; + xml2r:nodeXPath "/mw:mediawiki/mw:*" . + +:siteinfoMap a xml2r:ClassMap ; + xml2r:delegatedFrom :mediawikiElementClassMap; + xml2r:onElementName "siteinfo" ; + xml2r:setVariable [ + xml2r:variableName "baseUri" ; + xml2r:pattern "${fn:replace(mw:base/text(),\"/[^/]*$\",\"/\")}" + ] . + +:publicationMap a xml2r:ClassMap ; + xml2r:delegatedFrom :mediawikiElementClassMap; + xml2r:onElementName "page" ; + xml2r:uriPattern "&{baseUri}${mw:title/text()||mwurlify}" ; + xml2r:class nfo:TextDocument . + +:titleBridge a xml2r:PropertyBridge ; + xml2r:belongsToClassMap :publicationMap ; + xml2r:property nie:title ; + xml2r:pattern "${mw:title/text()}" . + +#contributor is difficult because we can't easily create Contact instances +#:contributorBridge a xml2r:PropertyBridge ; +# xml2r:belongsToClassMap :publicationMap ; +# xml2r:property dc:contributor ; +# xml2r:pattern "${mw:revision/mw:contributor/mw:username/text()}" . + +:textBridge a xml2r:PropertyBridge ; + xml2r:belongsToClassMap :publicationMap ; + xml2r:property nie:plainTextContent ; + xml2r:pattern "${mw:revision/mw:text/text()||mediawiki}" . + +:m a xml2r:Mapping ; + ax:mimeType "application/x-somethingelse" ; + ax:rootElementName "object" ; + ax:rootElementNameSpace "http://www.example.org/xml/example/" . + +:publicationMap a xml2r:ClassMap ; + xml2r:belongsToMapping :m ; + xml2r:nodeXPath "/publication" ; + xml2r:uriPattern "${parentUri}" ; + xml2r:class <http://some.cool.ontology/2008/ont#Publication> . + +:titleBridge a xml2r:PropertyBridge ; + xml2r:belongsToClassMap :publicationMap ; + xml2r:property dc:title ; + xml2r:pattern "${title/text()}" . + +:authorBridge a xml2r:PropertyBridge ; + xml2r:belongsToClassMap :publicationMap ; + xml2r:property dc:creator ; + xml2r:pattern "${author/text()}" . \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |