From: <dfl...@us...> - 2013-10-29 13:23:48
|
Revision: 4132 http://sourceforge.net/p/dl-learner/code/4132 Author: dfleischhacker Date: 2013-10-29 13:23:45 +0000 (Tue, 29 Oct 2013) Log Message: ----------- Bible test case activated Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/OWLOntologyLuceneSyntacticIndexCreator.java trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-10-29 08:55:58 UTC (rev 4131) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-10-29 13:23:45 UTC (rev 4132) @@ -55,7 +55,7 @@ addSubsequencesWordNet(entity, text); for (String alternativeText : nameGenerator.getAlternativeText(text)) { - addEntry(alternativeText, entity, text); + addEntry(alternativeText.toLowerCase(), entity, text); } } } @@ -101,36 +101,46 @@ } // generate subsequences starting at the given start index of the given size - Set<String> allPossibleSubsequences = getAllPossibleSubsequences(wordnetTokens); + Set<String[]> allPossibleSubsequences = getAllPossibleSubsequences(tokens, wordnetTokens); - for (String s : allPossibleSubsequences) { - addEntry(s, entity); + for (String[] s : allPossibleSubsequences) { + addEntry(s[0], entity, s[1]); } } } - private static Set<String> getAllPossibleSubsequences(List<String>[] wordnetTokens) { - ArrayList<String> res = new ArrayList<String>(); + private static Set<String[]> getAllPossibleSubsequences(String[] originalTokens, List<String>[] wordnetTokens) { + ArrayList<String[]> res = new ArrayList<String[]>(); for (int size = 1; size < wordnetTokens.length + 1; size++) { for (int start = 0; start < wordnetTokens.length - size + 1; start++) { - getPossibleSubsequencesRec(res, new ArrayList<String>(), wordnetTokens, 0, size); + getPossibleSubsequencesRec(originalTokens, res, new ArrayList<String>(), new ArrayList<String>(), + wordnetTokens, 0, size); } } - return new HashSet<String>(res); + return new HashSet<String[]>(res); } - private static void getPossibleSubsequencesRec(List<String> allSubsequences, List<String> currentSubsequence, List<String>[] wordnetTokens, - int curStart, int maxLength) { + + private static void getPossibleSubsequencesRec(String[] originalTokens, List<String[]> allSubsequences, + List<String> currentSubsequence, + List<String> currentOriginalSubsequence, + List<String>[] wordnetTokens, + int curStart, int maxLength) { + if (currentSubsequence.size() == maxLength) { - allSubsequences.add(StringUtils.join(currentSubsequence, " ")); + allSubsequences.add(new String[]{StringUtils.join(currentSubsequence, " ").toLowerCase(), StringUtils + .join(currentOriginalSubsequence, " ").toLowerCase()}); return; } for (String w : wordnetTokens[curStart]) { ArrayList<String> tmpSequence = new ArrayList<String>(currentSubsequence); + ArrayList<String> tmpOriginalSequence = new ArrayList<String>(currentOriginalSubsequence); tmpSequence.add(w); - getPossibleSubsequencesRec(allSubsequences, tmpSequence, wordnetTokens, curStart + 1, maxLength); + tmpOriginalSequence.add(originalTokens[curStart]); + getPossibleSubsequencesRec(originalTokens, allSubsequences, tmpSequence, tmpOriginalSequence, wordnetTokens, + curStart + 1, maxLength); } } @@ -183,7 +193,7 @@ List<String> termsList = new ArrayList<String>(trieMap.keySet()); Collections.sort(termsList); for (String key : termsList) { - output += key + ":\n"; + output += key + " (" + trieMap.get(key).getFullToken() + ") :\n"; for (Entity candidate: trieMap.get(key).getEntitySet()) { output += "\t"+candidate+"\n"; } @@ -207,10 +217,10 @@ } // generate subsequences starting at the given start index of the given size - Set<String> allPossibleSubsequences = getAllPossibleSubsequences(wordnetTokens); + Set<String[]> allPossibleSubsequences = getAllPossibleSubsequences(tokens, wordnetTokens); - for (String s : allPossibleSubsequences) { - System.out.println(s); + for (String[] s : allPossibleSubsequences) { + System.out.println(String.format("%s - %s", s[0], s[1])); } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/OWLOntologyLuceneSyntacticIndexCreator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/OWLOntologyLuceneSyntacticIndexCreator.java 2013-10-29 08:55:58 UTC (rev 4131) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/OWLOntologyLuceneSyntacticIndexCreator.java 2013-10-29 13:23:45 UTC (rev 4132) @@ -42,7 +42,7 @@ this.ontology = ontology; this.annotationProperty = annotationProperty; this.searchField = searchField; - + schemaEntities = new HashSet<OWLEntity>(); schemaEntities.addAll(ontology.getClassesInSignature()); schemaEntities.addAll(ontology.getObjectPropertiesInSignature()); Modified: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java 2013-10-29 08:55:58 UTC (rev 4131) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java 2013-10-29 13:23:45 UTC (rev 4132) @@ -6,14 +6,12 @@ import com.google.common.base.Charsets; import com.google.common.base.Joiner; import com.google.common.io.Files; -import com.hp.hpl.jena.vocabulary.RDFS; - import org.dllearner.algorithms.celoe.CELOE; import org.dllearner.algorithms.isle.index.*; import org.dllearner.algorithms.isle.index.semantic.SemanticIndex; import org.dllearner.algorithms.isle.index.semantic.simple.SimpleSemanticIndex; -import org.dllearner.algorithms.isle.index.syntactic.OWLOntologyLuceneSyntacticIndexCreator; import org.dllearner.algorithms.isle.index.syntactic.SyntacticIndex; +import org.dllearner.algorithms.isle.index.syntactic.TextDocumentSyntacticIndexCreator; import org.dllearner.algorithms.isle.metrics.PMIRelevanceMetric; import org.dllearner.algorithms.isle.metrics.RelevanceMetric; import org.dllearner.algorithms.isle.metrics.RelevanceUtils; @@ -32,17 +30,12 @@ import org.junit.Before; import org.junit.Test; import org.semanticweb.owlapi.apibinding.OWLManager; -import org.semanticweb.owlapi.model.IRI; -import org.semanticweb.owlapi.model.OWLDataFactory; -import org.semanticweb.owlapi.model.OWLEntity; -import org.semanticweb.owlapi.model.OWLOntology; -import org.semanticweb.owlapi.model.OWLOntologyManager; -import org.semanticweb.owlapi.vocab.OWLRDFVocabulary; - +import org.semanticweb.owlapi.model.*; import uk.ac.manchester.cs.owl.owlapi.OWLDataFactoryImpl; import java.io.File; import java.io.IOException; +import java.net.URL; import java.text.DecimalFormat; import java.util.HashSet; import java.util.Map; @@ -80,10 +73,10 @@ manager = OWLManager.createOWLOntologyManager(); ontology = manager.loadOntologyFromOntologyDocument(new File(testFolder + "ontology.owl")); textRetriever = new RDFSLabelEntityTextRetriever(ontology); - syntacticIndex = new OWLOntologyLuceneSyntacticIndexCreator(ontology, df.getRDFSLabel(), searchField).buildIndex(); - - - } + RemoteDataProvider chapterIndexProvider = new RemoteDataProvider( + new URL("http://gold.linkeddata.org/data/bible/chapter_index.zip")); + syntacticIndex = TextDocumentSyntacticIndexCreator.loadIndex(chapterIndexProvider.getLocalDirectory()); + } private Set<TextDocument> createDocuments(){ Set<TextDocument> documents = new HashSet<TextDocument>(); @@ -100,10 +93,27 @@ } return documents; } - - - /** + private Set<TextDocument> createBibleDocuments() throws IOException { + Set<TextDocument> documents = new HashSet<TextDocument>(); + RemoteDataProvider bibleByChapter = new RemoteDataProvider( + new URL("http://gold.linkeddata.org/data/bible/split_by_chapter.zip")); + File folder = bibleByChapter.getLocalDirectory(); + for (File file : folder.listFiles()) { + if(!file.isDirectory() && !file.isHidden()){ + try { + String text = Files.toString(file, Charsets.UTF_8); + documents.add(new TextDocument(text)); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + return documents; + } + + + /** * @throws java.lang.Exception */ @Before @@ -153,7 +163,7 @@ lp.init(); semanticIndex = new SimpleSemanticIndex(ontology, syntacticIndex); - semanticIndex.buildIndex(createDocuments()); + semanticIndex.buildIndex(createBibleDocuments()); relevance = new PMIRelevanceMetric(semanticIndex); @@ -209,10 +219,10 @@ ClassLearningProblem lp = new ClassLearningProblem(reasoner); lp.setClassToDescribe(cls); lp.init(); + + semanticIndex = new SimpleSemanticIndex(ontology, syntacticIndex, false); + semanticIndex.buildIndex(createBibleDocuments()); - semanticIndex = new SimpleSemanticIndex(ontology, syntacticIndex, false); - semanticIndex.buildIndex(createDocuments()); - relevance = new PMIRelevanceMetric(semanticIndex); Map<Entity, Double> entityRelevance = RelevanceUtils.getRelevantEntities(cls, ontology, relevance); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |