From: <dfl...@us...> - 2013-09-06 11:36:39
|
Revision: 4093 http://sourceforge.net/p/dl-learner/code/4093 Author: dfleischhacker Date: 2013-09-06 11:36:33 +0000 (Fri, 06 Sep 2013) Log Message: ----------- Extend ontology words by synonyms Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java 2013-09-06 10:01:53 UTC (rev 4092) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java 2013-09-06 11:36:33 UTC (rev 4093) @@ -93,7 +93,7 @@ // IndexWord iw = dict.getMorphologicalProcessor().lookupBaseForm(pos, s); if (iw != null) { Synset[] synsets = iw.getSenses(); - for (int i = 0; i < n; i++) { + for (int i = 0; i < Math.min(n, synsets.length); i++) { for (Word word : synsets[i].getWords()) { String c = word.getLemma(); if (!c.equals(s) && !c.contains(" ")) { Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java 2013-09-06 10:01:53 UTC (rev 4092) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java 2013-09-06 11:36:33 UTC (rev 4093) @@ -36,9 +36,13 @@ for (int i = 0; i < camelCase.length(); i++) { // we just ignore characters not matching the defined pattern char curChar = camelCase.charAt(i); - if (!Character.isLetter(curChar)) { + if (Character.isWhitespace(curChar)) { + sb.append(" "); continue; } + else if (!Character.isLetter(curChar)) { + continue; + } if (Character.isUpperCase(curChar)) { // found a new upper case letter resultingWords.add(sb.toString()); sb = new StringBuilder(); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-09-06 10:01:53 UTC (rev 4092) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-09-06 11:36:33 UTC (rev 4093) @@ -1,5 +1,6 @@ package org.dllearner.algorithms.isle.index; +import org.apache.commons.lang.StringUtils; import org.dllearner.algorithms.isle.textretrieval.EntityTextRetriever; import org.dllearner.core.owl.Entity; import org.dllearner.utilities.datastructures.PrefixTrie; @@ -11,28 +12,62 @@ PrefixTrie<Set<Entity>> trie; EntityTextRetriever entityTextRetriever; - + + /** + * Initialize the trie with strings from the provided ontology using a no-op name generator, i.e., only the + * actual ontology strings are added and no expansion is done. + * + * @param entityTextRetriever the text retriever to use + * @param ontology the ontology to get strings from + */ public SimpleEntityCandidatesTrie(EntityTextRetriever entityTextRetriever, OWLOntology ontology) { - this.entityTextRetriever = entityTextRetriever; - buildTrie(ontology); + this(entityTextRetriever, ontology, new DummyNameGenerator()); } + + /** + * Initialize the trie with strings from the provided ontology and use the given entity name generator + * for generating alternative words. + * + * @param entityTextRetriever the text retriever to use + * @param ontology the ontology to get strings from + * @param nameGenerator the name generator to use for generating alternative words + */ + public SimpleEntityCandidatesTrie(EntityTextRetriever entityTextRetriever, OWLOntology ontology, + NameGenerator nameGenerator) { + this.entityTextRetriever = entityTextRetriever; + buildTrie(ontology, nameGenerator); + } - public void buildTrie(OWLOntology ontology) { + public void buildTrie(OWLOntology ontology, NameGenerator nameGenerator) { this.trie = new PrefixTrie<Set<Entity>>(); Map<Entity, Set<String>> relevantText = entityTextRetriever.getRelevantText(ontology); for (Entity entity : relevantText.keySet()) { + for (String text : relevantText.get(entity)) { - addEntry(text, entity); - // Adds also composing words, e.g. for "has child", "has" and "child" are also added - if (text.contains(" ")) { - for (String subtext : text.split(" ")) { - addEntry(subtext, entity); - //System.out.println("trie.add("+subtext+","++")"); - } - } - } - } + text = StringUtils.join(LinguisticUtil.getWordsFromCamelCase(text), " "); + text = StringUtils.join(LinguisticUtil.getWordsFromUnderscored(text), " "); + if (text.trim().isEmpty()) { + continue; + } + addEntry(text, entity); + for (String alternativeText : nameGenerator.getAlternativeText(text)) { +// System.out.println("New alternative text for " + text + " --> " + alternativeText); + addEntry(alternativeText, entity); + } + // Adds also composing words, e.g. for "has child", "has" and "child" are also added + if (text.contains(" ")) { + for (String subtext : text.split(" ")) { + addEntry(subtext, entity); + for (String alternativeText : nameGenerator.getAlternativeText(subtext)) { +// System.out.println("New alternative text for " + subtext + " --> " + alternativeText); + addEntry(alternativeText, entity); + } + //System.out.println("trie.add("+subtext+","++")"); + } + } + } + } } @Override @@ -62,7 +97,7 @@ public String toString() { String output = ""; Map<String,Set<Entity>> trieMap = trie.toMap(); - List<String> termsList = new ArrayList(trieMap.keySet()); + List<String> termsList = new ArrayList<String>(trieMap.keySet()); Collections.sort(termsList); for (String key : termsList) { output += key + ":\n"; @@ -78,4 +113,68 @@ } + public static interface NameGenerator { + /** + * Returns a list of possible alternative words for the given word + * + * @param text the text to return alternative words for + * @return alternative words for given word + */ + List<String> getAlternativeText(String text); + } + + public static class DummyNameGenerator implements NameGenerator { + @Override + public List<String> getAlternativeText(String word) { + return Collections.singletonList(word); + } + } + + /** + * Generates alternative texts by using WordNet synonyms. + */ + public static class WordNetNameGenerator implements NameGenerator { + private int maxNumberOfSenses = 5; + + /** + * Sets up the generator for returning the lemmas of the top {@code maxNumberOfSenses} senses. + * @param maxNumberOfSenses the maximum number of senses to aggregate word lemmas from + */ + public WordNetNameGenerator(int maxNumberOfSenses) { + this.maxNumberOfSenses = maxNumberOfSenses; + } + + @Override + public List<String> getAlternativeText(String word) { + return Arrays.asList(LinguisticUtil.getTopSynonymsForWord(word, maxNumberOfSenses)); + } + } + + /** + * Generates alternative texts by using WordNet synonym and lemmatizing of the original words + */ + public static class LemmatizingWordNetNameGenerator implements NameGenerator { + private int maxNumberOfSenses = 5; + + /** + * Sets up the generator for returning the lemmas of the top {@code maxNumberOfSenses} senses. + * @param maxNumberOfSenses the maximum number of senses to aggregate word lemmas from + */ + public LemmatizingWordNetNameGenerator(int maxNumberOfSenses) { + this.maxNumberOfSenses = maxNumberOfSenses; + } + + @Override + public List<String> getAlternativeText(String word) { + ArrayList<String> res = new ArrayList<String>(); + res.add(LinguisticUtil.getNormalizedForm(word)); + + for (String w : LinguisticUtil + .getTopSynonymsForWord(LinguisticUtil.getNormalizedForm(word), maxNumberOfSenses)) { + res.add(w.replaceAll("_", " ")); + } + + return res; + } + } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-09-06 10:01:53 UTC (rev 4092) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-09-06 11:36:33 UTC (rev 4093) @@ -5,36 +5,36 @@ /** * Annotates a document using a prefix trie + * * @author Andre Melo - * */ public class TrieLinguisticAnnotator implements LinguisticAnnotator { - - EntityCandidatesTrie candidatesTrie; - - public TrieLinguisticAnnotator(EntityCandidatesTrie candidatesTrie) { - this.candidatesTrie = candidatesTrie; - } - - /** - * Generates annotation based on trie's longest matching strings - * @param document - * @return - */ - @Override - public Set<Annotation> annotate(Document document) { - String content = document.getContent(); - Set<Annotation> annotations = new HashSet<Annotation>(); - for (int i=0; i<content.length(); i++) { - String unparsed = content.substring(i); - String match = candidatesTrie.getLongestMatch(unparsed); - if (match!=null && !match.isEmpty()) { - Annotation annotation = new Annotation(document, i, match.length()); - annotations.add(annotation); - i += match.length()-1; - } - } - return annotations; - } + EntityCandidatesTrie candidatesTrie; + public TrieLinguisticAnnotator(EntityCandidatesTrie candidatesTrie) { + this.candidatesTrie = candidatesTrie; + } + + /** + * Generates annotation based on trie's longest matching strings + * + * @param document the document to get annotations for + * @return the set of annotation for the given document + */ + @Override + public Set<Annotation> annotate(Document document) { + String content = document.getContent(); + Set<Annotation> annotations = new HashSet<Annotation>(); + for (int i = 0; i < content.length(); i++) { + String unparsed = content.substring(i); + String match = candidatesTrie.getLongestMatch(unparsed); + if (match != null && !match.isEmpty()) { + Annotation annotation = new Annotation(document, i, match.length()); + annotations.add(annotation); + i += match.length() - 1; + } + } + return annotations; + } + } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java 2013-09-06 10:01:53 UTC (rev 4092) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java 2013-09-06 11:36:33 UTC (rev 4093) @@ -30,7 +30,8 @@ */ public SimpleSemanticIndex(OWLOntology ontology, SyntacticIndex syntacticIndex) { super(ontology); - SimpleEntityCandidatesTrie trie = new SimpleEntityCandidatesTrie(new RDFSLabelEntityTextRetriever(ontology), ontology); + SimpleEntityCandidatesTrie trie = new SimpleEntityCandidatesTrie(new RDFSLabelEntityTextRetriever(ontology), + ontology, new SimpleEntityCandidatesTrie.LemmatizingWordNetNameGenerator(5)); // trie.printTrie(); setSemanticAnnotator(new SemanticAnnotator( new SimpleWordSenseDisambiguation(ontology), This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |