From: <dfl...@us...> - 2013-12-09 14:40:08
|
Revision: 4195 http://sourceforge.net/p/dl-learner/code/4195 Author: dfleischhacker Date: 2013-12-09 14:40:04 +0000 (Mon, 09 Dec 2013) Log Message: ----------- WordNet alternative forms Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndexGenerator.java trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java 2013-12-09 14:36:38 UTC (rev 4194) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java 2013-12-09 14:40:04 UTC (rev 4195) @@ -97,6 +97,20 @@ } /** + * Returns an array of all synonyms for the given word. Only synonyms for the POS in {@link #RELEVANT_POS} are + * returned. + * + * @param word the word to retrieve synonyms for + * @return synonyms for the given word + */ + public String[] getSynonymsForWord(String word, POS pos) { + ArrayList<String> synonyms = new ArrayList<String>(); + + synonyms.addAll(wn.getAllSynonyms(pos, word)); + return synonyms.toArray(new String[synonyms.size()]); + } + + /** * Returns an array of the lemmas of the top {@code n} synonyms for the given word. Only synonyms for the POS in * {@link #RELEVANT_POS} are returned. * Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-09 14:36:38 UTC (rev 4194) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-09 14:40:04 UTC (rev 4195) @@ -1,9 +1,8 @@ package org.dllearner.algorithms.isle.index; -import org.apache.commons.lang.StringUtils; +import net.didion.jwnl.data.POS; import org.dllearner.algorithms.isle.textretrieval.EntityTextRetriever; import org.dllearner.core.owl.Entity; -import org.dllearner.utilities.datastructures.PrefixTrie; import org.semanticweb.owlapi.model.OWLOntology; import java.util.*; @@ -11,7 +10,6 @@ public class SimpleEntityCandidatesTrie implements EntityCandidatesTrie { TokenTree tree; - PrefixTrie<FullTokenEntitySetPair> trie; EntityTextRetriever entityTextRetriever; // /** @@ -31,15 +29,13 @@ * * @param entityTextRetriever the text retriever to use * @param ontology the ontology to get strings from - * @param nameGenerator the name generator to use for generating alternative words */ - public SimpleEntityCandidatesTrie(EntityTextRetriever entityTextRetriever, OWLOntology ontology, - NameGenerator nameGenerator) { + public SimpleEntityCandidatesTrie(EntityTextRetriever entityTextRetriever, OWLOntology ontology) { this.entityTextRetriever = entityTextRetriever; - buildTrie(ontology, nameGenerator); + buildTrie(ontology); } - public void buildTrie(OWLOntology ontology, NameGenerator nameGenerator) { + public void buildTrie(OWLOntology ontology) { this.tree = new TokenTree(); Map<Entity, Set<List<Token>>> entity2TokenSet = entityTextRetriever.getRelevantText(ontology); @@ -48,12 +44,9 @@ Entity entity = entry.getKey(); Set<List<Token>> tokenSet = entry.getValue(); for (List<Token> tokens : tokenSet) { + addAlternativeFormsFromWordNet(tokens); addEntry(tokens, entity); addSubsequences(entity, tokens); -// addSubsequencesWordNet(entity, text); -// for (String alternativeText : nameGenerator.getAlternativeText(text)) { -// addEntry(alternativeText.toLowerCase(), entity, text); -// } } } } @@ -76,65 +69,33 @@ } } -// private void addSubsequencesWordNet(Entity entity, String text) { -// if (text.contains(" ")) { -// String[] tokens = text.split(" "); -// -// List<String>[] wordnetTokens = (ArrayList<String>[]) new ArrayList[tokens.length]; -// -// // generate list of lemmatized wordnet synonyms for each token -// for (int i = 0; i < tokens.length; i++) { -// wordnetTokens[i] = new ArrayList<String>(); -// wordnetTokens[i].add(LinguisticUtil.getInstance().getNormalizedForm(tokens[i].toLowerCase())); -// for (String w : LinguisticUtil.getInstance().getTopSynonymsForWord(tokens[i], 5)) { -// wordnetTokens[i].add(LinguisticUtil.getInstance().getNormalizedForm(w).toLowerCase()); -// } -// } -// -// // generate subsequences starting at the given start index of the given size -// Set<String[]> allPossibleSubsequences = getAllPossibleSubsequences(tokens, wordnetTokens); -// -// for (String[] s : allPossibleSubsequences) { -// addEntry(s[0], entity, s[1]); -// } -// } -// } + private void addAlternativeFormsFromWordNet(List<Token> tokens) { + for (Token t : tokens) { + POS wordnetPos = null; + String posTag = t.getPOSTag(); + if (posTag.startsWith("N")) {//nouns + wordnetPos = POS.NOUN; + } + else if (posTag.startsWith("V")) {//verbs + wordnetPos = POS.VERB; + } + else if (posTag.startsWith("J")) {//adjectives + wordnetPos = POS.ADJECTIVE; + } + else if (posTag.startsWith("R")) {//adverbs + wordnetPos = POS.ADVERB; + } + if (wordnetPos == null) { + continue; + } + String[] synonyms = LinguisticUtil.getInstance().getSynonymsForWord(t.getRawForm(), wordnetPos); - private static Set<String[]> getAllPossibleSubsequences(String[] originalTokens, List<String>[] wordnetTokens) { - ArrayList<String[]> res = new ArrayList<String[]>(); - - for (int size = 1; size < wordnetTokens.length + 1; size++) { - for (int start = 0; start < wordnetTokens.length - size + 1; start++) { - getPossibleSubsequencesRec(originalTokens, res, new ArrayList<String>(), new ArrayList<String>(), - wordnetTokens, 0, size); + for (String synonym : synonyms) { + t.addAlternativeForm(LinguisticUtil.getInstance().getNormalizedForm(synonym)); } } - - return new HashSet<String[]>(res); } - - private static void getPossibleSubsequencesRec(String[] originalTokens, List<String[]> allSubsequences, - List<String> currentSubsequence, - List<String> currentOriginalSubsequence, - List<String>[] wordnetTokens, - int curStart, int maxLength) { - - if (currentSubsequence.size() == maxLength) { - allSubsequences.add(new String[]{StringUtils.join(currentSubsequence, " ").toLowerCase(), StringUtils - .join(currentOriginalSubsequence, " ").toLowerCase()}); - return; - } - for (String w : wordnetTokens[curStart]) { - ArrayList<String> tmpSequence = new ArrayList<String>(currentSubsequence); - ArrayList<String> tmpOriginalSequence = new ArrayList<String>(currentOriginalSubsequence); - tmpSequence.add(w); - tmpOriginalSequence.add(originalTokens[curStart]); - getPossibleSubsequencesRec(originalTokens, allSubsequences, tmpSequence, tmpOriginalSequence, wordnetTokens, - curStart + 1, maxLength); - } - } - @Override public void addEntry(List<Token> s, Entity e) { tree.add(s, e); @@ -177,111 +138,10 @@ wordnetTokens[i].add(LinguisticUtil.getInstance().getNormalizedForm(w).replaceAll("_", " ")); } } - - // generate subsequences starting at the given start index of the given size - Set<String[]> allPossibleSubsequences = getAllPossibleSubsequences(tokens, wordnetTokens); - - for (String[] s : allPossibleSubsequences) { - System.out.println(String.format("%s - %s", s[0], s[1])); - } } public void printTrie() { System.out.println(this.toString()); - + } - - public static interface NameGenerator { - /** - * Returns a list of possible alternative words for the given word - * - * @param text the text to return alternative words for - * @return alternative words for given word - */ - List<String> getAlternativeText(String text); - } - - public static class DummyNameGenerator implements NameGenerator { - @Override - public List<String> getAlternativeText(String word) { - return Collections.singletonList(word); - } - } - - /** - * Generates alternative texts by using WordNet synonyms. - */ - public static class WordNetNameGenerator implements NameGenerator { - private int maxNumberOfSenses = 5; - - /** - * Sets up the generator for returning the lemmas of the top {@code maxNumberOfSenses} senses. - * @param maxNumberOfSenses the maximum number of senses to aggregate word lemmas from - */ - public WordNetNameGenerator(int maxNumberOfSenses) { - this.maxNumberOfSenses = maxNumberOfSenses; - } - - @Override - public List<String> getAlternativeText(String word) { - return Arrays.asList(LinguisticUtil.getInstance().getTopSynonymsForWord(word, maxNumberOfSenses)); - } - } - - /** - * Generates alternative texts by using WordNet synonym and lemmatizing of the original words - */ - public static class LemmatizingWordNetNameGenerator implements NameGenerator { - private int maxNumberOfSenses = 5; - - /** - * Sets up the generator for returning the lemmas of the top {@code maxNumberOfSenses} senses. - * @param maxNumberOfSenses the maximum number of senses to aggregate word lemmas from - */ - public LemmatizingWordNetNameGenerator(int maxNumberOfSenses) { - this.maxNumberOfSenses = maxNumberOfSenses; - } - - @Override - public List<String> getAlternativeText(String word) { - ArrayList<String> res = new ArrayList<String>(); - res.add(LinguisticUtil.getInstance().getNormalizedForm(word)); - - for (String w : LinguisticUtil.getInstance().getTopSynonymsForWord(word, maxNumberOfSenses)) { - res.add(LinguisticUtil.getInstance().getNormalizedForm(w.replaceAll("_", " "))); - } - - return res; - } - } - - /** - * Pair of the actual word and the word after processing. - */ - public static class ActualModifiedWordPair { - private String actualString; - private String modifiedString; - - public String getActualString() { - return actualString; - } - - public void setActualString(String actualString) { - this.actualString = actualString; - } - - public String getModifiedString() { - return modifiedString; - } - - public void setModifiedString(String modifiedString) { - this.modifiedString = modifiedString; - } - - public ActualModifiedWordPair(String actualString, String modifiedString) { - - this.actualString = actualString; - this.modifiedString = modifiedString; - } - } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-12-09 14:36:38 UTC (rev 4194) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-12-09 14:40:04 UTC (rev 4195) @@ -3,13 +3,13 @@ */ package org.dllearner.algorithms.isle.index; +import com.google.common.collect.ComparisonChain; + import java.io.Serializable; import java.util.Collections; import java.util.HashSet; import java.util.Set; -import com.google.common.collect.ComparisonChain; - /** * @author Lorenz Buehmann * Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java 2013-12-09 14:36:38 UTC (rev 4194) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java 2013-12-09 14:40:04 UTC (rev 4195) @@ -13,13 +13,13 @@ * @author Daniel Fleischhacker */ public class TokenTree { - private HashMap<Token, TokenTree> children; + private LinkedHashMap<Token, TokenTree> children; private Set<Entity> entities; private List<Token> originalTokens; private boolean ignoreStopWords = true; public TokenTree() { - this.children = new HashMap<>(); + this.children = new LinkedHashMap<>(); this.entities = new HashSet<>(); this.originalTokens = new ArrayList<>(); } @@ -73,7 +73,7 @@ } /** - * Returns the set of entities located by the given list of tokens. + * Returns the set of entities located by the given list of tokens. This method does not consider alternative forms. * * @param tokens tokens to locate the information to get * @return located set of entities or null if token sequence not contained in tree @@ -101,7 +101,7 @@ TokenTree curNode = this; for (Token t : tokens) { - TokenTree nextNode = curNode.children.get(t); + TokenTree nextNode = getNextTokenTree(curNode, t); if (nextNode == null) { return fallbackTokenList; } @@ -111,6 +111,19 @@ return fallbackTokenList; } + private TokenTree getNextTokenTree(TokenTree current, Token t) { + TokenTree next = current.children.get(t); + if (next != null) { + return next; + } + for (Map.Entry<Token, TokenTree> child : current.children.entrySet()) { + if (child.getKey().equalsWithAlternativeForms(t)) { + return child.getValue(); + } + } + return null; + } + /** * Returns the set of entities assigned to the longest matching token subsequence of the given token sequence. * @param tokens token sequence to search for longest match @@ -121,7 +134,7 @@ TokenTree curNode = this; for (Token t : tokens) { - TokenTree nextNode = curNode.children.get(t); + TokenTree nextNode = getNextTokenTree(curNode, t); if (nextNode == null) { return fallback == null ? null : fallback.entities; } @@ -142,7 +155,7 @@ TokenTree curNode = this; for (Token t : tokens) { - TokenTree nextNode = curNode.children.get(t); + TokenTree nextNode = getNextTokenTree(curNode, t); if (nextNode == null) { return fallback == null ? null : fallback.originalTokens; } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndexGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndexGenerator.java 2013-12-09 14:36:38 UTC (rev 4194) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndexGenerator.java 2013-12-09 14:40:04 UTC (rev 4195) @@ -1,38 +1,22 @@ package org.dllearner.algorithms.isle.index.semantic; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; -import java.util.HashSet; -import java.util.Set; - +import com.google.common.hash.HashCode; +import com.google.common.hash.HashFunction; +import com.google.common.hash.Hashing; import org.apache.log4j.Logger; import org.dllearner.algorithms.isle.EntityCandidateGenerator; import org.dllearner.algorithms.isle.TextDocumentGenerator; -import org.dllearner.algorithms.isle.index.AnnotatedDocument; -import org.dllearner.algorithms.isle.index.LinguisticAnnotator; -import org.dllearner.algorithms.isle.index.SemanticAnnotator; -import org.dllearner.algorithms.isle.index.SimpleEntityCandidatesTrie; -import org.dllearner.algorithms.isle.index.TextDocument; -import org.dllearner.algorithms.isle.index.TrieEntityCandidateGenerator; -import org.dllearner.algorithms.isle.index.TrieLinguisticAnnotator; +import org.dllearner.algorithms.isle.index.*; import org.dllearner.algorithms.isle.textretrieval.RDFSLabelEntityTextRetriever; import org.dllearner.algorithms.isle.wsd.StructureBasedWordSenseDisambiguation; import org.dllearner.algorithms.isle.wsd.WindowBasedContextExtractor; import org.dllearner.algorithms.isle.wsd.WordSenseDisambiguation; import org.dllearner.core.owl.Entity; -import org.semanticweb.owlapi.model.OWLAnnotation; -import org.semanticweb.owlapi.model.OWLAnnotationProperty; -import org.semanticweb.owlapi.model.OWLEntity; -import org.semanticweb.owlapi.model.OWLLiteral; -import org.semanticweb.owlapi.model.OWLOntology; +import org.semanticweb.owlapi.model.*; -import com.google.common.hash.HashCode; -import com.google.common.hash.HashFunction; -import com.google.common.hash.Hashing; +import java.io.*; +import java.util.HashSet; +import java.util.Set; /** * Interface for an index which is able to resolve a given entity's URI to the set of documents containing @@ -86,14 +70,8 @@ public static SemanticIndex generateIndex(Set<String> documents, OWLOntology ontology, boolean useWordNormalization){ SimpleEntityCandidatesTrie trie; - if (useWordNormalization) { - trie = new SimpleEntityCandidatesTrie(new RDFSLabelEntityTextRetriever(ontology), - ontology, new SimpleEntityCandidatesTrie.LemmatizingWordNetNameGenerator(5)); - } - else { - trie = new SimpleEntityCandidatesTrie(new RDFSLabelEntityTextRetriever(ontology), - ontology, new SimpleEntityCandidatesTrie.DummyNameGenerator()); - } + trie = new SimpleEntityCandidatesTrie(new RDFSLabelEntityTextRetriever(ontology), + ontology); trie.printTrie(); TrieLinguisticAnnotator linguisticAnnotator = new TrieLinguisticAnnotator(trie); @@ -142,7 +120,10 @@ logger.info("Creating semantic index..."); SemanticIndex index = new SemanticIndex(); for (String document : documents) { - TextDocument textDocument = TextDocumentGenerator.getInstance().generateDocument(document); + if (document.isEmpty()) { + continue; + } + TextDocument textDocument = TextDocumentGenerator.getInstance().generateDocument(document); logger.debug("Processing document:" + textDocument); AnnotatedDocument annotatedDocument = semanticAnnotator.processDocument(textDocument); for (Entity entity : annotatedDocument.getContainedEntities()) { Modified: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java 2013-12-09 14:36:38 UTC (rev 4194) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java 2013-12-09 14:40:04 UTC (rev 4195) @@ -3,26 +3,11 @@ */ package org.dllearner.algorithms.isle; -import java.io.File; -import java.io.IOException; -import java.net.URL; -import java.text.DecimalFormat; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; - +import com.google.common.base.Charsets; +import com.google.common.base.Joiner; +import com.google.common.io.Files; import org.dllearner.algorithms.celoe.CELOE; -import org.dllearner.algorithms.isle.index.AnnotatedDocument; -import org.dllearner.algorithms.isle.index.EntityCandidatesTrie; -import org.dllearner.algorithms.isle.index.LinguisticAnnotator; -import org.dllearner.algorithms.isle.index.RemoteDataProvider; -import org.dllearner.algorithms.isle.index.SemanticAnnotator; -import org.dllearner.algorithms.isle.index.SimpleEntityCandidatesTrie; -import org.dllearner.algorithms.isle.index.TextDocument; -import org.dllearner.algorithms.isle.index.Token; -import org.dllearner.algorithms.isle.index.TrieEntityCandidateGenerator; -import org.dllearner.algorithms.isle.index.TrieLinguisticAnnotator; +import org.dllearner.algorithms.isle.index.*; import org.dllearner.algorithms.isle.index.semantic.SemanticIndex; import org.dllearner.algorithms.isle.index.semantic.SemanticIndexGenerator; import org.dllearner.algorithms.isle.metrics.PMIRelevanceMetric; @@ -43,17 +28,17 @@ import org.junit.Before; import org.junit.Test; import org.semanticweb.owlapi.apibinding.OWLManager; -import org.semanticweb.owlapi.model.IRI; -import org.semanticweb.owlapi.model.OWLDataFactory; -import org.semanticweb.owlapi.model.OWLEntity; -import org.semanticweb.owlapi.model.OWLOntology; -import org.semanticweb.owlapi.model.OWLOntologyManager; - +import org.semanticweb.owlapi.model.*; import uk.ac.manchester.cs.owl.owlapi.OWLDataFactoryImpl; -import com.google.common.base.Charsets; -import com.google.common.base.Joiner; -import com.google.common.io.Files; +import java.io.File; +import java.io.IOException; +import java.net.URL; +import java.text.DecimalFormat; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; /** * Some tests for the ISLE algorithm. @@ -184,8 +169,7 @@ @Test public void testEntityLinkingWithLemmatizing() throws Exception { - EntityCandidatesTrie ect = new SimpleEntityCandidatesTrie(new RDFSLabelEntityTextRetriever(ontology), ontology, - new SimpleEntityCandidatesTrie.LemmatizingWordNetNameGenerator(5)); + EntityCandidatesTrie ect = new SimpleEntityCandidatesTrie(new RDFSLabelEntityTextRetriever(ontology), ontology); LinguisticAnnotator linguisticAnnotator = new TrieLinguisticAnnotator(ect); WordSenseDisambiguation wsd = new SimpleWordSenseDisambiguation(ontology); EntityCandidateGenerator ecg = new TrieEntityCandidateGenerator(ontology, ect); @@ -200,8 +184,7 @@ @Test public void testEntityLinkingWithSimpleStringMatching() throws Exception { - EntityCandidatesTrie ect = new SimpleEntityCandidatesTrie(new RDFSLabelEntityTextRetriever(ontology), ontology, - new SimpleEntityCandidatesTrie.DummyNameGenerator()); + EntityCandidatesTrie ect = new SimpleEntityCandidatesTrie(new RDFSLabelEntityTextRetriever(ontology), ontology); TrieLinguisticAnnotator linguisticAnnotator = new TrieLinguisticAnnotator(ect); linguisticAnnotator.setNormalizeWords(false); WordSenseDisambiguation wsd = new SimpleWordSenseDisambiguation(ontology); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |