From: <dfl...@us...> - 2013-10-24 13:56:30
|
Revision: 4130 http://sourceforge.net/p/dl-learner/code/4130 Author: dfleischhacker Date: 2013-10-24 13:56:26 +0000 (Thu, 24 Oct 2013) Log Message: ----------- Get the ISLE pipeline working * Ability to resolve match in trie to the producing string (the pre-wordnet one) * Add NormalizedTextMapper for mapping normalized words to their original documents * Activate structure based WSD Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/FullTokenEntitySetPair.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NormalizedTextMapper.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java 2013-10-24 13:47:58 UTC (rev 4129) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java 2013-10-24 13:56:26 UTC (rev 4130) @@ -1,11 +1,9 @@ package org.dllearner.algorithms.isle.index; -import java.util.Map.Entry; +import org.dllearner.core.owl.Entity; + import java.util.Set; -import org.dllearner.core.owl.Entity; -import org.dllearner.utilities.datastructures.PrefixTrie; - public interface EntityCandidatesTrie { /** @@ -22,14 +20,22 @@ * @return */ public Set<Entity> getCandidateEntities(String s); - - + + /** - * Gets the longest matching string - * @param s - * @return + * Returns the string on which this entry is based on. This is used e.g. for storing the original + * ontology string when the parameter string has been added to the trie after generation by using + * WordNet or other additional methods. + * + * @param s the string to search in the trie + * @return string generating the path of the longest match in the trie */ - public String getLongestMatch(String s); - - + public String getGeneratingStringForLongestMatch(String s); + + /** + * Gets the longest matching string + * @param s + * @return + */ + public String getLongestMatchingText(String s); } Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/FullTokenEntitySetPair.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/FullTokenEntitySetPair.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/FullTokenEntitySetPair.java 2013-10-24 13:56:26 UTC (rev 4130) @@ -0,0 +1,31 @@ +package org.dllearner.algorithms.isle.index; + +import org.dllearner.core.owl.Entity; + +import java.util.HashSet; +import java.util.Set; + +/** + * A pair consisting of a full string token and the corresponding entities + */ +public class FullTokenEntitySetPair { + private String fullToken; + private Set<Entity> entitySet; + + public FullTokenEntitySetPair(String fullToken) { + this.fullToken = fullToken; + this.entitySet = new HashSet<Entity>(); + } + + public String getFullToken() { + return fullToken; + } + + public Set<Entity> getEntitySet() { + return entitySet; + } + + public void addEntity(Entity entity) { + entitySet.add(entity); + } +} Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NormalizedTextMapper.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NormalizedTextMapper.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NormalizedTextMapper.java 2013-10-24 13:56:26 UTC (rev 4130) @@ -0,0 +1,140 @@ +package org.dllearner.algorithms.isle.index; + +import java.util.ArrayList; + +/** + * Provides text normalization and mapping of normalized ranges to the original ones. + */ +public class NormalizedTextMapper { + private Document originalDocument; + private String originalText; + private String normalizedText; + + private ArrayList<OccurenceMappingPair> normalizedIndexToOriginalIndex; + + public NormalizedTextMapper(Document original) { + this.originalDocument = original; + this.originalText = original.getContent(); + this.normalizedIndexToOriginalIndex = new ArrayList<OccurenceMappingPair>(); + + StringBuilder sb = new StringBuilder(); + int currentOriginalIndex = 0; + for (String originalWord : originalText.split(" ")) { + String normalizedWord = getNormalizedWord(originalWord); + normalizedIndexToOriginalIndex + .add(new OccurenceMappingPair(currentOriginalIndex, originalWord.length(), sb.length(), + normalizedWord.length())); + currentOriginalIndex += originalWord.length() + 1; + sb.append(normalizedWord); + sb.append(" "); + } + normalizedText = sb.toString(); + } + + public String getOriginalText() { + return originalText; + } + + public String getNormalizedText() { + return normalizedText; + } + + /** + * Returns the annotation for the original text matching the given position and length in the normalized + * text. + * + * @param position position in the normalized text to get annotation for + * @param length length of the text to get annotation for + * @return + */ + public Annotation getOriginalAnnotationForPosition(int position, int length) { + int curNormalizedLength = 0; + int originalStart = -1; + int curOriginalLength = 0; + + for (OccurenceMappingPair p : normalizedIndexToOriginalIndex) { + if (p.getNormalizedIndex() == position) { + originalStart = p.getOriginalIndex(); + } + if (originalStart != -1) { + curNormalizedLength += p.getNormalizedLength(); + curOriginalLength += p.getOriginalLength(); + if (curNormalizedLength >= length) { + return new Annotation(originalDocument, originalStart, curOriginalLength); + } + + // include space + curNormalizedLength += 1; + curOriginalLength += 1; + } + } + + return null; + } + + /** + * Returns the normalized form of the given word. Word must not contain any spaces or the like. + * @param word + * @return + */ + private String getNormalizedWord(String word) { + return LinguisticUtil.getInstance().getNormalizedForm(word); + } + + public static void main(String[] args) { + NormalizedTextMapper n = new NormalizedTextMapper(new TextDocument("This is a testing text using letters")); + System.out.println(n.getOriginalText()); + System.out.println(n.getNormalizedText()); + for (OccurenceMappingPair p : n.normalizedIndexToOriginalIndex) { + System.out.println(p); + } + System.out.println(n.getOriginalAnnotationForPosition(7,6)); + System.out.println(n.getOriginalAnnotationForPosition(23,6)); + System.out.println(n.getOriginalAnnotationForPosition(7,1)); + System.out.println(n.getOriginalAnnotationForPosition(14,15)); + } + + /** + * Maps words identified by index and length in the normalized texts to the original word. + */ + private class OccurenceMappingPair { + private int originalIndex; + private int originalLength; + private int normalizedIndex; + private int normalizedLength; + + private OccurenceMappingPair(int originalIndex, int originalLength, int normalizedIndex, int normalizedLength) { + + this.originalIndex = originalIndex; + this.originalLength = originalLength; + this.normalizedIndex = normalizedIndex; + this.normalizedLength = normalizedLength; + } + + private int getNormalizedIndex() { + return normalizedIndex; + } + + private int getNormalizedLength() { + return normalizedLength; + } + + private int getOriginalLength() { + return originalLength; + } + + private int getOriginalIndex() { + return originalIndex; + } + + @Override + public String toString() { + return "OccurenceMappingPair{" + + "originalIndex=" + originalIndex + + ", originalLength=" + originalLength + + ", normalizedIndex=" + normalizedIndex + + ", normalizedLength=" + normalizedLength + + '}'; + } + } +} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-10-24 13:47:58 UTC (rev 4129) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-10-24 13:56:26 UTC (rev 4130) @@ -10,7 +10,7 @@ public class SimpleEntityCandidatesTrie implements EntityCandidatesTrie { - PrefixTrie<Set<Entity>> trie; + PrefixTrie<FullTokenEntitySetPair> trie; EntityTextRetriever entityTextRetriever; // /** @@ -39,7 +39,7 @@ } public void buildTrie(OWLOntology ontology, NameGenerator nameGenerator) { - this.trie = new PrefixTrie<Set<Entity>>(); + this.trie = new PrefixTrie<FullTokenEntitySetPair>(); Map<Entity, Set<String>> relevantText = entityTextRetriever.getRelevantText(ontology); for (Entity entity : relevantText.keySet()) { @@ -55,7 +55,7 @@ addSubsequencesWordNet(entity, text); for (String alternativeText : nameGenerator.getAlternativeText(text)) { - addEntry(alternativeText, entity); + addEntry(alternativeText, entity, text); } } } @@ -136,37 +136,55 @@ @Override public void addEntry(String s, Entity e) { - Set<Entity> candidates; + FullTokenEntitySetPair candidates; if (trie.contains(s)) candidates = trie.get(s); else - candidates = new HashSet<Entity>(); + candidates = new FullTokenEntitySetPair(s); - candidates.add(e); + candidates.addEntity(e); trie.put(s, candidates); } + public void addEntry(String s, Entity e, String originalString) { + FullTokenEntitySetPair candidates; + if (trie.contains(s)) + candidates = trie.get(s); + else + candidates = new FullTokenEntitySetPair(originalString); + + candidates.addEntity(e); + + trie.put(s, candidates); + } + @Override public Set<Entity> getCandidateEntities(String s) { - Set<Entity> res = trie.get(s); - return res == null ? new HashSet<Entity>() : trie.get(s); + FullTokenEntitySetPair res = trie.get(s); + return res == null ? new HashSet<Entity>() : trie.get(s).getEntitySet(); } @Override - public String getLongestMatch(String s) { + public String getGeneratingStringForLongestMatch(String s) { CharSequence match = trie.getLongestMatch(s); - return (match!=null) ? match.toString() : null; + return (match!=null) ? trie.get(match).getFullToken() : null; } + + @Override + public String getLongestMatchingText(String s) { + CharSequence match = trie.getLongestMatch(s); + return (match!=null) ? match.toString() : null; + } public String toString() { String output = ""; - Map<String,Set<Entity>> trieMap = trie.toMap(); + Map<String,FullTokenEntitySetPair> trieMap = trie.toMap(); List<String> termsList = new ArrayList<String>(trieMap.keySet()); Collections.sort(termsList); for (String key : termsList) { output += key + ":\n"; - for (Entity candidate: trieMap.get(key)) { + for (Entity candidate: trieMap.get(key).getEntitySet()) { output += "\t"+candidate+"\n"; } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-10-24 13:47:58 UTC (rev 4129) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-10-24 13:56:26 UTC (rev 4130) @@ -25,24 +25,23 @@ */ @Override public Set<Annotation> annotate(Document document) { - String content = document.getContent(); Set<Annotation> annotations = new HashSet<Annotation>(); + NormalizedTextMapper mapper = new NormalizedTextMapper(document); + String content = mapper.getNormalizedText(); for (int i = 0; i < content.length(); i++) { if (Character.isWhitespace(content.charAt(i))) { continue; } String unparsed = content.substring(i); - if (normalizeWords) { - unparsed = LinguisticUtil.getInstance().getNormalizedForm(unparsed); - } - String match = candidatesTrie.getLongestMatch(unparsed); + String match = candidatesTrie.getLongestMatchingText(unparsed); if (match != null && !match.isEmpty()) { - - //TODO: here we are losing the original offset and index... - Annotation annotation = new Annotation(document, i, match.length()); + Annotation annotation = mapper.getOriginalAnnotationForPosition(i, match.length()); annotations.add(annotation); i += match.length() - 1; } + while (!Character.isWhitespace(content.charAt(i)) && i < content.length()) { + i++; + } } return annotations; } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java 2013-10-24 13:47:58 UTC (rev 4129) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java 2013-10-24 13:56:26 UTC (rev 4130) @@ -10,7 +10,8 @@ import org.dllearner.algorithms.isle.index.semantic.SemanticIndex; import org.dllearner.algorithms.isle.index.syntactic.SyntacticIndex; import org.dllearner.algorithms.isle.textretrieval.RDFSLabelEntityTextRetriever; -import org.dllearner.algorithms.isle.wsd.SimpleWordSenseDisambiguation; +import org.dllearner.algorithms.isle.wsd.StructureBasedWordSenseDisambiguation; +import org.dllearner.algorithms.isle.wsd.WindowBasedContextExtractor; import org.semanticweb.owlapi.model.OWLOntology; /** @@ -56,7 +57,7 @@ TrieLinguisticAnnotator linguisticAnnotator = new TrieLinguisticAnnotator(trie); linguisticAnnotator.setNormalizeWords(useWordNormalization); setSemanticAnnotator(new SemanticAnnotator( - new SimpleWordSenseDisambiguation(ontology), + new StructureBasedWordSenseDisambiguation(new WindowBasedContextExtractor(), ontology), new TrieEntityCandidateGenerator(ontology, trie), linguisticAnnotator)); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |