You can subscribe to this list here.
2007 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
(120) |
Sep
(36) |
Oct
(116) |
Nov
(17) |
Dec
(44) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2008 |
Jan
(143) |
Feb
(192) |
Mar
(74) |
Apr
(84) |
May
(105) |
Jun
(64) |
Jul
(49) |
Aug
(120) |
Sep
(159) |
Oct
(156) |
Nov
(51) |
Dec
(28) |
2009 |
Jan
(17) |
Feb
(55) |
Mar
(33) |
Apr
(57) |
May
(54) |
Jun
(28) |
Jul
(6) |
Aug
(16) |
Sep
(38) |
Oct
(30) |
Nov
(26) |
Dec
(52) |
2010 |
Jan
(7) |
Feb
(91) |
Mar
(65) |
Apr
(2) |
May
(14) |
Jun
(25) |
Jul
(38) |
Aug
(48) |
Sep
(80) |
Oct
(70) |
Nov
(75) |
Dec
(77) |
2011 |
Jan
(68) |
Feb
(53) |
Mar
(51) |
Apr
(35) |
May
(65) |
Jun
(101) |
Jul
(29) |
Aug
(230) |
Sep
(95) |
Oct
(49) |
Nov
(110) |
Dec
(63) |
2012 |
Jan
(41) |
Feb
(42) |
Mar
(25) |
Apr
(46) |
May
(51) |
Jun
(44) |
Jul
(45) |
Aug
(29) |
Sep
(12) |
Oct
(9) |
Nov
(17) |
Dec
(2) |
2013 |
Jan
(12) |
Feb
(14) |
Mar
(7) |
Apr
(16) |
May
(54) |
Jun
(27) |
Jul
(11) |
Aug
(5) |
Sep
(85) |
Oct
(27) |
Nov
(37) |
Dec
(32) |
2014 |
Jan
(8) |
Feb
(29) |
Mar
(5) |
Apr
(3) |
May
(22) |
Jun
(3) |
Jul
(4) |
Aug
(3) |
Sep
|
Oct
|
Nov
|
Dec
|
From: <dfl...@us...> - 2013-12-02 15:19:09
|
Revision: 4185 http://sourceforge.net/p/dl-learner/code/4185 Author: dfleischhacker Date: 2013-12-02 15:19:06 +0000 (Mon, 02 Dec 2013) Log Message: ----------- Add getTokensStartingAtToken without numerOfToken parameter Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-12-02 14:59:36 UTC (rev 4184) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-12-02 15:19:06 UTC (rev 4185) @@ -99,6 +99,31 @@ return tokens; } + /** + * Returns a list containing all successive tokens from this document starting at the given start + * token. If {@code ignorePunctuation} is set, tokens which represent punctuation are added to the result but not + * counted for the number of tokens. + * + * @param start token to start collecting tokens from the document + * @param ignorePunctuation if true, punctuation are not counted towards the number of tokens to return + * @return list containing all relevant tokens, depending in the value of ignorePunctuation, the + * list might contain additional non-relevant (punctuation) tokens + */ + public List<Token> getTokensStartingAtToken(Token start, boolean ignorePunctuation) { + ArrayList<Token> tokens = new ArrayList<Token>(); + + boolean found = false; + + for (int i = 0; i < this.size(); i++) { + Token t = this.get(i); + if (t == start) { + return this.subList(i, this.size()); + } + } + + return tokens; + } + private String getStringForLevel(Token t, SurfaceFormLevel l) { switch (l) { case RAW: This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-12-02 14:59:39
|
Revision: 4184 http://sourceforge.net/p/dl-learner/code/4184 Author: dfleischhacker Date: 2013-12-02 14:59:36 +0000 (Mon, 02 Dec 2013) Log Message: ----------- Adapt to new Token implementation Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java 2013-12-02 14:52:33 UTC (rev 4183) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java 2013-12-02 14:59:36 UTC (rev 4184) @@ -2,6 +2,7 @@ import org.dllearner.core.owl.Entity; +import java.util.List; import java.util.Set; public interface EntityCandidatesTrie { @@ -11,7 +12,7 @@ * @param s * @param e */ - public void addEntry(String s, Entity e); + public void addEntry(List<Token> s, Entity e); /** Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-02 14:52:33 UTC (rev 4183) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-02 14:59:36 UTC (rev 4184) @@ -3,7 +3,6 @@ import org.apache.commons.lang.StringUtils; import org.dllearner.algorithms.isle.textretrieval.EntityTextRetriever; import org.dllearner.core.owl.Entity; -import org.dllearner.utilities.MapUtils; import org.dllearner.utilities.datastructures.PrefixTrie; import org.semanticweb.owlapi.model.OWLOntology; @@ -11,7 +10,7 @@ import java.util.Map.Entry; public class SimpleEntityCandidatesTrie implements EntityCandidatesTrie { - + TokenTree tree; PrefixTrie<FullTokenEntitySetPair> trie; EntityTextRetriever entityTextRetriever; @@ -41,7 +40,7 @@ } public void buildTrie(OWLOntology ontology, NameGenerator nameGenerator) { - this.trie = new PrefixTrie<FullTokenEntitySetPair>(); + this.tree = new TokenTree(); Map<Entity, Set<List<Token>>> entity2TokenSet = entityTextRetriever.getRelevantText(ontology); @@ -62,51 +61,45 @@ /** * Adds the subsequences of a test * @param entity - * @param text + * @param tokens */ - private void addSubsequences(Entity entity, String text) { - if (text.contains(" ")) { - String[] tokens = text.split(" "); - for (int size=1; size<tokens.length; size++) { - - for (int start=0; start<tokens.length-size+1; start++) { - String subsequence = ""; - for (int i=0; i<size; i++) { - subsequence += tokens[start+i] + " "; - } - subsequence = subsequence.trim(); - - addEntry(subsequence, entity); - } - - } - } - } - - private void addSubsequencesWordNet(Entity entity, String text) { - if (text.contains(" ")) { - String[] tokens = text.split(" "); - - List<String>[] wordnetTokens = (ArrayList<String>[]) new ArrayList[tokens.length]; - - // generate list of lemmatized wordnet synonyms for each token - for (int i = 0; i < tokens.length; i++) { - wordnetTokens[i] = new ArrayList<String>(); - wordnetTokens[i].add(LinguisticUtil.getInstance().getNormalizedForm(tokens[i].toLowerCase())); - for (String w : LinguisticUtil.getInstance().getTopSynonymsForWord(tokens[i], 5)) { - wordnetTokens[i].add(LinguisticUtil.getInstance().getNormalizedForm(w).toLowerCase()); + private void addSubsequences(Entity entity, List<Token> tokens) { + tree.add(tokens, entity); + for (int size = 1; size < tokens.size(); size++) { + for (int start = 0; start < tokens.size() - size + 1; start++) { + ArrayList<Token> subsequence = new ArrayList<>(); + for (int i = 0; i < size; i++) { + subsequence.add(tokens.get(start + i)); } + addEntry(subsequence, entity); } - - // generate subsequences starting at the given start index of the given size - Set<String[]> allPossibleSubsequences = getAllPossibleSubsequences(tokens, wordnetTokens); - - for (String[] s : allPossibleSubsequences) { - addEntry(s[0], entity, s[1]); - } } } +// private void addSubsequencesWordNet(Entity entity, String text) { +// if (text.contains(" ")) { +// String[] tokens = text.split(" "); +// +// List<String>[] wordnetTokens = (ArrayList<String>[]) new ArrayList[tokens.length]; +// +// // generate list of lemmatized wordnet synonyms for each token +// for (int i = 0; i < tokens.length; i++) { +// wordnetTokens[i] = new ArrayList<String>(); +// wordnetTokens[i].add(LinguisticUtil.getInstance().getNormalizedForm(tokens[i].toLowerCase())); +// for (String w : LinguisticUtil.getInstance().getTopSynonymsForWord(tokens[i], 5)) { +// wordnetTokens[i].add(LinguisticUtil.getInstance().getNormalizedForm(w).toLowerCase()); +// } +// } +// +// // generate subsequences starting at the given start index of the given size +// Set<String[]> allPossibleSubsequences = getAllPossibleSubsequences(tokens, wordnetTokens); +// +// for (String[] s : allPossibleSubsequences) { +// addEntry(s[0], entity, s[1]); +// } +// } +// } + private static Set<String[]> getAllPossibleSubsequences(String[] originalTokens, List<String>[] wordnetTokens) { ArrayList<String[]> res = new ArrayList<String[]>(); @@ -143,30 +136,12 @@ } @Override - public void addEntry(String s, Entity e) { - s = s.trim(); - FullTokenEntitySetPair candidates; - if (trie.contains(s)) - candidates = trie.get(s); - else - candidates = new FullTokenEntitySetPair(s); - - candidates.addEntity(e); - - trie.put(s, candidates); + public void addEntry(List<Token> s, Entity e) { + tree.add(s, e); } - public void addEntry(String s, Entity e, String originalString) { - s = s.trim(); - FullTokenEntitySetPair candidates; - if (trie.contains(s)) - candidates = trie.get(s); - else - candidates = new FullTokenEntitySetPair(originalString); - - candidates.addEntity(e); - - trie.put(s, candidates); + public void addEntry(List<Token> s, Entity e, List<Token> originalTokens) { + tree.add(s, e, originalTokens); } @Override Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java 2013-12-02 14:52:33 UTC (rev 4183) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java 2013-12-02 14:59:36 UTC (rev 4184) @@ -15,10 +15,12 @@ public class TokenTree { private HashMap<Token, TokenTree> children; private Set<Entity> entities; + private List<Token> originalTokens; public TokenTree() { this.children = new HashMap<>(); this.entities = new HashSet<>(); + this.originalTokens = new ArrayList<>(); } /** @@ -27,7 +29,7 @@ * @param tokens tokens to locate insertion point for entities * @param entities entities to add */ - public void add(List<Token> tokens, Set<Entity> entities) { + public void add(List<Token> tokens, Set<Entity> entities, List<Token> originalTokens) { TokenTree curNode = this; for (Token t : tokens) { TokenTree nextNode = curNode.children.get(t); @@ -38,8 +40,13 @@ curNode = nextNode; } curNode.entities.addAll(entities); + curNode.originalTokens = new ArrayList<>(originalTokens); } + public void add(List<Token> tokens, Set<Entity> entities) { + add(tokens, entities, tokens); + } + /** * Adds the given entity to the tree. * @@ -50,6 +57,10 @@ add(tokens, Collections.singleton(entity)); } + public void add(List<Token> tokens, Entity entity, List<Token> originalTokens) { + add(tokens, Collections.singleton(entity), originalTokens); + } + /** * Returns the set of entities located by the given list of tokens. * @@ -112,6 +123,27 @@ return fallback == null ? Collections.<Entity>emptySet() : fallback.entities; } + /** + * Returns the original token for the longest match + */ + public List<Token> getOriginalTokensForLongestMatch(List<Token> tokens) { + TokenTree fallback = this.entities.isEmpty() ? null : this; + TokenTree curNode = this; + + for (Token t : tokens) { + TokenTree nextNode = curNode.children.get(t); + if (nextNode == null) { + return fallback == null ? null : fallback.originalTokens; + } + curNode = nextNode; + if (!curNode.entities.isEmpty()) { + fallback = curNode; + } + } + + return fallback == null ? Collections.<Token>emptyList() : fallback.originalTokens; + } + public static void main(String[] args) throws Exception { List<Token> tokens1 = Lists.newLinkedList(); for (String s : Splitter.on(" ").split("this is a token tree")) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-12-02 14:52:35
|
Revision: 4183 http://sourceforge.net/p/dl-learner/code/4183 Author: lorenz_b Date: 2013-12-02 14:52:33 +0000 (Mon, 02 Dec 2013) Log Message: ----------- Refactoring. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/EntityTextRetriever.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/RDFSLabelEntityTextRetriever.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-02 14:41:21 UTC (rev 4182) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-02 14:52:33 UTC (rev 4183) @@ -42,26 +42,21 @@ public void buildTrie(OWLOntology ontology, NameGenerator nameGenerator) { this.trie = new PrefixTrie<FullTokenEntitySetPair>(); - Map<Entity, Set<String>> relevantText = entityTextRetriever.getRelevantText(ontology); + Map<Entity, Set<List<Token>>> entity2TokenSet = entityTextRetriever.getRelevantText(ontology); - for (Entity entity : relevantText.keySet()) { - - for (String text : relevantText.get(entity)) { - text = StringUtils.join(LinguisticUtil.getInstance().getWordsFromCamelCase(text), " "); - text = StringUtils.join(LinguisticUtil.getInstance().getWordsFromUnderscored(text), " "); - if (text.trim().isEmpty()) { - continue; - } - - addEntry(text, entity); - addSubsequencesWordNet(entity, text); - - for (String alternativeText : nameGenerator.getAlternativeText(text)) { - addEntry(alternativeText.toLowerCase(), entity, text); - } - } - } + for (Entry<Entity, Set<List<Token>>> entry : entity2TokenSet.entrySet()) { + Entity entity = entry.getKey(); + Set<List<Token>> tokenSet = entry.getValue(); + for (List<Token> tokens : tokenSet) { + addEntry(tokens, entity); + addSubsequences(entity, tokens); +// addSubsequencesWordNet(entity, text); +// for (String alternativeText : nameGenerator.getAlternativeText(text)) { +// addEntry(alternativeText.toLowerCase(), entity, text); +// } + } + } } /** Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java 2013-12-02 14:41:21 UTC (rev 4182) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java 2013-12-02 14:52:33 UTC (rev 4183) @@ -127,6 +127,15 @@ tree.add(tokens1, new NamedClass("TokenTree")); tree.add(tokens2, new NamedClass("TokenizedTree")); System.out.println(tree); + + System.out.println(tree.getEntitiesForLongestMatch(tokens1)); + System.out.println(tree.getLongestMatch(tokens1)); + + List<Token> tokens3 = Lists.newLinkedList(); + for (String s : Splitter.on(" ").split("this is a very nice tokenized tree")) { + tokens3.add(new Token(s, s, s, false, false)); + }; + System.out.println(tree.getLongestMatch(tokens3)); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java 2013-12-02 14:41:21 UTC (rev 4182) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java 2013-12-02 14:52:33 UTC (rev 4183) @@ -5,10 +5,13 @@ import java.util.HashMap; import java.util.HashSet; +import java.util.List; import java.util.Map; import java.util.Set; +import org.dllearner.algorithms.isle.TextDocumentGenerator; import org.dllearner.algorithms.isle.index.LinguisticUtil; +import org.dllearner.algorithms.isle.index.Token; import org.dllearner.core.owl.Entity; import org.dllearner.kb.OWLAPIOntology; import org.dllearner.utilities.owl.OWLAPIConverter; @@ -75,8 +78,8 @@ * @see org.dllearner.algorithms.isle.EntityTextRetriever#getRelevantText(org.dllearner.core.owl.Entity) */ @Override - public Map<String, Double> getRelevantText(Entity entity) { - Map<String, Double> textWithWeight = new HashMap<String, Double>(); + public Map<List<Token>, Double> getRelevantText(Entity entity) { + Map<List<Token>, Double> textWithWeight = new HashMap<List<Token>, Double>(); OWLEntity e = OWLAPIConverter.getOWLAPIEntity(entity); @@ -87,7 +90,7 @@ OWLLiteral val = (OWLLiteral) annotation.getValue(); if (val.hasLang(language)) { String label = val.getLiteral().trim(); - textWithWeight.put(label, weight); + textWithWeight.put(TextDocumentGenerator.getInstance().generateDocument(label), weight); } } } @@ -97,7 +100,7 @@ String shortForm = sfp.getShortForm(IRI.create(entity.getURI())); shortForm = Joiner.on(" ").join(LinguisticUtil.getInstance().getWordsFromCamelCase(shortForm)); shortForm = Joiner.on(" ").join(LinguisticUtil.getInstance().getWordsFromUnderscored(shortForm)).trim(); - textWithWeight.put(shortForm, weight); + textWithWeight.put(TextDocumentGenerator.getInstance().generateDocument(shortForm), weight); } return textWithWeight; @@ -108,8 +111,8 @@ * @return */ @Override - public Map<Entity, Set<String>> getRelevantText(OWLOntology ontology) { - Map<Entity, Set<String>> entity2RelevantText = new HashMap<Entity, Set<String>>(); + public Map<Entity, Set<List<Token>>> getRelevantText(OWLOntology ontology) { + Map<Entity, Set<List<Token>>> entity2RelevantText = new HashMap<>(); Set<OWLEntity> schemaEntities = new HashSet<OWLEntity>(); schemaEntities.addAll(ontology.getClassesInSignature()); @@ -117,7 +120,7 @@ schemaEntities.addAll(ontology.getDataPropertiesInSignature()); schemaEntities.remove(OWL_THING); - Map<String, Double> relevantText; + Map<List<Token>, Double> relevantText; for (OWLEntity owlEntity : schemaEntities) { Entity entity = OWLAPIConverter.getEntity(owlEntity); relevantText = getRelevantText(entity); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/EntityTextRetriever.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/EntityTextRetriever.java 2013-12-02 14:41:21 UTC (rev 4182) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/EntityTextRetriever.java 2013-12-02 14:52:33 UTC (rev 4183) @@ -19,9 +19,11 @@ package org.dllearner.algorithms.isle.textretrieval; +import java.util.List; import java.util.Map; import java.util.Set; +import org.dllearner.algorithms.isle.index.Token; import org.dllearner.core.owl.Entity; import org.semanticweb.owlapi.model.OWLOntology; @@ -45,8 +47,8 @@ * @param entity The entity to handle. * @return A weighted set of strings. For a value x, we need to have 0 <= x <= 1. */ - public Map<String, Double> getRelevantText(Entity entity); + public Map<List<Token>, Double> getRelevantText(Entity entity); - public Map<Entity, Set<String>> getRelevantText(OWLOntology ontology); + public Map<Entity, Set<List<Token>>> getRelevantText(OWLOntology ontology); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/RDFSLabelEntityTextRetriever.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/RDFSLabelEntityTextRetriever.java 2013-12-02 14:41:21 UTC (rev 4182) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/RDFSLabelEntityTextRetriever.java 2013-12-02 14:52:33 UTC (rev 4183) @@ -4,12 +4,14 @@ package org.dllearner.algorithms.isle.textretrieval; import java.io.File; +import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; +import org.dllearner.algorithms.isle.index.Token; import org.dllearner.core.owl.Entity; import org.dllearner.kb.OWLAPIOntology; import org.semanticweb.owlapi.apibinding.OWLManager; @@ -43,13 +45,13 @@ OWLOntology ontology = man.loadOntology(IRI.create("http://www.semanticbible.com/2006/11/NTNames.owl")); RDFSLabelEntityTextRetriever labelRetriever = new RDFSLabelEntityTextRetriever(ontology); - Map<Entity, Set<String>> relevantText = labelRetriever.getRelevantText(ontology); + Map<Entity, Set<List<Token>>> relevantText = labelRetriever.getRelevantText(ontology); SortedMap<String, String> uri2Labels = new TreeMap<String, String>(); - for (Entry<Entity, Set<String>> entry : relevantText.entrySet()) { + for (Entry<Entity, Set<List<Token>>> entry : relevantText.entrySet()) { Entity key = entry.getKey(); - Set<String> value = entry.getValue(); - uri2Labels.put(key.getName(), value.iterator().next()); + Set<List<Token>> value = entry.getValue(); + uri2Labels.put(key.getName(), value.iterator().next().get(0).getRawForm()); } StringBuilder csv = new StringBuilder(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-12-02 14:41:24
|
Revision: 4182 http://sourceforge.net/p/dl-learner/code/4182 Author: dfleischhacker Date: 2013-12-02 14:41:21 +0000 (Mon, 02 Dec 2013) Log Message: ----------- Prevent possible NPE Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java 2013-12-02 14:34:42 UTC (rev 4181) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java 2013-12-02 14:41:21 UTC (rev 4182) @@ -109,7 +109,7 @@ } } - return fallback.entities; + return fallback == null ? Collections.<Entity>emptySet() : fallback.entities; } public static void main(String[] args) throws Exception { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-12-02 14:34:45
|
Revision: 4181 http://sourceforge.net/p/dl-learner/code/4181 Author: dfleischhacker Date: 2013-12-02 14:34:42 +0000 (Mon, 02 Dec 2013) Log Message: ----------- Add toString to TokenTree Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java 2013-12-02 14:30:18 UTC (rev 4180) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java 2013-12-02 14:34:42 UTC (rev 4181) @@ -1,11 +1,10 @@ package org.dllearner.algorithms.isle.index; +import com.google.common.base.Splitter; +import com.google.common.collect.Lists; import org.dllearner.core.owl.Entity; import org.dllearner.core.owl.NamedClass; -import com.google.common.base.Splitter; -import com.google.common.collect.Lists; - import java.util.*; /** @@ -112,7 +111,7 @@ return fallback.entities; } - + public static void main(String[] args) throws Exception { List<Token> tokens1 = Lists.newLinkedList(); for (String s : Splitter.on(" ").split("this is a token tree")) { @@ -127,5 +126,28 @@ TokenTree tree = new TokenTree(); tree.add(tokens1, new NamedClass("TokenTree")); tree.add(tokens2, new NamedClass("TokenizedTree")); - } + System.out.println(tree); + } + + + public String toString() { + return "TokenTree\n"+ toString(0); + } + + public String toString(int indent) { + StringBuilder indentStringBuilder = new StringBuilder(); + for (int i = 0; i < indent; i++) { + indentStringBuilder.append(" "); + } + String indentString = indentStringBuilder.toString(); + StringBuilder sb = new StringBuilder(); + for (Map.Entry<Token, TokenTree> e : children.entrySet()) { + sb.append(indentString).append(e.getKey().toString()); + sb.append("\n"); + sb.append(e.getValue().toString(indent + 1)); + } + return sb.toString(); + } + + } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-12-02 14:30:21
|
Revision: 4180 http://sourceforge.net/p/dl-learner/code/4180 Author: lorenz_b Date: 2013-12-02 14:30:18 +0000 (Mon, 02 Dec 2013) Log Message: ----------- Updated toString. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-12-02 14:27:30 UTC (rev 4179) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-12-02 14:30:18 UTC (rev 4180) @@ -95,9 +95,7 @@ */ @Override public String toString() { - return "\n[Word: " + rawForm + "\n" - + "Stemmed word: " + stemmedForm + "\n" - + "POS tag: " + posTag + "]"; + return "[Word: " + rawForm + " | Stemmed word: " + stemmedForm + " | POS tag: " + posTag + "]"; } @Override This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-12-02 14:27:33
|
Revision: 4179 http://sourceforge.net/p/dl-learner/code/4179 Author: lorenz_b Date: 2013-12-02 14:27:30 +0000 (Mon, 02 Dec 2013) Log Message: ----------- Added main method to TokenTree. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java trunk/components-core/src/test/java/org/dllearner/algorithms/isle/SemanticBibleExperiment.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java 2013-12-02 13:36:13 UTC (rev 4178) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java 2013-12-02 14:27:30 UTC (rev 4179) @@ -1,7 +1,11 @@ package org.dllearner.algorithms.isle.index; import org.dllearner.core.owl.Entity; +import org.dllearner.core.owl.NamedClass; +import com.google.common.base.Splitter; +import com.google.common.collect.Lists; + import java.util.*; /** @@ -108,4 +112,20 @@ return fallback.entities; } + + public static void main(String[] args) throws Exception { + List<Token> tokens1 = Lists.newLinkedList(); + for (String s : Splitter.on(" ").split("this is a token tree")) { + tokens1.add(new Token(s, s, s, false, false)); + }; + + List<Token> tokens2 = Lists.newLinkedList(); + for (String s : Splitter.on(" ").split("this is a tokenized tree")) { + tokens2.add(new Token(s, s, s, false, false)); + }; + + TokenTree tree = new TokenTree(); + tree.add(tokens1, new NamedClass("TokenTree")); + tree.add(tokens2, new NamedClass("TokenizedTree")); + } } Modified: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/SemanticBibleExperiment.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/SemanticBibleExperiment.java 2013-12-02 13:36:13 UTC (rev 4178) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/SemanticBibleExperiment.java 2013-12-02 14:27:30 UTC (rev 4179) @@ -87,7 +87,7 @@ String text = Files.toString(file, Charsets.UTF_8); // String posTagged = getPOSTaggedText(text); // Files.write(posTagged, new File(taggedFolder, file.getName() + ".tagged"), Charsets.UTF_8); - documents.add(TextDocumentGenerator.getInstance().generateDocument(text)); +// documents.add(TextDocumentGenerator.getInstance().generateDocument(text)); } catch (IOException e) { e.printStackTrace(); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-12-02 13:36:19
|
Revision: 4178 http://sourceforge.net/p/dl-learner/code/4178 Author: dfleischhacker Date: 2013-12-02 13:36:13 +0000 (Mon, 02 Dec 2013) Log Message: ----------- Add equals and hashCode to Token class Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-12-02 12:51:30 UTC (rev 4177) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-12-02 13:36:13 UTC (rev 4178) @@ -99,4 +99,32 @@ + "Stemmed word: " + stemmedForm + "\n" + "POS tag: " + posTag + "]"; } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + Token token = (Token) o; + + if (!posTag.equals(token.posTag)) { + return false; + } + if (!stemmedForm.equals(token.stemmedForm)) { + return false; + } + + return true; + } + + @Override + public int hashCode() { + int result = stemmedForm.hashCode(); + result = 31 * result + posTag.hashCode(); + return result; + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-12-02 12:51:33
|
Revision: 4177 http://sourceforge.net/p/dl-learner/code/4177 Author: dfleischhacker Date: 2013-12-02 12:51:30 +0000 (Mon, 02 Dec 2013) Log Message: ----------- Add TokenTree class Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java 2013-12-02 12:51:30 UTC (rev 4177) @@ -0,0 +1,111 @@ +package org.dllearner.algorithms.isle.index; + +import org.dllearner.core.owl.Entity; + +import java.util.*; + +/** + * Tree for finding longest matching Token sequence + * + * @author Daniel Fleischhacker + */ +public class TokenTree { + private HashMap<Token, TokenTree> children; + private Set<Entity> entities; + + public TokenTree() { + this.children = new HashMap<>(); + this.entities = new HashSet<>(); + } + + /** + * Adds all given entities to the end of the path resulting from the given tokens. + * + * @param tokens tokens to locate insertion point for entities + * @param entities entities to add + */ + public void add(List<Token> tokens, Set<Entity> entities) { + TokenTree curNode = this; + for (Token t : tokens) { + TokenTree nextNode = curNode.children.get(t); + if (nextNode == null) { + nextNode = new TokenTree(); + curNode.children.put(t, nextNode); + } + curNode = nextNode; + } + curNode.entities.addAll(entities); + } + + /** + * Adds the given entity to the tree. + * + * @param tokens tokens to locate insertion point for entities + * @param entity entity to add + */ + public void add(List<Token> tokens, Entity entity) { + add(tokens, Collections.singleton(entity)); + } + + /** + * Returns the set of entities located by the given list of tokens. + * + * @param tokens tokens to locate the information to get + * @return located set of entities or null if token sequence not contained in tree + */ + public Set<Entity> get(List<Token> tokens) { + TokenTree curNode = this; + for (Token t : tokens) { + TokenTree nextNode = curNode.children.get(t); + if (nextNode == null) { + return null; + } + curNode = nextNode; + } + return curNode.entities; + } + + /** + * Returns the list of tokens which are the longest match with entities assigned in this tree. + * + * @param tokens list of tokens to check for longest match + * @return list of tokens being the longest match, sublist of {@code tokens} anchored at the first token + */ + public List<Token> getLongestMatch(List<Token> tokens) { + List<Token> fallbackTokenList = new ArrayList<>(); + TokenTree curNode = this; + + for (Token t : tokens) { + TokenTree nextNode = curNode.children.get(t); + if (nextNode == null) { + return fallbackTokenList; + } + curNode = nextNode; + fallbackTokenList.add(t); + } + return fallbackTokenList; + } + + /** + * Returns the set of entities assigned to the longest matching token subsequence of the given token sequence. + * @param tokens token sequence to search for longest match + * @return set of entities assigned to the longest matching token subsequence of the given token sequence + */ + public Set<Entity> getEntitiesForLongestMatch(List<Token> tokens) { + TokenTree fallback = this.entities.isEmpty() ? null : this; + TokenTree curNode = this; + + for (Token t : tokens) { + TokenTree nextNode = curNode.children.get(t); + if (nextNode == null) { + return fallback == null ? null : fallback.entities; + } + curNode = nextNode; + if (!curNode.entities.isEmpty()) { + fallback = curNode; + } + } + + return fallback.entities; + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <ki...@us...> - 2013-11-27 14:41:08
|
Revision: 4176 http://sourceforge.net/p/dl-learner/code/4176 Author: kirdie Date: 2013-11-27 14:41:05 +0000 (Wed, 27 Nov 2013) Log Message: ----------- changed index from interface to abstract class, thus removing much redundant code. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/reasoning/SPARQLReasoner.java trunk/components-ext/src/main/java/org/dllearner/common/index/HierarchicalIndex.java trunk/components-ext/src/main/java/org/dllearner/common/index/Index.java trunk/components-ext/src/main/java/org/dllearner/common/index/SOLRIndex.java trunk/components-ext/src/main/java/org/dllearner/common/index/SPARQLIndex.java Modified: trunk/components-core/src/main/java/org/dllearner/reasoning/SPARQLReasoner.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/reasoning/SPARQLReasoner.java 2013-11-25 14:20:13 UTC (rev 4175) +++ trunk/components-core/src/main/java/org/dllearner/reasoning/SPARQLReasoner.java 2013-11-27 14:41:05 UTC (rev 4176) @@ -331,8 +331,8 @@ return dataPropertyPopularityMap.get(dp); } else { String queryTemplate = "SELECT (COUNT(*) AS ?cnt) WHERE {?s <%s> ?o}"; - - ResultSet rs = executeSelectQuery(String.format(queryTemplate, dp.getName())); +String query = String.format(queryTemplate, dp.getName()); + ResultSet rs = executeSelectQuery(query); int cnt = rs.next().getLiteral("cnt").getInt(); dataPropertyPopularityMap.put(dp, cnt); return cnt; Modified: trunk/components-ext/src/main/java/org/dllearner/common/index/HierarchicalIndex.java =================================================================== --- trunk/components-ext/src/main/java/org/dllearner/common/index/HierarchicalIndex.java 2013-11-25 14:20:13 UTC (rev 4175) +++ trunk/components-ext/src/main/java/org/dllearner/common/index/HierarchicalIndex.java 2013-11-27 14:41:05 UTC (rev 4176) @@ -3,11 +3,9 @@ import java.util.ArrayList; import java.util.List; -public class HierarchicalIndex implements Index{ +public class HierarchicalIndex extends Index +{ - private static final int DEFAULT_LIMIT = 10; - private static final int DEFAULT_OFFSET = 0; - private Index primaryIndex; private Index secondaryIndex; @@ -23,18 +21,8 @@ public Index getSecondaryIndex() { return secondaryIndex; } - - @Override - public List<String> getResources(String queryString) { - return getResources(queryString, DEFAULT_LIMIT); - } @Override - public List<String> getResources(String queryString, int limit) { - return getResources(queryString, limit, DEFAULT_OFFSET); - } - - @Override public List<String> getResources(String queryString, int limit, int offset) { List<String> resources = new ArrayList<String>(); resources = primaryIndex.getResources(queryString, limit, offset); @@ -50,11 +38,6 @@ } @Override - public IndexResultSet getResourcesWithScores(String queryString, int limit) { - return getResourcesWithScores(queryString, limit, DEFAULT_OFFSET); - } - - @Override public IndexResultSet getResourcesWithScores(String queryString, int limit, int offset) { IndexResultSet rs = primaryIndex.getResourcesWithScores(queryString, limit, offset); if(rs.getItems().size() < limit){ @@ -63,4 +46,4 @@ return rs; } -} +} \ No newline at end of file Modified: trunk/components-ext/src/main/java/org/dllearner/common/index/Index.java =================================================================== --- trunk/components-ext/src/main/java/org/dllearner/common/index/Index.java 2013-11-25 14:20:13 UTC (rev 4175) +++ trunk/components-ext/src/main/java/org/dllearner/common/index/Index.java 2013-11-27 14:41:05 UTC (rev 4176) @@ -1,13 +1,16 @@ package org.dllearner.common.index; import java.util.List; -import java.util.Map; -public interface Index { - List<String> getResources(String queryString); - List<String> getResources(String queryString, int limit); - List<String> getResources(String queryString, int limit, int offset); - IndexResultSet getResourcesWithScores(String queryString); - IndexResultSet getResourcesWithScores(String queryString, int limit); - IndexResultSet getResourcesWithScores(String queryString, int limit, int offset); -} +public abstract class Index +{ + static final int DEFAULT_LIMIT = 10; + + public List<String> getResources(String queryString) {return getResources(queryString,DEFAULT_LIMIT);} + public List<String> getResources(String queryString, int limit) {return getResources(queryString,DEFAULT_LIMIT,0);} + abstract public List<String> getResources(String queryString, int limit, int offset); + + public IndexResultSet getResourcesWithScores(String queryString) {return getResourcesWithScores(queryString,DEFAULT_LIMIT);} + public IndexResultSet getResourcesWithScores(String queryString, int limit) {return getResourcesWithScores(queryString,DEFAULT_LIMIT,0);} + abstract public IndexResultSet getResourcesWithScores(String queryString, int limit, int offset); +} \ No newline at end of file Modified: trunk/components-ext/src/main/java/org/dllearner/common/index/SOLRIndex.java =================================================================== --- trunk/components-ext/src/main/java/org/dllearner/common/index/SOLRIndex.java 2013-11-25 14:20:13 UTC (rev 4175) +++ trunk/components-ext/src/main/java/org/dllearner/common/index/SOLRIndex.java 2013-11-27 14:41:05 UTC (rev 4176) @@ -13,13 +13,10 @@ import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.params.ModifiableSolrParams; -public class SOLRIndex implements Index{ +public class SOLRIndex extends Index{ private HttpSolrServer server; - private static final int DEFAULT_LIMIT = 10; - private static final int DEFAULT_OFFSET = 0; - private String primarySearchField; private String secondarySearchField; @@ -52,16 +49,6 @@ } @Override - public List<String> getResources(String queryString) { - return getResources(queryString, DEFAULT_LIMIT); - } - - @Override - public List<String> getResources(String queryString, int limit) { - return getResources(queryString, limit, DEFAULT_OFFSET); - } - - @Override public List<String> getResources(String queryString, int limit, int offset) { List<String> resources = new ArrayList<String>(); QueryResponse response; @@ -82,16 +69,6 @@ } @Override - public IndexResultSet getResourcesWithScores(String queryString) { - return getResourcesWithScores(queryString, DEFAULT_LIMIT); - } - - @Override - public IndexResultSet getResourcesWithScores(String queryString, int limit) { - return getResourcesWithScores(queryString, limit, DEFAULT_OFFSET); - } - - @Override public IndexResultSet getResourcesWithScores(String queryString, int limit, int offset) { IndexResultSet rs = new IndexResultSet(); @@ -148,4 +125,4 @@ this.sortField = sortField; } -} +} \ No newline at end of file Modified: trunk/components-ext/src/main/java/org/dllearner/common/index/SPARQLIndex.java =================================================================== --- trunk/components-ext/src/main/java/org/dllearner/common/index/SPARQLIndex.java 2013-11-25 14:20:13 UTC (rev 4175) +++ trunk/components-ext/src/main/java/org/dllearner/common/index/SPARQLIndex.java 2013-11-27 14:41:05 UTC (rev 4176) @@ -16,11 +16,8 @@ import com.hp.hpl.jena.rdf.model.RDFNode; import com.hp.hpl.jena.sparql.engine.http.QueryEngineHTTP; -public class SPARQLIndex implements Index{ +public class SPARQLIndex extends Index{ - private static final int DEFAULT_LIMIT = 10; - private static final int DEFAULT_OFFSET = 0; - private SparqlEndpoint endpoint; private ExtractionDBCache cache; @@ -65,16 +62,6 @@ } @Override - public List<String> getResources(String searchTerm) { - return getResources(searchTerm, DEFAULT_LIMIT); - } - - @Override - public List<String> getResources(String searchTerm, int limit) { - return getResources(searchTerm, limit, DEFAULT_OFFSET); - } - - @Override public List<String> getResources(String searchTerm, int limit, int offset) { List<String> resources = new ArrayList<String>(); @@ -92,18 +79,8 @@ } return resources; } - - @Override - public IndexResultSet getResourcesWithScores(String searchTerm) { - return getResourcesWithScores(searchTerm, DEFAULT_LIMIT); - } @Override - public IndexResultSet getResourcesWithScores(String searchTerm, int limit) { - return getResourcesWithScores(searchTerm, limit, DEFAULT_OFFSET); - } - - @Override public IndexResultSet getResourcesWithScores(String searchTerm, int limit, int offset) { IndexResultSet irs = new IndexResultSet(); @@ -151,4 +128,4 @@ return model; } -} +} \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-11-25 14:20:16
|
Revision: 4175 http://sourceforge.net/p/dl-learner/code/4175 Author: lorenz_b Date: 2013-11-25 14:20:13 +0000 (Mon, 25 Nov 2013) Log Message: ----------- Fixed bug. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/reasoning/SPARQLReasoner.java Modified: trunk/components-core/src/main/java/org/dllearner/reasoning/SPARQLReasoner.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/reasoning/SPARQLReasoner.java 2013-11-25 09:47:35 UTC (rev 4174) +++ trunk/components-core/src/main/java/org/dllearner/reasoning/SPARQLReasoner.java 2013-11-25 14:20:13 UTC (rev 4175) @@ -125,6 +125,8 @@ classPopularityMap = new HashMap<NamedClass, Integer>(); objectPropertyPopularityMap = new HashMap<ObjectProperty, Integer>(); + dataPropertyPopularityMap = new HashMap<DatatypeProperty, Integer>(); + individualPopularityMap = new HashMap<Individual, Integer>(); if(ks.isRemote()){ SparqlEndpoint endpoint = ks.getEndpoint(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-11-25 09:47:39
|
Revision: 4174 http://sourceforge.net/p/dl-learner/code/4174 Author: lorenz_b Date: 2013-11-25 09:47:35 +0000 (Mon, 25 Nov 2013) Log Message: ----------- ISLE. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NormalizedTextMapper.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NormalizedTextMapper.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NormalizedTextMapper.java 2013-11-25 09:42:56 UTC (rev 4173) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NormalizedTextMapper.java 2013-11-25 09:47:35 UTC (rev 4174) @@ -60,6 +60,7 @@ curNormalizedLength += p.getNormalizedLength(); curOriginalLength += p.getOriginalLength(); if (curNormalizedLength >= length) { + //TODO refactoring // return new Annotation(originalDocument, originalStart, curOriginalLength); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java 2013-11-25 09:42:56 UTC (rev 4173) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java 2013-11-25 09:47:35 UTC (rev 4174) @@ -1,6 +1,8 @@ package org.dllearner.algorithms.isle.index; +import com.google.common.collect.Lists; import com.google.common.collect.Sets; + import org.dllearner.algorithms.isle.EntityCandidateGenerator; import org.dllearner.algorithms.isle.StopWordFilter; import org.dllearner.core.owl.Entity; @@ -39,6 +41,7 @@ public void postProcess(HashMap<Annotation,Set<Entity>> candidatesMap, int window, StopWordFilter stopWordFilter) { Set<Annotation> annotations = candidatesMap.keySet(); List<Annotation> sortedAnnotations = new ArrayList<Annotation>(annotations); + //TODO refactoring /** @@ -108,17 +111,10 @@ } private Annotation mergeAnnotations(Annotation annotation_i, Annotation annotation_j) { - return null; -// int offset; -// int length; -// if (annotation_i.getOffset() < annotation_j.getOffset()) { -// offset = annotation_i.getOffset(); -// length = annotation_j.getOffset() - offset + annotation_j.getLength(); -// } else { -// offset = annotation_j.getOffset(); -// length = annotation_i.getOffset() - offset + annotation_i.getLength(); -// } -// return new Annotation(annotation_i.getReferencedDocument(), offset, length); + List<Token> tokens = Lists.newArrayList(); + tokens.addAll(annotation_i.getTokens()); + tokens.addAll(annotation_j.getTokens()); + return new Annotation(annotation_i.getReferencedDocument(), tokens); } @Override This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-11-25 09:43:00
|
Revision: 4173 http://sourceforge.net/p/dl-learner/code/4173 Author: lorenz_b Date: 2013-11-25 09:42:56 +0000 (Mon, 25 Nov 2013) Log Message: ----------- ISLE. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NormalizedTextMapper.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java trunk/components-core/src/main/java/org/dllearner/reasoning/SPARQLReasoner.java trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java trunk/components-core/src/test/java/org/dllearner/algorithms/isle/SemanticBibleExperiment.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NormalizedTextMapper.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NormalizedTextMapper.java 2013-11-22 12:44:10 UTC (rev 4172) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NormalizedTextMapper.java 2013-11-25 09:42:56 UTC (rev 4173) @@ -60,7 +60,7 @@ curNormalizedLength += p.getNormalizedLength(); curOriginalLength += p.getOriginalLength(); if (curNormalizedLength >= length) { - return new Annotation(originalDocument, originalStart, curOriginalLength); +// return new Annotation(originalDocument, originalStart, curOriginalLength); } // include space @@ -82,16 +82,16 @@ } public static void main(String[] args) { - NormalizedTextMapper n = new NormalizedTextMapper(new TextDocument("This is a testing text using letters")); - System.out.println(n.getOriginalText()); - System.out.println(n.getNormalizedText()); - for (OccurenceMappingPair p : n.normalizedIndexToOriginalIndex) { - System.out.println(p); - } - System.out.println(n.getOriginalAnnotationForPosition(7,6)); - System.out.println(n.getOriginalAnnotationForPosition(23,6)); - System.out.println(n.getOriginalAnnotationForPosition(7,1)); - System.out.println(n.getOriginalAnnotationForPosition(14,15)); +// NormalizedTextMapper n = new NormalizedTextMapper(new TextDocument("This is a testing text using letters")); +// System.out.println(n.getOriginalText()); +// System.out.println(n.getNormalizedText()); +// for (OccurenceMappingPair p : n.normalizedIndexToOriginalIndex) { +// System.out.println(p); +// } +// System.out.println(n.getOriginalAnnotationForPosition(7,6)); +// System.out.println(n.getOriginalAnnotationForPosition(23,6)); +// System.out.println(n.getOriginalAnnotationForPosition(7,1)); +// System.out.println(n.getOriginalAnnotationForPosition(14,15)); } /** Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java 2013-11-22 12:44:10 UTC (rev 4172) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java 2013-11-25 09:42:56 UTC (rev 4173) @@ -39,6 +39,8 @@ public void postProcess(HashMap<Annotation,Set<Entity>> candidatesMap, int window, StopWordFilter stopWordFilter) { Set<Annotation> annotations = candidatesMap.keySet(); List<Annotation> sortedAnnotations = new ArrayList<Annotation>(annotations); + /** + // Sort annotations by offset in ascending order Collections.sort(sortedAnnotations, new Comparator<Annotation>(){ @@ -102,20 +104,21 @@ } - + */ } private Annotation mergeAnnotations(Annotation annotation_i, Annotation annotation_j) { - int offset; - int length; - if (annotation_i.getOffset() < annotation_j.getOffset()) { - offset = annotation_i.getOffset(); - length = annotation_j.getOffset() - offset + annotation_j.getLength(); - } else { - offset = annotation_j.getOffset(); - length = annotation_i.getOffset() - offset + annotation_i.getLength(); - } - return new Annotation(annotation_i.getReferencedDocument(), offset, length); + return null; +// int offset; +// int length; +// if (annotation_i.getOffset() < annotation_j.getOffset()) { +// offset = annotation_i.getOffset(); +// length = annotation_j.getOffset() - offset + annotation_j.getLength(); +// } else { +// offset = annotation_j.getOffset(); +// length = annotation_i.getOffset() - offset + annotation_i.getLength(); +// } +// return new Annotation(annotation_i.getReferencedDocument(), offset, length); } @Override Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java 2013-11-22 12:44:10 UTC (rev 4172) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java 2013-11-25 09:42:56 UTC (rev 4173) @@ -7,10 +7,12 @@ import org.apache.log4j.Logger; import org.dllearner.algorithms.isle.EntityCandidateGenerator; +import org.dllearner.algorithms.isle.TextDocumentGenerator; import org.dllearner.algorithms.isle.index.AnnotatedDocument; import org.dllearner.algorithms.isle.index.LinguisticAnnotator; import org.dllearner.algorithms.isle.index.SemanticAnnotator; import org.dllearner.algorithms.isle.index.TextDocument; +import org.dllearner.algorithms.isle.index.Token; import org.dllearner.algorithms.isle.index.syntactic.SyntacticIndex; import org.dllearner.algorithms.isle.wsd.WordSenseDisambiguation; import org.dllearner.core.owl.Entity; @@ -108,7 +110,7 @@ } } if (label != null) { - documents.add(new TextDocument(label)); + documents.add(TextDocumentGenerator.getInstance().generateDocument(label)); } } buildIndex(documents); Modified: trunk/components-core/src/main/java/org/dllearner/reasoning/SPARQLReasoner.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/reasoning/SPARQLReasoner.java 2013-11-22 12:44:10 UTC (rev 4172) +++ trunk/components-core/src/main/java/org/dllearner/reasoning/SPARQLReasoner.java 2013-11-25 09:42:56 UTC (rev 4173) @@ -108,6 +108,7 @@ private Map<NamedClass, Integer> classPopularityMap; private Map<ObjectProperty, Integer> objectPropertyPopularityMap; private Map<DatatypeProperty, Integer> dataPropertyPopularityMap; + private Map<Individual, Integer> individualPopularityMap; private boolean prepared = false; @@ -156,6 +157,8 @@ classPopularityMap = new HashMap<NamedClass, Integer>(); objectPropertyPopularityMap = new HashMap<ObjectProperty, Integer>(); + dataPropertyPopularityMap = new HashMap<DatatypeProperty, Integer>(); + individualPopularityMap = new HashMap<Individual, Integer>(); if(ks.isRemote()){ SparqlEndpoint endpoint = ks.getEndpoint(); @@ -176,6 +179,8 @@ classPopularityMap = new HashMap<NamedClass, Integer>(); objectPropertyPopularityMap = new HashMap<ObjectProperty, Integer>(); + dataPropertyPopularityMap = new HashMap<DatatypeProperty, Integer>(); + individualPopularityMap = new HashMap<Individual, Integer>(); } public void precomputePopularity(){ @@ -330,7 +335,19 @@ dataPropertyPopularityMap.put(dp, cnt); return cnt; } + } + + public int getPopularity(Individual ind){ + if(individualPopularityMap != null && individualPopularityMap.containsKey(ind)){ + return individualPopularityMap.get(ind); + } else { + String queryTemplate = "SELECT (COUNT(*) AS ?cnt) WHERE {<%s> ?p ?o}"; + ResultSet rs = executeSelectQuery(String.format(queryTemplate, ind.getName())); + int cnt = rs.next().getLiteral("cnt").getInt(); + individualPopularityMap.put(ind, cnt); + return cnt; + } } public final ClassHierarchy prepareSubsumptionHierarchy() { Modified: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java 2013-11-22 12:44:10 UTC (rev 4172) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java 2013-11-25 09:42:56 UTC (rev 4173) @@ -85,7 +85,7 @@ if(!file.isDirectory() && !file.isHidden()){ try { String text = Files.toString(file, Charsets.UTF_8); - documents.add(new TextDocument(text)); + documents.add(TextDocumentGenerator.getInstance().generateDocument(text)); } catch (IOException e) { e.printStackTrace(); } @@ -103,7 +103,7 @@ if(!file.isDirectory() && !file.isHidden()){ try { String text = Files.toString(file, Charsets.UTF_8); - documents.add(new TextDocument(text)); + documents.add(TextDocumentGenerator.getInstance().generateDocument(text)); } catch (IOException e) { e.printStackTrace(); } Modified: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/SemanticBibleExperiment.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/SemanticBibleExperiment.java 2013-11-22 12:44:10 UTC (rev 4172) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/SemanticBibleExperiment.java 2013-11-25 09:42:56 UTC (rev 4173) @@ -87,7 +87,7 @@ String text = Files.toString(file, Charsets.UTF_8); // String posTagged = getPOSTaggedText(text); // Files.write(posTagged, new File(taggedFolder, file.getName() + ".tagged"), Charsets.UTF_8); - documents.add(new TextDocument(text)); + documents.add(TextDocumentGenerator.getInstance().generateDocument(text)); } catch (IOException e) { e.printStackTrace(); } @@ -98,9 +98,9 @@ } catch (IOException e) { e.printStackTrace(); } - - documents = Sets.newHashSet(new TextDocument("and in that day seven women shall take hold of one man saying we will eat our own bread and wear our own apparel only let us be called by thy name to take away our reproach in that day shall the branch of the lord be beautiful and glorious and the fruit of the earth excellent and comely for them that are escaped of israel and it shall come to pass left in zion and remaineth in jerusalem shall be called holy every one that is written among the living in jerusalem when the lord shall have washed away the filth of the daughters of zion and shall have purged the blood of jerusalem from the midst thereof by the spirit of judgment and by the spirit of burning and the lord will create upon every dwelling place of mount zion and upon her assemblies a cloud and smoke by day and the shining of a flaming fire by night for upon all the glory a defence and there shall be a tabernacle for a shadow in the daytime from the heat and for a place of refuge and for a covert from storm and from rain")); - + documents.clear(); + TextDocument doc = TextDocumentGenerator.getInstance().generateDocument("and in that day seven women shall take hold of one man saying we will eat our own bread and wear our own apparel only let us be called by thy name to take away our reproach in that day shall the branch of the lord be beautiful and glorious and the fruit of the earth excellent and comely for them that are escaped of israel and it shall come to pass left in zion and remaineth in jerusalem shall be called holy every one that is written among the living in jerusalem when the lord shall have washed away the filth of the daughters of zion and shall have purged the blood of jerusalem from the midst thereof by the spirit of judgment and by the spirit of burning and the lord will create upon every dwelling place of mount zion and upon her assemblies a cloud and smoke by day and the shining of a flaming fire by night for upon all the glory a defence and there shall be a tabernacle for a shadow in the daytime from the heat and for a place of refuge and for a covert from storm and from rain"); + documents.add(doc); return documents; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-11-22 12:44:13
|
Revision: 4172 http://sourceforge.net/p/dl-learner/code/4172 Author: lorenz_b Date: 2013-11-22 12:44:10 +0000 (Fri, 22 Nov 2013) Log Message: ----------- Added constructor to set search field. Modified Paths: -------------- trunk/components-ext/src/main/java/org/dllearner/common/index/SOLRIndex.java Modified: trunk/components-ext/src/main/java/org/dllearner/common/index/SOLRIndex.java =================================================================== --- trunk/components-ext/src/main/java/org/dllearner/common/index/SOLRIndex.java 2013-11-21 20:44:41 UTC (rev 4171) +++ trunk/components-ext/src/main/java/org/dllearner/common/index/SOLRIndex.java 2013-11-22 12:44:10 UTC (rev 4172) @@ -32,6 +32,12 @@ server.setRequestWriter(new BinaryRequestWriter()); } + public SOLRIndex(String solrServerURL, String primarySearchField){ + server = new HttpSolrServer(solrServerURL); + server.setRequestWriter(new BinaryRequestWriter()); + this.primarySearchField = primarySearchField; + } + public void setSearchFields(String primarySearchField, String secondarySearchField){ this.primarySearchField = primarySearchField; this.secondarySearchField = secondarySearchField; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-11-21 20:44:43
|
Revision: 4171 http://sourceforge.net/p/dl-learner/code/4171 Author: lorenz_b Date: 2013-11-21 20:44:41 +0000 (Thu, 21 Nov 2013) Log Message: ----------- Updated CONFIG. Modified Paths: -------------- trunk/test/fuzzydll/CONFIG Modified: trunk/test/fuzzydll/CONFIG =================================================================== --- trunk/test/fuzzydll/CONFIG 2013-11-21 15:24:32 UTC (rev 4170) +++ trunk/test/fuzzydll/CONFIG 2013-11-21 20:44:41 UTC (rev 4171) @@ -1,6 +1,4 @@ -MILP_SOLVER = /Users/josue/Documents/PhD/AKSW/fuzzySemanticTools/FuzzyDLMacOSX/FuzzyDL/fuzzyDLcbc -EPSILON = 0.01 -solver = z -max_individuals = -1 debugPrint = false -CBC = 1 +epsilon = 0.001 +maxIndividuals = -1 +showVersion = true \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-11-21 15:24:35
|
Revision: 4170 http://sourceforge.net/p/dl-learner/code/4170 Author: dfleischhacker Date: 2013-11-21 15:24:32 +0000 (Thu, 21 Nov 2013) Log Message: ----------- Remove unused offset-based methods Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedDocument.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedTextDocument.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedDocument.java 2013-11-21 13:53:20 UTC (rev 4169) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedDocument.java 2013-11-21 15:24:32 UTC (rev 4170) @@ -3,10 +3,10 @@ */ package org.dllearner.algorithms.isle.index; +import org.dllearner.core.owl.Entity; + import java.util.Set; -import org.dllearner.core.owl.Entity; - /** * @author Lorenz Buehmann * @@ -25,14 +25,6 @@ */ Set<SemanticAnnotation> getAnnotations(); - /** - * Returns the annotation at the given position(offset) of given length. - * @param offset - * @param length - * @return - */ - SemanticAnnotation getAnnotation(int offset, int length); - /** * Returns the number of occurrences of the given entity in this document. * Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedTextDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedTextDocument.java 2013-11-21 13:53:20 UTC (rev 4169) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedTextDocument.java 2013-11-21 15:24:32 UTC (rev 4170) @@ -3,11 +3,11 @@ */ package org.dllearner.algorithms.isle.index; +import org.dllearner.core.owl.Entity; + import java.util.HashSet; import java.util.Set; -import org.dllearner.core.owl.Entity; - /** * @author Lorenz Buehmann * @@ -70,19 +70,6 @@ } /* (non-Javadoc) - * @see org.dllearner.algorithms.isle.index.AnnotatedDocument#getAnnotation(int, int) - */ - @Override - public SemanticAnnotation getAnnotation(int offset, int length) { - for (SemanticAnnotation annotation : annotations) { - if(annotation.getOffset() == offset && annotation.getLength() == length){ - return annotation; - } - } - return null; - } - - /* (non-Javadoc) * @see org.dllearner.algorithms.isle.index.AnnotatedDocument#getEntityFrequency(org.dllearner.core.owl.Entity) */ @Override This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-11-21 13:53:23
|
Revision: 4169 http://sourceforge.net/p/dl-learner/code/4169 Author: lorenz_b Date: 2013-11-21 13:53:20 +0000 (Thu, 21 Nov 2013) Log Message: ----------- Refactored annotation class. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotation.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotation.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotation.java 2013-11-21 13:53:13 UTC (rev 4168) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotation.java 2013-11-21 13:53:20 UTC (rev 4169) @@ -14,14 +14,9 @@ private Entity entity; public SemanticAnnotation(Annotation annotation, Entity entity) { - super(annotation.getReferencedDocument(), annotation.getOffset(), annotation.getLength()); + super(annotation.getReferencedDocument(), annotation.getTokens()); this.entity = entity; } - - public SemanticAnnotation(Document getReferencedDocument, Entity entity, int offset, int length) { - super(getReferencedDocument, offset, length); - this.entity = entity; - } public Entity getEntity() { return entity; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-11-21 13:53:16
|
Revision: 4168 http://sourceforge.net/p/dl-learner/code/4168 Author: dfleischhacker Date: 2013-11-21 13:53:13 +0000 (Thu, 21 Nov 2013) Log Message: ----------- Remove unused NGramGeneratingAnnotator Removed Paths: ------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NGramGeneratingAnnotator.java Deleted: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NGramGeneratingAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NGramGeneratingAnnotator.java 2013-11-21 13:52:45 UTC (rev 4167) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NGramGeneratingAnnotator.java 2013-11-21 13:53:13 UTC (rev 4168) @@ -1,76 +0,0 @@ -package org.dllearner.algorithms.isle.index; - -import java.util.ArrayList; -import java.util.HashSet; -import java.util.Set; -import java.util.regex.Pattern; - -/** - * Generates word n-grams - * @author Daniel Fleischhacker - */ -public class NGramGeneratingAnnotator implements LinguisticAnnotator { - private int length; - - /** - * Initializes the annotator to generate word n-grams of the given length ({@code length} words per n-gram) - * @param length length of the single n-grams - */ - public NGramGeneratingAnnotator(int length) { - this.length = length; - } - - @Override - public Set<Annotation> annotate(Document document) { - String text = document.getContent(); - - Pattern legalChars = Pattern.compile("[A-Za-z]"); - - // clean up all texts - int curWordStartPosition = 0; - StringBuilder curWord = new StringBuilder(); - ArrayList<String> wordsInText = new ArrayList<String>(); - ArrayList<Integer> wordStart = new ArrayList<Integer>(); - ArrayList<Integer> wordEnd = new ArrayList<Integer>(); - - int i = 0; - while (i < text.length()) { - Character curChar = text.charAt(i); - if (!legalChars.matcher(curChar.toString()).matches()) { - if (curWord.length() == 0) { - curWordStartPosition = i + 1; - i++; - continue; - } - // current word finished - wordsInText.add(curWord.toString()); - wordStart.add(curWordStartPosition); - wordEnd.add(i); - curWord = new StringBuilder(); - curWordStartPosition = i + 1; - } - else { - curWord.append(curChar); - } - i++; - } - - HashSet<Annotation> annotations = new HashSet<Annotation>(); - - i = 0; - while (i < wordsInText.size() - (length-1)) { - StringBuilder sb = new StringBuilder(); - int curStart = wordStart.get(i); - int lastEnd = wordEnd.get(i); - for (int j = 1; j < length; j++) { - sb.append(wordsInText.get(i + j)); - lastEnd = wordEnd.get(i + j); - } - String nGram = sb.toString().trim(); - annotations.add(new Annotation(document, curStart, lastEnd - curStart)); - i++; - } - - return annotations; - } -} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-11-21 13:52:48
|
Revision: 4167 http://sourceforge.net/p/dl-learner/code/4167 Author: dfleischhacker Date: 2013-11-21 13:52:45 +0000 (Thu, 21 Nov 2013) Log Message: ----------- Remove SimpleLinguisticAnnotator Removed Paths: ------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleLinguisticAnnotator.java Deleted: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleLinguisticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleLinguisticAnnotator.java 2013-11-21 13:40:31 UTC (rev 4166) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleLinguisticAnnotator.java 2013-11-21 13:52:45 UTC (rev 4167) @@ -1,62 +0,0 @@ -package org.dllearner.algorithms.isle.index; - -import java.io.IOException; -import java.io.StringReader; -import java.util.HashSet; -import java.util.Set; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.core.StopFilter; -import org.apache.lucene.analysis.en.PorterStemFilter; -import org.apache.lucene.analysis.standard.StandardTokenizer; -import org.apache.lucene.util.Version; -import org.dllearner.algorithms.isle.StopWordFilter; - -/** - * - * @author Jens Lehmann - * - */ -public class SimpleLinguisticAnnotator implements LinguisticAnnotator { - - private StopWordFilter stopWordFilter = new StopWordFilter(); - NGramGeneratingAnnotator nGramAnnotator = new NGramGeneratingAnnotator(2); - - @Override - public Set<Annotation> annotate(Document document) { - String s = document.getContent().trim(); - System.out.println("Document:" + s); -// s = stopWordFilter.removeStopWords(s); - Set<Annotation> annotations = new HashSet<Annotation>(); - Pattern pattern = Pattern.compile("(\\u0020)+"); - Matcher matcher = pattern.matcher(s); - // Check all occurrences - int start = 0; - while (matcher.find()) { - int end = matcher.start(); - annotations.add(new Annotation(document, start, end - start)); - start = matcher.end(); - } - if(start < s.length()-1){ - annotations.add(new Annotation(document, start, s.length() - start)); - } - annotations.addAll(nGramAnnotator.annotate(document)); -// stopWordFilter.removeStopWordAnnotations(annotations); - return annotations; - } - - public static void main(String[] args) throws Exception { - String s = "male person least 1 child"; - Pattern pattern = Pattern.compile("(\\u0020)+"); - Matcher matcher = pattern.matcher(s); - int start = 0; - while (matcher.find()) { - int end = matcher.start(); - System.out.println(s.substring(start, end)); - start = matcher.end(); - } - } - -} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-11-21 13:40:34
|
Revision: 4166 http://sourceforge.net/p/dl-learner/code/4166 Author: dfleischhacker Date: 2013-11-21 13:40:31 +0000 (Thu, 21 Nov 2013) Log Message: ----------- Add method for getting a number of tokens starting at a given token Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-11-21 13:39:34 UTC (rev 4165) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-11-21 13:40:31 UTC (rev 4166) @@ -1,6 +1,8 @@ package org.dllearner.algorithms.isle.index; +import java.util.ArrayList; import java.util.LinkedList; +import java.util.List; /** * A simple text document without further formatting or markup. @@ -8,6 +10,16 @@ * @author Daniel Fleischhacker */ public class TextDocument extends LinkedList<Token> implements Document { + public static void main(String[] args) { + TextDocument t = new TextDocument(); + String s = "This is a very long, nice text for testing our new implementation of TextDocument."; + for (String e : s.split(" ")) { + t.add(new Token(e)); + } + + System.out.println(t.getRawContent()); + } + @Override public String getContent() { return getContentStartingAtToken(this.getFirst(), SurfaceFormLevel.STEMMED); @@ -28,7 +40,7 @@ * surface forms according to {@code level} are used to build the string. * * @param start token to start building the string at, i.e., the first token in the returned string - * @param l level of surface forms to use + * @param l level of surface forms to use * @return built string */ public String getContentStartingAtToken(Token start, SurfaceFormLevel l) { @@ -51,6 +63,42 @@ return sb.toString(); } + /** + * Returns a list containing {@code numberOfTokens} successive tokens from this document starting at the given start + * token. If {@code ignorePunctuation} is set, tokens which represent punctuation are added to the result but not + * counted for the number of tokens. + * + * @param start token to start collecting tokens from the document + * @param numberOfTokens number of tokens to collect from the document + * @param ignorePunctuation if true, punctuation are not counted towards the number of tokens to return + * @return list containing the given number of relevant tokens, depending in the value of ignorePunctuation, the + * list might contain additional non-relevant (punctuation) tokens + */ + public List<Token> getTokensStartingAtToken(Token start, int numberOfTokens, boolean ignorePunctuation) { + ArrayList<Token> tokens = new ArrayList<Token>(); + + int relevantTokens = 0; + boolean found = false; + + for (Token t : this) { + if (found) { + tokens.add(t); + if (!ignorePunctuation || !t.isPunctuation()) { + relevantTokens++; + } + } + else if (t == start) { + found = true; + tokens.add(t); + } + if (relevantTokens == numberOfTokens) { + break; + } + } + + return tokens; + } + private String getStringForLevel(Token t, SurfaceFormLevel l) { switch (l) { case RAW: @@ -63,14 +111,4 @@ return null; } - - public static void main(String[] args) { - TextDocument t = new TextDocument(); - String s = "This is a very long, nice text for testing our new implementation of TextDocument."; - for (String e : s.split(" ")) { - t.add(new Token(e)); - } - - System.out.println(t.getRawContent()); - } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-11-21 13:39:37
|
Revision: 4165 http://sourceforge.net/p/dl-learner/code/4165 Author: lorenz_b Date: 2013-11-21 13:39:34 +0000 (Thu, 21 Nov 2013) Log Message: ----------- Refactored context extractors. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/StopWordFilter.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/StopWordFilter.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/StopWordFilter.java 2013-11-21 13:38:03 UTC (rev 4164) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/StopWordFilter.java 2013-11-21 13:39:34 UTC (rev 4165) @@ -45,8 +45,7 @@ public void removeStopWordAnnotations(Set<Annotation> annotations) { for (Iterator<Annotation> iter = annotations.iterator(); iter.hasNext();) { Annotation annotation = iter.next(); - String content = annotation.getReferencedDocument().getContent(); - String token = content.substring(annotation.getOffset(), annotation.getOffset()+annotation.getLength()); + String token = annotation.getTokens().get(0).getRawForm(); if(stopWords.contains(token)){ iter.remove(); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-11-21 13:38:25
|
Revision: 4164 http://sourceforge.net/p/dl-learner/code/4164 Author: lorenz_b Date: 2013-11-21 13:38:03 +0000 (Thu, 21 Nov 2013) Log Message: ----------- Refactored context extractors. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SentenceBasedContextExtractor.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WindowBasedContextExtractor.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java 2013-11-21 13:16:13 UTC (rev 4163) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java 2013-11-21 13:38:03 UTC (rev 4164) @@ -37,7 +37,7 @@ return instance; } - public TextDocument tag(String text) { + public TextDocument generateDocument(String text) { TextDocument document = new TextDocument(); // create an empty Annotation just with the given text Annotation annotatedDocument = new Annotation(text); @@ -72,7 +72,7 @@ } public static void main(String[] args) throws Exception { - TextDocument document = TextDocumentGenerator.getInstance().tag("And he said, Amos, what seest thou? And I said, A basket of summer fruit. Then said the LORD unto me, The end is come upon my people of Israel; I will not again pass by them any more. "); + TextDocument document = TextDocumentGenerator.getInstance().generateDocument("And he said, Amos, what seest thou? And I said, A basket of summer fruit. Then said the LORD unto me, The end is come upon my people of Israel; I will not again pass by them any more. "); System.out.println(document); } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java 2013-11-21 13:16:13 UTC (rev 4163) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java 2013-11-21 13:38:03 UTC (rev 4164) @@ -34,6 +34,13 @@ public Document getReferencedDocument() { return referencedDocument; } + + /** + * @return the tokens + */ + public ArrayList<Token> getTokens() { + return tokens; + } public String getString(){ StringBuilder sb = new StringBuilder(); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java 2013-11-21 13:16:13 UTC (rev 4163) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java 2013-11-21 13:38:03 UTC (rev 4164) @@ -16,6 +16,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; +import org.dllearner.algorithms.isle.TextDocumentGenerator; import org.dllearner.algorithms.isle.index.TextDocument; import java.io.File; @@ -61,7 +62,7 @@ ScoreDoc[] result = searcher.search(query, getSize()).scoreDocs; for (int i = 0; i < result.length; i++) { Document doc = searcher.doc(result[i].doc); - documents.add(new TextDocument(doc.get(searchField))); + documents.add(TextDocumentGenerator.getInstance().generateDocument(doc.get(searchField))); } } catch (ParseException e) { e.printStackTrace(); @@ -85,7 +86,7 @@ try { Document doc = indexReader.document(i); String content = doc.get(searchField); - documents.add(new TextDocument(content)); + documents.add(TextDocumentGenerator.getInstance().generateDocument(content)); } catch (IOException e) { e.printStackTrace(); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SentenceBasedContextExtractor.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SentenceBasedContextExtractor.java 2013-11-21 13:16:13 UTC (rev 4163) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SentenceBasedContextExtractor.java 2013-11-21 13:38:03 UTC (rev 4164) @@ -3,6 +3,14 @@ */ package org.dllearner.algorithms.isle.wsd; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Properties; + +import org.dllearner.algorithms.isle.TextDocumentGenerator; +import org.dllearner.algorithms.isle.index.Token; + import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; @@ -10,12 +18,7 @@ import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.util.CoreMap; -import org.dllearner.algorithms.isle.index.TextDocument; -import java.util.ArrayList; -import java.util.List; -import java.util.Properties; - /** * @author Lorenz Buehmann * @@ -36,26 +39,29 @@ @Override public List<String> extractContext(org.dllearner.algorithms.isle.index.Annotation annotation) { //split text into sentences - List<CoreMap> sentences = getSentences(annotation.getReferencedDocument().getContent()); + List<CoreMap> sentences = getSentences(annotation.getReferencedDocument().getRawContent()); //find the sentence containing the token of the annotation - int tokenStart = annotation.getOffset(); - int index = 0; + Token firstToken = annotation.getTokens().get(0); for (CoreMap sentence : sentences) { - String s = sentence.toString(); - if (index <= tokenStart && s.length() > tokenStart) { + boolean found = false; + for (CoreLabel label : sentence.get(TokensAnnotation.class)) { + // this is the text of the token + String word = label.get(TextAnnotation.class); + if(word.equals(firstToken.getRawForm())){ + found = true; + break; + } + } + if(found){ List<String> context = new ArrayList<String>(); for (CoreLabel label : sentence.get(TokensAnnotation.class)) { // this is the text of the token String word = label.get(TextAnnotation.class); - - if(!word.isEmpty() && !word.matches("\\p{Punct}")){ - context.add(word); - } + context.add(word); } return context; } - index += s.length(); } throw new RuntimeException("Token " + annotation.getString() + " not found in text " + annotation.getReferencedDocument().getRawContent()); } @@ -79,9 +85,8 @@ String s = "International Business Machines Corporation, or IBM, is an American multinational services technology and consulting corporation, with headquarters in Armonk, New York, United States. IBM manufactures and markets computer hardware and software," + " and offers infrastructure, hosting and consulting services in areas ranging from mainframe computers to nanotechnology."; - String token = "services"; SentenceBasedContextExtractor extractor = new SentenceBasedContextExtractor(); - List<String> context = extractor.extractContext(new org.dllearner.algorithms.isle.index.Annotation(new TextDocument(s), s.indexOf(token), token.length())); + List<String> context = extractor.extractContext(new org.dllearner.algorithms.isle.index.Annotation(TextDocumentGenerator.getInstance().generateDocument(s), Arrays.asList(new Token("American")))); System.out.println(context); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WindowBasedContextExtractor.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WindowBasedContextExtractor.java 2013-11-21 13:16:13 UTC (rev 4163) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WindowBasedContextExtractor.java 2013-11-21 13:38:03 UTC (rev 4164) @@ -3,6 +3,14 @@ */ package org.dllearner.algorithms.isle.wsd; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Properties; + +import org.dllearner.algorithms.isle.TextDocumentGenerator; +import org.dllearner.algorithms.isle.index.Token; + import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; @@ -10,18 +18,13 @@ import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.util.CoreMap; -import org.dllearner.algorithms.isle.index.TextDocument; -import java.util.ArrayList; -import java.util.List; -import java.util.Properties; - /** * @author Lorenz Buehmann - * + * */ -public class WindowBasedContextExtractor implements ContextExtractor{ - +public class WindowBasedContextExtractor implements ContextExtractor { + private StanfordCoreNLP pipeline; private int tokensLeft = 10; private int tokensRight = 10; @@ -29,57 +32,66 @@ public WindowBasedContextExtractor(int tokensLeft, int tokensRight) { this.tokensLeft = tokensLeft; this.tokensRight = tokensRight; - + Properties props = new Properties(); props.put("annotators", "tokenize, ssplit"); pipeline = new StanfordCoreNLP(props); } - + public WindowBasedContextExtractor(int tokensLeftRight) { tokensLeft = tokensLeftRight; tokensRight = tokensLeftRight; - + Properties props = new Properties(); props.put("annotators", "tokenize, ssplit"); pipeline = new StanfordCoreNLP(props); } - + public WindowBasedContextExtractor() { Properties props = new Properties(); props.put("annotators", "tokenize, ssplit"); pipeline = new StanfordCoreNLP(props); } - /* (non-Javadoc) - * @see org.dllearner.algorithms.isle.wsd.ContextExtractor#extractContext(java.lang.String, java.lang.String) + /* + * (non-Javadoc) + * + * @see + * org.dllearner.algorithms.isle.wsd.ContextExtractor#extractContext(java + * .lang.String, java.lang.String) */ @Override public List<String> extractContext(org.dllearner.algorithms.isle.index.Annotation annotation) { // split text into sentences - List<CoreMap> sentences = getSentences(annotation.getReferencedDocument().getContent()); + List<CoreMap> sentences = getSentences(annotation.getReferencedDocument().getRawContent()); // find the sentence containing the token of the annotation - int tokenStart = annotation.getOffset(); - int index = 0; + Token firstToken = annotation.getTokens().get(0); for (CoreMap sentence : sentences) { - String s = sentence.toString(); - if (index <= tokenStart && s.length() > tokenStart) { + boolean found = false; + for (CoreLabel label : sentence.get(TokensAnnotation.class)) { + // this is the text of the token + String word = label.get(TextAnnotation.class); + if (word.equals(firstToken.getRawForm())) { + found = true; + break; + } + } + if (found) { List<String> context = new ArrayList<String>(); for (CoreLabel label : sentence.get(TokensAnnotation.class)) { // this is the text of the token String word = label.get(TextAnnotation.class); - context.add(word); } return context; } - index += s.length(); } - throw new RuntimeException("Token " + annotation + " not found in text " - + annotation.getReferencedDocument().getContent()); + throw new RuntimeException("Token " + annotation.getString() + " not found in text " + + annotation.getReferencedDocument().getRawContent()); } - + private List<CoreMap> getSentences(String document) { // create an empty Annotation just with the given text Annotation annotation = new Annotation(document); @@ -94,14 +106,14 @@ return sentences; } - + public static void main(String[] args) throws Exception { String s = "International Business Machines Corporation, or IBM, is an American multinational services technology and consulting corporation, with headquarters in Armonk, New York, United States. IBM manufactures and markets computer hardware and software," + " and offers infrastructure, hosting and consulting services in areas ranging from mainframe computers to nanotechnology."; - + String token = "services"; WindowBasedContextExtractor extractor = new WindowBasedContextExtractor(); - List<String> context = extractor.extractContext(new org.dllearner.algorithms.isle.index.Annotation(new TextDocument(s), s.indexOf(token), token.length())); + List<String> context = extractor.extractContext(new org.dllearner.algorithms.isle.index.Annotation(TextDocumentGenerator.getInstance().generateDocument(s), Arrays.asList(new Token("American")))); System.out.println(context); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-11-21 13:16:16
|
Revision: 4163 http://sourceforge.net/p/dl-learner/code/4163 Author: dfleischhacker Date: 2013-11-21 13:16:13 +0000 (Thu, 21 Nov 2013) Log Message: ----------- Annotation refactoring Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SentenceBasedContextExtractor.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SimpleWordSenseDisambiguation.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java 2013-11-21 13:00:33 UTC (rev 4162) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java 2013-11-21 13:16:13 UTC (rev 4163) @@ -4,6 +4,9 @@ package org.dllearner.algorithms.isle.index; +import java.util.ArrayList; +import java.util.List; + /** * A (non-semantic) annotation which represents an entity in a document by its offset and length. * @author Lorenz Buehmann @@ -12,8 +15,7 @@ public class Annotation { private Document referencedDocument; - private int offset; - private int length; + private ArrayList<Token> tokens; private String matchedString; public String getMatchedString() { @@ -24,64 +26,64 @@ this.matchedString = matchedString; } - public Annotation(Document referencedDocument, int offset, int length) { + public Annotation(Document referencedDocument, List<Token> tokens) { this.referencedDocument = referencedDocument; - this.offset = offset; - this.length = length; - } + this.tokens = new ArrayList<Token>(tokens); + } public Document getReferencedDocument() { return referencedDocument; } - public int getOffset() { - return offset; - } + public String getString(){ + StringBuilder sb = new StringBuilder(); + for (Token t : tokens) { + if (sb.length() > 0) { + sb.append(" "); + } + sb.append(t.getStemmedForm()); + } + return sb.toString(); + } - public int getLength() { - return length; - } - - public String getToken(){ - return referencedDocument.getContent().substring(offset, offset + length); - } + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } - @Override - public int hashCode() { - final int prime = 31; - int result = 1; - result = prime * result + ((referencedDocument == null) ? 0 : referencedDocument.hashCode()); - result = prime * result + length; - result = prime * result + offset; - return result; - } + Annotation that = (Annotation) o; + if (matchedString != null ? !matchedString.equals(that.matchedString) : that.matchedString != null) { + return false; + } + if (referencedDocument != null ? !referencedDocument.equals(that.referencedDocument) : + that.referencedDocument != null) { + return false; + } + if (tokens != null ? !tokens.equals(that.tokens) : that.tokens != null) { + return false; + } + + return true; + } + + @Override + public int hashCode() { + int result = referencedDocument != null ? referencedDocument.hashCode() : 0; + result = 31 * result + (tokens != null ? tokens.hashCode() : 0); + result = 31 * result + (matchedString != null ? matchedString.hashCode() : 0); + return result; + } + + /* (non-Javadoc) + * @see java.lang.Object#toString() + */ @Override - public boolean equals(Object obj) { - if (this == obj) - return true; - if (obj == null) - return false; - if (getClass() != obj.getClass()) - return false; - Annotation other = (Annotation) obj; - if (referencedDocument == null) { - if (other.referencedDocument != null) - return false; - } else if (!referencedDocument.equals(other.referencedDocument)) - return false; - if (length != other.length) - return false; - if (offset != other.offset) - return false; - return true; - } - - /* (non-Javadoc) - * @see java.lang.Object#toString() - */ - @Override public String toString() { - return "\"" + referencedDocument.getContent().substring(offset, offset+length) + "\" at position " + offset; - } + return getString(); + } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java 2013-11-21 13:00:33 UTC (rev 4162) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java 2013-11-21 13:16:13 UTC (rev 4163) @@ -54,7 +54,7 @@ Annotation annotation_i = sortedAnnotations.get(i); int begin_i = annotation_i.getOffset(); int end_i = begin_i + annotation_i.getLength()-1; - String token_i = annotation_i.getToken(); + String token_i = annotation_i.getString(); Set<Entity> candidates_i = getCandidates(annotation_i); Set<Entity> newCandidates_i = new HashSet<Entity>(); @@ -68,7 +68,7 @@ for (int j=windowStart; j<sortedAnnotations.size() && j<windowEnd; j++) { if (j!=i) { Annotation annotation_j = sortedAnnotations.get(j); - String token_j = annotation_j.getToken(); + String token_j = annotation_j.getString(); Set<Entity> candidates_j = getCandidates(annotation_j); Set<Entity> intersection = Sets.intersection(candidates_i, candidates_j); Set<Entity> newCandidates_ij = new HashSet<Entity>(); @@ -83,7 +83,7 @@ if (!newCandidates_ij.isEmpty()) { Annotation mergedAnnotation = mergeAnnotations(annotation_i,annotation_j); // If there's no punctuation in the merged annotation - if (!Pattern.matches("\\p{Punct}", mergedAnnotation.getToken())) { + if (!Pattern.matches("\\p{Punct}", mergedAnnotation.getString())) { candidatesMap.put(mergedAnnotation, newCandidates_ij); candidatesMap.remove(annotation_i); candidatesMap.remove(annotation_j); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java 2013-11-21 13:00:33 UTC (rev 4162) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java 2013-11-21 13:16:13 UTC (rev 4163) @@ -48,6 +48,7 @@ public SemanticIndex(OWLOntology ontology) { this.ontology = ontology; + } /** Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SentenceBasedContextExtractor.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SentenceBasedContextExtractor.java 2013-11-21 13:00:33 UTC (rev 4162) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SentenceBasedContextExtractor.java 2013-11-21 13:16:13 UTC (rev 4163) @@ -57,7 +57,7 @@ } index += s.length(); } - throw new RuntimeException("Token " + annotation.getToken() + " not found in text " + annotation.getReferencedDocument().getRawContent()); + throw new RuntimeException("Token " + annotation.getString() + " not found in text " + annotation.getReferencedDocument().getRawContent()); } private List<CoreMap> getSentences(String document) { Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SimpleWordSenseDisambiguation.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SimpleWordSenseDisambiguation.java 2013-11-21 13:00:33 UTC (rev 4162) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SimpleWordSenseDisambiguation.java 2013-11-21 13:16:13 UTC (rev 4163) @@ -50,7 +50,7 @@ public SemanticAnnotation disambiguate(Annotation annotation, Set<Entity> candidateEntities) { logger.debug("Linguistic annotations:\n" + annotation); logger.debug("Candidate entities:" + candidateEntities); - String token = annotation.getToken().trim(); + String token = annotation.getString().trim(); //check if annotation token matches label of entity or the part behind #(resp. /) for (Entity entity : candidateEntities) { Set<String> labels = getLabels(entity); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-11-21 13:00:36
|
Revision: 4162 http://sourceforge.net/p/dl-learner/code/4162 Author: dfleischhacker Date: 2013-11-21 13:00:33 +0000 (Thu, 21 Nov 2013) Log Message: ----------- Ignore punctuation in stemmed text Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-11-21 12:57:10 UTC (rev 4161) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-11-21 13:00:33 UTC (rev 4162) @@ -37,7 +37,10 @@ for (Token t : this) { if (found) { sb.append(" "); - sb.append(getStringForLevel(t, l)); + String surfaceForm = getStringForLevel(t, l); + if (surfaceForm != null) { + sb.append(surfaceForm); + } } else if (t == start) { found = true; @@ -55,9 +58,19 @@ case POS_TAGGED: return t.getPOSTag(); case STEMMED: - return t.getStemmedForm(); + return t.isPunctuation() ? null : t.getStemmedForm(); } return null; } + + public static void main(String[] args) { + TextDocument t = new TextDocument(); + String s = "This is a very long, nice text for testing our new implementation of TextDocument."; + for (String e : s.split(" ")) { + t.add(new Token(e)); + } + + System.out.println(t.getRawContent()); + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-11-21 12:57:14
|
Revision: 4161 http://sourceforge.net/p/dl-learner/code/4161 Author: lorenz_b Date: 2013-11-21 12:57:10 +0000 (Thu, 21 Nov 2013) Log Message: ----------- Cont. text document generator. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java 2013-11-21 12:51:05 UTC (rev 4160) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java 2013-11-21 12:57:10 UTC (rev 4161) @@ -19,7 +19,10 @@ public class TextDocumentGenerator { private static TextDocumentGenerator instance; + private StanfordCoreNLP pipeline; + private final String punctuationPattern = "\\p{Punct}"; + private final StopWordFilter stopWordFilter = new StopWordFilter(); private TextDocumentGenerator(){ Properties props = new Properties(); @@ -54,14 +57,22 @@ String pos = label.get(PartOfSpeechAnnotation.class); //this is the POS tag of the token String lemma = label.get(LemmaAnnotation.class); + //check if token is punctuation + boolean isPunctuation = word.matches(punctuationPattern); + //check if it is a stop word + boolean isStopWord = stopWordFilter.isStopWord(word); - Token token = new Token(word); - token.setPOSTag(pos); - token.setStemmedForm(lemma); + Token token = new Token(word, lemma, pos, isPunctuation, isStopWord); + document.add(token); } } return document; } + + public static void main(String[] args) throws Exception { + TextDocument document = TextDocumentGenerator.getInstance().tag("And he said, Amos, what seest thou? And I said, A basket of summer fruit. Then said the LORD unto me, The end is come upon my people of Israel; I will not again pass by them any more. "); + System.out.println(document); + } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-11-21 12:51:05 UTC (rev 4160) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-11-21 12:57:10 UTC (rev 4161) @@ -12,11 +12,21 @@ private String rawForm; private String stemmedForm; private String posTag; + private boolean isPunctuation; + private boolean isStopWord; public Token(String rawForm) { - posTag = rawForm; + this.rawForm = rawForm; } + public Token(String rawForm, String stemmedForm, String posTag, boolean isPunctuation, boolean isStopWord) { + this.rawForm = rawForm; + this.stemmedForm = stemmedForm; + this.posTag = posTag; + this.isPunctuation = isPunctuation; + this.isStopWord = isStopWord; + } + /** * @return the rawForm */ @@ -39,6 +49,20 @@ } /** + * @return the isPunctuation + */ + public boolean isPunctuation() { + return isPunctuation; + } + + /** + * @return the isStopWord + */ + public boolean isStopWord() { + return isStopWord; + } + + /** * @param stemmedForm the stemmedForm to set */ public void setStemmedForm(String stemmedForm) { @@ -51,14 +75,28 @@ public void setPOSTag(String posTag) { this.posTag = posTag; } + + /** + * @param isPunctuation the isPunctuation to set + */ + public void setIsPunctuation(boolean isPunctuation) { + this.isPunctuation = isPunctuation; + } + + /** + * @param isStopWord the isStopWord to set + */ + public void setIsStopWord(boolean isStopWord) { + this.isStopWord = isStopWord; + } /* (non-Javadoc) * @see java.lang.Object#toString() */ @Override public String toString() { - return "Word: " + rawForm + "\n" + return "\n[Word: " + rawForm + "\n" + "Stemmed word: " + stemmedForm + "\n" - + "POS tag: " + posTag; + + "POS tag: " + posTag + "]"; } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |