From: <dfl...@us...> - 2013-09-06 13:31:47
|
Revision: 4095 http://sourceforge.net/p/dl-learner/code/4095 Author: dfleischhacker Date: 2013-09-06 13:31:43 +0000 (Fri, 06 Sep 2013) Log Message: ----------- Make LinguisticUtil singleton Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java 2013-09-06 12:48:08 UTC (rev 4094) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java 2013-09-06 13:31:43 UTC (rev 4095) @@ -6,23 +6,25 @@ import org.dllearner.algorithms.isle.WordNet; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; /** * Provides shortcuts to commonly used linguistic operations * @author Daniel Fleischhacker */ public class LinguisticUtil { + private static LinguisticUtil instance; + private static final WordNet wn = new WordNet(); private static POS[] RELEVANT_POS = new POS[]{POS.NOUN, POS.VERB}; private static Lemmatizer lemmatizer; - static { - try { - lemmatizer = new DefaultLemmatizer(); + public static LinguisticUtil getInstance() { + if (instance == null) { + instance = new LinguisticUtil(); } - catch (Exception e) { - e.printStackTrace(); - } + return instance; } /** @@ -30,7 +32,7 @@ * @param camelCase the word containing camelcase to split * @return all words as camelcase contained in the given word */ - public static String[] getWordsFromCamelCase(String camelCase) { + public String[] getWordsFromCamelCase(String camelCase) { ArrayList<String> resultingWords = new ArrayList<String>(); StringBuilder sb = new StringBuilder(); for (int i = 0; i < camelCase.length(); i++) { @@ -66,7 +68,7 @@ * @param underScored word to split at underscores * @return words contained in given word */ - public static String[] getWordsFromUnderscored(String underScored) { + public String[] getWordsFromUnderscored(String underScored) { return underScored.split("_"); } @@ -77,7 +79,7 @@ * @param word the word to retrieve synonyms for * @return synonyms for the given word */ - public static String[] getSynonymsForWord(String word) { + public String[] getSynonymsForWord(String word) { ArrayList<String> synonyms = new ArrayList<String>(); for (POS pos : RELEVANT_POS) { @@ -94,7 +96,7 @@ * @param n the number of senses to get lemmas for * @return synonyms for the given word */ - public static String[] getTopSynonymsForWord(String word, int n) { + public String[] getTopSynonymsForWord(String word, int n) { ArrayList<String> synonyms = new ArrayList<String>(); for (POS pos : RELEVANT_POS) { @@ -104,30 +106,48 @@ } /** - * Returns the normalized form of the given word. This method is only able to work with single words! If there is an - * error normalizing the given word, the word itself is returned. + * Returns the normalized form of the given word. If the word contains spaces, each part separated by spaces is + * normalized independently and joined afterwards. If there is an error normalizing the given word, the word itself + * is returned. * * @param word the word to get normalized form for * @return normalized form of the word or the word itself on an error */ - public static String getNormalizedForm(String word) { - try { - if (lemmatizer == null) { - return word; + public String getNormalizedForm(String word) { + StringBuilder res = new StringBuilder(); + + boolean first = true; + + ArrayList<String> singleWords = new ArrayList<String>(); + Collections.addAll(singleWords, word.split(" ")); + + for (String w : singleWords) { + try { + if (first) { + first = false; + } + else { + res.append(" "); + } + if (lemmatizer == null) { + res.append(w); + } + else { + res.append(lemmatizer.lemmatize(w)); + } } - return lemmatizer.lemmatize(word); + catch (Exception e) { + e.printStackTrace(); + } } - catch (Exception e) { - e.printStackTrace(); - } - return word; + return res.toString(); } public static void main(String[] args) { - System.out.println(getNormalizedForm("going")); - for (String s : getWordsFromCamelCase("thisIsAClassWith1Name123")) { + System.out.println(LinguisticUtil.getInstance().getNormalizedForm("going")); + for (String s : LinguisticUtil.getInstance().getWordsFromCamelCase("thisIsAClassWith1Name123")) { System.out.println(s); - for (String w : getSynonymsForWord(s)) { + for (String w : LinguisticUtil.getInstance().getSynonymsForWord(s)) { System.out.println(" --> " + w); } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java 2013-09-06 12:48:08 UTC (rev 4094) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java 2013-09-06 13:31:43 UTC (rev 4095) @@ -42,7 +42,10 @@ Set<SemanticAnnotation> semanticAnnotations = new HashSet<SemanticAnnotation>(); for (Annotation annotation : annotations) { Set<Entity> candidateEntities = entityCandidateGenerator.getCandidates(annotation); - SemanticAnnotation semanticAnnotation = wordSenseDisambiguation.disambiguate(annotation, candidateEntities); + if (candidateEntities == null || candidateEntities.size() == 0) { + continue; + } + SemanticAnnotation semanticAnnotation = wordSenseDisambiguation.disambiguate(annotation, candidateEntities); if(semanticAnnotation != null){ semanticAnnotations.add(semanticAnnotation); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-09-06 12:48:08 UTC (rev 4094) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-09-06 13:31:43 UTC (rev 4095) @@ -45,14 +45,14 @@ for (Entity entity : relevantText.keySet()) { for (String text : relevantText.get(entity)) { - text = StringUtils.join(LinguisticUtil.getWordsFromCamelCase(text), " "); - text = StringUtils.join(LinguisticUtil.getWordsFromUnderscored(text), " "); + text = StringUtils.join(LinguisticUtil.getInstance().getWordsFromCamelCase(text), " "); + text = StringUtils.join(LinguisticUtil.getInstance().getWordsFromUnderscored(text), " "); if (text.trim().isEmpty()) { continue; } addEntry(text, entity); for (String alternativeText : nameGenerator.getAlternativeText(text)) { -// System.out.println("New alternative text for " + text + " --> " + alternativeText); + System.out.println("New alternative text for " + text + " --> " + alternativeText); addEntry(alternativeText, entity); } // Adds also composing words, e.g. for "has child", "has" and "child" are also added @@ -60,7 +60,7 @@ for (String subtext : text.split(" ")) { addEntry(subtext, entity); for (String alternativeText : nameGenerator.getAlternativeText(subtext)) { -// System.out.println("New alternative text for " + subtext + " --> " + alternativeText); + System.out.println("New alternative text for " + subtext + " --> " + alternativeText); addEntry(alternativeText, entity); } //System.out.println("trie.add("+subtext+","++")"); @@ -146,7 +146,7 @@ @Override public List<String> getAlternativeText(String word) { - return Arrays.asList(LinguisticUtil.getTopSynonymsForWord(word, maxNumberOfSenses)); + return Arrays.asList(LinguisticUtil.getInstance().getTopSynonymsForWord(word, maxNumberOfSenses)); } } @@ -167,10 +167,10 @@ @Override public List<String> getAlternativeText(String word) { ArrayList<String> res = new ArrayList<String>(); - res.add(LinguisticUtil.getNormalizedForm(word)); + res.add(LinguisticUtil.getInstance().getNormalizedForm(word)); - for (String w : LinguisticUtil - .getTopSynonymsForWord(LinguisticUtil.getNormalizedForm(word), maxNumberOfSenses)) { + for (String w : LinguisticUtil.getInstance() + .getTopSynonymsForWord(LinguisticUtil.getInstance().getNormalizedForm(word), maxNumberOfSenses)) { res.add(w.replaceAll("_", " ")); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-09-06 12:48:08 UTC (rev 4094) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-09-06 13:31:43 UTC (rev 4095) @@ -27,7 +27,7 @@ Set<Annotation> annotations = new HashSet<Annotation>(); for (int i = 0; i < content.length(); i++) { String unparsed = content.substring(i); - String match = candidatesTrie.getLongestMatch(unparsed); + String match = candidatesTrie.getLongestMatch(LinguisticUtil.getInstance().getNormalizedForm(unparsed)); if (match != null && !match.isEmpty()) { Annotation annotation = new Annotation(document, i, match.length()); annotations.add(annotation); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java 2013-09-06 12:48:08 UTC (rev 4094) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java 2013-09-06 13:31:43 UTC (rev 4095) @@ -91,8 +91,8 @@ if(textWithWeight.isEmpty() && useShortFormFallback){ String shortForm = sfp.getShortForm(IRI.create(entity.getURI())); - shortForm = Joiner.on(" ").join(LinguisticUtil.getWordsFromCamelCase(shortForm)); - shortForm = Joiner.on(" ").join(LinguisticUtil.getWordsFromUnderscored(shortForm)).trim(); + shortForm = Joiner.on(" ").join(LinguisticUtil.getInstance().getWordsFromCamelCase(shortForm)); + shortForm = Joiner.on(" ").join(LinguisticUtil.getInstance().getWordsFromUnderscored(shortForm)).trim(); textWithWeight.put(shortForm, weight); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |