From: <dfl...@us...> - 2013-09-04 15:04:42
|
Revision: 4065 http://sourceforge.net/p/dl-learner/code/4065 Author: dfleischhacker Date: 2013-09-04 15:04:37 +0000 (Wed, 04 Sep 2013) Log Message: ----------- Add lemmatizing to linguistic utils Modified Paths: -------------- trunk/components-core/pom.xml trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java Modified: trunk/components-core/pom.xml =================================================================== --- trunk/components-core/pom.xml 2013-09-04 14:39:59 UTC (rev 4064) +++ trunk/components-core/pom.xml 2013-09-04 15:04:37 UTC (rev 4065) @@ -195,7 +195,13 @@ <version>1.0</version> </dependency> + <dependency> + <groupId>edu.northwestern.at</groupId> + <artifactId>morphadorner</artifactId> + <version>2009-04-30</version> + </dependency> + <!-- This module is a library module, so it needs only to have the slf api dependency to enable logging --> <dependency> <groupId>org.slf4j</groupId> Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java 2013-09-04 14:39:59 UTC (rev 4064) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java 2013-09-04 15:04:37 UTC (rev 4065) @@ -1,9 +1,16 @@ package org.dllearner.algorithms.isle.index; +import edu.northwestern.at.utils.corpuslinguistics.lemmatizer.DefaultLemmatizer; +import edu.northwestern.at.utils.corpuslinguistics.lemmatizer.Lemmatizer; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.pipeline.*; +import edu.stanford.nlp.util.CoreMap; import net.didion.jwnl.data.POS; import org.dllearner.algorithms.isle.WordNet; import java.util.ArrayList; +import java.util.Properties; /** * Provides shortcuts to commonly used linguistic operations @@ -12,7 +19,17 @@ public class LinguisticUtil { private static final WordNet wn = new WordNet(); private static POS[] RELEVANT_POS = new POS[]{POS.NOUN, POS.VERB}; + private static Lemmatizer lemmatizer; + static { + try { + lemmatizer = new DefaultLemmatizer(); + } + catch (Exception e) { + e.printStackTrace(); + } + } + /** * Processes the given string and puts camelCased words into single words. * @param camelCase the word containing camelcase to split @@ -54,7 +71,13 @@ return underScored.split("_"); } - // get synonyms + /** + * Returns an array of all synonyms for the given word. Only synonyms for the POS in {@link #RELEVANT_POS} are + * returned. + * + * @param word the word to retrieve synonyms for + * @return synonyms for the given word + */ public static String[] getSynonymsForWord(String word) { ArrayList<String> synonyms = new ArrayList<String>(); @@ -64,7 +87,28 @@ return synonyms.toArray(new String[synonyms.size()]); } + /** + * Returns the normalized form of the given word. This method is only able to work with single words! If there is an + * error normalizing the given word, the word itself is returned. + * + * @param word the word to get normalized form for + * @return normalized form of the word or the word itself on an error + */ + public static String getNormalizedForm(String word) { + try { + if (lemmatizer == null) { + return word; + } + return lemmatizer.lemmatize(word); + } + catch (Exception e) { + e.printStackTrace(); + } + return word; + } + public static void main(String[] args) { + System.out.println(getNormalizedForm("going")); for (String s : getWordsFromCamelCase("thisIsAClassWith1Name123")) { System.out.println(s); for (String w : getSynonymsForWord(s)) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |