From: <chr...@us...> - 2011-09-14 10:05:36
|
Revision: 3258 http://dl-learner.svn.sourceforge.net/dl-learner/?rev=3258&view=rev Author: christinaunger Date: 2011-09-14 10:05:25 +0000 (Wed, 14 Sep 2011) Log Message: ----------- [tbsl] updates wordnet lookup for slots Modified Paths: -------------- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/nlp/LingPipeLemmatizer.java trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/sparql/Slot.java trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/templator/Templator.java trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/templator/WordNet.java trunk/components-ext/src/test/java/org/dllearner/algorithm/tbsl/LemmatizationTest.java trunk/components-ext/src/test/java/org/dllearner/algorithm/tbsl/WordNetTest.java Modified: trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/nlp/LingPipeLemmatizer.java =================================================================== --- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/nlp/LingPipeLemmatizer.java 2011-09-14 08:23:09 UTC (rev 3257) +++ trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/nlp/LingPipeLemmatizer.java 2011-09-14 10:05:25 UTC (rev 3258) @@ -7,6 +7,9 @@ public class LingPipeLemmatizer implements Lemmatizer { + public LingPipeLemmatizer() { + } + @Override public String stem(String word) { return PorterStemmerTokenizerFactory.stem(word); Modified: trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/sparql/Slot.java =================================================================== --- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/sparql/Slot.java 2011-09-14 08:23:09 UTC (rev 3257) +++ trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/sparql/Slot.java 2011-09-14 10:05:25 UTC (rev 3258) @@ -10,6 +10,7 @@ String token; SlotType type; List<String> words; + String postag; public Slot(String a,List<String> ws) { anchor = a; @@ -17,15 +18,21 @@ type = SlotType.UNSPEC; words = ws; replaceUnderscores(); + postag = ""; } public Slot(String a,SlotType t,List<String> ws) { anchor = a; token = ""; type = t; words = ws; - replaceUnderscores(); + replaceUnderscores(); + postag = ""; } + public void setPOStag(String s) { + postag = s; + } + public void setSlotType(SlotType st) { type = st; } Modified: trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/templator/Templator.java =================================================================== --- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/templator/Templator.java 2011-09-14 08:23:09 UTC (rev 3257) +++ trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/templator/Templator.java 2011-09-14 10:05:25 UTC (rev 3258) @@ -3,6 +3,7 @@ import java.io.InputStream; import java.util.ArrayList; import java.util.HashSet; +import java.util.Hashtable; import java.util.List; import java.util.Set; @@ -13,6 +14,7 @@ import org.dllearner.algorithm.tbsl.ltag.parser.Parser; import org.dllearner.algorithm.tbsl.ltag.parser.Preprocessor; import org.dllearner.algorithm.tbsl.nlp.ApachePartOfSpeechTagger; +import org.dllearner.algorithm.tbsl.nlp.LingPipeLemmatizer; import org.dllearner.algorithm.tbsl.nlp.PartOfSpeechTagger; import org.dllearner.algorithm.tbsl.sem.drs.DRS; import org.dllearner.algorithm.tbsl.sem.drs.UDRS; @@ -31,10 +33,17 @@ Parser p; Preprocessor pp; + WordNet wordnet; + LingPipeLemmatizer lem = new LingPipeLemmatizer(); + + DUDE2UDRS_Converter d2u = new DUDE2UDRS_Converter(); + DRS2SPARQL_Converter d2s = new DRS2SPARQL_Converter(); + boolean ONE_SCOPE_ONLY = true; boolean UNTAGGED_INPUT = true; public Templator() { + List<InputStream> grammarFiles = new ArrayList<InputStream>(); for(int i = 0; i < GRAMMAR_FILES.length; i++){ grammarFiles.add(this.getClass().getClassLoader().getResourceAsStream(GRAMMAR_FILES[i])); @@ -52,6 +61,8 @@ p.MODE = "LEIPZIG"; pp = new Preprocessor(true); + + wordnet = new WordNet(); } public void setUNTAGGED_INPUT(boolean b) { @@ -60,8 +71,6 @@ public Set<Template> buildTemplates(String s) { - DUDE2UDRS_Converter d2u = new DUDE2UDRS_Converter(); - DRS2SPARQL_Converter d2s = new DRS2SPARQL_Converter(); boolean clearAgain = true; String tagged; @@ -93,6 +102,13 @@ } } + // build pairs <String,POStag> from tagged + Hashtable<String,String> postable = new Hashtable<String,String>(); + for (String st : newtagged.split(" ")) { + postable.put(st.substring(0,st.indexOf("/")),st.substring(st.indexOf("/")+1));; + } + // + Set<DRS> drses = new HashSet<DRS>(); Set<Template> templates = new HashSet<Template>(); @@ -119,6 +135,37 @@ try { Template temp = d2s.convert(drs,slots); + + // find WordNet synonyms + List<String> newwords; + String word; + String pos; + for (Slot slot : temp.getSlots()) { + if (!slot.getWords().isEmpty()) { + + word = slot.getWords().get(0); + pos = postable.get(word.replace(" ","_")); + List<String> strings = wordnet.getAttributes(word); + + newwords = new ArrayList<String>(); + newwords.add(word); + newwords.addAll(strings); + if (strings.isEmpty()) { + newwords.addAll(wordnet.getBestSynonyms(word,pos)); + } else { + for (String att : strings) { + newwords.addAll(wordnet.getBestSynonyms(att,pos)); + } + } + if (newwords.isEmpty()) { + newwords.add(slot.getWords().get(0)); + } + // stem = lem.stem(slot.getWords().get(0)); newwords.add(stem); + slot.setWords(newwords); + } + } + // + templates.add(temp); } catch (java.lang.ClassCastException e) { continue; Modified: trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/templator/WordNet.java =================================================================== --- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/templator/WordNet.java 2011-09-14 08:23:09 UTC (rev 3257) +++ trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/templator/WordNet.java 2011-09-14 10:05:25 UTC (rev 3258) @@ -2,7 +2,10 @@ import java.util.ArrayList; import java.util.Arrays; +import java.util.HashSet; +import java.util.Hashtable; import java.util.List; +import java.util.Set; import edu.smu.tspell.wordnet.*; @@ -10,23 +13,57 @@ private WordNetDatabase database; + private String[] noun = {"NN","NNS","NNP","NNPS","NPREP","JJNN","JJNPREP"}; + private String[] adjective = {"JJ","JJR","JJS","JJH"}; + private String[] verb = {"VB","VBD","VBG","VBN","VBP","VBZ","PASSIVE","PASSPART","VPASS","VPASSIN","GERUNDIN","VPREP","WHEN","WHERE"}; + private String[] preps = {"IN","TO"}; + public WordNet() { + System.setProperty("wordnet.database.dir", System.getProperty("user.dir") + "/src/main/resources/tbsl/dict/"); database = WordNetDatabase.getFileInstance(); } - public List<String> getBestSynonyms(String s) { + public Set<String> getBestSynonyms(String s,String pos) { - List<String> synonyms = new ArrayList<String>(); + Set<String> synonyms = new HashSet<String>(); - Synset[] synsets = database.getSynsets(s); - if (synsets.length != 0) { - String[] candidates = synsets[0].getWordForms(); - for (String c : candidates) { - if (!c.equals(s) && !c.contains(" ") && synonyms.size() < 4) { - synonyms.add(c); + SynsetType type = null; + if (equalsOneOf(pos,noun)) { + type = SynsetType.NOUN; + } + else if (equalsOneOf(pos,adjective)) { + type = SynsetType.ADJECTIVE; + } + else if (equalsOneOf(pos,verb)) { + type = SynsetType.VERB; + } + + String[] basecandidates; + if (type != null) { + String[] bfc = database.getBaseFormCandidates(s,type); + basecandidates = new String[bfc.length + 1]; + basecandidates[0] = s; + for (int i = 0; i < bfc.length; i++) { + basecandidates[i+1] = bfc[i]; + } + } + else { + basecandidates = new String[1]; + basecandidates[0] = s; + } + + for (String b : basecandidates) { + Synset[] synsets = database.getSynsets(b); + if (synsets.length != 0) { + String[] candidates = synsets[0].getWordForms(); + for (String c : candidates) { + if (!c.equals(b) && !c.contains(" ") && synonyms.size() < 4) { + synonyms.add(c); + } } } } + return synonyms; } @@ -67,4 +104,13 @@ return result; } + private boolean equalsOneOf(String string,String[] strings) { + for (String s : strings) { + if (string.equals(s)) { + return true; + } + } + return false; + } + } Modified: trunk/components-ext/src/test/java/org/dllearner/algorithm/tbsl/LemmatizationTest.java =================================================================== --- trunk/components-ext/src/test/java/org/dllearner/algorithm/tbsl/LemmatizationTest.java 2011-09-14 08:23:09 UTC (rev 3257) +++ trunk/components-ext/src/test/java/org/dllearner/algorithm/tbsl/LemmatizationTest.java 2011-09-14 10:05:25 UTC (rev 3258) @@ -1,13 +1,21 @@ package org.dllearner.algorithm.tbsl; +import org.dllearner.algorithm.tbsl.nlp.LingPipeLemmatizer; + public class LemmatizationTest { /** * @param args */ public static void main(String[] args) { - // TODO Auto-generated method stub + + LingPipeLemmatizer lem = new LingPipeLemmatizer(); + System.out.println(lem.stem("soccer clubs")); + System.out.println(lem.stem("Permier League","NNP")); + System.out.println(lem.stem("cities","NNS")); + System.out.println(lem.stem("killed")); + System.out.println(lem.stem("bigger")); } } Modified: trunk/components-ext/src/test/java/org/dllearner/algorithm/tbsl/WordNetTest.java =================================================================== --- trunk/components-ext/src/test/java/org/dllearner/algorithm/tbsl/WordNetTest.java 2011-09-14 08:23:09 UTC (rev 3257) +++ trunk/components-ext/src/test/java/org/dllearner/algorithm/tbsl/WordNetTest.java 2011-09-14 10:05:25 UTC (rev 3258) @@ -2,14 +2,25 @@ import org.dllearner.algorithm.tbsl.templator.WordNet; +import edu.smu.tspell.wordnet.SynsetType; +import edu.smu.tspell.wordnet.WordNetDatabase; + public class WordNetTest { public static void main(String[] args) { + System.setProperty("wordnet.database.dir", System.getProperty("user.dir") + "/src/main/resources/tbsl/dict/"); + WordNetDatabase database = WordNetDatabase.getFileInstance(); + + System.out.println(database.getBaseFormCandidates("cities",SynsetType.NOUN)[1]); + WordNet wordnet = new WordNet(); - - System.out.println(wordnet.getBestSynonyms("city")); + + System.out.println(wordnet.getAttributes("bigger")); + System.out.println(wordnet.getBestSynonyms("city","NN")); System.out.println(wordnet.getAttributes("biggest")); + System.out.println(wordnet.getBestSynonyms("biggest","JJR")); + System.out.println(wordnet.getAttributes("city")); } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |