From: <chr...@us...> - 2011-11-14 10:56:48
|
Revision: 3400 http://dl-learner.svn.sourceforge.net/dl-learner/?rev=3400&view=rev Author: christinaunger Date: 2011-11-14 10:56:41 +0000 (Mon, 14 Nov 2011) Log Message: ----------- [tbsl] enabled parsing of NERs enclodes in "..." and of genitives (...'s) Modified Paths: -------------- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/ltag/parser/GrammarFilter.java trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/ltag/parser/Parser.java trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/ltag/parser/Preprocessor.java trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/templator/SlotBuilder.java trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/templator/Templator.java Modified: trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/ltag/parser/GrammarFilter.java =================================================================== --- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/ltag/parser/GrammarFilter.java 2011-11-12 23:10:28 UTC (rev 3399) +++ trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/ltag/parser/GrammarFilter.java 2011-11-14 10:56:41 UTC (rev 3400) @@ -33,6 +33,8 @@ static List<Integer> usedInts = new ArrayList<Integer>(); static ArrayList<String> doubles = new ArrayList<String>(); + public static boolean VERBOSE = true; + static ParseGrammar filter(String taggedinput,LTAGLexicon grammar,List<Integer> temps, String mode) { // DISAM: CLEAR @@ -191,7 +193,7 @@ start++; } - logger.trace("\ncovered tokens: " + coveredTokens); + if (VERBOSE) logger.trace("\ncovered tokens: " + coveredTokens); /* construct slots for all unknown tokens */ @@ -211,7 +213,7 @@ } } } - logger.trace("unknown words: " + unknownWords); + if (VERBOSE) logger.trace("unknown words: " + unknownWords); List<Pair<String,String>> buildSlotFor = new ArrayList<Pair<String,String>>(); @@ -238,7 +240,7 @@ System.out.println("Oh no, " + s + " has no POS tag!"); } } - logger.trace("build slot for: " + buildSlotFor + "\n"); + if (VERBOSE) logger.trace("build slot for: " + buildSlotFor + "\n"); List<String[]> entries; if (mode.equals("LEIPZIG")) { Modified: trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/ltag/parser/Parser.java =================================================================== --- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/ltag/parser/Parser.java 2011-11-12 23:10:28 UTC (rev 3399) +++ trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/ltag/parser/Parser.java 2011-11-14 10:56:41 UTC (rev 3400) @@ -18,6 +18,7 @@ public boolean USE_LESS_MEMORY = false; public boolean SHOW_GRAMMAR = false; public boolean SHOW_LEXICAL_COVERAGE = false; + public boolean VERBOSE = true; public String MODE = "BASIC"; // MODE ::= BASIC | LEIPZIG (set by Templator and BasicTemplator) private String[] input; @@ -50,6 +51,8 @@ derivedTrees.clear(); dudes.clear(); temporaryEntries.clear(); + + if (!VERBOSE) GrammarFilter.VERBOSE = false; /* * create a local copy of the grammar with own treeIDs. This is @@ -64,7 +67,7 @@ inputNoTags += s.substring(0,s.indexOf("/")) + " "; } - this.input = ("# ".concat(inputNoTags.trim())).split(" "); + this.input = ("# ".concat(inputNoTags.replaceAll("'","").trim())).split(" "); int n = this.input.length; @@ -84,7 +87,7 @@ internalParse(parseGrammar.getDPInitTrees(), n); } - logger.trace("Constructed " + derivationTrees.size() + " derivation trees.\n"); + if (VERBOSE) logger.trace("Constructed " + derivationTrees.size() + " derivation trees.\n"); return derivationTrees; } Modified: trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/ltag/parser/Preprocessor.java =================================================================== --- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/ltag/parser/Preprocessor.java 2011-11-12 23:10:28 UTC (rev 3399) +++ trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/ltag/parser/Preprocessor.java 2011-11-14 10:56:41 UTC (rev 3400) @@ -16,19 +16,25 @@ private static final Logger logger = Logger.getLogger(Preprocessor.class); - static final String[] genericReplacements = { "\"", "", "'", "", "[!?.,;]", "" }; + static final String[] genericReplacements = { "[!?.,;]", "" }; static final String[] englishReplacements = { "don't", "do not", "doesn't", "does not" }; static boolean USE_NER; + static boolean VERBOSE; static NER ner; public Preprocessor(boolean n) { USE_NER = n; + VERBOSE = true; if (USE_NER) { // ner = new LingPipeNER(true); //not case sensitive best solution? ner = new DBpediaSpotlightNER(); } } + public void setVERBOSE(boolean b) { + VERBOSE = b; + } + public String normalize(String s) { return normalize(s, new String[0]); } @@ -58,7 +64,7 @@ * nn/RBR of/IN > nn/NPREP * usw. * */ - String condensedstring = taggedstring; + String condensedstring = taggedstring.replaceAll("``/``","").replaceAll("''/''","").replaceAll(" "," "); Matcher m; Pattern compAdjPattern = Pattern.compile("(\\w+/RBR.(\\w+)/JJ)"); @@ -89,7 +95,7 @@ m = compAdjPattern.matcher(condensedstring); while (m.find()) { - logger.trace("Replacing " + m.group(1) + " by " + m.group(2)+"/JJR"); + if (VERBOSE) logger.trace("Replacing " + m.group(1) + " by " + m.group(2)+"/JJR"); condensedstring = condensedstring.replaceFirst(m.group(1),m.group(2)+"/JJR"); } // m = superAdjPattern.matcher(condensedstring); @@ -99,57 +105,57 @@ // } m = howManyPattern.matcher(condensedstring); while (m.find()) { - logger.trace("Replacing " + m.group(1) + " by how/WLEX many/WLEX"); + if (VERBOSE) logger.trace("Replacing " + m.group(1) + " by how/WLEX many/WLEX"); condensedstring = condensedstring.replaceFirst(m.group(1),"how/WLEX many/WLEX"); } m = howAdjPattern.matcher(condensedstring); while (m.find()) { - logger.trace("Replacing " + m.group(1) + " by " + m.group(2)+"/JJH"); + if (VERBOSE) logger.trace("Replacing " + m.group(1) + " by " + m.group(2)+"/JJH"); condensedstring = condensedstring.replaceFirst(m.group(1),m.group(2)+"/JJH"); } m = thesameasPattern.matcher(condensedstring); while (m.find()) { - logger.trace("Replacing " + m.group(1) + " by " + m.group(2)+"/NNSAME"); + if (VERBOSE) logger.trace("Replacing " + m.group(1) + " by " + m.group(2)+"/NNSAME"); condensedstring = condensedstring.replaceFirst(m.group(1),m.group(2)+"/NNSAME"); } m = nprepPattern.matcher(condensedstring); while (m.find()) { - logger.trace("Replacing " + m.group(1) + " by " + m.group(2)+"/NPREP"); + if (VERBOSE) logger.trace("Replacing " + m.group(1) + " by " + m.group(2)+"/NPREP"); condensedstring = condensedstring.replaceFirst(m.group(1),m.group(2)+"/NPREP"); } m = didPattern.matcher(condensedstring); while (m.find()) { - logger.trace("Replacing " + m.group(1) + " by \"\""); + if (VERBOSE) logger.trace("Replacing " + m.group(1) + " by \"\""); condensedstring = condensedstring.replaceFirst(m.group(1),""); } m = prepfrontPattern.matcher(condensedstring); while (m.find()) { - logger.trace("Replacing " + m.group(1) + " by \"\""); + if (VERBOSE) logger.trace("Replacing " + m.group(1) + " by \"\""); condensedstring = condensedstring.replaceFirst(m.group(1),""); } m = passivePattern1a.matcher(condensedstring); while (m.find()) { - logger.trace("Replacing " + m.group(1) + " by " + m.group(6)+"/PASSIVE"); + if (VERBOSE) logger.trace("Replacing " + m.group(1) + " by " + m.group(6)+"/PASSIVE"); condensedstring = condensedstring.replaceFirst(m.group(1),m.group(6)+"/PASSIVE"); } m = passivePattern1b.matcher(condensedstring); while (m.find()) { - logger.trace("Replacing " + m.group(1) + " by " + m.group(6)+m.group(7)+"/PASSIVE"); + if (VERBOSE) logger.trace("Replacing " + m.group(1) + " by " + m.group(6)+m.group(7)+"/PASSIVE"); condensedstring = condensedstring.replaceFirst(m.group(1),m.group(6) + m.group(7)+"/PASSIVE"); } m = passivePattern2a.matcher(condensedstring); while (m.find()) { - logger.trace("Replacing " + m.group(1) + " by " + m.group(7)+"/PASSIVE"); + if (VERBOSE) logger.trace("Replacing " + m.group(1) + " by " + m.group(7)+"/PASSIVE"); condensedstring = condensedstring.replaceFirst(m.group(1),m.group(7)+"/PASSIVE"); } m = pseudopassPattern.matcher(condensedstring); while (m.find()) { - logger.trace("Replacing " + m.group(1) + " by " + m.group(7)+"/VPREP"); + if (VERBOSE) logger.trace("Replacing " + m.group(1) + " by " + m.group(7)+"/VPREP"); condensedstring = condensedstring.replaceFirst(m.group(1),m.group(7)+"/VPREP"); } m = pseudopwhPattern.matcher(condensedstring); while (m.find()) { - logger.trace("Replacing " + m.group(1) + " by " + m.group(7)+m.group(8)+"/VPREP"); + if (VERBOSE) logger.trace("Replacing " + m.group(1) + " by " + m.group(7)+m.group(8)+"/VPREP"); condensedstring = condensedstring.replaceFirst(m.group(1),m.group(7)+" "+m.group(8)+"/VPREP"); } m = saveIsThere.matcher(condensedstring); @@ -158,57 +164,57 @@ } m = passivePattern2b.matcher(condensedstring); while (m.find()) { - logger.trace("Replacing " + m.group(1) + " by " + m.group(7)+"/PASSIVE"); + if (VERBOSE) logger.trace("Replacing " + m.group(1) + " by " + m.group(7)+"/PASSIVE"); condensedstring = condensedstring.replaceFirst(m.group(1),m.group(7)+"/PASSIVE"); } m = passpartPattern.matcher(condensedstring); while (m.find()) { - logger.trace("Replacing " + m.group(1) + " by " + m.group(2)+"/PASSPART"); + if (VERBOSE) logger.trace("Replacing " + m.group(1) + " by " + m.group(2)+"/PASSPART"); condensedstring = condensedstring.replaceFirst(m.group(1),m.group(2)+"/PASSPART"); } m = vpassPattern.matcher(condensedstring); while (m.find()) { - logger.trace("Replacing " + m.group(1) + " by " + m.group(2)+"/VPASS"); + if (VERBOSE) logger.trace("Replacing " + m.group(1) + " by " + m.group(2)+"/VPASS"); condensedstring = condensedstring.replaceFirst(m.group(1),m.group(2)+"/VPASS"); } m = vpassinPattern.matcher(condensedstring); while (m.find()) { - logger.trace("Replacing " + m.group(1) + " by " + m.group(2)+"/VPASSIN"); + if (VERBOSE) logger.trace("Replacing " + m.group(1) + " by " + m.group(2)+"/VPASSIN"); condensedstring = condensedstring.replaceFirst(m.group(1),m.group(2)+"/VPASSIN"); } m = gerundinPattern.matcher(condensedstring); while (m.find()) { - logger.trace("Replacing " + m.group(1) + " by " + m.group(2)+"/GERUNDIN"); + if (VERBOSE) logger.trace("Replacing " + m.group(1) + " by " + m.group(2)+"/GERUNDIN"); condensedstring = condensedstring.replaceFirst(m.group(1),m.group(2)+"/GERUNDIN"); } m = vprepPattern.matcher(condensedstring); while (m.find()) { - logger.trace("Replacing " + m.group(1) + " by " + m.group(2)+"/VPREP"); + if (VERBOSE) logger.trace("Replacing " + m.group(1) + " by " + m.group(2)+"/VPREP"); condensedstring = condensedstring.replaceFirst(m.group(1),m.group(2)+"/VPREP"); } m = whenPattern.matcher(condensedstring); while (m.find()) { - logger.trace("Replacing " + m.group(1) + " by " + m.group(2)+m.group(3)+"/WHEN"); + if (VERBOSE) logger.trace("Replacing " + m.group(1) + " by " + m.group(2)+m.group(3)+"/WHEN"); condensedstring = condensedstring.replaceFirst(m.group(1),m.group(2) + m.group(3)+"/WHEN"); } m = wherePattern.matcher(condensedstring); while (m.find()) { - logger.trace("Replacing " + m.group(1) + " by " + m.group(2)+m.group(3)+"/WHERE"); + if (VERBOSE) logger.trace("Replacing " + m.group(1) + " by " + m.group(2)+m.group(3)+"/WHERE"); condensedstring = condensedstring.replaceFirst(m.group(1),m.group(2) + m.group(3)+"/WHERE"); } m = adjsPattern.matcher(condensedstring); while (m.find()) { - logger.trace("Replacing " + m.group(1) + " by " + m.group(2)+"_"+m.group(3)+"/JJ"); + if (VERBOSE) logger.trace("Replacing " + m.group(1) + " by " + m.group(2)+"_"+m.group(3)+"/JJ"); condensedstring = condensedstring.replaceFirst(m.group(1),m.group(2)+"_"+m.group(3)+"/JJ"); } m = adjnounPattern.matcher(condensedstring); while (m.find()) { - logger.trace("Replacing " + m.group(1) + " by " + m.group(2)+"_"+m.group(3)+"/JJNN"); + if (VERBOSE) logger.trace("Replacing " + m.group(1) + " by " + m.group(2)+"_"+m.group(3)+"/JJNN"); condensedstring = condensedstring.replaceFirst(m.group(1),m.group(2)+"_"+m.group(3)+"/JJNN"); } m = adjnprepPattern.matcher(condensedstring); while (m.find()) { - logger.trace("Replacing " + m.group(1) + " by " + m.group(2)+"_"+m.group(3)+"/JJNPREP"); + if (VERBOSE) logger.trace("Replacing " + m.group(1) + " by " + m.group(2)+"_"+m.group(3)+"/JJNPREP"); condensedstring = condensedstring.replaceFirst(m.group(1),m.group(2)+"_"+m.group(3)+"/JJNPREP"); } @@ -259,7 +265,7 @@ List<String> namedentities = ner.getNamedEntitites(untagged); List<String> usefulnamedentities = new ArrayList<String>(); - logger.trace("Proposed NEs: " + namedentities); + if (VERBOSE) logger.trace("Proposed NEs: " + namedentities); // keep only longest matches (e.g. keep 'World of Warcraft' and forget about 'Warcraft') // containing at least one upper case letter (in order to filter out errors like 'software') @@ -277,7 +283,7 @@ } } - logger.trace("Accepted NEs: " + usefulnamedentities); + if (VERBOSE) logger.trace("Accepted NEs: " + usefulnamedentities); // replace POS tags accordingly for (String ne : usefulnamedentities) { Modified: trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/templator/SlotBuilder.java =================================================================== --- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/templator/SlotBuilder.java 2011-11-12 23:10:28 UTC (rev 3399) +++ trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/templator/SlotBuilder.java 2011-11-14 10:56:41 UTC (rev 3400) @@ -34,8 +34,23 @@ String type = "UNSPEC"; + /* 's */ + if (token.equals("'s")) { + String slot = "SLOT_of/SYMPROPERTY/of"; + String[] npAdjunct = {token, + "(NP NP* PART:'s' NP[obj]))", + "<x,l1,<e,t>,[ l1:[ y | SLOT_of(x,y) ] ],[(l2,y,obj,<e,t>)],[l2=l1],["+slot+"]>" + + " ;; <x,l1,<e,t>,[ l1:[ y | empty(x,y) ] ],[(l2,y,obj,<e,t>)],[l2=l1],[]>"}; + String[] dpAdjunct = {token, + "(DP DP* PART:'s' NP[obj]))", + "<x,l1,<<e,t>,t>,[ l1:[ y | SLOT_of(x,y) ] ],[(l2,y,obj,<e,t>)],[l2=l1],["+slot+"]>" + + " ;; <x,l1,<<e,t>,t>,[ l1:[ y | empty(x,y) ] ],[(l2,y,obj,<e,t>)],[l2=l1],[]>"}; + result.add(npAdjunct); + result.add(dpAdjunct); + } + /* NOUNS */ - if (equalsOneOf(pos,noun)) { + else if (equalsOneOf(pos,noun)) { if (pos.equals("NN") || pos.equals("NNS")) { type = "CLASS"; Modified: trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/templator/Templator.java =================================================================== --- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/templator/Templator.java 2011-11-12 23:10:28 UTC (rev 3399) +++ trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/templator/Templator.java 2011-11-14 10:56:41 UTC (rev 3400) @@ -53,8 +53,9 @@ boolean ONE_SCOPE_ONLY = true; boolean UNTAGGED_INPUT = true; - boolean USE_NER = false; + boolean USE_WORDNET = true; + boolean VERBOSE = true; public Templator() { this(new StanfordPartOfSpeechTagger(), new WordNet()); @@ -74,7 +75,6 @@ } g = LTAG_Constructor.construct(grammarFiles); - p = new Parser(); p.SHOW_GRAMMAR = true; @@ -85,12 +85,41 @@ pp = new Preprocessor(USE_NER); } + public Templator(boolean b) { + this.tagger = new StanfordPartOfSpeechTagger(); + this.USE_WORDNET = false; + VERBOSE = b; + + List<InputStream> grammarFiles = new ArrayList<InputStream>(); + for(int i = 0; i < GRAMMAR_FILES.length; i++){ + grammarFiles.add(this.getClass().getClassLoader().getResourceAsStream(GRAMMAR_FILES[i])); + } + + g = LTAG_Constructor.construct(grammarFiles); + + p = new Parser(); + p.SHOW_GRAMMAR = false; + p.VERBOSE = b; + p.USE_DPS_AS_INITTREES = true; + p.CONSTRUCT_SEMANTICS = true; + p.MODE = "LEIPZIG"; + + pp = new Preprocessor(USE_NER); + pp.setVERBOSE(b); + } + public void setUNTAGGED_INPUT(boolean b) { UNTAGGED_INPUT = b; } public void setUSE_NER(boolean b) { USE_NER = b; } + public void setVERBOSE(boolean b) { + VERBOSE = b; + } + public void setGrammarFiles(String[] gf) { + GRAMMAR_FILES = gf; + } public Set<Template> buildTemplates(String s) { @@ -100,7 +129,7 @@ if (UNTAGGED_INPUT) { s = pp.normalize(s); tagged = tagger.tag(s); - logger.trace("Tagged input: " + tagged); + if (VERBOSE) logger.trace("Tagged input: " + tagged); } else { tagged = s; @@ -114,20 +143,20 @@ else newtagged = pp.condenseNominals(tagged); newtagged = pp.condense(newtagged); - logger.trace("Preprocessed: " + newtagged); + if (VERBOSE) logger.trace("Preprocessed: " + newtagged); p.parse(newtagged,g); if (p.getDerivationTrees().isEmpty()) { p.clear(g,p.getTemps()); clearAgain = false; - logger.error("[Templator.java] '" + s + "' could not be parsed."); + if (VERBOSE) logger.error("[Templator.java] '" + s + "' could not be parsed."); } else { try { p.buildDerivedTrees(g); } catch (ParseException e) { - logger.error("[Templator.java] ParseException at '" + e.getMessage() + "'", e); + if (VERBOSE) logger.error("[Templator.java] ParseException at '" + e.getMessage() + "'", e); } } @@ -154,10 +183,12 @@ if (!containsModuloRenaming(drses,drs)) { // // DEBUG - System.out.println(dude); - System.out.println(drs); - for (Slot sl : slots) { - System.out.println(sl.toString()); + if (VERBOSE) { + System.out.println(dude); + System.out.println(drs); + for (Slot sl : slots) { + System.out.println(sl.toString()); + } } // // drses.add(drs); @@ -168,54 +199,55 @@ continue; } - // find WordNet synonyms - List<String> newwords; - String word; - String pos; - for (Slot slot : temp.getSlots()) { - if (!slot.getWords().isEmpty()) { - - word = slot.getWords().get(0); - pos = postable.get(word.toLowerCase().replace(" ","_")); - - POS wordnetpos = null; - if (pos != null) { - if (equalsOneOf(pos,noun)) { - wordnetpos = POS.NOUN; + if (USE_WORDNET) { // find WordNet synonyms + List<String> newwords; + String word; + String pos; + for (Slot slot : temp.getSlots()) { + if (!slot.getWords().isEmpty()) { + + word = slot.getWords().get(0); + pos = postable.get(word.toLowerCase().replace(" ","_")); + + POS wordnetpos = null; + if (pos != null) { + if (equalsOneOf(pos,noun)) { + wordnetpos = POS.NOUN; + } + else if (equalsOneOf(pos,adjective)) { + wordnetpos = POS.ADJECTIVE; + } + else if (equalsOneOf(pos,verb)) { + wordnetpos = POS.VERB; + } + } + + List<String> strings = new ArrayList<String>(); + if (wordnetpos != null && wordnetpos.equals(POS.ADJECTIVE)) { + strings = wordnet.getAttributes(word); + } + + newwords = new ArrayList<String>(); + newwords.addAll(slot.getWords()); + newwords.addAll(strings); + + if (wordnetpos != null && !slot.getSlotType().equals(SlotType.RESOURCE)) { + newwords.addAll(wordnet.getBestSynonyms(wordnetpos,getLemmatizedWord(word))); + for (String att : getLemmatizedWords(strings)) { + newwords.addAll(wordnet.getBestSynonyms(wordnetpos,att)); + } } - else if (equalsOneOf(pos,adjective)) { - wordnetpos = POS.ADJECTIVE; + if(newwords.isEmpty()){ + } - else if (equalsOneOf(pos,verb)) { - wordnetpos = POS.VERB; + if (newwords.isEmpty()) { + newwords.add(slot.getWords().get(0)); } + List<String> newwordslist = new ArrayList<String>(); + newwordslist.addAll(newwords); + slot.setWords(newwordslist); } - - List<String> strings = new ArrayList<String>(); - if (wordnetpos != null && wordnetpos.equals(POS.ADJECTIVE)) { - strings = wordnet.getAttributes(word); - } - - newwords = new ArrayList<String>(); - newwords.addAll(slot.getWords()); - newwords.addAll(strings); - - if (wordnetpos != null && !slot.getSlotType().equals(SlotType.RESOURCE)) { - newwords.addAll(wordnet.getBestSynonyms(wordnetpos,getLemmatizedWord(word))); - for (String att : getLemmatizedWords(strings)) { - newwords.addAll(wordnet.getBestSynonyms(wordnetpos,att)); - } - } - if(newwords.isEmpty()){ - - } - if (newwords.isEmpty()) { - newwords.add(slot.getWords().get(0)); - } - List<String> newwordslist = new ArrayList<String>(); - newwordslist.addAll(newwords); - slot.setWords(newwordslist); - } + } } // This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |