From: <lor...@us...> - 2011-12-12 14:38:00
|
Revision: 3499 http://dl-learner.svn.sourceforge.net/dl-learner/?rev=3499&view=rev Author: lorenz_b Date: 2011-12-12 14:37:51 +0000 (Mon, 12 Dec 2011) Log Message: ----------- Removed 2 unused classes. Removed Paths: ------------- trunk/components-core/src/main/java/org/dllearner/utilities/ICFinder.java trunk/components-core/src/main/java/org/dllearner/utilities/WordnetSimilarity.java Deleted: trunk/components-core/src/main/java/org/dllearner/utilities/ICFinder.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/utilities/ICFinder.java 2011-12-12 14:36:59 UTC (rev 3498) +++ trunk/components-core/src/main/java/org/dllearner/utilities/ICFinder.java 2011-12-12 14:37:51 UTC (rev 3499) @@ -1,189 +0,0 @@ -package org.dllearner.utilities; - -import java.io.BufferedReader; -import java.io.FileReader; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Hashtable; -import java.util.Vector; - -/** - * - * @author Uthaya - * - */ -// n + v only -//David Hope, 2008, University Of Sussex - -public class ICFinder -{ - private String[] editor = null; - private String icfilename = ""; - private BufferedReader in = null; - private String line = ""; -// look up - private Hashtable<String, Double> lookup = null; // quick look up for synset counts (we require Double as Resnik counts are doubles) -// counts for nouns and verbs - private double nouns_sum = 0.0; - private double verbs_sum = 0.0; - private double nounsandverbs_sum = 0.0; // ** the ??? normaliser ??? ** for the 'getProbability' method -// <ROOTS> for nouns and verbs - private double nounroot_sum = 0.0; - private double verbroot_sum = 0.0; - private ArrayList<String> nounroots = null; - private ArrayList<String> verbroots = null; - - public ICFinder(String icfilename) - { - System.out.println("... calculating IC <roots> ..."); - System.out.println("... ICFinder"); - -// your IC file - this.icfilename = icfilename; -// quick look up table - lookup = new Hashtable<String, Double>(); -// get some useful 'constants' - nounroots = new ArrayList<String>(); - verbroots = new ArrayList<String>(); - Vector<Double> constants = setup(); - nouns_sum = constants.get(0); - verbs_sum = constants.get(1); - nounsandverbs_sum = ( nouns_sum + verbs_sum ); - nounroot_sum = constants.get(2); - verbroot_sum = constants.get(3); - } - - public double getRootSum(String pos) - { - if(pos.equalsIgnoreCase("v")) - return (verbroot_sum); - return (nounroot_sum); - } - - -// 'getFrequency': get the count for the {synset} from the IC file - private double getFrequency(String synset, String pos) - { - if(lookup.containsKey(synset + pos)) - return ( lookup.get(synset + pos) ); - return ( 0.0 ); - } - -// 'getProbability': get the probability of the {synset} - private double getProbability(String synset, String pos) - { - double freq = getFrequency(synset, pos); - if(freq == 0.0) - return ( 0.0 ); - - double probability = 0.0; - - if(pos.equalsIgnoreCase("n")) - probability = ( freq / nounroot_sum ); // Ted Pedersen et al. use the sum of the noun<root> counts *not* the sum of the noun counts - - if(pos.equalsIgnoreCase("v")) - probability = ( freq / verbroot_sum ); // Ted Pedersen et al. use the sum of the verb<root> counts *not* the sum of the verb counts - - return ( probability ); - } - - -// does all / any type of synset i.e. standard synset | <lcs> synset -// !!! we are using the notion of a 'fake'<root> as per the Perl implementation !!! -// !!! there is no option to turn the 'fake'<root> off in this implementation - it all gets a bit silly (hard to justify) if we do this !!! - public double getIC(String synset, String pos) - { - double ic = 0.0; -// Case 1. There is *no* <lcs> ............................................................................................................................................... -// If the 'synset' is empty (null Object or an empty String), - this implies that no <lcs>|synset was found for a (pair of synsets) and thus, -// they must join at an 'imaginary' <root> point in the WordNet space (tree). We call this the'fake'<root>. -// Further, *if* we are assuming a 'fake' root' (which we do; we default to it as per the Perl implementation), - this implies -// that it subsumes all other <roots>. This being the case, the 'fake'<root> must then have an Information Content(ic) value of 0 -// as it provides us with zero information - if(synset == null || synset.length() == 0) - { - return ( ic ); - } -// ....................................................................................................................................................................................... -// Case 2. There is an <lcs> but it has a frequency of zero and thus it has a probability of zero and thus is just not valid as input -// to the Information Content equation ( we will get 'Infinity') - so, we simply return 0 - double p = getProbability(synset, pos); - if(p == 0.0) - { - return ( ic ); - } - else - { - ic = -Math.log(p); - } -// ....................................................................................................................................................................................... -// Case 3. There is an <lcs>, -- it may be a <root> or it may be a boring old synset but - it does have a frequency, thus it does have -// a probability and thus we may calculate the Information Content for this synset. If the synset is a <root> and there is only 1 such -// <root> for the POS, then, effectively the Information Contente will be zero, otherwise we should get a value that is greater than zero - return ( ic ); - } - -// utility: get counts for {synsets} | just nouns | just verbs | noun'fake'<root> | verb'fake'<root> -// these are used to calculate probabilities of {synsets} and to 'back-off' to a <root> value if no LCS exists for 2 words - private Vector<Double> setup() - { - String unit = ""; - double uc = 0.0; - double nc = 0.0; - double vc = 0.0; - double nrc = 0.0; - double vrc = 0.0; - Vector<Double> counts = new Vector<Double>(); - try - { - in = new BufferedReader(new FileReader(icfilename)); - while ((line = in.readLine()) != null) - { - editor = line.split("\\s"); // IC files are space delimited - for(int i = 0; i < editor.length; i++) - { - unit = editor[i]; -// nouns - if(unit.endsWith("n")) - { - lookup.put(editor[0], Double.parseDouble(editor[1])); - uc = Double.parseDouble(editor[1]); // get the value: the 'count' for the {synset} - nc += uc;// add to noun total - if(editor.length == 3) // if ROOT - { - nrc += uc;// add to noun<root> total - // store noun <root> - nounroots.add(editor[0].substring(0,editor[0].length()-1)); - } - }else if(unit.endsWith("v")) // verbs - { - lookup.put(editor[0], Double.parseDouble(editor[1])); - uc = Double.parseDouble(editor[1]); // get the value: the 'count' for the {synset} - vc += uc; // add to verb total - if(editor.length == 3) // if ROOT - { - vrc += uc; // add to verb<root> total - // store verb<root> - verbroots.add(editor[0].substring(0,editor[0].length()-1)); - } - }/*else{ - System.err.println("Adj? "+ unit); - }*/ - } - } - in.close(); - } - catch (IOException e){e.printStackTrace();} - counts.add(nc); counts.add(vc); counts.add(nrc); counts.add(vrc); - return ( counts ); - } - - public ArrayList<String> getNounRoots() - { - return ( nounroots ); - } - public ArrayList<String> getVerbRoots() - { - return ( verbroots ); - } -} Deleted: trunk/components-core/src/main/java/org/dllearner/utilities/WordnetSimilarity.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/utilities/WordnetSimilarity.java 2011-12-12 14:36:59 UTC (rev 3498) +++ trunk/components-core/src/main/java/org/dllearner/utilities/WordnetSimilarity.java 2011-12-12 14:37:51 UTC (rev 3499) @@ -1,287 +0,0 @@ -package org.dllearner.utilities; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashSet; -import java.util.List; -import java.util.Set; - -import net.didion.jwnl.JWNL; -import net.didion.jwnl.JWNLException; -import net.didion.jwnl.data.IndexWord; -import net.didion.jwnl.data.POS; -import net.didion.jwnl.data.PointerTarget; -import net.didion.jwnl.data.PointerType; -import net.didion.jwnl.data.Synset; -import net.didion.jwnl.data.Word; -import net.didion.jwnl.dictionary.Dictionary; - -public class WordnetSimilarity { - - public Dictionary dict; - - public WordnetSimilarity(){ - try { - JWNL.initialize(this.getClass().getClassLoader().getResourceAsStream("wordnet_properties.xml")); - dict = Dictionary.getInstance(); - } catch (JWNLException e) { - e.printStackTrace(); - } - } - - public double computeSimilarity(String s1, String s2, POS pos){ - List<String> synonyms = new ArrayList<String>(); - - try { - IndexWord iw1 = dict.getIndexWord(pos, s1); - IndexWord iw2 = dict.getIndexWord(pos, s2);//dict.getMorphologicalProcessor().lookupBaseForm(pos, s) -// IndexWord iw = dict.getMorphologicalProcessor().lookupBaseForm(pos, s); - getUpwardHierachy(s1, pos); - getUpwardHierachy(s2, pos); - - ICFinder icFinder = new ICFinder("src/main/resources/ic-semcor.dat"); - Synset synset1 = iw1.getSenses()[0]; - Synset synset2 = iw2.getSenses()[0]; - Synset lcs = getLCS(synset1, synset2, "NN", icFinder); - System.out.println(lcs); - - for(Synset synset : iw1.getSenses()){ - for(List<PointerTarget> tree : getHypernymTrees(synset, new HashSet<PointerTarget>())){ - for(PointerTarget t : tree){ - System.out.print(((Synset)t).getWords()[0].getLemma() + "-->"); - } - System.out.println(); - } - } - - - - } catch (JWNLException e) { - e.printStackTrace(); - } - - - return -1; - } - - private List<PointerTarget> getUpwardHierachy(PointerTarget target){ - List<PointerTarget> hierarchy = new ArrayList<PointerTarget>(); - try { - PointerTarget[] targets = target.getTargets(PointerType.HYPERNYM); - for (PointerTarget t : targets) { - hierarchy.add(t); - hierarchy.addAll(getUpwardHierachy(t)); - } - } catch (JWNLException e) { - e.printStackTrace(); - } - return hierarchy; - - } - -// private List<List<PointerTarget>> getUpwardHierachies(List<List<PointerTarget>> targets){ -// List<List<PointerTarget>> hierarchies = new ArrayList<List<PointerTarget>>(); -// try { -// PointerTarget[] targets = target.getTargets(PointerType.HYPERNYM); -// for (PointerTarget t : targets) { -// hierarchy.add(t); -// hierarchy.addAll(getUpwardHierachy(t)); -// } -// } catch (JWNLException e) { -// e.printStackTrace(); -// } -// return hierarchy; -// -// } - - private void getUpwardHierachy(String word, POS pos){ - try { - IndexWord iw = dict.getIndexWord(pos, word); - for(Synset synset : iw.getSenses()){ - for(PointerTarget t : getUpwardHierachy(synset)){ - System.out.print(((Synset)t).getWord(0).getLemma() + "-->"); - } - System.out.println(); - } - } catch (JWNLException e) { - e.printStackTrace(); - } - - } - - private void getHypernyms(IndexWord iw){ - try { - if(iw != null){ - Synset[] synsets = iw.getSenses(); - for(Synset s : synsets){ - System.out.println(s); - PointerTarget[] targets = s.getTargets(PointerType.HYPERNYM); - for (PointerTarget target : targets) { - Word[] words = ((Synset) target).getWords(); - for (Word word : words) { - System.out.println(word); - } - } - } - } - } catch (JWNLException e) { - e.printStackTrace(); - } - } - - public Synset getLCS(Synset synset1, Synset synset2, String pos, ICFinder icFinder) throws JWNLException - { - // synset1 - HashSet<Synset> s1 = new HashSet<Synset>(); s1.add(synset1); - HashSet<Synset> h1 = new HashSet<Synset>(); - h1 = getHypernyms(s1,h1); - // !!! important !!! we must add the original {synset} back in, as the 2 {synsets}(senses) we are comparing may be equivalent i.e. bthe same {synset}! - h1.add(synset1); - //System.out.println(">>>>>>>>>>>>>>>>>>>>>"); - // synset2 - HashSet<Synset> s2 = new HashSet<Synset>(); s2.add(synset2); - HashSet<Synset> h2 = new HashSet<Synset>(); - h2 = getHypernyms(s2,h2); - h2.add(synset2); // ??? don't really need this ??? - //System.out.println("JWNL,h1, "+toStr(synset1.getWords())+", :h2, "+toStr(synset2.getWords())+" ,=, "+h1.size()+", "+h2.size()); - // get the candidate <lcs>s i.e. the intersection of all <hypernyms> | {synsets} which subsume the 2 {synsets} - /*System.out.println("========================"); - System.out.println(h1); - System.out.println(h2); - System.out.println("========================");*/ - h1.retainAll(h2); - if(h1.isEmpty()) - { - return (null); // i.e. there is *no* <LCS> for the 2 synsets - } - - // get *a* <lcs> with the highest Information Content - double max = -Double.MAX_VALUE; - Synset maxlcs = null; - for (Synset h : h1) - { - double ic = icFinder.getIC("" + h.getOffset(), pos); // use ICfinder to get the Information Content value - if(ic > max) - { - max = ic; - maxlcs = h; - } - } - return maxlcs; // return the <synset} with *a* highest IC value - } - - - // 1.1 GET <HYPERNYMS> - private HashSet<Synset> getHypernyms(HashSet<Synset> synsets, HashSet<Synset> allhypernms) throws JWNLException - { - if(allhypernms.size()>= 100){ - return allhypernms; - } - - //System.out.println("IP: " + synsets); - HashSet<Synset> hypernyms = new HashSet<Synset>(); - for(Synset s : synsets) - { - - PointerTarget[] hyp = s.getTargets(PointerType.HYPERNYM); // get the <hypernyms> if there are any - for (PointerTarget pointerTarget : hyp) { - if (pointerTarget instanceof Synset) { - Synset poiSyn = (Synset) pointerTarget; - hypernyms.add(poiSyn); - }/*else{ - //System.out.println("PointerTarget is not instanceof Synset: "+pointerTarget); - }*/ - } - //System.out.println("\t"+hypernyms); - } - if(!hypernyms.isEmpty()) - { - if(allhypernms.size()+hypernyms.size()>= 100){ - return allhypernms; - } - try { - allhypernms.addAll(hypernyms); - } catch (StackOverflowError e) { - //System.out.println(allhypernms.size()); - //System.out.println(hypernyms.size()); - //e.printStackTrace(); - System.gc(); - System.gc(); - System.err.println(e.getMessage()); - return allhypernms; - } - allhypernms = getHypernyms(hypernyms, allhypernms); - } - //System.out.println(allhypernms); - return allhypernms; - } - - /** - * since this method is heavily used, inner cache would help for e.g. - * calculating similarity matrix - * - * Suroutine that returns an array of hypernym trees, given the offset of # - * the synset. Each hypernym tree is an array of offsets. - * - * @param synset - * @param mode - */ - public List<List<PointerTarget>> getHypernymTrees(PointerTarget synset, Set<PointerTarget> history) { - PointerTarget key = synset; - - // check if the input synset is one of the imaginary root nodes - if (synset.equals(new Synset(POS.NOUN, 0, new Word[]{new Word("ROOT", "ROOT", 0)}, null, null, null))) { - List<PointerTarget> tree = new ArrayList<PointerTarget>(); - tree.add(new Synset(POS.NOUN, 0, new Word[]{new Word("ROOT", "ROOT", 0)}, null, null, null)); - List<List<PointerTarget>> trees = new ArrayList<List<PointerTarget>>(); - trees.add(tree); - return trees; - } - - List<PointerTarget> synlinks = null; - try { - synlinks = Arrays.asList(synset.getTargets(PointerType.HYPERNYM)); - } catch (JWNLException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - - List<List<PointerTarget>> returnList = new ArrayList<List<PointerTarget>>(); - if (synlinks.size() == 0) { - List<PointerTarget> tree = new ArrayList<PointerTarget>(); - tree.add(synset); - tree.add(0, new Synset(POS.NOUN, 0, new Word[]{new Word("ROOT", "ROOT", 0)}, null, null, null)); - returnList.add(tree); - } else { - for (PointerTarget hypernym : synlinks) { - if ( history.contains(hypernym) ) continue; - history.add(hypernym); - - List<List<PointerTarget>> hypernymTrees = getHypernymTrees(hypernym, history); - if ( hypernymTrees!=null ) { - for (List<PointerTarget> hypernymTree : hypernymTrees) { - hypernymTree.add(synset); - returnList.add(hypernymTree); - } - } - if (returnList.size() == 0) { - List<PointerTarget> newList = new ArrayList<PointerTarget>(); - newList.add(synset); - newList.add(0, new Synset(POS.NOUN, 0, new Word[]{new Word("ROOT", "ROOT", 0)}, null, null, null)); - returnList.add(newList); - } - } - } - - return returnList; - } - - - public static void main(String[] args) { - System.out.println(new WordnetSimilarity().computeSimilarity("writer", "teacher", POS.NOUN)); - -// ILexicalDatabase db = new NictWordNet(); -// System.out.println(new Lin(db).calcRelatednessOfWords("writer", "teacher")); - } - -} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |