From: <ge...@us...> - 2012-07-06 08:24:29
|
Revision: 3773 http://dl-learner.svn.sourceforge.net/dl-learner/?rev=3773&view=rev Author: gerbsen Date: 2012-07-06 08:24:21 +0000 (Fri, 06 Jul 2012) Log Message: ----------- code for extracting properties from diadem text and made a change to newer lucene version 3.6 Modified Paths: -------------- trunk/components-ext/pom.xml Added Paths: ----------- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/diadem/ trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/diadem/DiademPropertyFinder.java trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/diadem/Word.java trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/diadem/WordFrequencyCounter.java Modified: trunk/components-ext/pom.xml =================================================================== --- trunk/components-ext/pom.xml 2012-07-05 07:24:27 UTC (rev 3772) +++ trunk/components-ext/pom.xml 2012-07-06 08:24:21 UTC (rev 3773) @@ -43,6 +43,18 @@ <groupId>org.apache.solr</groupId> <artifactId>solr-core</artifactId> </dependency> + + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-analyzers</artifactId> + <version>3.5.0</version> + </dependency> + + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-core</artifactId> + <version>3.5.0</version> + </dependency> <!--BEGIN Logging Dependencies--> Added: trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/diadem/DiademPropertyFinder.java =================================================================== --- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/diadem/DiademPropertyFinder.java (rev 0) +++ trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/diadem/DiademPropertyFinder.java 2012-07-06 08:24:21 UTC (rev 3773) @@ -0,0 +1,31 @@ +package org.dllearner.algorithm.tbsl.diadem; + +import java.io.File; +import java.io.IOException; +import java.util.List; + +import org.apache.commons.io.FileUtils; + +import edu.stanford.nlp.util.StringUtils; + +/** + * + */ +public class DiademPropertyFinder { + + /** + * @param args + * @throws IOException + */ + public static void main(String[] args) throws IOException { + + List<String> lines = FileUtils.readLines(new File("/Users/gerb/Development/workspaces/experimental/diadem/descriptions.txt")); + String allDEscriptions = StringUtils.join(lines, " "); + + WordFrequencyCounter wfc = new WordFrequencyCounter(); + for ( Word word : wfc.getKeywordsSortedByFrequency(allDEscriptions)) { + + System.out.println(word.getWord() + ":\t" + word.getFrequency()); + } + } +} Added: trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/diadem/Word.java =================================================================== --- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/diadem/Word.java (rev 0) +++ trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/diadem/Word.java 2012-07-06 08:24:21 UTC (rev 3773) @@ -0,0 +1,112 @@ +/** + * + */ +package org.dllearner.algorithm.tbsl.diadem; + +/** + * @author Daniel Gerber <dg...@in...> + * + */ +public class Word implements Comparable<Word> { + + public boolean isFromWikipedia() { + + return isFromWikipedia; + } + + private String word; + private int frequency; + private boolean isFromWikipedia; // Is that term extracted from a Wikipedia + // article + + public Word(String word, int frequency, boolean fromWikipedia) { + + isFromWikipedia = fromWikipedia; + this.word = word; + this.frequency = frequency; + } + + public Word(String word, int count) { + + this(word, count, false); + } + + /** + * Increases the total frequency with 1 + * + * @return The new frequency + */ + public int incrementFrequency() { + + return ++frequency; + } + + public int compareTo(Word otherWord) { + + if (this.frequency == otherWord.frequency) { + return this.word.compareTo(otherWord.word); + } + return otherWord.frequency - this.frequency; + } + + public String getWord() { + + return word; + } + + public int getFrequency() { + + return frequency; + } + + @Override + public String toString() { + + return word; + } + + /* + * (non-Javadoc) + * + * @see java.lang.Object#hashCode() + */ + @Override + public int hashCode() { + + final int prime = 31; + int result = 1; + result = prime * result + ((word == null) ? 0 : word.hashCode()); + return result; + } + + /* + * (non-Javadoc) + * + * @see java.lang.Object#equals(java.lang.Object) + */ + @Override + public boolean equals(Object obj) { + + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + Word other = (Word) obj; + if (word == null) { + if (other.word != null) + return false; + } + else + if (!word.equals(other.word)) + return false; + return true; + } + + public Word setFrequency(int i) { + + this.frequency = i; + return this; + } +} Added: trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/diadem/WordFrequencyCounter.java =================================================================== --- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/diadem/WordFrequencyCounter.java (rev 0) +++ trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/diadem/WordFrequencyCounter.java 2012-07-06 08:24:21 UTC (rev 3773) @@ -0,0 +1,82 @@ +package org.dllearner.algorithm.tbsl.diadem; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.PatternAnalyzer; +import org.apache.lucene.analysis.ngram.NGramTokenFilter; +import org.apache.lucene.analysis.shingle.ShingleFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + +/** + * + * @author Daniel Gerber <dg...@in...> + * + */ +public class WordFrequencyCounter { + + private List<String> stopwords = new ArrayList<String>(); + public WordFrequencyCounter(){ + +// stopwords.addAll(Arrays.asList()); + } + + /** + * + * @param inputWords + * @return + */ + public ArrayList<Word> getKeywordsSortedByFrequency(String inputWords){ + + PatternAnalyzer keywordAnalyzer = PatternAnalyzer.EXTENDED_ANALYZER; + TokenStream pageTokens = keywordAnalyzer.tokenStream("", inputWords); + CharTermAttribute charTermAttribute = pageTokens.getAttribute(CharTermAttribute.class); + ArrayList<String> tokens = new ArrayList<String>(1000); + + ShingleFilter filter = new ShingleFilter(pageTokens, 2, 3); + + try{ + + while (filter.incrementToken()) { + + // we need to filter these stop words, mostly references in wikipedia + String token = charTermAttribute.toString(); + if ( token.length() > 2 && !stopwords.contains(token) ) tokens.add(token.trim()); + } + } + catch (IOException exp){ + + exp.printStackTrace(); + } + + HashMap<String,Word> map = new HashMap<String,Word>(); + for(String token : tokens){ + + Word word = map.get(token); + if ( word == null ) { + + word = new Word(token,1); + map.put(token, word); + } + else word.incrementFrequency(); + } + // sort the values by there frequency and return them + ArrayList<Word> sortedKeywordList = new ArrayList<Word>(map.values()); + Collections.sort(sortedKeywordList); + + Iterator<Word> wordsIterator = sortedKeywordList.iterator(); + while ( wordsIterator.hasNext() ) { + + Word word = wordsIterator.next(); + if ( word.getFrequency() <= 10 ) wordsIterator.remove(); + } + + return sortedKeywordList; + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |