From: <lor...@us...> - 2013-12-03 12:41:40
|
Revision: 4192 http://sourceforge.net/p/dl-learner/code/4192 Author: lorenz_b Date: 2013-12-03 12:41:34 +0000 (Tue, 03 Dec 2013) Log Message: ----------- Made semantic index serializable. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/WordTypeComparator.java trunk/components-core/src/test/java/org/dllearner/algorithms/isle/Experiment.java trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java trunk/components-core/src/test/java/org/dllearner/algorithms/isle/SemanticBibleExperiment.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java 2013-12-03 09:40:14 UTC (rev 4191) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java 2013-12-03 12:41:34 UTC (rev 4192) @@ -4,6 +4,7 @@ package org.dllearner.algorithms.isle.index; +import java.io.Serializable; import java.util.ArrayList; import java.util.List; @@ -12,7 +13,7 @@ * @author Lorenz Buehmann * */ -public class Annotation { +public class Annotation implements Serializable{ private Document referencedDocument; private ArrayList<Token> tokens; Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-12-03 09:40:14 UTC (rev 4191) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-12-03 12:41:34 UTC (rev 4192) @@ -4,6 +4,9 @@ package org.dllearner.algorithms.isle.index; import java.io.Serializable; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; import com.google.common.collect.ComparisonChain; @@ -159,9 +162,9 @@ Token token = (Token) o; -// if (!posTag.equals(token.posTag)) { -// return false; -// } + if (!WordTypeComparator.sameWordType(posTag, token.posTag)) { + return false; + } if (!stemmedForm.equals(token.stemmedForm)) { return false; } @@ -172,7 +175,7 @@ @Override public int hashCode() { int result = stemmedForm.hashCode(); -// result = 31 * result + posTag.hashCode(); + result = 31 * result + WordTypeComparator.hashCode(posTag); return result; } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/WordTypeComparator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/WordTypeComparator.java 2013-12-03 09:40:14 UTC (rev 4191) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/WordTypeComparator.java 2013-12-03 12:41:34 UTC (rev 4192) @@ -18,10 +18,28 @@ * @return */ public static boolean sameWordType(String posTag1, String posTag2){ - if(posTag1.startsWith("NN") && posTag2.startsWith("NN") || - posTag1.startsWith("V") && posTag2.startsWith("V")){ + if(posTag1.startsWith("NN") && posTag2.startsWith("NN") || //nouns + posTag1.startsWith("V") && posTag2.startsWith("V") || //verbs + posTag1.startsWith("JJ") && posTag2.startsWith("JJ") || //adjectives + posTag1.startsWith("RB") && posTag2.startsWith("RB")) //adverbs + { return true; + } else { + return posTag1.equals(posTag2); } - return false; } + + public static int hashCode(String posTag){ + if(posTag.startsWith("NN")){//nouns + return "NN".hashCode(); + } else if(posTag.startsWith("V")){//verbs + return "V".hashCode(); + } else if(posTag.startsWith("JJ")){//adjectives + return "JJ".hashCode(); + } else if(posTag.startsWith("RB")){//adverbs + return "RB".hashCode(); + } else { + return posTag.hashCode(); + } + } } Modified: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/Experiment.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/Experiment.java 2013-12-03 09:40:14 UTC (rev 4191) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/Experiment.java 2013-12-03 12:41:34 UTC (rev 4192) @@ -4,20 +4,15 @@ package org.dllearner.algorithms.isle; import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.ObjectOutputStream; import java.util.Collections; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; -import java.util.Properties; import java.util.Set; import java.util.SortedSet; import org.dllearner.algorithms.celoe.CELOE; -import org.dllearner.algorithms.isle.index.TextDocument; import org.dllearner.algorithms.isle.index.semantic.SemanticIndex; import org.dllearner.algorithms.isle.index.semantic.SemanticIndexGenerator; import org.dllearner.algorithms.isle.metrics.PMIRelevanceMetric; @@ -47,15 +42,6 @@ import com.google.common.collect.Sets; -import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation; -import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; -import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; -import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; -import edu.stanford.nlp.ling.CoreLabel; -import edu.stanford.nlp.pipeline.Annotation; -import edu.stanford.nlp.pipeline.StanfordCoreNLP; -import edu.stanford.nlp.util.CoreMap; - /** * Experimental setup: * @@ -86,54 +72,21 @@ private String testFolder = "experiments/logs/"; private OWLOntology ontology; - private Set<TextDocument> documents; + private Set<String> documents; private boolean initialized = false; private RhoDRDown operator; - protected StanfordCoreNLP pipeline; protected abstract OWLOntology getOntology(); - protected abstract Set<TextDocument> getDocuments(); + protected abstract Set<String> getDocuments(); /** * */ public Experiment() { - // creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution - Properties props = new Properties(); - props.put("annotators", "tokenize, ssplit, pos"); - pipeline = new StanfordCoreNLP(props); } - protected String getPOSTaggedText(String text){ - // create an empty Annotation just with the given text - Annotation document = new Annotation(text); - - // run all Annotators on this text - pipeline.annotate(document); - - // these are all the sentences in this document - // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types - List<CoreMap> sentences = document.get(SentencesAnnotation.class); - - StringBuilder sb = new StringBuilder(); - for(CoreMap sentence: sentences) { - // traversing the words in the current sentence - // a CoreLabel is a CoreMap with additional token-specific methods - for (CoreLabel token: sentence.get(TokensAnnotation.class)) { - // this is the text of the token - String word = token.get(TextAnnotation.class); - // this is the POS tag of the token - String pos = token.get(PartOfSpeechAnnotation.class); - - sb.append(word).append("/").append(pos).append(" "); - } - - } - return sb.toString(); - } - private void initIfNecessary() { if(!initialized){ ontology = getOntology(); @@ -141,13 +94,6 @@ // build semantic index SemanticIndex semanticIndex = SemanticIndexGenerator.generateIndex(documents, ontology, false); - try { - ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream("semantic-index.ser")); - oos.writeObject(semanticIndex); - oos.close(); - } catch (IOException e1) { - e1.printStackTrace(); - } // set the relevance metric relevance = new PMIRelevanceMetric(semanticIndex); Modified: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java 2013-12-03 09:40:14 UTC (rev 4191) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java 2013-12-03 12:41:34 UTC (rev 4192) @@ -90,14 +90,14 @@ new URL("http://gold.linkeddata.org/data/bible/chapter_index.zip")); } - private Set<TextDocument> createDocuments(){ - Set<TextDocument> documents = new HashSet<TextDocument>(); + private Set<String> createDocuments(){ + Set<String> documents = new HashSet<String>(); File folder = new File(testFolder+"corpus/"); for (File file : folder.listFiles()) { if(!file.isDirectory() && !file.isHidden()){ try { String text = Files.toString(file, Charsets.UTF_8); - documents.add(TextDocumentGenerator.getInstance().generateDocument(text)); + documents.add(text); } catch (IOException e) { e.printStackTrace(); } @@ -106,8 +106,8 @@ return documents; } - private Set<TextDocument> createBibleDocuments() throws IOException { - Set<TextDocument> documents = new HashSet<TextDocument>(); + private Set<String> createBibleDocuments() throws IOException { + Set<String> documents = new HashSet<String>(); RemoteDataProvider bibleByChapter = new RemoteDataProvider( new URL("http://gold.linkeddata.org/data/bible/split_by_chapter.zip")); File folder = bibleByChapter.getLocalDirectory(); @@ -115,7 +115,7 @@ if(!file.isDirectory() && !file.isHidden()){ try { String text = Files.toString(file, Charsets.UTF_8); - documents.add(TextDocumentGenerator.getInstance().generateDocument(text)); + documents.add(text); } catch (IOException e) { e.printStackTrace(); } @@ -191,9 +191,9 @@ EntityCandidateGenerator ecg = new TrieEntityCandidateGenerator(ontology, ect); SemanticAnnotator semanticAnnotator = new SemanticAnnotator(wsd, ecg, linguisticAnnotator); - Set<TextDocument> docs = createDocuments(); - for (TextDocument doc : docs) { - AnnotatedDocument annotated = semanticAnnotator.processDocument(doc); + Set<String> docs = createDocuments(); + for (String doc : docs) { + AnnotatedDocument annotated = semanticAnnotator.processDocument(TextDocumentGenerator.getInstance().generateDocument(doc)); System.out.println(annotated); } } @@ -208,9 +208,9 @@ EntityCandidateGenerator ecg = new TrieEntityCandidateGenerator(ontology, ect); SemanticAnnotator semanticAnnotator = new SemanticAnnotator(wsd, ecg, linguisticAnnotator); - Set<TextDocument> docs = createDocuments(); - for (TextDocument doc : docs) { - AnnotatedDocument annotated = semanticAnnotator.processDocument(doc); + Set<String> docs = createDocuments(); + for (String doc : docs) { + AnnotatedDocument annotated = semanticAnnotator.processDocument(TextDocumentGenerator.getInstance().generateDocument(doc)); System.out.println(annotated); } } Modified: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/SemanticBibleExperiment.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/SemanticBibleExperiment.java 2013-12-03 09:40:14 UTC (rev 4191) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/SemanticBibleExperiment.java 2013-12-03 12:41:34 UTC (rev 4192) @@ -8,12 +8,9 @@ import java.net.MalformedURLException; import java.net.URL; import java.util.HashSet; -import java.util.List; -import java.util.Properties; import java.util.Set; import org.dllearner.algorithms.isle.index.RemoteDataProvider; -import org.dllearner.algorithms.isle.index.TextDocument; import org.dllearner.core.owl.NamedClass; import org.semanticweb.owlapi.apibinding.OWLManager; import org.semanticweb.owlapi.model.IRI; @@ -22,23 +19,8 @@ import org.semanticweb.owlapi.model.OWLOntologyManager; import com.google.common.base.Charsets; -import com.google.common.collect.Sets; import com.google.common.io.Files; -import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation; -import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation; -import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; -import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; -import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; -import edu.stanford.nlp.ling.CoreLabel; -import edu.stanford.nlp.pipeline.Annotation; -import edu.stanford.nlp.pipeline.StanfordCoreNLP; -import edu.stanford.nlp.trees.Tree; -import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation; -import edu.stanford.nlp.trees.semgraph.SemanticGraph; -import edu.stanford.nlp.trees.semgraph.SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation; -import edu.stanford.nlp.util.CoreMap; - /** * @author Lorenz Buehmann * @@ -73,10 +55,8 @@ * @see org.dllearner.algorithms.isle.Experiment#getDocuments() */ @Override - protected Set<TextDocument> getDocuments() { - Set<TextDocument> documents = new HashSet<TextDocument>(); - File taggedFolder = new File("tmp/tagged"); - taggedFolder.mkdirs(); + protected Set<String> getDocuments() { + Set<String> documents = new HashSet<String>(); try { RemoteDataProvider bibleByChapter = new RemoteDataProvider( new URL("http://gold.linkeddata.org/data/bible/split_by_chapter.zip")); @@ -85,9 +65,10 @@ if(!file.isDirectory() && !file.isHidden()){ try { String text = Files.toString(file, Charsets.UTF_8); -// String posTagged = getPOSTaggedText(text); -// Files.write(posTagged, new File(taggedFolder, file.getName() + ".tagged"), Charsets.UTF_8); -// documents.add(TextDocumentGenerator.getInstance().generateDocument(text)); + text = text.trim(); + if(!text.isEmpty()){ + documents.add(text); + } } catch (IOException e) { e.printStackTrace(); } @@ -98,9 +79,8 @@ } catch (IOException e) { e.printStackTrace(); } - documents.clear(); - TextDocument doc = TextDocumentGenerator.getInstance().generateDocument("and in that day seven women shall take hold of one man saying we will eat our own bread and wear our own apparel only let us be called by thy name to take away our reproach in that day shall the branch of the lord be beautiful and glorious and the fruit of the earth excellent and comely for them that are escaped of israel and it shall come to pass left in zion and remaineth in jerusalem shall be called holy every one that is written among the living in jerusalem when the lord shall have washed away the filth of the daughters of zion and shall have purged the blood of jerusalem from the midst thereof by the spirit of judgment and by the spirit of burning and the lord will create upon every dwelling place of mount zion and upon her assemblies a cloud and smoke by day and the shining of a flaming fire by night for upon all the glory a defence and there shall be a tabernacle for a shadow in the daytime from the heat and for a place of refuge and for a covert from storm and from rain"); - documents.add(doc); +// documents.clear(); +// documents.add("and in that day seven women shall take hold of one man saying we will eat our own bread and wear our own apparel only let us be called by thy name to take away our reproach in that day shall the branch of the lord be beautiful and glorious and the fruit of the earth excellent and comely for them that are escaped of israel and it shall come to pass left in zion and remaineth in jerusalem shall be called holy every one that is written among the living in jerusalem when the lord shall have washed away the filth of the daughters of zion and shall have purged the blood of jerusalem from the midst thereof by the spirit of judgment and by the spirit of burning and the lord will create upon every dwelling place of mount zion and upon her assemblies a cloud and smoke by day and the shining of a flaming fire by night for upon all the glory a defence and there shall be a tabernacle for a shadow in the daytime from the heat and for a place of refuge and for a covert from storm and from rain"); return documents; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |