From: <lor...@us...> - 2013-11-21 13:38:25
|
Revision: 4164 http://sourceforge.net/p/dl-learner/code/4164 Author: lorenz_b Date: 2013-11-21 13:38:03 +0000 (Thu, 21 Nov 2013) Log Message: ----------- Refactored context extractors. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SentenceBasedContextExtractor.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WindowBasedContextExtractor.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java 2013-11-21 13:16:13 UTC (rev 4163) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java 2013-11-21 13:38:03 UTC (rev 4164) @@ -37,7 +37,7 @@ return instance; } - public TextDocument tag(String text) { + public TextDocument generateDocument(String text) { TextDocument document = new TextDocument(); // create an empty Annotation just with the given text Annotation annotatedDocument = new Annotation(text); @@ -72,7 +72,7 @@ } public static void main(String[] args) throws Exception { - TextDocument document = TextDocumentGenerator.getInstance().tag("And he said, Amos, what seest thou? And I said, A basket of summer fruit. Then said the LORD unto me, The end is come upon my people of Israel; I will not again pass by them any more. "); + TextDocument document = TextDocumentGenerator.getInstance().generateDocument("And he said, Amos, what seest thou? And I said, A basket of summer fruit. Then said the LORD unto me, The end is come upon my people of Israel; I will not again pass by them any more. "); System.out.println(document); } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java 2013-11-21 13:16:13 UTC (rev 4163) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java 2013-11-21 13:38:03 UTC (rev 4164) @@ -34,6 +34,13 @@ public Document getReferencedDocument() { return referencedDocument; } + + /** + * @return the tokens + */ + public ArrayList<Token> getTokens() { + return tokens; + } public String getString(){ StringBuilder sb = new StringBuilder(); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java 2013-11-21 13:16:13 UTC (rev 4163) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java 2013-11-21 13:38:03 UTC (rev 4164) @@ -16,6 +16,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; +import org.dllearner.algorithms.isle.TextDocumentGenerator; import org.dllearner.algorithms.isle.index.TextDocument; import java.io.File; @@ -61,7 +62,7 @@ ScoreDoc[] result = searcher.search(query, getSize()).scoreDocs; for (int i = 0; i < result.length; i++) { Document doc = searcher.doc(result[i].doc); - documents.add(new TextDocument(doc.get(searchField))); + documents.add(TextDocumentGenerator.getInstance().generateDocument(doc.get(searchField))); } } catch (ParseException e) { e.printStackTrace(); @@ -85,7 +86,7 @@ try { Document doc = indexReader.document(i); String content = doc.get(searchField); - documents.add(new TextDocument(content)); + documents.add(TextDocumentGenerator.getInstance().generateDocument(content)); } catch (IOException e) { e.printStackTrace(); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SentenceBasedContextExtractor.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SentenceBasedContextExtractor.java 2013-11-21 13:16:13 UTC (rev 4163) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SentenceBasedContextExtractor.java 2013-11-21 13:38:03 UTC (rev 4164) @@ -3,6 +3,14 @@ */ package org.dllearner.algorithms.isle.wsd; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Properties; + +import org.dllearner.algorithms.isle.TextDocumentGenerator; +import org.dllearner.algorithms.isle.index.Token; + import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; @@ -10,12 +18,7 @@ import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.util.CoreMap; -import org.dllearner.algorithms.isle.index.TextDocument; -import java.util.ArrayList; -import java.util.List; -import java.util.Properties; - /** * @author Lorenz Buehmann * @@ -36,26 +39,29 @@ @Override public List<String> extractContext(org.dllearner.algorithms.isle.index.Annotation annotation) { //split text into sentences - List<CoreMap> sentences = getSentences(annotation.getReferencedDocument().getContent()); + List<CoreMap> sentences = getSentences(annotation.getReferencedDocument().getRawContent()); //find the sentence containing the token of the annotation - int tokenStart = annotation.getOffset(); - int index = 0; + Token firstToken = annotation.getTokens().get(0); for (CoreMap sentence : sentences) { - String s = sentence.toString(); - if (index <= tokenStart && s.length() > tokenStart) { + boolean found = false; + for (CoreLabel label : sentence.get(TokensAnnotation.class)) { + // this is the text of the token + String word = label.get(TextAnnotation.class); + if(word.equals(firstToken.getRawForm())){ + found = true; + break; + } + } + if(found){ List<String> context = new ArrayList<String>(); for (CoreLabel label : sentence.get(TokensAnnotation.class)) { // this is the text of the token String word = label.get(TextAnnotation.class); - - if(!word.isEmpty() && !word.matches("\\p{Punct}")){ - context.add(word); - } + context.add(word); } return context; } - index += s.length(); } throw new RuntimeException("Token " + annotation.getString() + " not found in text " + annotation.getReferencedDocument().getRawContent()); } @@ -79,9 +85,8 @@ String s = "International Business Machines Corporation, or IBM, is an American multinational services technology and consulting corporation, with headquarters in Armonk, New York, United States. IBM manufactures and markets computer hardware and software," + " and offers infrastructure, hosting and consulting services in areas ranging from mainframe computers to nanotechnology."; - String token = "services"; SentenceBasedContextExtractor extractor = new SentenceBasedContextExtractor(); - List<String> context = extractor.extractContext(new org.dllearner.algorithms.isle.index.Annotation(new TextDocument(s), s.indexOf(token), token.length())); + List<String> context = extractor.extractContext(new org.dllearner.algorithms.isle.index.Annotation(TextDocumentGenerator.getInstance().generateDocument(s), Arrays.asList(new Token("American")))); System.out.println(context); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WindowBasedContextExtractor.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WindowBasedContextExtractor.java 2013-11-21 13:16:13 UTC (rev 4163) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WindowBasedContextExtractor.java 2013-11-21 13:38:03 UTC (rev 4164) @@ -3,6 +3,14 @@ */ package org.dllearner.algorithms.isle.wsd; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Properties; + +import org.dllearner.algorithms.isle.TextDocumentGenerator; +import org.dllearner.algorithms.isle.index.Token; + import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; @@ -10,18 +18,13 @@ import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.util.CoreMap; -import org.dllearner.algorithms.isle.index.TextDocument; -import java.util.ArrayList; -import java.util.List; -import java.util.Properties; - /** * @author Lorenz Buehmann - * + * */ -public class WindowBasedContextExtractor implements ContextExtractor{ - +public class WindowBasedContextExtractor implements ContextExtractor { + private StanfordCoreNLP pipeline; private int tokensLeft = 10; private int tokensRight = 10; @@ -29,57 +32,66 @@ public WindowBasedContextExtractor(int tokensLeft, int tokensRight) { this.tokensLeft = tokensLeft; this.tokensRight = tokensRight; - + Properties props = new Properties(); props.put("annotators", "tokenize, ssplit"); pipeline = new StanfordCoreNLP(props); } - + public WindowBasedContextExtractor(int tokensLeftRight) { tokensLeft = tokensLeftRight; tokensRight = tokensLeftRight; - + Properties props = new Properties(); props.put("annotators", "tokenize, ssplit"); pipeline = new StanfordCoreNLP(props); } - + public WindowBasedContextExtractor() { Properties props = new Properties(); props.put("annotators", "tokenize, ssplit"); pipeline = new StanfordCoreNLP(props); } - /* (non-Javadoc) - * @see org.dllearner.algorithms.isle.wsd.ContextExtractor#extractContext(java.lang.String, java.lang.String) + /* + * (non-Javadoc) + * + * @see + * org.dllearner.algorithms.isle.wsd.ContextExtractor#extractContext(java + * .lang.String, java.lang.String) */ @Override public List<String> extractContext(org.dllearner.algorithms.isle.index.Annotation annotation) { // split text into sentences - List<CoreMap> sentences = getSentences(annotation.getReferencedDocument().getContent()); + List<CoreMap> sentences = getSentences(annotation.getReferencedDocument().getRawContent()); // find the sentence containing the token of the annotation - int tokenStart = annotation.getOffset(); - int index = 0; + Token firstToken = annotation.getTokens().get(0); for (CoreMap sentence : sentences) { - String s = sentence.toString(); - if (index <= tokenStart && s.length() > tokenStart) { + boolean found = false; + for (CoreLabel label : sentence.get(TokensAnnotation.class)) { + // this is the text of the token + String word = label.get(TextAnnotation.class); + if (word.equals(firstToken.getRawForm())) { + found = true; + break; + } + } + if (found) { List<String> context = new ArrayList<String>(); for (CoreLabel label : sentence.get(TokensAnnotation.class)) { // this is the text of the token String word = label.get(TextAnnotation.class); - context.add(word); } return context; } - index += s.length(); } - throw new RuntimeException("Token " + annotation + " not found in text " - + annotation.getReferencedDocument().getContent()); + throw new RuntimeException("Token " + annotation.getString() + " not found in text " + + annotation.getReferencedDocument().getRawContent()); } - + private List<CoreMap> getSentences(String document) { // create an empty Annotation just with the given text Annotation annotation = new Annotation(document); @@ -94,14 +106,14 @@ return sentences; } - + public static void main(String[] args) throws Exception { String s = "International Business Machines Corporation, or IBM, is an American multinational services technology and consulting corporation, with headquarters in Armonk, New York, United States. IBM manufactures and markets computer hardware and software," + " and offers infrastructure, hosting and consulting services in areas ranging from mainframe computers to nanotechnology."; - + String token = "services"; WindowBasedContextExtractor extractor = new WindowBasedContextExtractor(); - List<String> context = extractor.extractContext(new org.dllearner.algorithms.isle.index.Annotation(new TextDocument(s), s.indexOf(token), token.length())); + List<String> context = extractor.extractContext(new org.dllearner.algorithms.isle.index.Annotation(TextDocumentGenerator.getInstance().generateDocument(s), Arrays.asList(new Token("American")))); System.out.println(context); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |