From: <lor...@us...> - 2014-02-17 15:51:45
|
Revision: 4229 http://sourceforge.net/p/dl-learner/code/4229 Author: lorenz_b Date: 2014-02-17 15:51:37 +0000 (Mon, 17 Feb 2014) Log Message: ----------- Simplified index creation. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SolrSyntacticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/EntityTextRetriever.java trunk/components-core/src/main/java/org/dllearner/core/owl/DatatypeProperty.java trunk/components-core/src/main/java/org/dllearner/core/owl/ObjectProperty.java trunk/components-core/src/main/java/org/dllearner/core/owl/ObjectPropertyExpression.java trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaExperiment.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java 2014-02-17 13:55:46 UTC (rev 4228) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java 2014-02-17 15:51:37 UTC (rev 4229) @@ -1,5 +1,6 @@ package org.dllearner.algorithms.isle; +import java.util.ArrayList; import java.util.List; import java.util.Properties; @@ -26,11 +27,16 @@ private static TextDocumentGenerator instance; private StanfordCoreNLP pipeline; + private StanfordCoreNLP pipelineSimple; private final String punctuationPattern = "\\p{Punct}"; private final StopWordFilter stopWordFilter = new StopWordFilter(); private TextDocumentGenerator(){ Properties props = new Properties(); + props.put("annotators", "tokenize, ssplit");//, pos, lemma, parse"); + pipelineSimple = new StanfordCoreNLP(props); + + props = new Properties(); props.put("annotators", "tokenize, ssplit, pos, lemma, parse"); pipeline = new StanfordCoreNLP(props); } @@ -95,8 +101,7 @@ String lemma = label.get(LemmaAnnotation.class); //check if token is punctuation boolean isPunctuation = word.matches(punctuationPattern) - || pos.equalsIgnoreCase("-lrb-") - || pos.equalsIgnoreCase("-rrb-") + || (pos != null && (pos.equalsIgnoreCase("-lrb-") || pos.equalsIgnoreCase("-rrb-"))) || word.startsWith("'") ; //check if it is a stop word @@ -115,6 +120,33 @@ return document; } + public List<String> generateDocumentSimple(String text) { + List<String> tokens = new ArrayList<>(); + + // create an empty Annotation just with the given text + Annotation annotatedDocument = new Annotation(text); + + // run all Annotators on this text + pipelineSimple.annotate(annotatedDocument); + + // these are all the sentences in this document + // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types + List<CoreMap> sentences = annotatedDocument.get(SentencesAnnotation.class); + + for(CoreMap sentence: sentences) { + + + for (CoreLabel label: sentence.get(TokensAnnotation.class)) { + // this is the text of the token + String word = label.get(TextAnnotation.class); + + tokens.add(word); + } + } + + return tokens; + } + public static void main(String[] args) throws Exception { TextDocument document = TextDocumentGenerator.getInstance().generateDocument("And he said, Amos, what seest thou? And I said, A basket of summer fruit. Then said the LORD unto me, The end is come upon my people of Israel; I will not again pass by them any more. "); System.out.println(document); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SolrSyntacticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SolrSyntacticIndex.java 2014-02-17 13:55:46 UTC (rev 4228) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SolrSyntacticIndex.java 2014-02-17 15:51:37 UTC (rev 4229) @@ -64,7 +64,7 @@ long totalNumberOfDocuments = -1; - Map<Set<Entity>, Long> cache = new HashMap<>(); + Map<Set<Entity>, Long> cache = Collections.synchronizedMap(new HashMap<Set<Entity>, Long>()); private OWLOntology ontology; public SolrSyntacticIndex(OWLOntology ontology, String solrServerURL, String searchField) { @@ -84,6 +84,15 @@ } } logger.info("...done."); + Entity e = new NamedClass("http://dbpedia.org/ontology/Comics"); + int i = 0; + for (Set<Entity> entities : cache.keySet()) { + if(entities.contains(e)){ + System.out.println(entities); + i++; + } + } + System.out.println(i); } public void buildIndex(Collection<NamedClass> classes){ @@ -103,7 +112,6 @@ final Set<Entity> otherEntities = OWLAPIConverter.getEntities(owlEntities); otherEntities.addAll(classes); for (final Entity entity : otherEntities) { - logger.info(entity); executor.submit(new Runnable() { @Override @@ -214,17 +222,13 @@ if(cache.containsKey(entitySet)){ return cache.get(entitySet); } - Map<List<Token>, Double> relevantText = textRetriever.getRelevantText(entity); + Map<String, Double> relevantText = textRetriever.getRelevantTextSimple(entity); String queryString = "("; Set<String> terms = new HashSet<>(); - for (Entry<List<Token>, Double> entry : relevantText.entrySet()) { - List<Token> tokens = entry.getKey(); - String phrase = ""; - for (Token token : tokens) { -// terms.add(token.getRawForm()); - phrase += token.getRawForm() + " "; - } + for (Entry<String, Double> entry : relevantText.entrySet()) { + String tokens = entry.getKey(); + String phrase = tokens; phrase.trim(); terms.add(quotedString(phrase)); } @@ -256,17 +260,13 @@ Set<String> queryStringParts = new HashSet<>(); for (Entity entity : entities) { - Map<List<Token>, Double> relevantText = textRetriever.getRelevantText(entity); + Map<String, Double> relevantText = textRetriever.getRelevantTextSimple(entity); String queryString = "("; Set<String> terms = new HashSet<>(); - for (Entry<List<Token>, Double> entry : relevantText.entrySet()) { - List<Token> tokens = entry.getKey(); - String phrase = ""; - for (Token token : tokens) { -// terms.add(token.getRawForm()); - phrase += token.getRawForm() + " "; - } + for (Entry<String, Double> entry : relevantText.entrySet()) { + String tokens = entry.getKey(); + String phrase = tokens; phrase.trim(); terms.add(quotedString(phrase)); } @@ -330,10 +330,10 @@ String searchField = "comment"; OWLOntology ontology = OWLManager.createOWLOntologyManager().loadOntologyFromOntologyDocument(new File("src/test/resources/org/dllearner/algorithms/isle/dbpedia_3.9.owl")); SolrSyntacticIndex index = new SolrSyntacticIndex(ontology, solrServerURL, searchField); - index.loadCache(new File("frequencies.obj")); - long n = index.getNumberOfDocumentsFor(new NamedClass("http://dbpedia.org/ontology/Abbey")); + index.loadCache(new File("entity_frequencies.obj")); + long n = index.getNumberOfDocumentsFor(new NamedClass("http://dbpedia.org/ontology/Comics")); System.out.println(n); - n = index.getNumberOfDocumentsFor(new NamedClass("http://dbpedia.org/ontology/Abbey"), new ObjectProperty("http://dbpedia.org/ontology/largestCity")); + n = index.getNumberOfDocumentsFor(new NamedClass("http://dbpedia.org/ontology/Comics"), new ObjectProperty("http://dbpedia.org/ontology/largestCity")); System.out.println(n); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java 2014-02-17 13:55:46 UTC (rev 4228) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java 2014-02-17 15:51:37 UTC (rev 4229) @@ -98,7 +98,11 @@ } //remove content in brackets like (...) label = label.replaceAll("\\s?\\((.*?)\\)", ""); - textWithWeight.put(TextDocumentGenerator.getInstance().generateDocument(label, determineHeadNoun), weight); + try { + textWithWeight.put(TextDocumentGenerator.getInstance().generateDocument(label, determineHeadNoun), weight); + } catch (Exception e1) { + e1.printStackTrace(); + } } } } @@ -114,6 +118,45 @@ return textWithWeight; } + @Override + public Map<String, Double> getRelevantTextSimple(Entity entity) { + Map<String, Double> textWithWeight = new HashMap<String, Double>(); + + OWLEntity e = OWLAPIConverter.getOWLAPIEntity(entity); + + for (OWLAnnotationProperty property : properties) { + Set<OWLAnnotation> annotations = e.getAnnotations(ontology, property); + for (OWLAnnotation annotation : annotations) { + if (annotation.getValue() instanceof OWLLiteral) { + OWLLiteral val = (OWLLiteral) annotation.getValue(); + if (val.hasLang(language)) { + //trim + String label = val.getLiteral().trim(); + if(entity instanceof NamedClass){ + label = label.toLowerCase(); + } + //remove content in brackets like (...) + label = label.replaceAll("\\s?\\((.*?)\\)", ""); + try { + textWithWeight.put(label, weight); + } catch (Exception e1) { + e1.printStackTrace(); + } + } + } + } + } + + if(textWithWeight.isEmpty() && useShortFormFallback){ + String shortForm = sfp.getShortForm(IRI.create(entity.getURI())); + shortForm = Joiner.on(" ").join(LinguisticUtil.getInstance().getWordsFromCamelCase(shortForm)); + shortForm = Joiner.on(" ").join(LinguisticUtil.getInstance().getWordsFromUnderscored(shortForm)).trim(); + textWithWeight.put(shortForm, weight); + } + + return textWithWeight; + } + /** * Returns for each entity in the ontology all relevant text, i.e. either the annotations or the short form of the IRI as fallback. * @return Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/EntityTextRetriever.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/EntityTextRetriever.java 2014-02-17 13:55:46 UTC (rev 4228) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/EntityTextRetriever.java 2014-02-17 15:51:37 UTC (rev 4229) @@ -50,5 +50,11 @@ public Map<List<Token>, Double> getRelevantText(Entity entity); public Map<Entity, Set<List<Token>>> getRelevantText(OWLOntology ontology); + + /** + * @param entity + * @return + */ + Map<String, Double> getRelevantTextSimple(Entity entity); } Modified: trunk/components-core/src/main/java/org/dllearner/core/owl/DatatypeProperty.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/core/owl/DatatypeProperty.java 2014-02-17 13:55:46 UTC (rev 4228) +++ trunk/components-core/src/main/java/org/dllearner/core/owl/DatatypeProperty.java 2014-02-17 15:51:37 UTC (rev 4229) @@ -80,29 +80,36 @@ return name.compareTo(o.name); } + /* (non-Javadoc) + * @see org.dllearner.core.owl.KBElement#toManchesterSyntaxString(java.lang.String, java.util.Map) + */ @Override - public boolean equals(Object nc) { - // standard equals code - always return true for object identity and - // false if classes differ - if(nc == this) { - return true; - } else if(getClass() != nc.getClass()) { - return false; - } - // compare on URIs - return ((DatatypeProperty)nc).name.equals(name); + public String toManchesterSyntaxString(String baseURI, Map<String, String> prefixes) { + return Helper.getAbbreviatedString(name, baseURI, prefixes); } - + @Override public int hashCode() { - return name.hashCode(); + final int prime = 31; + int result = 1; + result = prime * result + ((name == null) ? 0 : name.hashCode()); + return result; } - /* (non-Javadoc) - * @see org.dllearner.core.owl.KBElement#toManchesterSyntaxString(java.lang.String, java.util.Map) - */ @Override - public String toManchesterSyntaxString(String baseURI, Map<String, String> prefixes) { - return Helper.getAbbreviatedString(name, baseURI, prefixes); + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + DatatypeProperty other = (DatatypeProperty) obj; + if (name == null) { + if (other.name != null) + return false; + } else if (!name.equals(other.name)) + return false; + return true; } } Modified: trunk/components-core/src/main/java/org/dllearner/core/owl/ObjectProperty.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/core/owl/ObjectProperty.java 2014-02-17 13:55:46 UTC (rev 4228) +++ trunk/components-core/src/main/java/org/dllearner/core/owl/ObjectProperty.java 2014-02-17 15:51:37 UTC (rev 4229) @@ -71,24 +71,6 @@ return name.compareTo(o.name); } - @Override - public boolean equals(Object nc) { - // standard equals code - always return true for object identity and - // false if classes differ - if(nc == this) { - return true; - } else if(getClass() != nc.getClass()) { - return false; - } - // compare on URIs - return ((ObjectProperty)nc).name.equals(name); - } - - @Override - public int hashCode() { - return name.hashCode(); - } - /* (non-Javadoc) * @see org.dllearner.core.owl.KBElement#toManchesterSyntaxString(java.lang.String, java.util.Map) */ Modified: trunk/components-core/src/main/java/org/dllearner/core/owl/ObjectPropertyExpression.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/core/owl/ObjectPropertyExpression.java 2014-02-17 13:55:46 UTC (rev 4228) +++ trunk/components-core/src/main/java/org/dllearner/core/owl/ObjectPropertyExpression.java 2014-02-17 15:51:37 UTC (rev 4229) @@ -44,5 +44,30 @@ public String getName() { return name; } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((name == null) ? 0 : name.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + ObjectPropertyExpression other = (ObjectPropertyExpression) obj; + if (name == null) { + if (other.name != null) + return false; + } else if (!name.equals(other.name)) + return false; + return true; + } } Modified: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaExperiment.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaExperiment.java 2014-02-17 13:55:46 UTC (rev 4228) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaExperiment.java 2014-02-17 15:51:37 UTC (rev 4229) @@ -95,6 +95,7 @@ import org.semanticweb.owlapi.util.OWLEntityRemover; import com.google.common.base.Charsets; +import com.google.common.collect.Lists; import com.google.common.collect.Sets; import com.google.common.hash.HashCode; import com.google.common.hash.HashFunction; @@ -260,10 +261,13 @@ List<NamedClass> classList = new ArrayList<>(classes); // Collections.reverse(classList); // classList = classList.subList(0, 2); +// classList = Lists.newArrayList( +// new NamedClass("http://dbpedia.org/ontology/Comics"), +// new NamedClass("http://dbpedia.org/ontology/Actor"), +// new NamedClass("http://dbpedia.org/ontology/Book")); + new SolrSyntacticIndex(schema, solrServerURL, searchField).buildIndex(classList); + System.exit(0); -// new SolrSyntacticIndex(schema, solrServerURL, searchField).buildIndex(classList); -// System.exit(0); - ExecutorService executor = Executors.newFixedThreadPool(6); for (final NamedClass cls : classList) { @@ -316,7 +320,6 @@ // e.printStackTrace(); // } - /** //set up the learning try { // set KB @@ -436,7 +439,6 @@ } catch (ComponentInitException e) { e.printStackTrace(); } - */ } /** @@ -697,7 +699,8 @@ private Index getSyntacticIndex(){ SolrSyntacticIndex index = new SolrSyntacticIndex(schema, solrServerURL, searchField); // try { -// index.loadCache(new File("src/test/resources/org/dllearner/algorithms/isle/dbpedia_entity_frequencies.obj")); +//// index.loadCache(new File("src/test/resources/org/dllearner/algorithms/isle/dbpedia_entity_frequencies.obj")); +// index.loadCache(new File("frequencies.obj")); // } catch (IOException e) { // e.printStackTrace(); // } @@ -767,8 +770,8 @@ // la.start(); long start = System.currentTimeMillis(); -// new DBpediaExperiment().run(); - new DBpediaExperiment().run(new NamedClass("http://dbpedia.org/ontology/Book")); + new DBpediaExperiment().run(); +// new DBpediaExperiment().run(new NamedClass("http://dbpedia.org/ontology/Comics")); long end = System.currentTimeMillis(); logger.info("Operation took " + (end - start) + "ms"); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |