From: <lor...@us...> - 2013-09-04 08:04:37
|
Revision: 4043 http://sourceforge.net/p/dl-learner/code/4043 Author: lorenz_b Date: 2013-09-04 08:04:33 +0000 (Wed, 04 Sep 2013) Log Message: ----------- Fixed bug in linguistic annotator. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedTextDocument.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotation.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleLinguisticAnnotator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedTextDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedTextDocument.java 2013-09-03 16:53:18 UTC (rev 4042) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedTextDocument.java 2013-09-04 08:04:33 UTC (rev 4043) @@ -87,5 +87,13 @@ } return cnt; } + + /* (non-Javadoc) + * @see java.lang.Object#toString() + */ + @Override + public String toString() { + return "Document:\n" + document.getContent() + "\nAnnotations:" + annotations; + } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java 2013-09-03 16:53:18 UTC (rev 4042) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java 2013-09-04 08:04:33 UTC (rev 4043) @@ -63,6 +63,11 @@ return true; } - - + /* (non-Javadoc) + * @see java.lang.Object#toString() + */ + @Override + public String toString() { + return "\"" + getReferencedDocument.getContent().substring(offset, offset+length) + "\" at position " + offset; + } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotation.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotation.java 2013-09-03 16:53:18 UTC (rev 4042) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotation.java 2013-09-04 08:04:33 UTC (rev 4043) @@ -52,7 +52,13 @@ return true; } - + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.index.Annotation#toString() + */ + @Override + public String toString() { + return super.toString() + "->" + entity; + } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleLinguisticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleLinguisticAnnotator.java 2013-09-03 16:53:18 UTC (rev 4042) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleLinguisticAnnotator.java 2013-09-04 08:04:33 UTC (rev 4043) @@ -14,15 +14,20 @@ @Override public Set<Annotation> annotate(Document document) { - String s = document.getRawContent(); + String s = document.getRawContent().trim(); Set<Annotation> annotations = new HashSet<Annotation>(); - Pattern pattern = Pattern.compile(" "); + Pattern pattern = Pattern.compile("\\u0020+"); Matcher matcher = pattern.matcher(s); // Check all occurrences + int start = 0; while (matcher.find()) { - annotations.add(new Annotation(document, matcher.start(), - matcher.end() - matcher.start())); + int end = matcher.start(); + annotations.add(new Annotation(document, start, end - start)); + start = matcher.end(); } + if(start < s.length()-1){ + annotations.add(new Annotation(document, start, s.length() - start)); + } return annotations; } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java 2013-09-03 16:53:18 UTC (rev 4042) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java 2013-09-04 08:04:33 UTC (rev 4043) @@ -1,9 +1,11 @@ package org.dllearner.algorithms.isle.index.semantic; +import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; +import org.apache.lucene.document.Field; import org.dllearner.algorithms.isle.EntityCandidateGenerator; import org.dllearner.algorithms.isle.WordSenseDisambiguation; import org.dllearner.algorithms.isle.index.AnnotatedDocument; @@ -12,6 +14,10 @@ import org.dllearner.algorithms.isle.index.TextDocument; import org.dllearner.algorithms.isle.index.syntactic.SyntacticIndex; import org.dllearner.core.owl.Entity; +import org.semanticweb.owlapi.model.OWLAnnotation; +import org.semanticweb.owlapi.model.OWLAnnotationProperty; +import org.semanticweb.owlapi.model.OWLEntity; +import org.semanticweb.owlapi.model.OWLLiteral; import org.semanticweb.owlapi.model.OWLOntology; /** @@ -43,6 +49,7 @@ * Precompute the whole index, i.e. iterate over all entities and compute all annotated documents. */ public void buildIndex(Set<TextDocument> documents){ + index = new HashMap<Entity, Set<AnnotatedDocument>>(); for (TextDocument document : documents) { AnnotatedDocument annotatedDocument = semanticAnnotator.processDocument(document); for (Entity entity : annotatedDocument.getContainedEntities()) { @@ -56,6 +63,35 @@ } } + public void buildIndex(OWLAnnotationProperty annotationProperty, String language){ + Set<OWLEntity> schemaEntities = new HashSet<OWLEntity>(); + schemaEntities.addAll(ontology.getClassesInSignature()); + schemaEntities.addAll(ontology.getObjectPropertiesInSignature()); + schemaEntities.addAll(ontology.getDataPropertiesInSignature()); + Set<TextDocument> documents = new HashSet<TextDocument>(); + for (OWLEntity entity : schemaEntities) { + String label = null; + Set<OWLAnnotation> annotations = entity.getAnnotations(ontology, annotationProperty); + for (OWLAnnotation annotation : annotations) { + if (annotation.getValue() instanceof OWLLiteral) { + OWLLiteral val = (OWLLiteral) annotation.getValue(); + if (language != null) { + if(val.hasLang(language)){ + label = val.getLiteral(); + } + + } else { + label = val.getLiteral(); + } + } + } + if(label != null){ + documents.add(new TextDocument(label)); + } + } + buildIndex(documents); + } + /** * Returns the set of annotated documents which reference the given entity using one of its surface forms. * @@ -63,6 +99,11 @@ * @return documents referencing given entity */ public Set<AnnotatedDocument> getDocuments(Entity entity){ + if(index == null){ + System.err.println("You have to prebuild the index before you can use this method."); + System.exit(1); + } + Set<AnnotatedDocument> annotatedDocuments = index.get(entity); return annotatedDocuments; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |