From: <lor...@us...> - 2011-09-29 13:03:05
|
Revision: 3292 http://dl-learner.svn.sourceforge.net/dl-learner/?rev=3292&view=rev Author: lorenz_b Date: 2011-09-29 13:02:55 +0000 (Thu, 29 Sep 2011) Log Message: ----------- Started test skript to evaluate indexes. Modified Paths: -------------- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/search/ThresholdSlidingSolrSearch.java Added Paths: ----------- trunk/components-ext/src/test/java/org/dllearner/algorithm/tbsl/IndexEvaluation.java Modified: trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/search/ThresholdSlidingSolrSearch.java =================================================================== --- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/search/ThresholdSlidingSolrSearch.java 2011-09-29 08:47:15 UTC (rev 3291) +++ trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/search/ThresholdSlidingSolrSearch.java 2011-09-29 13:02:55 UTC (rev 3292) @@ -33,7 +33,7 @@ if(threshold < 1){ queryWithThreshold = queryString + "~" + threshold; } - System.out.println(queryWithThreshold); + resources.addAll(findResources(queryWithThreshold, limit - resources.size(), 0)); threshold -= step; } Added: trunk/components-ext/src/test/java/org/dllearner/algorithm/tbsl/IndexEvaluation.java =================================================================== --- trunk/components-ext/src/test/java/org/dllearner/algorithm/tbsl/IndexEvaluation.java (rev 0) +++ trunk/components-ext/src/test/java/org/dllearner/algorithm/tbsl/IndexEvaluation.java 2011-09-29 13:02:55 UTC (rev 3292) @@ -0,0 +1,352 @@ +package org.dllearner.algorithm.tbsl; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; +import java.util.SortedMap; +import java.util.SortedSet; +import java.util.TreeMap; +import java.util.TreeSet; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; + +import org.apache.log4j.FileAppender; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.apache.log4j.SimpleLayout; +import org.dllearner.algorithm.tbsl.learning.SPARQLTemplateBasedLearner; +import org.dllearner.algorithm.tbsl.search.HierarchicalSolrSearch; +import org.dllearner.algorithm.tbsl.search.SolrSearch; +import org.dllearner.algorithm.tbsl.search.ThresholdSlidingSolrSearch; +import org.dllearner.algorithm.tbsl.sparql.Slot; +import org.dllearner.algorithm.tbsl.sparql.SlotType; +import org.dllearner.algorithm.tbsl.sparql.Template; +import org.dllearner.algorithm.tbsl.templator.Templator; +import org.dllearner.algorithm.tbsl.util.Prefixes; +import org.dllearner.algorithm.tbsl.util.StringSimilarityComparator; +import org.ini4j.InvalidFileFormatException; +import org.ini4j.Options; +import org.w3c.dom.DOMException; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.NodeList; +import org.xml.sax.SAXException; + +public class IndexEvaluation { + + private static Logger logger = Logger.getLogger(IndexEvaluation.class); + + private SortedMap<Integer, String> id2Question = new TreeMap<Integer, String>(); + private SortedMap<Integer, String> id2Query = new TreeMap<Integer, String>(); + + private Map<String, String> prefixMap; + + private Templator templateGenerator; + + private SolrSearch resource_index; + private SolrSearch class_index; + private SolrSearch property_index; + + public IndexEvaluation(File ... evaluationFiles) { + for(File file : evaluationFiles){ + readQueries(file); + } + init(); + } + + private void init(){ + try { + Options options = new Options(new FileInputStream(this.getClass().getClassLoader().getResource("tbsl/tbsl.properties").getPath())); + + templateGenerator = new Templator(); + prefixMap = Prefixes.getPrefixes(); + + String resourcesIndexUrl = options.fetch("solr.resources.url"); + String resourcesIndexSearchField = options.fetch("solr.resources.searchfield"); + resource_index = new ThresholdSlidingSolrSearch(resourcesIndexUrl, resourcesIndexSearchField, 1.0, 0.1); + + String classesIndexUrl = options.fetch("solr.classes.url"); + String classesIndexSearchField = options.fetch("solr.classes.searchfield"); + class_index = new ThresholdSlidingSolrSearch(classesIndexUrl, classesIndexSearchField, 1.0, 0.1); + + String propertiesIndexUrl = options.fetch("solr.properties.url"); + String propertiesIndexSearchField = options.fetch("solr.properties.searchfield"); + SolrSearch labelBasedPropertyIndex = new SolrSearch(propertiesIndexUrl, propertiesIndexSearchField); + + String boaPatternIndexUrl = options.fetch("solr.boa.properties.url"); + String boaPatternIndexSearchField = options.fetch("solr.boa.properties.searchfield"); + SolrSearch patternBasedPropertyIndex = new SolrSearch(boaPatternIndexUrl, boaPatternIndexSearchField); + + property_index = new HierarchicalSolrSearch(patternBasedPropertyIndex, labelBasedPropertyIndex); + } catch (InvalidFileFormatException e) { + e.printStackTrace(); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + private void readQueries(File file){ + logger.info("Reading file containing queries and answers..."); + try { + DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); + DocumentBuilder db = dbf.newDocumentBuilder(); + Document doc = db.parse(file); + doc.getDocumentElement().normalize(); + NodeList questionNodes = doc.getElementsByTagName("question"); + int id; + String question; + String query; + for(int i = 0; i < questionNodes.getLength(); i++){ + Element questionNode = (Element) questionNodes.item(i); + //read question ID + id = Integer.valueOf(questionNode.getAttribute("id")); + //Read question + question = ((Element)questionNode.getElementsByTagName("string").item(0)).getChildNodes().item(0).getNodeValue().trim(); + //Read SPARQL query + query = ((Element)questionNode.getElementsByTagName("query").item(0)).getChildNodes().item(0).getNodeValue().trim(); + + id2Question.put(id, question); + id2Query.put(id, query); + + } + } catch (DOMException e) { + e.printStackTrace(); + } catch (ParserConfigurationException e) { + e.printStackTrace(); + } catch (SAXException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + logger.info("Done."); + } + + private Set<String> extractEntities(String query){ + List<String> exclusions = Arrays.asList(new String[]{"rdf", "rdfs"}); + Set<String> entities = new HashSet<String>(); + //pattern to detect resources + Pattern pattern = Pattern.compile("(\\w+):(\\w+)"); + Matcher matcher = pattern.matcher(query); + String group; + while(matcher.find()){ + group = matcher.group(); + boolean add = true; + for(String ex : exclusions){ + if(group.contains(ex)){ + add = false; + break; + } + } + if(add){ + entities.add(getFullURI(group)); + } + } + //pattern to detect string literals + pattern = Pattern.compile("'(\\w+)'@en"); + matcher = pattern.matcher(query); + while(matcher.find()){ + group = matcher.group(); + entities.add(getFullURI(buildEntityFromLabel(group))); + } + + return entities; + } + + private String getFullURI(String prefixedURI){ + String fullURI = prefixedURI; + String prefix; + String uri; + for(Entry<String, String> uri2Prefix : prefixMap.entrySet()){ + uri = uri2Prefix.getKey(); + prefix = uri2Prefix.getValue(); + if(prefixedURI.startsWith(prefix)){ + fullURI = prefixedURI.replace(prefix + ":", uri); + break; + } + } + return fullURI; + } + + private String getPrefixedURI(String fullURI){ + String prefixedURI = fullURI; + String prefix; + String uri; + for(Entry<String, String> prefix2URI : prefixMap.entrySet()){ + prefix = prefix2URI.getKey(); + uri = prefix2URI.getValue(); + if(fullURI.startsWith(uri)){ + prefixedURI = fullURI.replace(uri, prefix + ":" ); + break; + } + } + return prefixedURI; + } + + private String buildEntityFromLabel(String label){ + String base = "res:"; + String entity = label.substring(1).substring(0, label.lastIndexOf("'")-1).replace(" ", "_"); + return base + entity; + } + + private List<String> getCandidateURIsSortedBySimilarity(Slot slot){ + logger.info("Generating URI candidates for " + slot.getWords() + "..."); + List<String> sortedURIs = new ArrayList<String>(); + //get the appropriate index based on slot type + SolrSearch index = getIndexBySlotType(slot); + + SortedSet<String> tmp; + List<String> uris; + + //prune the word list only when slot type is not RESOURCE + List<String> words; + if(slot.getSlotType() == SlotType.RESOURCE){ + words = slot.getWords(); + } else { +// words = pruneList(slot.getWords());//getLemmatizedWords(slot.getWords()); + words = pruneList(slot.getWords()); + } + + for(String word : words){ + tmp = new TreeSet<String>(new StringSimilarityComparator(word)); + uris = index.getResources(word, 5); + + tmp.addAll(uris); + sortedURIs.addAll(tmp); + tmp.clear(); + } + + logger.info("URIs: " + sortedURIs); + return sortedURIs; + } + + private List<String> pruneList(List<String> words){ + List<String> prunedList = new ArrayList<String>(); + for(String w1 : words){ + boolean smallest = true; + for(String w2 : words){ + if(!w1.equals(w2)){ + if(w1.contains(w2)){ + smallest = false; + break; + } + } + } + if(smallest){ + prunedList.add(w1); + } + } + logger.info("Pruned list: " + prunedList); +// return getLemmatizedWords(words); + return prunedList; + } + + private SolrSearch getIndexBySlotType(Slot slot){ + SolrSearch index = null; + SlotType type = slot.getSlotType(); + if(type == SlotType.CLASS){ + index = class_index; + } else if(type == SlotType.PROPERTY || type == SlotType.SYMPROPERTY){ + index = property_index; + } else if(type == SlotType.RESOURCE || type == SlotType.UNSPEC){ + index = resource_index; + } + return index; + } + + public void run(){ + String question; + String targetQuery; + Set<String> targetEntities; + Set<Template> templates; + for(Entry<Integer, String> entry : id2Question.entrySet()){ + try { + question = entry.getValue(); + targetQuery = id2Query.get(entry.getKey()); + targetQuery = targetQuery.replace("onto:", "dbo:").replace("res:", "dbr:").replace("prop:", "dbp:"); + + logger.info(question); + logger.info(targetQuery); + + + templates = templateGenerator.buildTemplates(question); + if(!templates.isEmpty()){ + targetEntities = extractEntities(targetQuery); + logger.info("Target entities:\n" + targetEntities); + + + SortedSet<Slot> slots = new TreeSet<Slot>(new Comparator<Slot>() { + + @Override + public int compare(Slot o1, Slot o2) { + if(o1.getToken().equals(o2.getToken()) && o1.getSlotType()==o2.getSlotType()){ + return 0; + } else { + return o1.getToken().compareTo(o2.getToken()); + } + } + }); + for(Template t : templates){ + slots.addAll(t.getSlots()); + } + + Set<List<String>> uriLists = new HashSet<List<String>>(); + for(Slot slot : slots){ + uriLists.add(getCandidateURIsSortedBySimilarity(slot)); + } + + int pos = -1; + for(String entity : targetEntities){ + for(List<String> uris : uriLists){ + pos = uris.indexOf(entity); + if(pos >= 0){ + break; + } + } + if(pos == -1){ + logger.info(entity + " not covered."); + } else { + logger.info(entity + " covered at position " + pos); + } + } + } else { + logger.info("No template generated."); + } + + + + } catch (Exception e) { + e.printStackTrace(); + } + + } + } + + public static void main(String[] args) throws IOException { + Logger.getRootLogger().removeAllAppenders(); + Logger.getRootLogger().addAppender(new FileAppender(new SimpleLayout(), "log/index_eval.txt")); + Logger.getLogger(IndexEvaluation.class).setLevel(Level.INFO); + if(args.length == 0){ + System.out.println("Usage: IndexEvaluation <file>"); + System.exit(0); + } + + File file = new File(IndexEvaluation.class.getClassLoader().getResource(args[0]).getPath()); + new IndexEvaluation(file).run(); + } + +} Property changes on: trunk/components-ext/src/test/java/org/dllearner/algorithm/tbsl/IndexEvaluation.java ___________________________________________________________________ Added: svn:mime-type + text/plain This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |