[DL-Learner SVN] SF.net SVN: dl-learner:[3851] branches/hmm/components-ext/src/main/java/org /dlle

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 3851
          http://dl-learner.svn.sourceforge.net/dl-learner/?rev=3851&view=rev
Author:   kirdie
Date:     2012-09-27 13:16:49 +0000 (Thu, 27 Sep 2012)
Log Message:
-----------


Added Paths:
-----------
    branches/hmm/components-ext/src/main/java/org/dllearner/algorithm/tbsl/learning/SPARQLTemplateBasedLearner2HMM.java

Removed Paths:
-------------
    branches/hmm/components-ext/src/main/java/org/dllearner/algorithm/tbsl/learning/SPARQLTemplateBasedLearner2.java

Deleted: branches/hmm/components-ext/src/main/java/org/dllearner/algorithm/tbsl/learning/SPARQLTemplateBasedLearner2.java
===================================================================

--- branches/hmm/components-ext/src/main/java/org/dllearner/algorithm/tbsl/learning/SPARQLTemplateBasedLearner2.java	2012-09-27 09:39:55 UTC (rev 3850)
+++ branches/hmm/components-ext/src/main/java/org/dllearner/algorithm/tbsl/learning/SPARQLTemplateBasedLearner2.java	2012-09-27 13:16:49 UTC (rev 3851)
@@ -1,972 +0,0 @@
-package org.dllearner.algorithm.tbsl.learning;
-
-import hmm.HiddenMarkovModel;
-import hmm.ResourceInfo;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.SortedMap;
-import java.util.SortedSet;
-import java.util.TreeMap;
-import java.util.TreeSet;
-import org.apache.commons.collections15.MultiMap;
-import org.apache.log4j.Logger;
-import org.dllearner.algorithm.tbsl.nlp.Lemmatizer;
-import org.dllearner.algorithm.tbsl.nlp.LingPipeLemmatizer;
-import org.dllearner.algorithm.tbsl.nlp.PartOfSpeechTagger;
-import org.dllearner.algorithm.tbsl.nlp.PlingStemmer;
-import org.dllearner.algorithm.tbsl.nlp.StanfordPartOfSpeechTagger;
-import org.dllearner.algorithm.tbsl.nlp.WordNet;
-import org.dllearner.algorithm.tbsl.sparql.Allocation;
-import org.dllearner.algorithm.tbsl.sparql.Query;
-import org.dllearner.algorithm.tbsl.sparql.SPARQL_QueryType;
-import org.dllearner.algorithm.tbsl.sparql.Slot;
-import org.dllearner.algorithm.tbsl.sparql.SlotType;
-import org.dllearner.algorithm.tbsl.sparql.Template;
-import org.dllearner.algorithm.tbsl.sparql.WeightedQuery;
-import org.dllearner.algorithm.tbsl.templator.Templator;
-import org.dllearner.algorithm.tbsl.util.Knowledgebase;
-import org.dllearner.algorithm.tbsl.util.PopularityMap;
-import org.dllearner.algorithm.tbsl.util.PopularityMap.EntityType;
-import org.dllearner.algorithm.tbsl.util.Similarity;
-import org.dllearner.common.index.Index;
-import org.dllearner.common.index.IndexResultItem;
-import org.dllearner.common.index.IndexResultSet;
-import org.dllearner.common.index.MappingBasedIndex;
-import org.dllearner.common.index.SPARQLDatatypePropertiesIndex;
-import org.dllearner.common.index.SPARQLIndex;
-import org.dllearner.common.index.SPARQLObjectPropertiesIndex;
-import org.dllearner.common.index.SPARQLPropertiesIndex;
-import org.dllearner.common.index.VirtuosoDatatypePropertiesIndex;
-import org.dllearner.common.index.VirtuosoObjectPropertiesIndex;
-import org.dllearner.common.index.VirtuosoPropertiesIndex;
-import org.dllearner.core.ComponentInitException;
-import org.dllearner.core.LearningProblem;
-import org.dllearner.core.SparqlQueryLearningAlgorithm;
-import org.dllearner.kb.LocalModelBasedSparqlEndpointKS;
-import org.dllearner.kb.SparqlEndpointKS;
-import org.dllearner.kb.sparql.ExtractionDBCache;
-import org.dllearner.kb.sparql.SparqlEndpoint;
-import org.dllearner.kb.sparql.SparqlQuery;
-import org.dllearner.reasoning.SPARQLReasoner;
-import org.ini4j.Options;
-import org.semanticweb.owlapi.model.IRI;
-import org.semanticweb.owlapi.util.SimpleIRIShortFormProvider;
-import com.hp.hpl.jena.ontology.OntModelSpec;
-import com.hp.hpl.jena.query.QueryExecutionFactory;
-import com.hp.hpl.jena.query.QueryFactory;
-import com.hp.hpl.jena.query.QuerySolution;
-import com.hp.hpl.jena.query.ResultSet;
-import com.hp.hpl.jena.query.Syntax;
-import com.hp.hpl.jena.rdf.model.Model;
-import com.hp.hpl.jena.rdf.model.ModelFactory;
-import com.hp.hpl.jena.sparql.engine.http.QueryEngineHTTP;
-import com.jamonapi.Monitor;
-import com.jamonapi.MonitorFactory;
-
-/** The old learner taken over by Konrad Höffner for experiments with the Hidden Markov Algorithm by Saedeeh Shekarpur.
- * 
- * */
-public class SPARQLTemplateBasedLearner2 implements SparqlQueryLearningAlgorithm
-{
-	enum Mode {BEST_QUERY, BEST_NON_EMPTY_QUERY}
-	private Mode mode = Mode.BEST_QUERY;
-	
-	/** used to create a label out of the URI when there is no label available in the SPARQL endpoint.*/
-	private static SimpleIRIShortFormProvider sfp = new SimpleIRIShortFormProvider();
-
-	private static final Logger logger = Logger.getLogger(SPARQLTemplateBasedLearner2.class);
-	/** synonyms are great but are not used yet by the HMM algorithm. **/
-	private static final boolean	CREATE_SYNONYMS	= false;
-	/** The minimum score of items that are accepted from the Sindice search BOA index. **/
-	private static final Double	BOA_THRESHOLD	=  0.9;
-	private Monitor templateMon = MonitorFactory.getTimeMonitor("template");
-	private Monitor sparqlMon = MonitorFactory.getTimeMonitor("sparql");
-
-	private boolean useRemoteEndpointValidation;
-	private boolean stopIfQueryResultNotEmpty;
-	private int maxTestedQueriesPerTemplate = 50;
-	private int maxQueryExecutionTimeInSeconds;
-	private int maxTestedQueries = 200;
-	private int maxIndexResults;
-
-	private SparqlEndpoint endpoint = null;
-	private Model model = null;
-
-	private ExtractionDBCache cache = new ExtractionDBCache("cache");
-
-	private Index resourcesIndex;
-	private Index classesIndex;
-	private Index propertiesIndex;
-
-	private Index datatypePropertiesIndex;
-	private Index objectPropertiesIndex;
-
-	private MappingBasedIndex mappingIndex;
-
-	private Templator templateGenerator = null;
-	private Lemmatizer lemmatizer;
-	private PartOfSpeechTagger posTagger;
-	private WordNet wordNet;
-
-	private String question;
-	private int learnedPos = -1;
-
-	private Set<Template> templates;
-	private Map<Template, Collection<? extends Query>> template2Queries;
-	private Map<Slot, List<String>> slot2URI;
-
-	private Collection<WeightedQuery> sparqlQueryCandidates;
-	private SortedSet<WeightedQuery> learnedSPARQLQueries;
-	private SortedSet<WeightedQuery> generatedQueries;
-
-	private SPARQLReasoner reasoner;
-
-	private String currentlyExecutedQuery;
-
-	private boolean dropZeroScoredQueries = true;
-	private boolean useManualMappingsIfExistOnly = true;
-
-	private boolean multiThreaded = true;
-
-	private String [] grammarFiles = new String[]{"tbsl/lexicon/english.lex"};
-
-	private PopularityMap popularityMap;
-
-	private Set<String> relevantKeywords;
-
-	private boolean useDomainRangeRestriction = true;
-
-	public SPARQLTemplateBasedLearner2(SparqlEndpoint endpoint, Index resourcesIndex, Index classesIndex, Index propertiesIndex){
-		this(endpoint, resourcesIndex, classesIndex, propertiesIndex, new StanfordPartOfSpeechTagger());
-	}
-
-	public SPARQLTemplateBasedLearner2(Knowledgebase knowledgebase, PartOfSpeechTagger posTagger, WordNet wordNet, Options options){
-		this(knowledgebase.getEndpoint(), knowledgebase.getResourceIndex(), knowledgebase.getClassIndex(),knowledgebase.getPropertyIndex(), posTagger, wordNet, options);
-	}
-
-	public SPARQLTemplateBasedLearner2(Knowledgebase knowledgebase){
-		this(knowledgebase.getEndpoint(), knowledgebase.getResourceIndex(), knowledgebase.getClassIndex(),knowledgebase.getPropertyIndex(), new StanfordPartOfSpeechTagger(), new WordNet(), new Options());
-	}
-
-	public SPARQLTemplateBasedLearner2(SparqlEndpoint endpoint, Index index){
-		this(endpoint, index, new StanfordPartOfSpeechTagger());
-	}
-
-	public SPARQLTemplateBasedLearner2(SparqlEndpoint endpoint, Index resourcesIndex, Index classesIndex, Index propertiesIndex, PartOfSpeechTagger posTagger){
-		this(endpoint, resourcesIndex, classesIndex, propertiesIndex, posTagger, new WordNet(), new Options());
-	}
-
-	public SPARQLTemplateBasedLearner2(SparqlEndpoint endpoint, Index index, PartOfSpeechTagger posTagger){
-		this(endpoint, index, posTagger, new WordNet(), new Options());
-	}
-
-	public SPARQLTemplateBasedLearner2(SparqlEndpoint endpoint, Index resourcesIndex, Index classesIndex, Index propertiesIndex, WordNet wordNet){
-		this(endpoint, resourcesIndex, classesIndex, propertiesIndex, new StanfordPartOfSpeechTagger(), wordNet, new Options());
-	}
-
-	public SPARQLTemplateBasedLearner2(SparqlEndpoint endpoint, Index index, WordNet wordNet){
-		this(endpoint, index, new StanfordPartOfSpeechTagger(), wordNet, new Options());
-	}
-
-	public SPARQLTemplateBasedLearner2(SparqlEndpoint endpoint, Index resourcesIndex, Index classesIndex, Index propertiesIndex, PartOfSpeechTagger posTagger, WordNet wordNet){
-		this(endpoint, resourcesIndex, classesIndex, propertiesIndex, posTagger, wordNet, new Options(), new ExtractionDBCache("cache"));
-	}
-
-	public SPARQLTemplateBasedLearner2(SparqlEndpoint endpoint, Index index, PartOfSpeechTagger posTagger, WordNet wordNet){
-		this(endpoint, index, index, index, posTagger, wordNet, new Options(), new ExtractionDBCache("cache"));
-	}
-
-	public SPARQLTemplateBasedLearner2(SparqlEndpoint endpoint, Index resourcesIndex, Index classesIndex, Index propertiesIndex, PartOfSpeechTagger posTagger, WordNet wordNet, Options options){
-		this(endpoint, resourcesIndex, classesIndex, propertiesIndex, posTagger, wordNet, options, new ExtractionDBCache("cache"));
-	}
-
-	public SPARQLTemplateBasedLearner2(SparqlEndpoint endpoint, Index index, PartOfSpeechTagger posTagger, WordNet wordNet, Options options){
-		this(endpoint, index, index, index, posTagger, wordNet, options, new ExtractionDBCache("cache"));
-	}
-
-	public SPARQLTemplateBasedLearner2(SparqlEndpoint endpoint, Index resourcesIndex, Index classesIndex, Index propertiesIndex, PartOfSpeechTagger posTagger, WordNet wordNet, Options options, ExtractionDBCache cache){
-		this.endpoint = endpoint;
-		this.resourcesIndex = resourcesIndex;
-		this.classesIndex = classesIndex;
-		this.propertiesIndex = propertiesIndex;
-		this.posTagger = posTagger;
-		this.wordNet = wordNet;
-		this.cache = cache;
-
-		setOptions(options);
-
-		if(propertiesIndex instanceof SPARQLPropertiesIndex){
-			if(propertiesIndex instanceof VirtuosoPropertiesIndex){
-				datatypePropertiesIndex = new VirtuosoDatatypePropertiesIndex((SPARQLPropertiesIndex)propertiesIndex);
-				objectPropertiesIndex = new VirtuosoObjectPropertiesIndex((SPARQLPropertiesIndex)propertiesIndex);
-			} else {
-				datatypePropertiesIndex = new SPARQLDatatypePropertiesIndex((SPARQLPropertiesIndex)propertiesIndex);
-				objectPropertiesIndex = new SPARQLObjectPropertiesIndex((SPARQLPropertiesIndex)propertiesIndex);
-			}
-		} else {
-			datatypePropertiesIndex = propertiesIndex;
-			objectPropertiesIndex = propertiesIndex;
-		}
-		reasoner = new SPARQLReasoner(new SparqlEndpointKS(endpoint), cache);
-	}
-
-	public SPARQLTemplateBasedLearner2(Model model, Index resourcesIndex, Index classesIndex, Index propertiesIndex){
-		this(model, resourcesIndex, classesIndex, propertiesIndex, new StanfordPartOfSpeechTagger());
-	}
-
-	public SPARQLTemplateBasedLearner2(Model model, Index resourcesIndex, Index classesIndex, Index propertiesIndex, PartOfSpeechTagger posTagger){
-		this(model, resourcesIndex, classesIndex, propertiesIndex, posTagger, new WordNet(), new Options());
-	}
-
-	public SPARQLTemplateBasedLearner2(Model model, Index resourcesIndex, Index classesIndex, Index propertiesIndex, WordNet wordNet){
-		this(model, resourcesIndex, classesIndex, propertiesIndex, new StanfordPartOfSpeechTagger(), wordNet, new Options());
-	}
-
-	public SPARQLTemplateBasedLearner2(Model model, Index resourcesIndex, Index classesIndex, Index propertiesIndex, PartOfSpeechTagger posTagger, WordNet wordNet, Options options){
-		this(model, resourcesIndex, classesIndex, propertiesIndex, posTagger, wordNet, options, new ExtractionDBCache("cache"));
-	}
-
-	public SPARQLTemplateBasedLearner2(Model model, MappingBasedIndex mappingBasedIndex, PartOfSpeechTagger posTagger)
-	{
-		this(model, new SPARQLIndex(model),new SPARQLIndex(model),new SPARQLIndex(model),posTagger);
-		setMappingIndex(mappingBasedIndex);
-	}
-
-	public SPARQLTemplateBasedLearner2(Model model, Index resourcesIndex, Index classesIndex, Index propertiesIndex, PartOfSpeechTagger posTagger, WordNet wordNet, Options options, ExtractionDBCache cache){
-		this.model = model;
-		this.resourcesIndex = resourcesIndex;
-		this.classesIndex = classesIndex;
-		this.propertiesIndex = propertiesIndex;
-		this.posTagger = posTagger;
-		this.wordNet = wordNet;
-		this.cache = cache;
-
-		setOptions(options);
-
-		if(propertiesIndex instanceof SPARQLPropertiesIndex){
-			if(propertiesIndex instanceof VirtuosoPropertiesIndex){
-				datatypePropertiesIndex = new VirtuosoDatatypePropertiesIndex((SPARQLPropertiesIndex)propertiesIndex);
-				objectPropertiesIndex = new VirtuosoObjectPropertiesIndex((SPARQLPropertiesIndex)propertiesIndex);
-			} else {
-				datatypePropertiesIndex = new SPARQLDatatypePropertiesIndex((SPARQLPropertiesIndex)propertiesIndex);
-				objectPropertiesIndex = new SPARQLObjectPropertiesIndex((SPARQLPropertiesIndex)propertiesIndex);
-			}
-		} else {
-			datatypePropertiesIndex = propertiesIndex;
-			objectPropertiesIndex = propertiesIndex;
-		}
-		reasoner = new SPARQLReasoner(new LocalModelBasedSparqlEndpointKS(ModelFactory.createOntologyModel(OntModelSpec.RDFS_MEM, model)), cache);
-	}
-
-	public void setGrammarFiles(String[] grammarFiles)
-	{
-		if(templateGenerator==null) {throw new AssertionError("Learner not initialized. Please call init();");}
-		templateGenerator.setGrammarFiles(grammarFiles);
-	}
-
-	@Override
-	public void init() throws ComponentInitException {
-		templateGenerator = new Templator(posTagger, wordNet, grammarFiles);
-		lemmatizer = new LingPipeLemmatizer();
-	}
-
-	public void setMappingIndex(MappingBasedIndex mappingIndex) {
-		this.mappingIndex = mappingIndex;
-	}
-
-	public void setCache(ExtractionDBCache cache) {
-		this.cache = cache;
-	}
-
-	public void setKnowledgebase(Knowledgebase knowledgebase){
-		this.endpoint = knowledgebase.getEndpoint();
-		this.resourcesIndex = knowledgebase.getResourceIndex();
-		this.classesIndex = knowledgebase.getClassIndex();
-		this.propertiesIndex = knowledgebase.getPropertyIndex();
-		this.mappingIndex = knowledgebase.getMappingIndex();
-		if(propertiesIndex instanceof SPARQLPropertiesIndex){
-			if(propertiesIndex instanceof VirtuosoPropertiesIndex){
-				datatypePropertiesIndex = new VirtuosoDatatypePropertiesIndex((SPARQLPropertiesIndex)propertiesIndex);
-				objectPropertiesIndex = new VirtuosoObjectPropertiesIndex((SPARQLPropertiesIndex)propertiesIndex);
-			} else {
-				datatypePropertiesIndex = new SPARQLDatatypePropertiesIndex((SPARQLPropertiesIndex)propertiesIndex);
-				objectPropertiesIndex = new SPARQLObjectPropertiesIndex((SPARQLPropertiesIndex)propertiesIndex);
-			}
-		} else {
-			datatypePropertiesIndex = propertiesIndex;
-			objectPropertiesIndex = propertiesIndex;
-		}
-		reasoner = new SPARQLReasoner(new SparqlEndpointKS(endpoint));
-	}
-
-	public void setUseDomainRangeRestriction(boolean useDomainRangeRestriction) {
-		this.useDomainRangeRestriction = useDomainRangeRestriction;
-	}
-
-	/*
-	 * Only for Evaluation useful.
-	 */
-	public void setUseIdealTagger(boolean value){
-		templateGenerator.setUNTAGGED_INPUT(!value);
-	}
-
-	private void setOptions(Options options){
-		maxIndexResults = Integer.parseInt(options.get("solr.query.limit", "10"));
-
-		maxQueryExecutionTimeInSeconds = Integer.parseInt(options.get("sparql.query.maxExecutionTimeInSeconds", "20"));
-		cache.setMaxExecutionTimeInSeconds(maxQueryExecutionTimeInSeconds);
-
-		useRemoteEndpointValidation = options.get("learning.validationType", "remote").equals("remote") ? true : false;
-		stopIfQueryResultNotEmpty = Boolean.parseBoolean(options.get("learning.stopAfterFirstNonEmptyQueryResult", "true"));
-		maxTestedQueriesPerTemplate = Integer.parseInt(options.get("learning.maxTestedQueriesPerTemplate", "20"));
-
-		String wordnetPath = options.get("wordnet.dictionary", "tbsl/dict");
-		wordnetPath = this.getClass().getClassLoader().getResource(wordnetPath).getPath();
-		System.setProperty("wordnet.database.dir", wordnetPath);
-	}
-
-	public void setEndpoint(SparqlEndpoint endpoint){
-		this.endpoint = endpoint;
-
-		reasoner = new SPARQLReasoner(new SparqlEndpointKS(endpoint));
-		reasoner.setCache(cache);
-		reasoner.prepareSubsumptionHierarchy();
-	}
-
-	public void setQuestion(String question){
-		this.question = question;
-	}
-
-	public void setUseRemoteEndpointValidation(boolean useRemoteEndpointValidation){
-		this.useRemoteEndpointValidation = useRemoteEndpointValidation;
-	}
-
-	public int getMaxQueryExecutionTimeInSeconds() {
-		return maxQueryExecutionTimeInSeconds;
-	}
-
-	public void setMaxQueryExecutionTimeInSeconds(int maxQueryExecutionTimeInSeconds) {
-		this.maxQueryExecutionTimeInSeconds = maxQueryExecutionTimeInSeconds;
-	}
-
-	public int getMaxTestedQueriesPerTemplate() {
-		return maxTestedQueriesPerTemplate;
-	}
-
-	public void setMaxTestedQueriesPerTemplate(int maxTestedQueriesPerTemplate) {
-		this.maxTestedQueriesPerTemplate = maxTestedQueriesPerTemplate;
-	}
-
-	private void reset(){
-		learnedSPARQLQueries = new TreeSet<WeightedQuery>();
-		template2Queries = new HashMap<Template, Collection<? extends Query>>();
-		slot2URI = new HashMap<Slot, List<String>>();
-		relevantKeywords = new HashSet<String>();
-		currentlyExecutedQuery = null;
-
-		//		templateMon.reset();
-		//		sparqlMon.reset();
-	}
-
-	public void learnSPARQLQueries() throws NoTemplateFoundException{
-		reset();
-		//generate SPARQL query templates
-		logger.debug("Generating SPARQL query templates...");
-		templateMon.start();
-		if(multiThreaded){
-			templates = templateGenerator.buildTemplatesMultiThreaded(question,CREATE_SYNONYMS);
-		} else {
-			templates = templateGenerator.buildTemplates(question);
-		}
-		templateMon.stop();
-		logger.debug("Done in " + templateMon.getLastValue() + "ms.");
-		relevantKeywords.addAll(templateGenerator.getUnknownWords());
-		if(templates.isEmpty()){
-			throw new NoTemplateFoundException();
-
-		}
-		logger.debug("Templates:");
-		for(Template t : templates){
-			logger.debug(t);
-		}
-
-		//get the weighted query candidates
-		generatedQueries = getWeightedSPARQLQueries(templates);
-		sparqlQueryCandidates = new ArrayList<WeightedQuery>();
-		int i = 0;
-		for(WeightedQuery wQ : generatedQueries){
-			logger.debug(wQ.explain());
-			sparqlQueryCandidates.add(wQ);
-			if(i == maxTestedQueries){
-				break;
-			}
-			i++;
-		}
-
-		if(mode == Mode.BEST_QUERY){
-			double bestScore = -1;
-			for(WeightedQuery candidate : generatedQueries){
-				double score = candidate.getScore();
-				if(score >= bestScore){
-					bestScore = score;
-					learnedSPARQLQueries.add(candidate);
-				} else {
-					break;
-				}
-			}
-		} else if(mode == Mode.BEST_NON_EMPTY_QUERY){
-			//test candidates
-			if(useRemoteEndpointValidation){ //on remote endpoint
-				validateAgainstRemoteEndpoint(sparqlQueryCandidates);
-			} else {//on local model
-
-			}
-		}
-	}
-
-	public SortedSet<WeightedQuery> getGeneratedQueries() {
-		return generatedQueries;
-	}
-
-	public SortedSet<WeightedQuery> getGeneratedQueries(int topN) {
-		SortedSet<WeightedQuery> topNQueries = new TreeSet<WeightedQuery>();
-		int max = Math.min(topN, generatedQueries.size());
-		for(WeightedQuery wQ : generatedQueries){
-			topNQueries.add(wQ);
-			if(topNQueries.size() == max){
-				break;
-			}
-		}
-		return topNQueries;
-	}
-
-	public Set<Template> getTemplates(){
-		return templates;
-	}
-
-	public List<String> getGeneratedSPARQLQueries(){
-		List<String> queries = new ArrayList<String>();
-		for(WeightedQuery wQ : sparqlQueryCandidates){
-			queries.add(wQ.getQuery().toString());
-		}
-
-		return queries;
-	}
-
-	public Map<Template, Collection<? extends Query>> getTemplates2SPARQLQueries(){
-		return template2Queries;
-	}
-
-	public Map<Slot, List<String>> getSlot2URIs(){
-		return slot2URI;
-	}
-
-	private void normProminenceValues(Set<Allocation> allocations){
-		double min = 0;
-		double max = 0;
-		for(Allocation a : allocations){
-			if(a.getProminence() < min){
-				min = a.getProminence();
-			}
-			if(a.getProminence() > max){
-				max = a.getProminence();
-			}
-		}
-		if(min==max) {return;}
-		for(Allocation a : allocations){
-			double prominence = a.getProminence()/(max-min);
-			a.setProminence(prominence);
-		}
-	}
-
-	private void computeScore(Set<Allocation> allocations){
-		double alpha = 0.8;
-		double beta = 1 - alpha;
-
-		for(Allocation a : allocations){
-			double score = alpha * a.getSimilarity() + beta * a.getProminence();
-			a.setScore(score);
-		}
-
-	}
-
-	public Set<String> getRelevantKeywords(){
-		return relevantKeywords;
-	}
-
-	// just for testing the HMM integration, getWeightedSPARQLQueriesOld is the original one
-	private SortedSet<WeightedQuery> getWeightedSPARQLQueries(Set<Template> templates)
-	{
-		// for testing 
-		for(Template template: templates)
-		{
-			{
-				ArrayList<String> keywords = new ArrayList<String>();
-				for(Slot slot: template.getSlots())
-				{
-					keywords.add(slot.getWords().get(0));
-				}
-				if(template.getSlots().size()!=3) {continue;}
-//				if(!keywords.contains("Mean Hamster Software")) {continue;}
-//				if(!keywords.contains("published")) {continue;}
-				System.out.println("\"keywords\": "+keywords);
-			}
-			System.out.println(template);
-			SortedSet<WeightedQuery> queries = new TreeSet<WeightedQuery>();
-			Query query = template.getQuery();
-			double score = 0;
-
-			Map<List<String>,List<ResourceInfo>> segmentToURIs = new HashMap<List<String>,List<ResourceInfo>>();
-			Map<String,IndexResultItem> uriUniqueToResultItem = new HashMap<String,IndexResultItem>(); 
-			for(Slot slot: template.getSlots())
-			{
-				List<String> segment = new LinkedList<String>();
-				segment.addAll(Arrays.asList(slot.getWords().get(0).split("\\s")));			
-				List<ResourceInfo> resourceInfos = new LinkedList<ResourceInfo>();
-
-				for(IndexResultItem item : getIndexResultItems(slot))
-				{
-					// if this gets used at another place, create a function IndexResultItemToResourceInfo()
-					ResourceInfo info = new ResourceInfo();
-					info.setUri(item.getUri());
-					String label = item.getLabel();					
-					// in dbpedia, the last part of the uri is transformed from the english label, reverse the transformation (should almost always work for dbpedia article resources)
-					info.setLabel(label!=null?label:sfp.getShortForm(IRI.create(item.getUri())));
-					// in saedeehs algorithm, the emission probabilty is formed by the string similarity
-					// but we use the lucene index score
-					double max = 0;
-					for(String word: slot.getWords()) {max = Math.max(max, Similarity.getSimilarity(word, info.getLabel()));}					
-					if(max<0||max>1) throw new AssertionError("max is not in [0,1], max="+max);
-					info.setStringSimilarityScore(max);
-					if(!info.setTypeFromDBpediaURI()) throw new AssertionError("could not set type for info "+info);
-					System.err.println("info with type: "+info);
-					resourceInfos.add(info);
-				}
-				segmentToURIs.put(segment,resourceInfos);
-			}
-			HiddenMarkovModel hmm = new HiddenMarkovModel();
-			hmm.initialization();
-			hmm.startMarkovModel(segmentToURIs,true);
-			MultiMap<Double,List<String>> paths = hmm.getPaths();
-
-			//			System.out.println(hmm.getPaths());
-			// die keywords jetzt in sadeehs algorithmus reinwerfen 
-			// da kommen jetzt pfade raus mit unterschiedlichen wahrscheinlichkeiten	
-			//			HiddenMarkovModel HMM = new HiddenMarkovModel();
-			//			HMM.StartMarkovModel();
-			// jetzt die variablen aus der query ersetzen mit den kandidaten
-			// ranked list der pfade, die die observation sequence generieren
-
-			for(Double d : paths.keySet())
-			{
-				for(List<String> path : paths.get(d))
-				{
-					Query q = new Query(query);
-					// TODO: which variable stands for which resource? do it randomly now to check if the replacement works and then correct the order later 
-					System.out.println(q.getVariablesAsStringList());
-					System.out.println();
-					int i = 0;
-					for(String var : q.getVariablesAsStringList())
-					{						
-						q.replaceVarWithURI(var, path.get(i));
-						i++;
-					}
-					System.out.println(q);
-
-
-					WeightedQuery wQuery = new WeightedQuery(q, score);
-					queries.add(wQuery);
-				}
-			}
-			//System.exit(0);
-			return queries;
-			//			>> SLOTS:
-			//				y0: RESOURCE {Mean Hamster Software}
-			//				p0: OBJECTPROPERTY {published,print}
-			//				p1: CLASS {video games}
-
-
-			//			System.out.println(template);			
-		}
-		// 		
-		return null;
-	}
-
-	private double getProminenceValue(String uri, SlotType type){
-		Integer popularity = null;
-		if(popularityMap != null){
-			if(type == SlotType.CLASS){
-				popularity = popularityMap.getPopularity(uri, EntityType.CLASS);
-			} else if(type == SlotType.PROPERTY || type == SlotType.SYMPROPERTY 
-					|| type == SlotType.DATATYPEPROPERTY || type == SlotType.OBJECTPROPERTY){
-				popularity = popularityMap.getPopularity(uri, EntityType.PROPERTY);
-			} else if(type == SlotType.RESOURCE || type == SlotType.UNSPEC){
-				popularity = popularityMap.getPopularity(uri, EntityType.RESOURCE);
-			} 
-		} 
-		if(popularity == null){
-			String query = null;
-			if(type == SlotType.CLASS){
-				query = "SELECT COUNT(?s) WHERE {?s a <%s>}";
-			} else if(type == SlotType.PROPERTY || type == SlotType.SYMPROPERTY 
-					|| type == SlotType.DATATYPEPROPERTY || type == SlotType.OBJECTPROPERTY){
-				query = "SELECT COUNT(*) WHERE {?s <%s> ?o}";
-			} else if(type == SlotType.RESOURCE || type == SlotType.UNSPEC){
-				query = "SELECT COUNT(*) WHERE {?s ?p <%s>}";
-			}
-			query = String.format(query, uri);
-
-			ResultSet rs = executeSelect(query);
-			QuerySolution qs;
-			String projectionVar;
-			while(rs.hasNext()){
-				qs = rs.next();
-				projectionVar = qs.varNames().next();
-				popularity = qs.get(projectionVar).asLiteral().getInt();
-			}
-		}
-		if(popularity == null){
-			popularity = Integer.valueOf(0);
-		}
-		System.out.println(popularity);
-
-
-		//		if(cnt == 0){
-		//			return 0;
-		//		} 
-		//		return Math.log(cnt);
-		if(popularity!=popularity) {throw new AssertionError("prominence NaN for uri "+uri+", slot type "+type);}
-		return popularity;
-	}
-
-	public void setPopularityMap(PopularityMap popularityMap) {
-		this.popularityMap = popularityMap;
-	}
-
-
-	private List<String> pruneList(List<String> words){
-		List<String> prunedList = new ArrayList<String>();
-		for(String w1 : words){
-			boolean smallest = true;
-			for(String w2 : words){
-				if(!w1.equals(w2)){
-					if(w1.contains(w2)){
-						smallest = false;
-						break;
-					}
-				}
-			}
-			if(smallest){
-				prunedList.add(w1);
-			}
-		}
-		logger.info("Pruned list: " + prunedList);
-		//		return getLemmatizedWords(words);
-		return prunedList;
-	}
-
-	private List<String> getLemmatizedWords(List<String> words){
-		logger.info("Pruning word list " + words + "...");
-		//		mon.start();
-		List<String> pruned = new ArrayList<String>();
-		for(String word : words){
-			//currently only stem single words
-			if(word.contains(" ")){
-				pruned.add(word);
-			} else {
-				String lemWord = lemmatizer.stem(word);
-				if(!pruned.contains(lemWord)){
-					pruned.add(lemWord);
-				}
-			}
-
-		}
-		//		mon.stop();
-		//		logger.info("Done in " + mon.getLastValue() + "ms.");
-		logger.info("Pruned list: " + pruned);
-		return pruned;
-	}
-
-
-	private Index getIndexBySlotType(Slot slot){
-		Index index = null;
-		SlotType type = slot.getSlotType();
-		if(type == SlotType.CLASS){
-			index = classesIndex;
-		} else if(type == SlotType.PROPERTY || type == SlotType.SYMPROPERTY){
-			index = propertiesIndex;
-		} else if(type == SlotType.DATATYPEPROPERTY){
-			index = datatypePropertiesIndex;
-		} else if(type == SlotType.OBJECTPROPERTY){
-			index = objectPropertiesIndex;
-		} else if(type == SlotType.RESOURCE || type == SlotType.UNSPEC){
-			index = resourcesIndex;
-		}
-		return index;
-	}
-
-	private void validateAgainstRemoteEndpoint(Collection<WeightedQuery> queries){
-		SPARQL_QueryType queryType = queries.iterator().next().getQuery().getQt();
-		validate(queries, queryType);
-	}
-
-	private void validate(Collection<WeightedQuery> queries, SPARQL_QueryType queryType){
-		logger.debug("Testing candidate SPARQL queries on remote endpoint...");
-		sparqlMon.start();
-		if(queryType == SPARQL_QueryType.SELECT){
-			for(WeightedQuery query : queries){
-				learnedPos++;
-				List<String> results;
-				try {
-					logger.debug("Testing query:\n" + query);
-					com.hp.hpl.jena.query.Query q = QueryFactory.create(query.getQuery().toString(), Syntax.syntaxARQ);
-					q.setLimit(1);
-					ResultSet rs = executeSelect(q.toString());
-
-					results = new ArrayList<String>();
-					QuerySolution qs;
-					String projectionVar;
-					while(rs.hasNext()){
-						qs = rs.next();
-						projectionVar = qs.varNames().next();
-						if(qs.get(projectionVar).isLiteral()){
-							results.add(qs.get(projectionVar).asLiteral().getLexicalForm());
-						} else if(qs.get(projectionVar).isURIResource()){
-							results.add(qs.get(projectionVar).asResource().getURI());
-						}
-
-					}
-					if(!results.isEmpty()){
-						try{
-							int cnt = Integer.parseInt(results.get(0));
-							if(cnt > 0){
-								learnedSPARQLQueries.add(query);
-								if(stopIfQueryResultNotEmpty){
-									return;
-								}
-							}
-						} catch (NumberFormatException e){
-							learnedSPARQLQueries.add(query);
-							if(stopIfQueryResultNotEmpty){
-								return;
-							}
-						}
-						logger.debug("Result: " + results);
-					}
-				} catch (Exception e) {
-					e.printStackTrace();
-				}
-
-			}
-		} else if(queryType == SPARQL_QueryType.ASK){
-			for(WeightedQuery query : queries){
-				learnedPos++;
-				logger.debug("Testing query:\n" + query);
-				boolean result = executeAskQuery(query.getQuery().toString());
-				learnedSPARQLQueries.add(query);
-				//				if(stopIfQueryResultNotEmpty && result){
-				//					return;
-				//				}
-				if(stopIfQueryResultNotEmpty){
-					return;
-				}
-				logger.debug("Result: " + result);
-			}
-		}
-
-		sparqlMon.stop();
-		logger.debug("Done in " + sparqlMon.getLastValue() + "ms.");
-	}
-
-	private boolean executeAskQuery(String query)
-	{
-		if(query==null) throw new NullPointerException("Parameter query == null");
-		currentlyExecutedQuery = query;		
-
-		boolean ret;
-		if (model == null)
-		{		
-			QueryEngineHTTP qe = new QueryEngineHTTP(endpoint.getURL().toString(), query);
-			qe.setDefaultGraphURIs(endpoint.getDefaultGraphURIs());
-			ret = qe.execAsk();			
-		}
-		else {ret = QueryExecutionFactory.create(QueryFactory.create(query, Syntax.syntaxARQ), model).execAsk();}
-		return ret;
-	}
-
-	private ResultSet executeSelect(String query)
-	{
-		if(query==null) throw new NullPointerException("Parameter query == null");
-		currentlyExecutedQuery = query;
-		ResultSet rs;
-		if (model == null) {
-			if (cache == null) {
-				QueryEngineHTTP qe = new QueryEngineHTTP(endpoint.getURL().toString(), query);
-				qe.setDefaultGraphURIs(endpoint.getDefaultGraphURIs());
-				rs = qe.execSelect();
-			} else {
-				rs = SparqlQuery.convertJSONtoResultSet(cache.executeSelectQuery(endpoint, query));
-			}
-		} else {
-			rs = QueryExecutionFactory.create(QueryFactory.create(query, Syntax.syntaxARQ), model)
-					.execSelect();
-		}
-
-		return rs;
-	}
-
-	public String getCurrentlyExecutedQuery() {
-		return currentlyExecutedQuery;
-	}
-
-	public int getLearnedPosition() {
-		if(learnedPos >= 0){
-			return learnedPos+1;
-		}
-		return learnedPos;
-	}
-
-	@Override
-	public void start() {
-	}
-
-	@Override
-	public List<String> getCurrentlyBestSPARQLQueries(int nrOfSPARQLQueries) {
-		List<String> bestQueries = new ArrayList<String>();
-		for(WeightedQuery wQ : learnedSPARQLQueries){
-			bestQueries.add(wQ.getQuery().toString());
-		}
-		return bestQueries;
-	}
-
-	@Override
-	public String getBestSPARQLQuery() {
-		if(!learnedSPARQLQueries.isEmpty()){
-			return learnedSPARQLQueries.iterator().next().getQuery().toString();
-		} else {
-			return null;
-		}
-	}
-
-	public SortedSet<WeightedQuery> getLearnedSPARQLQueries() {
-		return learnedSPARQLQueries;
-	}
-
-	@Override
-	public LearningProblem getLearningProblem() {
-		// TODO Auto-generated method stub
-		return null;
-	}
-
-	@Override
-	public void setLearningProblem(LearningProblem learningProblem) {
-		// TODO Auto-generated method stub
-
-	}
-
-	private Set<IndexResultItem> getIndexResultItems(Slot slot)
-	{
-		//		List<String> uris = new LinkedList<String>();
-		Set<IndexResultItem> indexResultItems = new HashSet<IndexResultItem>();
-
-		Index index = getIndexBySlotType(slot);
-
-		for(String word : slot.getWords())
-		{
-			IndexResultSet rs = new IndexResultSet();
-			if(mappingIndex != null){
-				SlotType type = slot.getSlotType();
-				if(type == SlotType.CLASS){
-					rs.add(mappingIndex.getClassesWithScores(word));
-				} else if(type == SlotType.PROPERTY || type == SlotType.SYMPROPERTY){
-					rs.add(mappingIndex.getPropertiesWithScores(word));
-				} else if(type == SlotType.DATATYPEPROPERTY){
-					rs.add(mappingIndex.getDatatypePropertiesWithScores(word));
-				} else if(type == SlotType.OBJECTPROPERTY){
-					rs.add(mappingIndex.getObjectPropertiesWithScores(word));
-				} else if(type == SlotType.RESOURCE || type == SlotType.UNSPEC){
-					rs.add(mappingIndex.getResourcesWithScores(word));
-				}
-			}
-			//use the non manual indexes only if mapping based resultset is not empty and option is set
-			if(!useManualMappingsIfExistOnly || rs.isEmpty()){
-				if(slot.getSlotType() == SlotType.RESOURCE){
-					rs.add(index.getResourcesWithScores(word, 20,0));
-				} else {
-					if(slot.getSlotType() == SlotType.CLASS){
-						word = PlingStemmer.stem(word); 
-					}
-					IndexResultSet tmp = index.getResourcesWithScores(word, 20,0,Collections.singleton("boa-score"));
-					for(IndexResultItem item : tmp.getItems())
-					{System.out.println(item);
-						Double boaScore = (Double) item.getFields().get("boa-score");
-						if(boaScore==null||boaScore>BOA_THRESHOLD) rs.addItem(item);
-					}
-				}
-			}
-			//			for(IndexResultItem item: rs.getItems())
-			//			{
-			//				uris.add(item.getUri());
-			//			}
-			indexResultItems.addAll(rs.getItems());
-		}
-		return indexResultItems;
-	}
-
-
-	public String getTaggedInput()
-	{
-		if(templateGenerator==null) {throw new AssertionError("Learner not initialized. Please call init();");}
-		return templateGenerator.getTaggedInput();
-	}
-
-	private boolean isDatatypeProperty(String uri){
-		Boolean isDatatypeProperty = null;
-		if(mappingIndex != null){
-			isDatatypeProperty = mappingIndex.isDataProperty(uri);
-		}
-		if(isDatatypeProperty == null){
-			String query = String.format("ASK {<%s> a <http://www.w3.org/2002/07/owl#DatatypeProperty> .}", uri);
-			isDatatypeProperty = executeAskQuery(query);
-		}
-		return isDatatypeProperty;
-	}
-
-	//	/**
-	//	 * @param args
-	//	 * @throws NoTemplateFoundException 
-	//	 * @throws IOException 
-	//	 * @throws FileNotFoundException 
-	//	 * @throws InvalidFileFormatException 
-	//	 */
-	//	public static void main(String[] args) throws Exception {
-	//		SparqlEndpoint endpoint = new SparqlEndpoint(new URL("http://greententacle.techfak.uni-bielefeld.de:5171/sparql"), 
-	//				Collections.<String>singletonList(""), Collections.<String>emptyList());
-	//		Index resourcesIndex = new SOLRIndex("http://139.18.2.173:8080/solr/dbpedia_resources");
-	//		Index classesIndex = new SOLRIndex("http://139.18.2.173:8080/solr/dbpedia_classes");
-	//		Index propertiesIndex = new SOLRIndex("http://139.18.2.173:8080/solr/dbpedia_properties");
-	//
-	//		SPARQLTemplateBasedLearner2 learner = new SPARQLTemplateBasedLearner2(endpoint, resourcesIndex, classesIndex, propertiesIndex);
-	//		learner.init();
-	//
-	//		String question = "What is the highest mountain?";
-	//
-	//		learner.setQuestion(question);
-	//		learner.learnSPARQLQueries();
-	//		System.out.println("Learned query:\n" + learner.getBestSPARQLQuery());
-	//		System.out.println("Lexical answer type is: " + learner.getTemplates().iterator().next().getLexicalAnswerType());
-	//		System.out.println(learner.getLearnedPosition());
-	//
-	//	}
-
-
-
-}

Copied: branches/hmm/components-ext/src/main/java/org/dllearner/algorithm/tbsl/learning/SPARQLTemplateBasedLearner2HMM.java (from rev 3849, branches/hmm/components-ext/src/main/java/org/dllearner/algorithm/tbsl/learning/SPARQLTemplateBasedLearner2.java)
===================================================================
--- branches/hmm/components-ext/src/main/java/org/dllearner/algorithm/tbsl/learning/SPARQLTemplateBasedLearner2HMM.java	                        (rev 0)
+++ branches/hmm/components-ext/src/main/java/org/dllearner/algorithm/tbsl/learning/SPARQLTemplateBasedLearner2HMM.java	2012-09-27 13:16:49 UTC (rev 3851)
@@ -0,0 +1,1440 @@
+package org.dllearner.algorithm.tbsl.learning;
+
+import hmm.HiddenMarkovModel;
+import hmm.ResourceInfo;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+import java.util.SortedMap;
+import java.util.SortedSet;
+import java.util.TreeMap;
+import java.util.TreeSet;
+import org.apache.commons.collections15.MultiMap;
+import org.apache.log4j.Logger;
+import org.dllearner.algorithm.tbsl.nlp.Lemmatizer;
+import org.dllearner.algorithm.tbsl.nlp.LingPipeLemmatizer;
+import org.dllearner.algorithm.tbsl.nlp.PartOfSpeechTagger;
+import org.dllearner.algorithm.tbsl.nlp.PlingStemmer;
+import org.dllearner.algorithm.tbsl.nlp.StanfordPartOfSpeechTagger;
+import org.dllearner.algorithm.tbsl.nlp.WordNet;
+import org.dllearner.algorithm.tbsl.sparql.Allocation;
+import org.dllearner.algorithm.tbsl.sparql.Query;
+import org.dllearner.algorithm.tbsl.sparql.SPARQL_Filter;
+import org.dllearner.algorithm.tbsl.sparql.SPARQL_Pair;
+import org.dllearner.algorithm.tbsl.sparql.SPARQL_PairType;
+import org.dllearner.algorithm.tbsl.sparql.SPARQL_Property;
+import org.dllearner.algorithm.tbsl.sparql.SPARQL_QueryType;
+import org.dllearner.algorithm.tbsl.sparql.SPARQL_Triple;
+import org.dllearner.algorithm.tbsl.sparql.SPARQL_Value;
+import org.dllearner.algorithm.tbsl.sparql.Slot;
+import org.dllearner.algorithm.tbsl.sparql.SlotType;
+import org.dllearner.algorithm.tbsl.sparql.Template;
+import org.dllearner.algorithm.tbsl.sparql.WeightedQuery;
+import org.dllearner.algorithm.tbsl.templator.Templator;
+import org.dllearner.algorithm.tbsl.util.Knowledgebase;
+import org.dllearner.algorithm.tbsl.util.PopularityMap;
+import org.dllearner.algorithm.tbsl.util.PopularityMap.EntityType;
+import org.dllearner.algorithm.tbsl.util.Similarity;
+import org.dllearner.algorithm.tbsl.util.UnknownPropertyHelper.SymPropertyDirection;
+import org.dllearner.common.index.Index;
+import org.dllearner.common.index.IndexResultItem;
+import org.dllearner.common.index.IndexResultSet;
+import org.dllearner.common.index.MappingBasedIndex;
+import org.dllearner.common.index.SOLRIndex;
+import org.dllearner.common.index.SPARQLDatatypePropertiesIndex;
+import org.dllearner.common.index.SPARQLIndex;
+import org.dllearner.common.index.SPARQLObjectPropertiesIndex;
+import org.dllearner.common.index.SPARQLPropertiesIndex;
+import org.dllearner.common.index.VirtuosoDatatypePropertiesIndex;
+import org.dllearner.common.index.VirtuosoObjectPropertiesIndex;
+import org.dllearner.common.index.VirtuosoPropertiesIndex;
+import org.dllearner.core.ComponentInitException;
+import org.dllearner.core.LearningProblem;
+import org.dllearner.core.SparqlQueryLearningAlgorithm;
+import org.dllearner.core.owl.Description;
+import org.dllearner.core.owl.NamedClass;
+import org.dllearner.core.owl.ObjectProperty;
+import org.dllearner.core.owl.Thing;
+import org.dllearner.kb.LocalModelBasedSparqlEndpointKS;
+import org.dllearner.kb.SparqlEndpointKS;
+import org.dllearner.kb.sparql.ExtractionDBCache;
+import org.dllearner.kb.sparql.SparqlEndpoint;
+import org.dllearner.kb.sparql.SparqlQuery;
+import org.dllearner.reasoning.SPARQLReasoner;
+import org.ini4j.InvalidFileFormatException;
+import org.ini4j.Options;
+import org.semanticweb.owlapi.model.IRI;
+import org.semanticweb.owlapi.util.SimpleIRIShortFormProvider;
+import com.hp.hpl.jena.ontology.OntModelSpec;
+import com.hp.hpl.jena.query.QueryExecutionFactory;
+import com.hp.hpl.jena.query.QueryFactory;
+import com.hp.hpl.jena.query.QuerySolution;
+import com.hp.hpl.jena.query.ResultSet;
+import com.hp.hpl.jena.query.Syntax;
+import com.hp.hpl.jena.rdf.model.Model;
+import com.hp.hpl.jena.rdf.model.ModelFactory;
+import com.hp.hpl.jena.sparql.engine.http.QueryEngineHTTP;
+import com.hp.hpl.jena.sparql.expr.ExprAggregator;
+import com.hp.hpl.jena.sparql.expr.ExprVar;
+import com.hp.hpl.jena.sparql.expr.aggregate.AggCount;
+import com.hp.hpl.jena.sparql.expr.aggregate.Aggregator;
+import com.jamonapi.Monitor;
+import com.jamonapi.MonitorFactory;
+
+/** The old learner taken over by Konrad Höffner for experiments with the Hidden Markov Algorithm by Saedeeh Shekarpur.
+ * 
+ * */
+public class SPARQLTemplateBasedLearner2 implements SparqlQueryLearningAlgorithm
+{
+	public static boolean useHMM = true;
+	
+	enum Mode {BEST_QUERY, BEST_NON_EMPTY_QUERY}
+	private Mode mode = Mode.BEST_QUERY;
+	
+	/** used to create a label out of the URI when there is no label available in the SPARQL endpoint.*/
+	private static SimpleIRIShortFormProvider sfp = new SimpleIRIShortFormProvider();
+
+	private static final Logger logger = Logger.getLogger(SPARQLTemplateBasedLearner2.class);
+	/** synonyms are great but are not used yet by the HMM algorithm. **/
+	private static final boolean	CREATE_SYNONYMS	= false;
+	/** The minimum score of items that are accepted from the Sindice search BOA index. **/
+	private static final Double	BOA_THRESHOLD	=  0.9;
+	private Monitor templateMon = MonitorFactory.getTimeMonitor("template");
+	private Monitor sparqlMon = MonitorFactory.getTimeMonitor("sparql");
+
+	private boolean useRemoteEndpointValidation;
+	private boolean stopIfQueryResultNotEmpty;
+	private int maxTestedQueriesPerTemplate = 50;
+	private int maxQueryExecutionTimeInSeconds;
+	private int maxTestedQueries = 200;
+	private int maxIndexResults;
+
+	private SparqlEndpoint endpoint = null;
+	private Model model = null;
+
+	private ExtractionDBCache cache = new ExtractionDBCache("cache");
+
+	private Index resourcesIndex;
+	private Index classesIndex;
+	private Index propertiesIndex;
+
+	private Index datatypePropertiesIndex;
+	private Index objectPropertiesIndex;
+
+	private MappingBasedIndex mappingIndex;
+
+	private Templator templateGenerator = null;
+	private Lemmatizer lemmatizer;
+	private PartOfSpeechTagger posTagger;
+	private WordNet wordNet;
+
+	private String question;
+	private int learnedPos = -1;
+
+	private Set<Template> templates;
+	private Map<Template, Collection<? extends Query>> template2Queries;
+	private Map<Slot, List<String>> slot2URI;
+
+	private Collection<WeightedQuery> sparqlQueryCandidates;
+	private SortedSet<WeightedQuery> learnedSPARQLQueries;
+	private SortedSet<WeightedQuery> generatedQueries;
+
+	private SPARQLReasoner reasoner;
+
+	private String currentlyExecutedQuery;
+
+	private boolean dropZeroScoredQueries = true;
+	private boolean useManualMappingsIfExistOnly = true;
+
+	private boolean multiThreaded = true;
+
+	private String [] grammarFiles = new String[]{"tbsl/lexicon/english.lex"};
+
+	private PopularityMap popularityMap;
+
+	private Set<String> relevantKeywords;
+
+	private boolean useDomainRangeRestriction = true;
+
+	public SPARQLTemplateBasedLearner2(SparqlEndpoint endpoint, Index resourcesIndex, Index classesIndex, Index propertiesIndex){
+		this(endpoint, resourcesIndex, classesIndex, propertiesIndex, new StanfordPartOfSpeechTagger());
+	}
+
+	public SPARQLTemplateBasedLearner2(Knowledgebase knowledgebase, PartOfSpeechTagger posTagger, WordNet wordNet, Options options){
+		this(knowledgebase.getEndpoint(), knowledgebase.getResourceIndex(), knowledgebase.getClassIndex(),knowledgebase.getPropertyIndex(), posTagger, wordNet, options);
+	}
+
+	public SPARQLTemplateBasedLearner2(Knowledgebase knowledgebase){
+		this(knowledgebase.getEndpoint(), knowledgebase.getResourceIndex(), knowledgebase.getClassIndex(),knowledgebase.getPropertyIndex(), new StanfordPartOfSpeechTagger(), new WordNet(), new Options());
+	}
+
+	public SPARQLTemplateBasedLearner2(SparqlEndpoint endpoint, Index index){
+		this(endpoint, index, new StanfordPartOfSpeechTagger());
+	}
+
+	public SPARQLTemplateBasedLearner2(SparqlEndpoint endpoint, Index resourcesIndex, Index classesIndex, Index propertiesIndex, PartOfSpeechTagger posTagger){
+		this(endpoint, resourcesIndex, classesIndex, propertiesIndex, posTagger, new WordNet(), new Options());
+	}
+
+	public SPARQLTemplateBasedLearner2(SparqlEndpoint endpoint, Index index, PartOfSpeechTagger posTagger){
+		this(endpoint, index, posTagger, new WordNet(), new Options());
+	}
+
+	public SPARQLTemplateBasedLearner2(SparqlEndpoint endpoint, Index resourcesIndex, Index classesIndex, Index propertiesIndex, WordNet wordNet){
+		this(endpoint, resourcesIndex, classesIndex, propertiesIndex, new StanfordPartOfSpeechTagger(), wordNet, new Options());
+	}
+
+	public SPARQLTemplateBasedLearner2(SparqlEndpoint endpoint, Index index, WordNet wordNet){
+		this(endpoint, index, new StanfordPartOfSpeechTagger(), wordNet, new Options());
+	}
+
+	public SPARQLTemplateBasedLearner2(SparqlEndpoint endpoint, Index resourcesIndex, Index classesIndex, Index propertiesIndex, PartOfSpeechTagger posTagger, WordNet wordNet){
+		this(endpoint, resourcesIndex, classesIndex, propertiesIndex, posTagger, wordNet, new Options(), new ExtractionDBCache("cache"));
+	}
+
+	public SPARQLTemplateBasedLearner2(SparqlEndpoint endpoint, Index index, PartOfSpeechTagger posTagger, WordNet wordNet){
+		this(endpoint, index, index, index, posTagger, wordNet, new Options(), new ExtractionDBCache("cache"));
+	}
+
+	public SPARQLTemplateBasedLearner2(SparqlEndpoint endpoint, Index resourcesIndex, Index classesIndex, Index propertiesIndex, PartOfSpeechTagger posTagger, WordNet wordNet, Options options){
+		this(endpoint, resourcesIndex, classesIndex, propertiesIndex, posTagger, wordNet, options, new ExtractionDBCache("cache"));
+	}
+
+	public SPARQLTemplateBasedLearner2(SparqlEndpoint endpoint, Index index, PartOfSpeechTagger posTagger, WordNet wordNet, Options options){
+		this(endpoint, index, index, index, posTagger, wordNet, options, new ExtractionDBCache("cache"));
+	}
+
+	public SPARQLTemplateBasedLearner2(SparqlEndpoint endpoint, Index resourcesIndex, Index classesIndex, Index propertiesIndex, PartOfSpeechTagger posTagger, WordNet wordNet, Options options, ExtractionDBCache cache){
+		this.endpoint = endpoint;
+		this.resourcesIndex = resourcesIndex;
+		this.classesIndex = classesIndex;
+		this.propertiesIndex = propertiesIndex;
+		this.posTagger = posTagger;
+		this.wordNet = wordNet;
+		this.cache = cache;
+
+		setOptions(options);
+
+		if(propertiesIndex instanceof SPARQLPropertiesIndex){
+			if(propertiesIndex instanceof VirtuosoPropertiesIndex){
+				datatypePropertiesIndex = new VirtuosoDatatypePropertiesIndex((SPARQLPropertiesIndex)propertiesIndex);
+				objectPropertiesIndex = new VirtuosoObjectPropertiesIndex((SPARQLPropertiesIndex)propertiesIndex);
+			} else {
+				datatypePropertiesIndex = new SPARQLDatatypePropertiesIndex((SPARQLPropertiesIndex)propertiesIndex);
+				objectPropertiesIndex = new SPARQLObjectPropertiesIndex((SPARQLPropertiesIndex)propertiesIndex);
+			}
+		} else {
+			datatypePropertiesIndex = propertiesIndex;
+			objectPropertiesIndex = propertiesIndex;
+		}
+		reasoner = new SPARQLReasoner(new SparqlEndpointKS(endpoint), cache);
+	}
+
+	public SPARQLTemplateBasedLearner2(Model model, Index resourcesIndex, Index classesIndex, Index propertiesIndex){
+		this(model, resourcesIndex, classesIndex, propertiesIndex, new StanfordPartOfSpeechTagger());
+	}
+
+	public SPARQLTemplateBasedLearner2(Model model, Index resourcesIndex, Index classesIndex, Index propertiesIndex, PartOfSpeechTagger posTagger){
+		this(model, resourcesIndex, classesIndex, propertiesIndex, posTagger, new WordNet(), new Options());
+	}
+
+	public SPARQLTemplateBasedLearner2(Model model, Index resourcesIndex, Index classesIndex, Index propertiesIndex, WordNet wordNet){
+		this(model, resourcesIndex, classesIndex, propertiesIndex, new StanfordPartOfSpeechTagger(), wordNet, new Options());
+	}
+
+	public SPARQLTemplateBasedLearner2(Model model, Index resourcesIndex, Index classesIndex, Index propertiesIndex, PartOfSpeechTagger posTagger, WordNet wordNet, Options options){
+		this(model, resourcesIndex, classesIndex, propertiesIndex, posTagger, wordNet, options, new ExtractionDBCache("cache"));
+	}
+
+	public SPARQLTemplateBasedLearner2(Model model, MappingBasedIndex mappingBasedIndex, PartOfSpeechTagger posTagger)
+	{
+		this(model, new SPARQLIndex(model),new SPARQLIndex(model),new SPARQLIndex(model),posTagger);
+		setMappingIndex(mappingBasedIndex);
+	}
+
+	public SPARQLTemplateBasedLearner2(Model model, Index resourcesIndex, Index classesIndex, Index propertiesIndex, PartOfSpeechTagger posTagger, WordNet wordNet, Options options, ExtractionDBCache cache){
+		this.model = model;
+		this.resourcesIndex = resourcesIndex;
+		this.classesIndex = classesIndex;
+		this.propertiesIndex = propertiesIndex;
+		this.posTagger = posTagger;
+		this.wordNet = wordNet;
+		this.cache = cache;
+
+		setOptions(options);
+
+		if(propertiesIndex instanceof SPARQLPropertiesIndex){
+			if(propertiesIndex instanceof VirtuosoPropertiesIndex){
+				datatypePropertiesIndex = new VirtuosoDatatypePropertiesIndex((SPARQLPropertiesIndex)propertiesIndex);
+				objectPropertiesIndex = new VirtuosoObjectPropertiesIndex((SPARQLPropertiesIndex)propertiesIndex);
+			} else {
+				datatypePropertiesIndex = new SPARQLDatatypePropertiesIndex((SPARQLPropertiesIndex)propertiesIndex);
+				objectPropertiesIndex = new SPARQLObjectPropertiesIndex((SPARQLPropertiesIndex)propertiesIndex);
+			}
+		} else {
+			datatypePropertiesIndex = propertiesIndex;
+			objectPropertiesIndex = propertiesIndex;
+		}
+		reasoner = new SPARQLReasoner(new LocalModelBasedSparqlEndpointKS(ModelFactory.createOntologyModel(OntModelSpec.RDFS_MEM, model)), cache);
+	}
+
+	public void setGrammarFiles(String[] grammarFiles)
+	{
+		if(templateGenerator==null) {throw new AssertionError("Learner not initialized. Please call init();");}
+		templateGenerator.setGrammarFiles(grammarFiles);
+	}
+
+	@Override
+	public void init() throws ComponentInitException {
+		templateGenerator = new Templator(posTagger, wordNet, grammarFiles);
+		lemmatizer = new LingPipeLemmatizer();
+	}
+
+	public void setMappingIndex(MappingBasedIndex mappingIndex) {
+		this.mappingIndex = mappingIndex;
+	}
+
+	public void setCache(ExtractionDBCache cache) {
+		this.cache = cache;
+	}
+
+	public void setKnowledgebase(Knowledgebase knowledgebase){
+		this.endpoint = knowledgebase.getEndpoint();
+		this.resourcesIndex = knowledgebase.getResourceIndex();
+		this.classesIndex = knowledgebase.getClassIndex();
+		this.propertiesIndex = knowledgebase.getPropertyIndex();
+		this.mappingIndex = knowledgebase.getMappingIndex();
+		if(propertiesIndex instanceof SPARQLPropertiesIndex){
+			if(propertiesIndex instanceof VirtuosoPropertiesIndex){
+				datatypePropertiesIndex = new VirtuosoDatatypePropertiesIndex((SPARQLPropertiesIndex)propertiesIndex);
+				objectPropertiesIndex = new VirtuosoObjectPropertiesIndex((SPARQLPropertiesIndex)propertiesIndex);
+			} else {
+				datatypePropertiesIndex = new SPARQLDatatypePropertiesIndex((SPARQLPropertiesIndex)propertiesIndex);
+				objectPropertiesIndex = new SPARQLObjectPropertiesIndex((SPARQLPropertiesIndex)propertiesIndex);
+			}
+		} else {
+			datatypePropertiesIndex = propertiesIndex;
+			objectPropertiesIndex = propertiesIndex;
+		}
+		reasoner = new SPARQLReasoner(new SparqlEndpointKS(endpoint));
+	}
+
+	public void setUseDomainRangeRestriction(boolean useDomainRangeRestriction) {
+		this.useDomainRangeRestriction = useDomainRangeRestriction;
+	}
+
+	/*
+	 * Only for Evaluation useful.
+	 */
+	public void setUseIdealTagger(boolean value){
+		templateGenerator.setUNTAGGED_INPUT(!value);
+	}
+
+	private void setOptions(Options options){
+		maxIndexResults = Integer.parseInt(options.get("solr.query.limit", "10"));
+
+		maxQueryExecutionTimeInSeconds = Integer.parseInt(options.get("sparql.query.maxExecutionTimeInSeconds", "20"));
+		cache.setMaxExecutionTimeInSeconds(maxQueryExecutionTimeInSeconds);
+
+		useRemoteEndpointValidation = options.get("learning.validationType", "remote").equals("remote") ? true : false;
+		stopIfQueryResultNotEmpty = Boolean.parseBoolean(options.get("learning.stopAfterFirstNonEmptyQueryResult", "true"));
+		maxTestedQueriesPerTemplate = Integer.parseInt(options.get("learning.maxTestedQueriesPerTemplate", "20"));
+
+		String wordnetPath = options.get("wordnet.dictionary", "tbsl/dict");
+		wordnetPath = this.getClass().getClassLoader().getResource(wordnetPath).getPath();
+		System.setProperty("wordnet.database.dir", wordnetPath);
+	}
+
+	public void setEndpoint(SparqlEndpoint endpoint){
+		this.endpoint = endpoint;
+
+		reasoner = new SPARQLReasoner(new SparqlEndpointKS(endpoint));
+		reasoner.setCache(cache);
+		reasoner.prepareSubsumptionHierarchy();
+	}
+
+	public void setQuestion(String question){
+		this.question = question;
+	}
+
+	public void setUseRemoteEndpointValidation(boolean useRemoteEndpointValidation){
+		this.useRemoteEndpointValidation = useRemoteEndpointValidation;
+	}
+
+	public int getMaxQueryExecutionTimeInSeconds() {
+		return maxQueryExecutionTimeInSeconds;
+	}
+
+	public void setMaxQueryExecutionTimeInSeconds(int maxQueryExecutionTimeInSeconds) {
+		this.maxQueryExecutionTimeInSeconds = maxQueryExecutionTimeInSeconds;
+	}
+
+	public int getMaxTestedQueriesPerTemplate() {
+		return maxTestedQueriesPerTemplate;
+	}
+
+	public void setMaxTestedQueriesPerTemplate(int maxTestedQueriesPerTemplate) {
+		this.maxTestedQueriesPerTemplate = maxTestedQueriesPerTemplate;
+	}
+
+	private void reset(){
+		learnedSPARQLQueries = new TreeSet<WeightedQuery>();
+		template2Queries = new HashMap<Template, Collection<? extends Query>>();
+		slot2URI = new HashMap<Slot, List<String>>();
+		relevantKeywords = new HashSet<String>();
+		currentlyExecutedQuery = null;
+
+		//		templateMon.reset();
+		//		sparqlMon.reset();
+	}
+
+	public void learnSPARQLQueries() throws NoTemplateFoundException{
+		reset();
+		//generate SPARQL query templates
+		logger.debug("Generating SPARQL query templates...");
+		templateMon.start();
+		if(multiThreaded){
+			templates = templateGenerator.buildTemplatesMultiThreaded(question,CREATE_SYNONYMS);
+		} else {
+			templates = templateGenerator.buildTemplates(question);
+		}
+		templateMon.stop();
+		logger.debug("Done in " + templateMon.getLastValue() + "ms.");
+		relevantKeywords.addAll(templateGenerator.getUnknownWords());
+		if(templates.isEmpty()){
+			throw new NoTemplateFoundException();
+
+		}
+		logger.debug("Templates:");
+		for(Template t : templates){
+			logger.debug(t);
+		}
+
+		//get the weighted query candidates
+		generatedQueries = getWeightedSPARQLQueries(templates);
+		sparqlQueryCandidates = new ArrayList<WeightedQuery>();
+		int i = 0;
+		for(WeightedQuery wQ : generatedQueries){
+			logger.debug(wQ.explain());
+			sparqlQueryCandidates.add(wQ);
+			if(i == maxTestedQueries){
+				break;
+			}
+			i++;
+		}
+
+		if(mode == Mode.BEST_QUERY){
+			double bestScore = -1;
+			for(WeightedQuery candidate : generatedQueries){
+				double score = candidate.getScore();
+				if(score >= bestScore){
+					bestScore = score;
+					learnedSPARQLQueries.add(candidate);
+				} else {
+					break;
+				}
+			}
+		} else if(mode == Mode.BEST_NON_EMPTY_QUERY){
+			//test candidates
+			if(useRemoteEndpointValidation){ //on remote endpoint
+				validateAgainstRemoteEndpoint(sparqlQueryCandidates);
+			} else {//on local model
+
+			}
+		}
+	}
+
+	public SortedSet<WeightedQuery> getGeneratedQueries() {
+		return generatedQueries;
+	}
+
+	public SortedSet<WeightedQuery> getGeneratedQueries(int topN) {
+		SortedSet<WeightedQuery> topNQueries = new TreeSet<WeightedQuery>();
+		int max = Math.min(topN, generatedQueries.size());
+		for(WeightedQuery wQ : generatedQueries){
+			topNQueries.add(wQ);
+			if(topNQueries.size() == max){
+				break;
+			}
+		}
+		return topNQueries;
+	}
+
+	public Set<Template> getTemplates(){
+		return templates;
+	}
+
+	public List<String> getGeneratedSPARQLQueries(){
+		List<String> queries = new ArrayList<String>();
+		for(WeightedQuery wQ : sparqlQueryCandidates){
+			queries.add(wQ.getQuery().toString());
+		}
+
+		return queries;
+	}
+
+	public Map<Template, Collection<? extends Query>> getTemplates2SPARQLQueries(){
+		return template2Queries;
+	}
+
+	public Map<Slot, List<String>> getSlot2URIs(){
+		return slot2URI;
+	}
+
+	private void normProminenceValues(Set<Allocation> allocations){
+		double min = 0;
+		double max = 0;
+		for(Allocation a : allocations){
+			if(a.getProminence() < min){
+				min = a.getProminence();
+			}
+			if(a.getProminence() > max){
+				max = a.getProminence();
+			}
+		}
+		if(min==max) {return;}
+		for(Allocation a : allocations){
+			double prominence = a.getProminence()/(max-min);
+			a.setProminence(prominence);
+		}
+	}
+
+	private void computeScore(Set<Allocation> allocations){
+		double alpha = 0.8;
+		double beta = 1 - alpha;
+
+		for(Allocation a : allocations){
+			double score = alpha * a.getSimilarity() + beta * a.getProminence();
+			a.setScore(score);
+		}
+
+	}
+
+	public Set<String> getRelevantKeywords(){
+		return relevantKeywords;
+	}
+
+	// just for testing the HMM integration, getWeightedSPARQLQueriesOld is the original one
+	private SortedSet<WeightedQuery> getWeightedSPARQLQueries(Set<Template> templates)
+	{
+		// for testing 
+		for(Template template: templates)
+		{
+			{
+				ArrayList<String> keywords = new ArrayList<String>();
+				for(Slot slot: template.getSlots())
+				{
+					keywords.add(slot.getWords().get(0));
+				}
+				if(template.getSlots().size()!=3) {continue;}
+//				if(!keywords.contains("Mean Hamster Software")) {continue;}
+//				if(!keywords.contains("published")) {continue;}
+				System.out.println("\"keywords\": "+keywords);
+			}
+			System.out.println(template);
+			SortedSet<WeightedQuery> queries = new TreeSet<WeightedQuery>();
+			Query query = template.getQuery();
+			double score = 0;
+
+			Map<List<String>,List<ResourceInfo>> segmentToURIs = new HashMap<List<String>,List<ResourceInfo>>();
+			Map<String,IndexResultItem> uriUniqueToResultItem = new HashMap<String,IndexResultItem>(); 
+			for(Slot slot: template.getSlots())
+			{
+				List<String> segment = new LinkedList<String>();
+				segment.addAll(Arrays.asList(slot.getWords().get(0).split("\\s")));			
+				List<ResourceInfo> resourceInfos = new LinkedList<ResourceInfo>();
+
+				for(IndexResultItem item : getIndexResultItems(slot))
+				{
+					// if this gets used at another place, create a function IndexResultItemToResourceInfo()
+					ResourceInfo info = new ResourceInfo();
+					info.setUri(item.getUri());
+					String label = item.getLabel();					
+					// in dbpedia, the last part of the uri is transformed from the english label, reverse the transformation (should almost always work for dbpedia article resources)
+					info.setLabel(label!=null?label:sfp.getShortForm(IRI.create(item.getUri())));
+					// in saedeehs algorithm, the emission probabilty is formed by the string similarity
+					// but we use the lucene index score
+					double max = 0;
+					for(String word: slot.getWords()) {max = Math.max(max, Similarity.getSimilarity(word, info.getLabel()));}					
+					if(max<0||max>1) throw new AssertionEr...
 
[truncated message content]

[DL-Learner SVN] SF.net SVN: dl-learner:[3851] branches/hmm/components-ext/src/main/java/org /dlle

[DL-Learner SVN] SF.net SVN: dl-learner:[3851] branches/hmm/components-ext/src/main/java/org /dllearner/algorithm/tbsl/learning