[DL-Learner SVN] SF.net SVN: dl-learner:[3862] branches/hmm/components-ext/src

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 3862
          http://dl-learner.svn.sourceforge.net/dl-learner/?rev=3862&view=rev
Author:   kirdie
Date:     2012-10-30 16:32:40 +0000 (Tue, 30 Oct 2012)
Log Message:
-----------
fixed oxford testing.

Modified Paths:
--------------
    branches/hmm/components-ext/src/main/java/org/dllearner/algorithm/tbsl/learning/SPARQLTemplateBasedLearner2.java
    branches/hmm/components-ext/src/main/resources/tbsl/evaluation/oxford_working_questions.xml
    branches/hmm/components-ext/src/test/java/org/dllearner/algorithm/tbsl/learning/QueryTestData.java
    branches/hmm/components-ext/src/test/java/org/dllearner/algorithm/tbsl/learning/SPARQLTemplateBasedLearner3Test.java

Added Paths:
-----------
    branches/hmm/components-ext/src/main/resources/tbsl/evaluation/oxford_working_questions_justquestions.xml

Modified: branches/hmm/components-ext/src/main/java/org/dllearner/algorithm/tbsl/learning/SPARQLTemplateBasedLearner2.java
===================================================================

--- branches/hmm/components-ext/src/main/java/org/dllearner/algorithm/tbsl/learning/SPARQLTemplateBasedLearner2.java	2012-10-30 16:10:36 UTC (rev 3861)
+++ branches/hmm/components-ext/src/main/java/org/dllearner/algorithm/tbsl/learning/SPARQLTemplateBasedLearner2.java	2012-10-30 16:32:40 UTC (rev 3862)
@@ -90,15 +90,13 @@
 import com.jamonapi.MonitorFactory;
 
 /** The old learner taken over by Konrad Höffner for experiments with the Hidden Markov Algorithm by Saedeeh Shekarpur.
- * 
  * */
 public class SPARQLTemplateBasedLearner2 implements SparqlQueryLearningAlgorithm
 {
-	private static final boolean USE_HMM = false;
 	/** synonyms are great but are not used yet by the HMM algorithm. **/
-	private static final boolean	HMM_USE_SYNONYMS	= false;
+	private static final boolean	HMM_USE_SYNONYMS	= true;
 		/** The minimum score of items that are accepted from the Sindice search BOA index. **/
-	private static final Double	BOA_THRESHOLD	=  0.9;
+	private static final Double	BOA_THRESHOLD	=  0.5;
 	enum Mode {BEST_QUERY, BEST_NON_EMPTY_QUERY}
 	private Mode mode = Mode.BEST_QUERY;
 	
@@ -394,35 +392,37 @@
 		//		templateMon.reset();
 		//		sparqlMon.reset();
 	}
-
-	public void learnSPARQLQueries() throws NoTemplateFoundException{
+	public void learnSPARQLQueries() throws NoTemplateFoundException
+	{
+		learnSPARQLQueries(false);
+	}
+	
+	public void learnSPARQLQueries(boolean useHMM) throws NoTemplateFoundException
+	{
 		reset();
 		//generate SPARQL query templates
 		logger.debug("Generating SPARQL query templates...");
 		templateMon.start();
 		if(multiThreaded){
-			templates = templateGenerator.buildTemplatesMultiThreaded(question,!USE_HMM||HMM_USE_SYNONYMS);
+			templates = templateGenerator.buildTemplatesMultiThreaded(question,!useHMM||HMM_USE_SYNONYMS);
 		} else {
 			templates = templateGenerator.buildTemplates(question);
 		}
 		templateMon.stop();
 		logger.debug("Done in " + templateMon.getLastValue() + "ms.");
 		relevantKeywords.addAll(templateGenerator.getUnknownWords());
-		if(templates.isEmpty()){
-			throw new NoTemplateFoundException();
+		if(templates.isEmpty()){throw new NoTemplateFoundException();}
+//		logger.debug("Templates:");
+//		for(Template t : templates){
+//			logger.debug(t);
+//		}
 
-		}
-		logger.debug("Templates:");
-		for(Template t : templates){
-			logger.debug(t);
-		}
-
 		//get the weighted query candidates
-		generatedQueries = getWeightedSPARQLQueries(templates,USE_HMM);
+		generatedQueries = getWeightedSPARQLQueries(templates,useHMM);
 		sparqlQueryCandidates = new ArrayList<WeightedQuery>();
 		int i = 0;
 		for(WeightedQuery wQ : generatedQueries){
-			logger.debug(wQ.explain());
+			logger.trace(wQ.explain());
 			sparqlQueryCandidates.add(wQ);
 			if(i == maxTestedQueries){
 				break;
@@ -526,52 +526,61 @@
 	
 	private SortedSet<WeightedQuery> getWeightedSPARQLQueriesWithHMM(Set<Template> templates)
 	{
-		// for testing 
+		List<String> vars = new LinkedList<String>();
+		if(templates.isEmpty()) throw new AssertionError("no templates");
+		SortedSet<WeightedQuery> queries = new TreeSet<WeightedQuery>();
 		for(Template template: templates)
 		{
 			{
 				ArrayList<String> keywords = new ArrayList<String>();
 				for(Slot slot: template.getSlots())
 				{
-					keywords.add(slot.getWords().get(0));
+					if(!slot.getWords().isEmpty())
+					{
+						// we don't have synonyms for hmm at the moment, so there should be just one word 
+						if(slot.getWords().size()!=1) throw new AssertionError("more than one word with hmm for slot: "+slot.getWords());
+						keywords.add(slot.getWords().get(0));
+						vars.add(slot.getAnchor());
+					}									
 				}
-				if(template.getSlots().size()!=3) {continue;}
 //				if(!keywords.contains("Mean Hamster Software")) {continue;}
 //				if(!keywords.contains("published")) {continue;}
-				System.out.println("\"keywords\": "+keywords);
+				logger.debug("\"keywords\": "+keywords);
 			}
-			System.out.println(template);
-			SortedSet<WeightedQuery> queries = new TreeSet<WeightedQuery>();
+			System.out.println(template);			
 			Query query = template.getQuery();
 			double score = 0;
 
 			Map<List<String>,List<ResourceInfo>> segmentToURIs = new HashMap<List<String>,List<ResourceInfo>>();
-			Map<String,IndexResultItem> uriUniqueToResultItem = new HashMap<String,IndexResultItem>(); 
+//			Map<String,IndexResultItem> uriUniqueToResultItem = new HashMap<String,IndexResultItem>(); 
 			for(Slot slot: template.getSlots())
 			{
-				List<String> segment = new LinkedList<String>();
-				segment.addAll(Arrays.asList(slot.getWords().get(0).split("\\s")));			
-				List<ResourceInfo> resourceInfos = new LinkedList<ResourceInfo>();
+				if(!slot.getWords().isEmpty()){
+					List<String> segment = new LinkedList<String>();
+					segment.addAll(Arrays.asList(slot.getWords().get(0).split("\\s")));			
+					List<ResourceInfo> resourceInfos = new LinkedList<ResourceInfo>();
 
-				for(IndexResultItem item : getIndexResultItems(slot))
-				{
-					// if this gets used at another place, create a function IndexResultItemToResourceInfo()
-					ResourceInfo info = new ResourceInfo();
-					info.setUri(item.getUri());
-					String label = item.getLabel();					
-					// in dbpedia, the last part of the uri is transformed from the english label, reverse the transformation (should almost always work for dbpedia article resources)
-					info.setLabel(label!=null?label:sfp.getShortForm(IRI.create(item.getUri())));
-					// in saedeehs algorithm, the emission probabilty is formed by the string similarity
-					// but we use the lucene index score
-					double max = 0;
-					for(String word: slot.getWords()) {max = Math.max(max, Similarity.getSimilarity(word, info.getLabel()));}					
-					if(max<0||max>1) throw new AssertionError("max is not in [0,1], max="+max);
-					info.setStringSimilarityScore(max);
-					if(!info.setTypeFromDBpediaURI()) throw new AssertionError("could not set type for info "+info);
-					System.err.println("info with type: "+info);
-					resourceInfos.add(info);
+					for(IndexResultItem item : getIndexResultItems(slot))
+					{
+						// if this gets used at another place, create a function IndexResultItemToResourceInfo()
+						ResourceInfo info = new ResourceInfo();
+						info.setUri(item.getUri());
+						String label = item.getLabel();					
+						// in dbpedia, the last part of the uri is transformed from the english label, reverse the transformation (should almost always work for dbpedia article resources)
+						info.setLabel(label!=null?label:sfp.getShortForm(IRI.create(item.getUri())));
+						// in saedeehs algorithm, the emission probabilty is formed by the string similarity
+						// but we use the lucene index score
+						double max = 0;
+						for(String word: slot.getWords()) {max = Math.max(max, Similarity.getSimilarity(word, info.getLabel()));}					
+						if(max<0||max>1) throw new AssertionError("max is not in [0,1], max="+max);
+						info.setStringSimilarityScore(max);
+						if(!info.setTypeFromDBpediaURI()) throw new AssertionError("could not set type for info "+info);
+						System.err.println("info with type: "+info);
+						resourceInfos.add(info);
+					}
+					segmentToURIs.put(segment,resourceInfos);
 				}
-				segmentToURIs.put(segment,resourceInfos);
+				
 			}
 			HiddenMarkovModel hmm = new HiddenMarkovModel();
 			hmm.initialization();
@@ -595,10 +604,10 @@
 					System.out.println(q.getVariablesAsStringList());
 					System.out.println();
 					int i = 0;
-					for(String var : q.getVariablesAsStringList())
-					{						
-						q.replaceVarWithURI(var, path.get(i));
-						i++;
+					for(String uri : path){
+						uri = uri.trim();
+						String var = vars.get(path.indexOf(uri));
+						q.replaceVarWithURI(var, uri);
 					}
 					System.out.println(q);
 
@@ -607,8 +616,7 @@
 					queries.add(wQuery);
 				}
 			}
-			//System.exit(0);
-			return queries;
+			//System.exit(0);			
 			//			>> SLOTS:
 			//				y0: RESOURCE {Mean Hamster Software}
 			//				p0: OBJECTPROPERTY {published,print}
@@ -618,10 +626,10 @@
 			//			System.out.println(template);			
 		}
 		// 		
-		return null;
+		return queries;		
 	}
 
-	private SortedSet<WeightedQuery> getWeightedSPARQLQueriesWithoutHMM(Set<Template> templates){
+	@SuppressWarnings("unused") private SortedSet<WeightedQuery> getWeightedSPARQLQueriesWithoutHMM(Set<Template> templates){
 		logger.debug("Generating SPARQL query candidates...");
 
 		Map<Slot, Set<Allocation>> slot2Allocations = new TreeMap<Slot, Set<Allocation>>(new Comparator<Slot>() {
@@ -640,11 +648,10 @@
 
 		SortedSet<WeightedQuery> allQueries = new TreeSet<WeightedQuery>();
 
-		Set<Allocation> allocations;
-
-		for(Template t : templates){
-			logger.info("Processing template:\n" + t.toString());			
-			allocations = new TreeSet<Allocation>();
+		for(Template t : templates)
+		{
+			logger.info("Processing template:\n" + t.toString());
+//			Set<Allocation> allocations = new TreeSet<Allocation>();
 			boolean containsRegex = t.getQuery().toString().toLowerCase().contains("(regex(");
 
 			ExecutorService executor = Executors.newFixedThreadPool(t.getSlots().size());
@@ -653,7 +660,7 @@
 			long startTime = System.currentTimeMillis();
 
 			for (Slot slot : t.getSlots()) {
-				if(!slot2Allocations.containsKey(slot)){//System.out.println(slot + ": " + slot.hashCode());System.out.println(slot2Allocations);
+				if(!slot2Allocations.containsKey(slot)){
 					Callable<Map<Slot, SortedSet<Allocation>>> worker = new SlotProcessor(slot);
 					Future<Map<Slot, SortedSet<Allocation>>> submit = executor.submit(worker);
 					list.add(submit);
@@ -668,7 +675,8 @@
 				} catch (InterruptedException e) {
 					e.printStackTrace();
 				} catch (ExecutionException e) {
-					e.printStackTrace();
+//					e.printStackTrace();
+					throw new RuntimeException(e);
 				}
 			}
 
@@ -734,7 +742,7 @@
 				queries.clear();
 				queries.addAll(tmp);
 				tmp.clear();
-			}
+			}			
 
 			for(Slot slot : sortedSlots){
 				if(!slot2Allocations.get(slot).isEmpty()){
@@ -743,104 +751,104 @@
 							Query q = new Query(query.getQuery());
 
 							boolean drop = false;
-							if(useDomainRangeRestriction){
-								if(slot.getSlotType() == SlotType.PROPERTY || slot.getSlotType() == SlotType.SYMPROPERTY){
-									for(SPARQL_Triple triple : q.getTriplesWithVar(slot.getAnchor())){
-										String objectVar = triple.getValue().getName();
-										String subjectVar = triple.getVariable().getName();
-										//											System.out.println(triple);
-										for(SPARQL_Triple typeTriple : q.getRDFTypeTriples(objectVar)){
-											//												System.out.println(typeTriple);
-											if(true){//reasoner.isObjectProperty(a.getUri())){
-												Description range = reasoner.getRange(new ObjectProperty(a.getUri()));
-												//													System.out.println(a);
-												if(range != null){
-													Set<Description> allRanges = new HashSet<Description>();
-													SortedSet<Description> superClasses;
-													if(range instanceof NamedClass){
-														superClasses = reasoner.getSuperClasses(range);
-														allRanges.addAll(superClasses);
-													} else {
-														for(Description nc : range.getChildren()){
-															superClasses = reasoner.getSuperClasses(nc);
-															allRanges.addAll(superClasses);
-														}
-													}
-													allRanges.add(range);
-													allRanges.remove(new NamedClass(Thing.instance.getURI()));
+//							if(useDomainRangeRestriction){
+//								if(slot.getSlotType() == SlotType.PROPERTY || slot.getSlotType() == SlotType.SYMPROPERTY){
+//									for(SPARQL_Triple triple : q.getTriplesWithVar(slot.getAnchor())){
+//										String objectVar = triple.getValue().getName();
+//										String subjectVar = triple.getVariable().getName();
+//
+//										for(SPARQL_Triple typeTriple : q.getRDFTypeTriples(objectVar)){
+//
+//											if(true){//reasoner.isObjectProperty(a.getUri())){
+//												Description range = reasoner.getRange(new ObjectProperty(a.getUri()));
+//
+//												if(range != null){
+//													Set<Description> allRanges = new HashSet<Description>();
+//													SortedSet<Description> superClasses;
+//													if(range instanceof NamedClass){
+//														superClasses = reasoner.getSuperClasses(range);
+//														allRanges.addAll(superClasses);
+//													} else {
+//														for(Description nc : range.getChildren()){
+//															superClasses = reasoner.getSuperClasses(nc);
+//															allRanges.addAll(superClasses);
+//														}
+//													}
+//													allRanges.add(range);
+//													allRanges.remove(new NamedClass(Thing.instance.getURI()));
+//
+//													Set<Description> allTypes = new HashSet<Description>();
+//													String typeURI = typeTriple.getValue().getName().substring(1,typeTriple.getValue().getName().length()-1);
+//													Description type = new NamedClass(typeURI);
+//													superClasses = reasoner.getSuperClasses(type);
+//													allTypes.addAll(superClasses);
+//													allTypes.add(type);
+//
+//													if(!org.mindswap.pellet.utils.SetUtils.intersects(allRanges, allTypes)){
+//														drop = true;
+//													} 
+//												}
+//											} else {
+//												drop = true;
+//											}
+//
+//										}
+//										for(SPARQL_Triple typeTriple : q.getRDFTypeTriples(subjectVar)){
+//											Description domain = reasoner.getDomain(new ObjectProperty(a.getUri()));
+//
+//											if(domain != null){
+//												Set<Description> allDomains = new HashSet<Description>();
+//												SortedSet<Description> superClasses;
+//												if(domain instanceof NamedClass){
+//													superClasses = reasoner.getSuperClasses(domain);
+//													allDomains.addAll(superClasses);
+//												} else {
+//													for(Description nc : domain.getChildren()){
+//														superClasses = reasoner.getSuperClasses(nc);
+//														allDomains.addAll(superClasses);
+//													}
+//												}
+//												allDomains.add(domain);
+//												allDomains.remove(new NamedClass(Thing.instance.getURI()));
+//
+//												Set<Description> allTypes = new HashSet<Description>();
+//												String typeURI = typeTriple.getValue().getName().substring(1,typeTriple.getValue().getName().length()-1);
+//												Description type = new NamedClass(typeURI);
+//												superClasses = reasoner.getSuperClasses(type);
+//												allTypes.addAll(superClasses);
+//												allTypes.add(type);
+//
+//												if(!org.mindswap.pellet.utils.SetUtils.intersects(allDomains, allTypes)){
+//													drop = true;												
+//												} else {
+//
+//												}
+//											}
+//										}
+//									}
+//								}
+//							}
 
-													Set<Description> allTypes = new HashSet<Description>();
-													String typeURI = typeTriple.getValue().getName().substring(1,typeTriple.getValue().getName().length()-1);
-													Description type = new NamedClass(typeURI);
-													superClasses = reasoner.getSuperClasses(type);
-													allTypes.addAll(superClasses);
-													allTypes.add(type);
-
-													if(!org.mindswap.pellet.utils.SetUtils.intersects(allRanges, allTypes)){
-														drop = true;
-													} 
-												}
-											} else {
-												drop = true;
-											}
-
-										}
-										for(SPARQL_Triple typeTriple : q.getRDFTypeTriples(subjectVar)){
-											Description domain = reasoner.getDomain(new ObjectProperty(a.getUri()));
-											//												System.out.println(a);
-											if(domain != null){
-												Set<Description> allDomains = new HashSet<Description>();
-												SortedSet<Description> superClasses;
-												if(domain instanceof NamedClass){
-													superClasses = reasoner.getSuperClasses(domain);
-													allDomains.addAll(superClasses);
-												} else {
-													for(Description nc : domain.getChildren()){
-														superClasses = reasoner.getSuperClasses(nc);
-														allDomains.addAll(superClasses);
-													}
-												}
-												allDomains.add(domain);
-												allDomains.remove(new NamedClass(Thing.instance.getURI()));
-
-												Set<Description> allTypes = new HashSet<Description>();
-												String typeURI = typeTriple.getValue().getName().substring(1,typeTriple.getValue().getName().length()-1);
-												Description type = new NamedClass(typeURI);
-												superClasses = reasoner.getSuperClasses(type);
-												allTypes.addAll(superClasses);
-												allTypes.add(type);
-
-												if(!org.mindswap.pellet.utils.SetUtils.intersects(allDomains, allTypes)){
-													drop = true;												
-												} else {
-
-												}
-											}
-										}
-									}
-								}
-							}
-
 							if(!drop){
 								if(slot.getSlotType() == SlotType.RESOURCE){//avoid queries where predicate is data property and object resource->add REGEX filter in this case
 									for(SPARQL_Triple triple : q.getTriplesWithVar(slot.getAnchor())){
 										SPARQL_Value object = triple.getValue();
-										if(object.isVariable() && object.getName().equals(slot.getAnchor())){//only consider triple where SLOT is in object position
-											SPARQL_Property predicate = triple.getProperty();
-											if(!predicate.isVariable()){//only consider triple where predicate is URI
-												String predicateURI = predicate.getName().replace("<", "").replace(">", "");
-												if(isDatatypeProperty(predicateURI)){//if data property
-													q.addFilter(new SPARQL_Filter(new SPARQL_Pair(
-															object, "'" + slot.getWords().get(0) + "'", SPARQL_PairType.REGEX)));
-												} else {
-													q.replaceVarWithURI(slot.getAnchor(), a.getUri());
-												}
-											} else {
-												q.replaceVarWithURI(slot.getAnchor(), a.getUri());
-											}
-										} else {
-											q.replaceVarWithURI(slot.getAnchor(), a.getUri());
-										}
+//										if(object.isVariable() && object.getName().equals(slot.getAnchor())){//only consider triple where SLOT is in object position
+//											SPARQL_Property predicate = triple.getProperty();
+//											if(!predicate.isVariable()){//only consider triple where predicate is URI
+//												String predicateURI = predicate.getName().replace("<", "").replace(">", "");
+//												if(isDatatypeProperty(predicateURI)){//if data property
+//													q.addFilter(new SPARQL_Filter(new SPARQL_Pair(
+//															object, "'" + slot.getWords().get(0) + "'", SPARQL_PairType.REGEX)));
+//												} else {
+//													q.replaceVarWithURI(slot.getAnchor(), a.getUri());
+//												}
+//											} else {
+//												q.replaceVarWithURI(slot.getAnchor(), a.getUri());
+//											}
+//										} else {
+//										
+//										}
 									}
 								} else {
 									q.replaceVarWithURI(slot.getAnchor(), a.getUri());
@@ -864,7 +872,7 @@
 					}
 
 					queries.clear();
-					queries.addAll(tmp);//System.out.println(tmp);
+					queries.addAll(tmp);
 					tmp.clear();
 				} else {//Add REGEX FILTER if resource slot is empty and predicate is datatype property
 					if(slot.getSlotType() == SlotType.RESOURCE){
@@ -952,6 +960,7 @@
 				}
 
 			}
+			
 			for (Iterator<WeightedQuery> iterator = queries.iterator(); iterator.hasNext();) {
 				WeightedQuery wQ = iterator.next();
 				if(dropZeroScoredQueries){
@@ -966,11 +975,12 @@
 			}
 			allQueries.addAll(queries);
 			List<Query> qList = new ArrayList<Query>();
-			for(WeightedQuery wQ : queries){//System.err.println(wQ.getQuery());
+			for(WeightedQuery wQ : queries){
 				qList.add(wQ.getQuery());
 			}
 			template2Queries.put(t, qList);
 		}
+		logger.debug(allQueries);
 		logger.debug("...done in ");
 		return allQueries;
 	}
@@ -1011,14 +1021,13 @@
 		if(popularity == null){
 			popularity = Integer.valueOf(0);
 		}
-		System.out.println(popularity);
+		logger.trace("popularity: "+popularity);
 
-
 		//		if(cnt == 0){
 		//			return 0;
 		//		} 
 		//		return Math.log(cnt);
-		if(popularity!=popularity) {throw new AssertionError("prominence NaN for uri "+uri+", slot type "+type);}
+		if(Double.isNaN(popularity)) {throw new AssertionError("prominence NaN for uri "+uri+", slot type "+type);}
 		return popularity;
 	}
 

Modified: branches/hmm/components-ext/src/main/resources/tbsl/evaluation/oxford_working_questions.xml
===================================================================
(Binary files differ)

Added: branches/hmm/components-ext/src/main/resources/tbsl/evaluation/oxford_working_questions_justquestions.xml
===================================================================
(Binary files differ)


Property changes on: branches/hmm/components-ext/src/main/resources/tbsl/evaluation/oxford_working_questions_justquestions.xml
___________________________________________________________________
Added: svn:mime-type
   + application/xml

Modified: branches/hmm/components-ext/src/test/java/org/dllearner/algorithm/tbsl/learning/QueryTestData.java
===================================================================
--- branches/hmm/components-ext/src/test/java/org/dllearner/algorithm/tbsl/learning/QueryTestData.java	2012-10-30 16:10:36 UTC (rev 3861)
+++ branches/hmm/components-ext/src/test/java/org/dllearner/algorithm/tbsl/learning/QueryTestData.java	2012-10-30 16:32:40 UTC (rev 3862)
@@ -12,6 +12,7 @@
 import java.io.PrintWriter;
 import java.io.Serializable;
 import java.util.Collection;
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.Set;
 import java.util.SortedMap;
@@ -36,7 +37,8 @@
 
 public class QueryTestData implements Serializable
 {
-	private static final long	serialVersionUID	= 1L;
+	private static final long	serialVersionUID	= 2L;
+	public boolean hmm = false;
 	public SortedMap<Integer, String> id2Question = new ConcurrentSkipListMap<Integer, String>();
 	public SortedMap<Integer, String> id2Query = new ConcurrentSkipListMap<Integer, String>();
 	public SortedMap<Integer, Set<String>> id2Answers = new ConcurrentSkipListMap<Integer, Set<String>>();
@@ -73,138 +75,153 @@
 		if(!id2Answers.isEmpty()) {throw new AssertionError("Answers already existing.");}
 		for(int i:id2Query.keySet())
 		{
-			Set<String> uris = SPARQLTemplateBasedLearner3Test.getUris(endpoint, id2Query.get(i),cache,model); 
-			id2Answers.put(i, uris); // empty answer set better transfers intended meaning and doesn't cause NPEs in html generation :-) 
-			if(!uris.isEmpty())	{/*id2Answers.put(i, uris);*/}
-			else				{id2LearnStatus.put(i, LearnStatus.QUERY_RESULT_EMPTY);}
-		}
-		return this;
+			try
+			{
+				Set<String> uris = SPARQLTemplateBasedLearner3Test.getUris(endpoint, id2Query.get(i),cache,model);
+				// empty answer set better transfers intended meaning and doesn't cause NPEs in html generation :-)
+				id2Answers.put(i, uris);			
+				if(!uris.isEmpty())	{/*id2Answers.put(i, uris);*/}
+				else				{id2LearnStatus.put(i, LearnStatus.QUERY_RESULT_EMPTY);}
+
+			}
+			catch(Exception e)
+			{
+				id2Answers.put(i, Collections.<String>emptySet());
+				id2LearnStatus.put(i, LearnStatus.exceptionStatus(e));
+			}
 	}
+	return this;
+}
 
-	/** reads test data from a QALD2 benchmark XML file, including questions, queries and answers.
-	 * each question needs to have a query but not necessarily an answer.
-	 * @param file a QALD benchmark XML file 
-	 * @param MAX_NUMBER_OF_QUESTIONS the maximum number of questions read from the file. 
-	 * @return the test data read from the XML file */
-	public static QueryTestData readQaldXml(final File file, int MAX_NUMBER_OF_QUESTIONS)
-	{
-		QueryTestData testData = new QueryTestData();
-		try {
-			DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
-			DocumentBuilder db = dbf.newDocumentBuilder();
-			Document doc = db.parse(file);
-			doc.getDocumentElement().normalize();
-			NodeList questionNodes = doc.getElementsByTagName("question");
-			int id;
+/** reads test data from a QALD2 benchmark XML file, including questions, queries and answers.
+ * each question needs to have a query but not necessarily an answer.
+ * @param file a QALD benchmark XML file 
+ * @param MAX_NUMBER_OF_QUESTIONS the maximum number of questions read from the file. 
+ * @return the test data read from the XML file */	
+public static QueryTestData readQaldXml(final File file, final int MAX_NUMBER_OF_QUESTIONS, boolean whitelistOnly,Set<Integer> whitelist)
+{
+	QueryTestData testData = new QueryTestData();
+	try {
+		DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
+		DocumentBuilder db = dbf.newDocumentBuilder();
+		Document doc = db.parse(file);
+		doc.getDocumentElement().normalize();
+		NodeList questionNodes = doc.getElementsByTagName("question");
+		int id;
 
-			for(int i = 0; i < questionNodes.getLength(); i++)
+		for(int i = 0; i < questionNodes.getLength(); i++)
+		{			
+			if(i>MAX_NUMBER_OF_QUESTIONS) break;
+			String question;
+			String query;
+			Set<String> answers = new HashSet<String>();
+			Element questionNode = (Element) questionNodes.item(i);
+			//read question ID
+			id = Integer.valueOf(questionNode.getAttribute("id"));
+			if(whitelistOnly&&!whitelist.contains(id)) {continue;}
+			
+			//Read question
+			question = ((Element)questionNode.getElementsByTagName("string").item(0)).getChildNodes().item(0).getNodeValue().trim();
+			//Read SPARQL query
+			query = ((Element)questionNode.getElementsByTagName("query").item(0)).getChildNodes().item(0).getNodeValue().trim();
+			//				//Read answers
+			//				answers = new HashSet<String>();
+			//				NodeList aswersNodes = questionNode.getElementsByTagName("answer");
+			//				for(int j = 0; j < aswersNodes.getLength(); j++){
+			//					Element answerNode = (Element) aswersNodes.item(j);
+			//					answers.add(((Element)answerNode.getElementsByTagName("uri").item(0)).getChildNodes().item(0).getNodeValue().trim());
+			//				}
+
+			if(!query.equals("OUT OF SCOPE")) // marker in qald benchmark file, will create holes interval of ids (e.g. 1,2,5,7)   
 			{
-				if(i>MAX_NUMBER_OF_QUESTIONS) break;
-				String question;
-				String query;
-				Set<String> answers = new HashSet<String>();
-				Element questionNode = (Element) questionNodes.item(i);
-				//read question ID
-				id = Integer.valueOf(questionNode.getAttribute("id"));				
-				//Read question
-				question = ((Element)questionNode.getElementsByTagName("string").item(0)).getChildNodes().item(0).getNodeValue().trim();
-				//Read SPARQL query
-				query = ((Element)questionNode.getElementsByTagName("query").item(0)).getChildNodes().item(0).getNodeValue().trim();
-				//				//Read answers
-				//				answers = new HashSet<String>();
-				//				NodeList aswersNodes = questionNode.getElementsByTagName("answer");
-				//				for(int j = 0; j < aswersNodes.getLength(); j++){
-				//					Element answerNode = (Element) aswersNodes.item(j);
-				//					answers.add(((Element)answerNode.getElementsByTagName("uri").item(0)).getChildNodes().item(0).getNodeValue().trim());
-				//				}
-
-				if(!query.equals("OUT OF SCOPE")) // marker in qald benchmark file, will create holes interval of ids (e.g. 1,2,5,7)   
+				testData.id2Question.put(id, question);
+				testData.id2Query.put(id, query);					
+				Element answersElement = (Element) questionNode.getElementsByTagName("answers").item(0);
+				// some of our qald files were mistakenly created so that they have the "answer" elements directly under the question node 
+				// with no answers element
+				if(answersElement==null) answersElement = (Element)questionNode;
+//				if(answersElement!=null)
 				{
-					testData.id2Question.put(id, question);
-					testData.id2Query.put(id, query);					
-					Element answersElement = (Element) questionNode.getElementsByTagName("answers").item(0);
-					if(answersElement!=null)
+					NodeList answerElements = answersElement.getElementsByTagName("answer");						
+					for(int j=0; j<answerElements.getLength();j++)
 					{
-						NodeList answerElements = answersElement.getElementsByTagName("answer");						
-						for(int j=0; j<answerElements.getLength();j++)
-						{
-							String answer = ((Element)answerElements.item(j)).getTextContent();
-							answers.add(answer);
-						}
-						testData.id2Answers.put(id, answers);
+						String answer = ((Element)answerElements.item(j)).getTextContent();
+						answers.add(answer);
 					}
-				}				
-				//				question2Answers.put(question, answers);
+					testData.id2Answers.put(id, answers);
+				}
+			}				
+			//				question2Answers.put(question, answers);
 
-			}
-		} catch (DOMException e) {
-			e.printStackTrace();
-		} catch (ParserConfigurationException e) {
-			e.printStackTrace();
-		} catch (SAXException e) {
-			e.printStackTrace();
-		} catch (IOException e) {
-			e.printStackTrace();
-		}		
-		return testData;
-	}
+		}
+	} catch (DOMException e) {
+		e.printStackTrace();
+	} catch (ParserConfigurationException e) {
+		e.printStackTrace();
+	} catch (SAXException e) {
+		e.printStackTrace();
+	} catch (IOException e) {
+		e.printStackTrace();
+	}		
+	return testData;
+}
 
-	/** write the test data to a QALD2 benchmark XML file, including questions, queries and answers.
-	 * each question needs to have a query but not necessarily an answer.
-	 * @param file a QALD benchmark XML file **/ 
-	public void writeQaldXml(final File file)
-	{		
-		// see http://www.genedavis.com/library/xml/java_dom_xml_creation.jsp
-		try
+/** write the test data to a QALD2 benchmark XML file, including questions, queries and answers.
+ * each question needs to have a query but not necessarily an answer.
+ * @param file a QALD benchmark XML file **/ 
+public void writeQaldXml(final File file)
+{		
+	// see http://www.genedavis.com/library/xml/java_dom_xml_creation.jsp
+	try
+	{
+		DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
+		DocumentBuilder db = dbf.newDocumentBuilder();
+		Document doc = db.newDocument();
+		Element root = doc.createElement("dataset");
+		doc.appendChild(root);
+
+		for(Integer i:id2Question.keySet())
 		{
-			DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
-			DocumentBuilder db = dbf.newDocumentBuilder();
-			Document doc = db.newDocument();
-			Element root = doc.createElement("dataset");
-			doc.appendChild(root);
-
-			for(Integer i:id2Question.keySet())
+			Element questionElement = doc.createElement("question");
+			questionElement.setAttribute("id", i.toString());
+			questionElement.setAttribute("answertype", "resource");
+			root.appendChild(questionElement);
+			Element stringElement = doc.createElement("string");
+			stringElement.setTextContent(id2Question.get(i));
+			questionElement.appendChild(stringElement);
+			String query = id2Query.get(i);
+			if(query!=null)
+			{					
+				Element queryElement = doc.createElement("query");
+				//					queryElement.setTextContent(query);
+				queryElement.appendChild(doc.createCDATASection(query));
+				questionElement.appendChild(queryElement);
+			}
+			Collection<String> answers = id2Answers.get(i);
+			if(answers!=null)
 			{
-				Element questionElement = doc.createElement("question");
-				questionElement.setAttribute("id", i.toString());
-				questionElement.setAttribute("answertype", "resource");
-				root.appendChild(questionElement);
-				Element stringElement = doc.createElement("string");
-				stringElement.setTextContent(id2Question.get(i));
-				questionElement.appendChild(stringElement);
-				String query = id2Query.get(i);
-				if(query!=null)
-				{					
-					Element queryElement = doc.createElement("query");
-//					queryElement.setTextContent(query);
-					queryElement.appendChild(doc.createCDATASection(query));
-					questionElement.appendChild(queryElement);
-				}
-				Collection<String> answers = id2Answers.get(i);
-				if(answers!=null)
+				for(String answer: answers)
 				{
-					for(String answer: answers)
-					{
-						Element answerElement = doc.createElement("answer");
-						answerElement.setTextContent(answer);
-						questionElement.appendChild(answerElement);
-					}
+					Element answerElement = doc.createElement("answer");
+					answerElement.setTextContent(answer);
+					questionElement.appendChild(answerElement);
 				}
-			}		
-			   //set up a transformer
-            TransformerFactory transfac = TransformerFactory.newInstance();
-            Transformer trans = transfac.newTransformer();
-            trans.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
-            trans.setOutputProperty(OutputKeys.INDENT, "yes");
+			}
+		}		
+		//set up a transformer
+		TransformerFactory transfac = TransformerFactory.newInstance();
+		Transformer trans = transfac.newTransformer();
+		trans.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
+		trans.setOutputProperty(OutputKeys.INDENT, "yes");
 
-            
-            //create string from xml tree
-            PrintWriter sw = new PrintWriter(file);
-            StreamResult result = new StreamResult(sw);
-            DOMSource source = new DOMSource(doc);
-            trans.transform(source, result);            
-		}
-		catch (Exception e) {throw new RuntimeException(e);}				
+
+		//create string from xml tree
+		PrintWriter sw = new PrintWriter(file);
+		StreamResult result = new StreamResult(sw);
+		DOMSource source = new DOMSource(doc);
+		trans.transform(source, result);            
 	}
+	catch (Exception e) {throw new RuntimeException(e);}				
+}
 
 }
\ No newline at end of file

Modified: branches/hmm/components-ext/src/test/java/org/dllearner/algorithm/tbsl/learning/SPARQLTemplateBasedLearner3Test.java
===================================================================
--- branches/hmm/components-ext/src/test/java/org/dllearner/algorithm/tbsl/learning/SPARQLTemplateBasedLearner3Test.java	2012-10-30 16:10:36 UTC (rev 3861)
+++ branches/hmm/components-ext/src/test/java/org/dllearner/algorithm/tbsl/learning/SPARQLTemplateBasedLearner3Test.java	2012-10-30 16:32:40 UTC (rev 3862)
@@ -4,6 +4,7 @@
 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileInputStream;
+import java.io.FileNotFoundException;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStreamReader;
@@ -12,9 +13,12 @@
 import java.io.PrintWriter;
 import java.io.Serializable;
 import java.io.UnsupportedEncodingException;
+import java.net.HttpURLConnection;
 import java.net.MalformedURLException;
+import java.net.URL;
 import java.net.URLDecoder;
 import java.text.DateFormat;
+import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.Date;
@@ -36,6 +40,7 @@
 import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
+import java.util.regex.Pattern;
 import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
 import javax.xml.parsers.ParserConfigurationException;
@@ -44,6 +49,9 @@
 import javax.xml.transform.TransformerFactory;
 import javax.xml.transform.dom.DOMSource;
 import javax.xml.transform.stream.StreamResult;
+import org.apache.commons.collections15.BidiMap;
+import org.apache.commons.collections15.bidimap.DualHashBidiMap;
+import org.apache.commons.lang.StringEscapeUtils;
 import org.apache.log4j.FileAppender;
 import org.apache.log4j.Level;
 import org.apache.log4j.Logger;
@@ -69,7 +77,6 @@
 import org.w3c.dom.Element;
 import org.w3c.dom.NodeList;
 import org.xml.sax.SAXException;
-import cern.colt.Arrays;
 import com.hp.hpl.jena.query.QueryExecutionFactory;
 import com.hp.hpl.jena.query.QueryFactory;
 import com.hp.hpl.jena.query.QuerySolution;
@@ -99,25 +106,150 @@
 
 // problem mit "In/IN which/WDT films/NNS did/VBD Julia/NNP Roberts/NNP as/RB well/RB as/IN Richard/NNP Gere/NNP play/NN"
 public class SPARQLTemplateBasedLearner3Test
-{			
+{
+	private static final boolean	USE_HMM	= false;
 	private static final File evaluationFolder = new File("cache/evaluation");
 	private static final boolean	DBPEDIA_PRETAGGED	= true;
 	private static final boolean	OXFORD_PRETAGGED	= false;
-	private static final int MAX_NUMBER_OF_QUESTIONS = 10;
+	private static final int MAX_NUMBER_OF_QUESTIONS = 20;	
+	private static final boolean WHITELIST_ONLY = false;
+	private static final Set<Integer> WHITELIST = Collections.unmodifiableSet(new HashSet<Integer>(Arrays.asList(new Integer[] {4})));
 
-	@Test public void testDBpedia() throws Exception
+	/*@Test*/ public void testDBpedia() throws Exception
 	{
 		File file = generateTestDataIfNecessary(
 				new File(getClass().getClassLoader().getResource("tbsl/evaluation/qald2-dbpedia-train-tagged(ideal).xml").getFile()),
 				SparqlEndpoint.getEndpointDBpedia(),
 				dbpediaLiveCache);
-		test("QALD 2 Benchmark ideally tagged", file,SparqlEndpoint.getEndpointDBpedia(),dbpediaLiveCache,dbpediaLiveKnowledgebase,null,null);
+		test("QALD 2 Benchmark ideally tagged", file,SparqlEndpoint.getEndpointDBpedia(),dbpediaLiveCache,dbpediaLiveKnowledgebase,null,null,DBPEDIA_PRETAGGED);
 	}
 
-	/*@Test*/ public void testOxford() throws Exception
+	//	private char[] hmmHtmlRow(String question, String string, String string2, String string3, Set<String> set, Set<String> set2,
+	//			Set<String> set3, LearnStatus learnStatus, LearnStatus learnStatus2)
+	//	{
+	//		return null;
+	//	}
+
+	private static boolean probablySparqlSelectQuery(String s)
 	{
+		s=s.toLowerCase();
+		return s.contains("select")&&s.contains("{")&&s.contains("}");
+	}
+
+	/** returns an html table row representation &lt;tr&gt;&lt;td&gt;t(o_1)&lt;/td&gt;...&lt;td&gt;t(o_n)&lt;/td&gt;&lt;/tr&gt; of the string representation of objects,
+	 * transformed by escaping HTML characters, setting fixed width on SPARQL queries and shortening and linking of dbpedia resource URIs.  
+	 */
+	// unescaped form from the top: <tr><td>t(o_1)</td>...<td>t(o_n)</td></tr>
+	private static String htmlTableTr(Object... tds)
+	{
+		System.out.println();
+		StringBuilder sb = new StringBuilder();		
+		// shorten and link dbpedia resource uris
+		Pattern p = Pattern.compile("http://dbpedia\\.org/resource/([\\w]*)");		
+
+		for(Object td: tds)
+		{
+			if(td==null) {sb.append("<td></td>");continue;}
+			sb.append("<td>");
+
+			// probably a SPARQL query? use fixed font width.
+			String s =  StringEscapeUtils.escapeHtml(td.toString());			 
+			if(probablySparqlSelectQuery(s)) {s="<pre>"+s+"</pre>";}
+			else {s =(p.matcher(s).replaceAll("<a href=\"$0\">dbpedia:$1</a>"));}
+
+			sb.append(s);
+			sb.append("</td>");
+		}
+		return sb.toString();
+	}	
+
+	@SuppressWarnings("unchecked") /*@Test*/ public void evaluateHMMAgainstNormalAndBenchmark() throws FileNotFoundException
+	{		
+		// get the newest evaluations from both with and without hmm
+		SortedMap<Long,Evaluation> evaluations = new TreeMap<Long,Evaluation>(Collections.reverseOrder());
+		evaluations.putAll(Evaluation.read());
+		Evaluation newestWithHmm = null;
+		Evaluation newestWithoutHmm = null;
+
+		for(Iterator<Long> it = evaluations.keySet().iterator();it.hasNext()&&(newestWithHmm==null||newestWithoutHmm==null);)
+		{
+			Evaluation e = evaluations.get(it.next());
+			if(e.testData.hmm)
+			{if(newestWithHmm==null) {newestWithHmm=e;}}			
+			else if(newestWithoutHmm==null) {newestWithoutHmm=e;}
+		}
+		if(newestWithHmm==null||newestWithoutHmm==null) {logger.warn("No pair of evaluations for Aborting.");return;}
+
+		Set<String> intersectionOfQuestions = new HashSet<String>(newestWithHmm.testData.id2Question.values());
+		intersectionOfQuestions.retainAll(newestWithoutHmm.testData.id2Question.values());
+		if(intersectionOfQuestions.isEmpty()) {logger.warn("No common questions. Aborting.");return;}
+
+		Set<String> questionsOnlyCorrectWithHMM = new HashSet<String> (intersectionOfQuestions);
+		questionsOnlyCorrectWithHMM.retainAll(newestWithHmm.correctlyAnsweredQuestions);
+		questionsOnlyCorrectWithHMM.removeAll(newestWithoutHmm.correctlyAnsweredQuestions);		
+
+		Set<String> questionsOnlyCorrectWithoutHMM = new HashSet<String> (intersectionOfQuestions);
+		questionsOnlyCorrectWithoutHMM.retainAll(newestWithoutHmm.correctlyAnsweredQuestions);
+		questionsOnlyCorrectWithoutHMM.removeAll(newestWithHmm.correctlyAnsweredQuestions);
+
+		PrintWriter out = new PrintWriter("log/evaluatehmm.html");
+		String title = "Evaluation of HMM vs the normal disambiguation.";
+
+		out.println("<!DOCTYPE html><html>\n<head><title>"+title+"</title></head>\n<body>\n<table border='1'>");							
+		out.println("<tr><th>Question</th><th>Query with HMM</th><th>Query without HMM</th><th>Reference Query</th>" +
+				"<th>Answers with HMM</th><th>Answers without HMM</th><th>Reference Answers</th><th>Status with HMM</th><th>Status without HMM</th></tr>");
+
+		// most of the time it should be enough to assume that the keys are equal, but this could introduce subtle bugs  
+		BidiMap<String,Integer> question2IdWithHmm = new DualHashBidiMap<Integer,String>(newestWithHmm.testData.id2Question).inverseBidiMap();
+		BidiMap<String,Integer> question2IdWithoutHmm = new DualHashBidiMap<Integer,String>(newestWithoutHmm.testData.id2Question).inverseBidiMap();
+		//		if(newestWithHmm.correctlyAnsweredQuestions.contains(question)!=newestWithoutHmm.correctlyAnsweredQuestions.contains(question)) {..}
+
+		for(Set<String> c : new Set[] {questionsOnlyCorrectWithHMM, questionsOnlyCorrectWithoutHMM})
+		{
+			for(String question: c)
+			{
+				int idWithHmm = question2IdWithHmm.get(question);
+				int idWithoutHmm = question2IdWithoutHmm.get(question);						
+				out.println(htmlTableTr(
+						question,
+						newestWithHmm.testData.id2Query.get(idWithHmm),
+						newestWithoutHmm.testData.id2Query.get(idWithoutHmm),
+						newestWithHmm.referenceData.id2Query.get(idWithHmm),
+						newestWithHmm.testData.id2Answers.get(idWithHmm),
+						newestWithoutHmm.testData.id2Answers.get(idWithoutHmm),
+						newestWithHmm.referenceData.id2Answers.get(idWithHmm),
+						newestWithHmm.testData.id2LearnStatus.get(idWithHmm),
+						newestWithoutHmm.testData.id2LearnStatus.get(idWithoutHmm)
+						));
+			}
+		}	
+
+		//		Integer id = question2Id.get(question);
+		//		if(evaluation.testData.id2Answers.get(id)==null) {System.err.println(question);continue;}
+		//		out.println(
+		//				"<tr><td>"+question+"</td>"+
+		//						"<td><code><pre>"+escapePre(evaluation.testData.id2Query.get(id))+"</pre></code></td>"+
+		//						"<td><code><pre>"+escapePre(evaluation.referenceData.id2Query.get(id))+"</pre></code></td>"+
+		//						"<td><ul>"+getAnswerHTMLList(evaluation.testData.id2Answers.get(id).toArray(new String[0]))+"</ul></td>"+
+		//						"<td><ul>"+getAnswerHTMLList(evaluation.referenceData.id2Answers.get(id).toArray(new String[0]))+"</ul></td>"+
+		//						"<td>"+evaluation.testData.id2LearnStatus.get(id)+"</td></tr>");					
+
+
+
+
+
+		logger.info(questionsOnlyCorrectWithHMM.size()+" questions only correct with hmm, "+
+				questionsOnlyCorrectWithoutHMM.size()+" questions only correct without hmm");
+
+		// generate a html description of it
+
+		out.close();
+	}
+
+	@Test public void testOxford() throws Exception
+	{
 		File file = new File(getClass().getClassLoader().getResource("tbsl/evaluation/oxford_working_questions.xml").getFile());
-		test("Oxford 19 working questions", file,null,null,null,loadOxfordModel(),getOxfordMappingIndex());
+		test("Oxford 19 working questions", file,null,null,null,loadOxfordModel(),getOxfordMappingIndex(),OXFORD_PRETAGGED);
 	}
 
 	//	/*@Test*/ public void testOxford() throws Exception
@@ -180,7 +312,7 @@
 		logger.info("learned query: "+testData.id2Query.get(0));
 	}
 
-	/*@Test*/  public void generateXMLOxford() throws IOException
+	/*@Test*/  @SuppressWarnings("null") public void generateXMLOxford() throws IOException
 	{
 		boolean ADD_POS_TAGS = true;
 		PartOfSpeechTagger posTagger = null;
@@ -289,10 +421,10 @@
 		}
 	}
 
-	public void test(String title, final File referenceXML,final  SparqlEndpoint endpoint,ExtractionDBCache cache,Knowledgebase kb, Model model, MappingBasedIndex index)
+	public void test(String title, final File referenceXML,final  SparqlEndpoint endpoint,ExtractionDBCache cache,Knowledgebase kb, Model model, MappingBasedIndex index,boolean pretagged)
 			throws ParserConfigurationException, SAXException, IOException, TransformerException, ComponentInitException, NoTemplateFoundException
 			{		
-		evaluateAndWrite(title,referenceXML,endpoint,cache,kb,model,index);
+		evaluateAndWrite(title,referenceXML,endpoint,cache,kb,model,index,pretagged);
 		generateHTML(title); 
 
 		//				if(evaluation.numberOfCorrectAnswers<3) {fail("only " + evaluation.numberOfCorrectAnswers+" correct answers.");}
@@ -335,14 +467,14 @@
 	}
 
 	private void evaluateAndWrite(String title,final File updatedReferenceXML, final  SparqlEndpoint endpoint,ExtractionDBCache cache,
-			Knowledgebase kb, Model model, MappingBasedIndex index)
+			Knowledgebase kb, Model model, MappingBasedIndex index,boolean pretagged)
 	{
 
-		QueryTestData referenceTestData = QueryTestData.readQaldXml(updatedReferenceXML,MAX_NUMBER_OF_QUESTIONS);
+		QueryTestData referenceTestData = QueryTestData.readQaldXml(updatedReferenceXML,MAX_NUMBER_OF_QUESTIONS,WHITELIST_ONLY,WHITELIST);
 		logger.info(title+" subset loaded with "+referenceTestData.id2Question.size()+" questions.");
 
 		long startLearning = System.currentTimeMillis();
-		QueryTestData learnedTestData = generateTestDataMultiThreaded(referenceTestData.id2Question, kb,model,index,DBPEDIA_PRETAGGED);
+		QueryTestData learnedTestData = generateTestDataMultiThreaded(referenceTestData.id2Question, kb,model,index,pretagged);
 		long endLearning = System.currentTimeMillis();
 		logger.info("finished learning after "+(endLearning-startLearning)/1000.0+"s");
 		learnedTestData.generateAnswers(endpoint,cache,model);
@@ -376,6 +508,17 @@
 			String referenceQuery = reference.id2Query.get(i);
 			String suspectQuery = suspect.id2Query.get(i);
 			// reference is required to contain answers for every key so we shouldn't get NPEs here (even though it could be the empty set but that shouldn't happen because only questions with nonempty answers are included in the updated reference)
+			if(reference.id2Answers.get(i)==null)
+			{
+				logger.warn("no reference answers for question "+i+" ("+question+")");
+				continue;
+			}
+			if(suspect.id2Answers.get(i)==null)
+			{
+				logger.warn("no suspect answers for question "+i+" ("+question+")");
+				continue;
+			}			
+			
 			if(referenceQuery.equals(suspectQuery)||reference.id2Answers.get(i).equals(suspect.id2Answers.get(i)))
 			{
 				evaluation.correctlyAnsweredQuestions.add(question);
@@ -456,9 +599,12 @@
 			} catch(IOException e) {throw new RuntimeException(e);}
 		}
 
+		/**
+		 * @return the evaluations by timestamp, sorted ascending (from oldest to newest)
+		 */
 		public static SortedMap<Long,Evaluation> read()
 		{
-			SortedMap<Long,Evaluation> evaluations = new ConcurrentSkipListMap<Long,Evaluation>();
+			SortedMap<Long,Evaluation> evaluations = new ConcurrentSkipListMap<Long,Evaluation>();			
 			evaluationFolder.mkdirs();
 			File[] files = evaluationFolder.listFiles();		
 			for(int i=0;i<files.length;i++) {evaluations.put(Long.valueOf(files[i].getName()),read(files[i]));}
@@ -603,6 +749,7 @@
 	private QueryTestData generateTestDataMultiThreaded(SortedMap<Integer, String> id2Question,Knowledgebase kb,Model model, MappingBasedIndex index,boolean pretagged)
 	{
 		QueryTestData testData = new QueryTestData();
+		testData.hmm = USE_HMM;
 		// -- only create the learner parameters once to save time -- 
 		//		PartOfSpeechTagger posTagger = new StanfordPartOfSpeechTagger();		
 		//		WordNet wordnet = new WordNet();
@@ -792,15 +939,32 @@
 
 	//	private ResultSet executeDBpediaLiveSelect(String query){return SparqlQuery.convertJSONtoResultSet(dbpediaLiveCache.executeSelectQuery(dbpediaLiveEndpoint, query));}
 
+	private static boolean httpResponseOK(String url) throws MalformedURLException, IOException
+	{		
+			HttpURLConnection connection = (HttpURLConnection) new URL(url).openConnection();
+			connection.setRequestMethod("HEAD");
+			int responseCode = connection.getResponseCode();
+			return responseCode == 200;
+	}
 
 	private static Knowledgebase createDBpediaLiveKnowledgebase(ExtractionDBCache cache)
 	{		
-		SOLRIndex resourcesIndex = new SOLRIndex("http://dbpedia.aksw.org:8080/solr/dbpedia_resources");
+		String resourcesURL = "http://dbpedia.aksw.org:8080/solr/dbpedia_resources123";
+		String classesURL = "http://dbpedia.aksw.org:8080/solr/dbpedia_classes";
+		String propertiesURL = "http://dbpedia.aksw.org:8080/solr/dbpedia_properties";
+		String boaPropertiesURL = "http://139.18.2.173:8080/solr/boa_fact_detail";
+		
+//		for(String url : new String[] {resourcesURL,classesURL,propertiesURL,boaPropertiesURL})
+//		{
+//			try{if(!httpResponseOK(url)) throw new RuntimeException("Http response not 200 for url "+url);} catch(Exception e) {throw new RuntimeException(e);}
+//		}
+		
+		SOLRIndex resourcesIndex = new SOLRIndex(resourcesURL);
 		resourcesIndex.setPrimarySearchField("label");
 		//			resourcesIndex.setSortField("pagerank");
-		Index classesIndex = new SOLRIndex("http://dbpedia.aksw.org:8080/solr/dbpedia_classes");
-		Index propertiesIndex = new SOLRIndex("http://dbpedia.aksw.org:8080/solr/dbpedia_properties");
-		SOLRIndex boa_propertiesIndex = new SOLRIndex("http://139.18.2.173:8080/solr/boa_fact_detail");
+		Index classesIndex = new SOLRIndex(classesURL);
+		Index propertiesIndex = new SOLRIndex(propertiesURL);
+		SOLRIndex boa_propertiesIndex = new SOLRIndex(boaPropertiesURL);
 		boa_propertiesIndex.setSortField("boa-score");
 		//		propertiesIndex = new HierarchicalIndex(boa_propertiesIndex, propertiesIndex);
 		MappingBasedIndex mappingIndex= new MappingBasedIndex(
@@ -820,11 +984,13 @@
 		Logger.getRootLogger().setLevel(Level.WARN);
 		Logger.getLogger(Templator.class).setLevel(Level.WARN);
 		Logger.getLogger(Parser.class).setLevel(Level.WARN);
-		Logger.getLogger(SPARQLTemplateBasedLearner2.class).setLevel(Level.WARN);
+		Logger.getLogger(SPARQLTemplateBasedLearner2.class).setLevel(Level.DEBUG);
 		//		Logger.getLogger(SPARQLTemplateBasedLearner2.class).setLevel(Level.INFO);
 		logger.setLevel(Level.INFO); // TODO: remove when finishing implementation of this class
 		logger.addAppender(new FileAppender(new SimpleLayout(), "log/"+this.getClass().getSimpleName()+".log", false));
 
+		//		Logger.getRootLogger().removeAllAppenders();
+
 		//		oxfordEndpoint = new SparqlEndpoint(new URL("http://lgd.aksw.org:8900/sparql"), Collections.singletonList("http://diadem.cs.ox.ac.uk"), Collections.<String>emptyList());		
 		//		oxfordLearner = new SPARQLTemplateBasedLearner2(createOxfordKnowledgebase(oxfordCache));
 	}
@@ -846,8 +1012,9 @@
 		}
 		catch(QueryExceptionHTTP e)
 		{
-			logger.error("Error getting uris for query "+query+" at endpoint "+endpoint,e);
-			return Collections.<String>emptySet();
+			throw new QueryExceptionHTTP("Error getting uris for query "+query+" at endpoint "+endpoint,e);
+			//			logger.error("Error getting uris for query "+query+" at endpoint "+endpoint,e);
+			//			return Collections.<String>emptySet();
 		}
 		String variable = "?uri";
 		resultsetloop:
@@ -976,14 +1143,13 @@
 
 		@Override public LearnStatus call()
 		{
-
 			logger.trace("learning question: "+question);					
 			try
 			{			
 				// learn query
 
 				learner.setQuestion(question);						
-				learner.learnSPARQLQueries();						
+				learner.learnSPARQLQueries(USE_HMM);						
 				String learnedQuery = learner.getBestSPARQLQuery();
 				testData.id2Question.put(id, question);
 				if(learnedQuery!=null&&!learnedQuery.isEmpty())
@@ -995,6 +1161,11 @@
 
 				//						Set<String> learnedURIs = getUris(DBPEDIA_LIVE_ENDPOINT_URL_STRING,learnedQuery);
 			}
+			catch(AssertionError e )
+			{
+				// this is the only exception that we want to halt on
+				throw new RuntimeException(e);
+			}
 			catch(NoTemplateFoundException e)
 			{		
 				logger.warn(String.format("no template found for question \"%s\"",question));
@@ -1070,7 +1241,7 @@
 			out.println("<!DOCTYPE html><html>\n<head><title>"+title+"</title></head>\n<body>\n<table border='1'>");
 			if(queriesAvailable)
 			{				
-				out.println("<tr><th>Question</th><th>Learned Query</th><th>Reference Query</th><th>Learned Answers</th><th>Reference Answers</th></tr>");
+				out.println("<tr><th>Question</th><th>Learned Query</th><th>Reference Query</th><th>Learned Answers</th><th>Reference Answers</th><th>Error Type</th></tr>");
 				for(String question: questions)
 				{
 					Integer id = question2Id.get(question);
@@ -1080,8 +1251,9 @@
 									"<td><code><pre>"+escapePre(evaluation.testData.id2Query.get(id))+"</pre></code></td>"+
 									"<td><code><pre>"+escapePre(evaluation.referenceData.id2Query.get(id))+"</pre></code></td>"+
 									"<td><ul>"+getAnswerHTMLList(evaluation.testData.id2Answers.get(id).toArray(new String[0]))+"</ul></td>"+
-									"<td><ul>"+getAnswerHTMLList(evaluation.referenceData.id2Answers.get(id).toArray(new String[0]))+"</ul></td></tr>");					
-				}								
+									"<td><ul>"+getAnswerHTMLList(evaluation.referenceData.id2Answers.get(id).toArray(new String[0]))+"</ul></td>"+
+									"<td>"+evaluation.testData.id2LearnStatus.get(id)+"</td></tr>");					
+				}
 			} else
 			{				
 				out.println("<tr><th>Question</th><th>Error Type</th></tr>");

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.