[DL-Learner SVN] SF.net SVN: dl-learner:[4102] trunk/components-core/src/main/java/org/ dllearner/a

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 4102
          http://sourceforge.net/p/dl-learner/code/4102
Author:   lorenz_b
Date:     2013-09-09 10:43:46 +0000 (Mon, 09 Sep 2013)
Log Message:
-----------
Added class to compute the cosine similarity for 2 documents using the Lucene API.

Modified Paths:
--------------
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/NLPHeuristic.java

Added Paths:
-----------
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/VSMCosineDocumentSimilarity.java

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/NLPHeuristic.java
===================================================================

--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/NLPHeuristic.java	2013-09-09 10:18:57 UTC (rev 4101)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/NLPHeuristic.java	2013-09-09 10:43:46 UTC (rev 4102)
@@ -57,7 +57,7 @@
 	@ConfigOption(name = "startNodeBonus", defaultValue="0.1")
 	private double startNodeBonus = 0.1;
 	
-	private double nlpBonusFactor = 0.0001;
+	private double nlpBonusFactor = 1;
 	
 	private Map<Entity, Double> entityRelevance;
 	

Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/VSMCosineDocumentSimilarity.java
===================================================================
--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/VSMCosineDocumentSimilarity.java	                        (rev 0)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/VSMCosineDocumentSimilarity.java	2013-09-09 10:43:46 UTC (rev 4102)
@@ -0,0 +1,238 @@
+/**
+ * 
+ */
+package org.dllearner.algorithms.isle;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.math3.linear.ArrayRealVector;
+import org.apache.commons.math3.linear.RealVector;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.core.SimpleAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.Version;
+
+/**
+ * Imagine an N-dimensional space where N is the number of unique words in a pair of texts. Each of the two texts 
+ * can be treated like a vector in this N-dimensional space. The distance between the two vectors is an indication 
+ * of the similarity of the two texts. The cosine of the angle between the two vectors is the most common distance measure.
+ * @author Lorenz Buehmann
+ *
+ */
+public class VSMCosineDocumentSimilarity {
+	
+	enum TermWeighting {
+		TF, TF_IDF
+	}
+	
+	public static final String CONTENT = "Content";
+    public static final FieldType TYPE_STORED = new FieldType();
+    
+    private final Set<String> terms = new HashSet<String>();
+    private final RealVector v1;
+    private final RealVector v2;
+    
+    static {
+        TYPE_STORED.setIndexed(true);
+        TYPE_STORED.setTokenized(true);
+        TYPE_STORED.setStored(true);
+        TYPE_STORED.setStoreTermVectors(true);
+        TYPE_STORED.setStoreTermVectorPositions(true);
+        TYPE_STORED.freeze();
+    }
+    
+    public VSMCosineDocumentSimilarity(String s1, String s2, TermWeighting termWeighting) throws IOException {
+    	//create the index
+        Directory directory = createIndex(s1, s2);
+        IndexReader reader = DirectoryReader.open(directory);
+        //generate the document vectors
+        if(termWeighting == TermWeighting.TF){//based on term frequency only
+        	//compute the term frequencies for document 1
+            Map<String, Integer> f1 = getTermFrequencies(reader, 0);
+            //compute the term frequencies for document 2
+            Map<String, Integer> f2 = getTermFrequencies(reader, 1);
+            reader.close();
+            //map both documents to vector objects
+            v1 = getTermVectorInteger(f1);
+            v2 = getTermVectorInteger(f2);
+        } else if(termWeighting == TermWeighting.TF_IDF){//based on tf*idf weighting
+        	//compute the term frequencies for document 1
+            Map<String, Double> f1 = getTermWeights(reader, 0);
+            //compute the term frequencies for document 2
+            Map<String, Double> f2 = getTermWeights(reader, 1);
+            reader.close();
+            //map both documents to vector objects
+            v1 = getTermVectorDouble(f1);
+            v2 = getTermVectorDouble(f2);
+        } else {
+        	v1 = null;
+        	v2 = null;
+        }
+    }
+    
+    public VSMCosineDocumentSimilarity(String s1, String s2) throws IOException {
+    	this(s1, s2, TermWeighting.TF_IDF);
+    }
+    
+    /**
+     * Returns the cosine document similarity between document {@code doc1} and {@code doc2} using TF-IDF as weighting for each term.
+     * The resulting similarity ranges from -1 meaning exactly opposite, to 1 meaning exactly the same, 
+     * with 0 usually indicating independence, and in-between values indicating intermediate similarity or dissimilarity.
+     * @param s1
+     * @param s2
+     * @return
+     * @throws IOException
+     */
+    public static double getCosineSimilarity(String doc1, String doc2)
+            throws IOException {
+        return new VSMCosineDocumentSimilarity(doc1, doc2).getCosineSimilarity();
+    }
+    
+    /**
+     * Returns the cosine document similarity between document {@code doc1} and {@code doc2} based on {@code termWeighting} to compute the weight
+     * for each term in the documents.
+     * The resulting similarity ranges from -1 meaning exactly opposite, to 1 meaning exactly the same, 
+     * with 0 usually indicating independence, and in-between values indicating intermediate similarity or dissimilarity.
+     * @param s1
+     * @param s2
+     * @return
+     * @throws IOException
+     */
+    public static double getCosineSimilarity(String doc1, String doc2, TermWeighting termWeighting)
+            throws IOException {
+        return new VSMCosineDocumentSimilarity(doc1, doc2, termWeighting).getCosineSimilarity();
+    }
+    
+    /**
+     * Create a in-memory Lucene index for both documents.
+     * @param s1
+     * @param s2
+     * @return
+     * @throws IOException
+     */
+    private Directory createIndex(String s1, String s2) throws IOException {
+        Directory directory = new RAMDirectory();
+        Analyzer analyzer = new SimpleAnalyzer(Version.LUCENE_43);
+        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, analyzer);
+        IndexWriter writer = new IndexWriter(directory, iwc);
+        addDocument(writer, s1);
+        addDocument(writer, s2);
+        writer.close();
+        return directory;
+    }
+    
+    /**
+     * Add the document to the Lucene index.
+     * @param writer
+     * @param content
+     * @throws IOException
+     */
+    private void addDocument(IndexWriter writer, String content) throws IOException {
+        Document doc = new Document();
+        Field field = new Field(CONTENT, content, TYPE_STORED);
+        doc.add(field);
+        writer.addDocument(doc);
+    }
+    
+    /**
+     * Get the frequency of each term contained in the document.
+     * @param reader
+     * @param docId
+     * @return
+     * @throws IOException
+     */
+    private Map<String, Integer> getTermFrequencies(IndexReader reader, int docId)
+            throws IOException {
+        Terms vector = reader.getTermVector(docId, CONTENT);
+        TermsEnum termsEnum = vector.iterator(null);
+        Map<String, Integer> frequencies = new HashMap<String, Integer>();
+        BytesRef text = null;
+        while ((text = termsEnum.next()) != null) {
+            String term = text.utf8ToString();
+            int freq = (int) termsEnum.totalTermFreq();
+            frequencies.put(term, freq);
+            terms.add(term);
+        }
+        return frequencies;
+    }
+    
+    /**
+     * Get the weight(tf*idf) of each term contained in the document.
+     * @param reader
+     * @param docId
+     * @return
+     * @throws IOException
+     */
+    private Map<String, Double> getTermWeights(IndexReader reader, int docId)
+            throws IOException {
+        Terms vector = reader.getTermVector(docId, CONTENT);
+        TermsEnum termsEnum = vector.iterator(null);
+        Map<String, Double> weights = new HashMap<String, Double>();
+        BytesRef text = null;
+        while ((text = termsEnum.next()) != null) {
+            String term = text.utf8ToString();
+            //get the term frequency
+            int tf = (int) termsEnum.totalTermFreq();
+            //get the document frequency
+            int df = reader.docFreq(new Term(CONTENT, text));
+            //compute the inverse document frequency
+            double idf = getIDF(reader.numDocs(), df);
+            //compute tf*idf
+            double weight = tf * idf;
+            
+            weights.put(term, weight);
+            terms.add(term);
+        }
+        return weights;
+    }
+    
+    private double getIDF(int totalNumberOfDocuments, int documentFrequency){
+    	return 1 + Math.log(totalNumberOfDocuments/documentFrequency);
+    }
+    
+    private double getCosineSimilarity() {
+        return (v1.dotProduct(v2)) / (v1.getNorm() * v2.getNorm());
+    }
+    
+    private RealVector getTermVectorInteger(Map<String, Integer> map) {
+        RealVector vector = new ArrayRealVector(terms.size());
+        int i = 0;
+        for (String term : terms) {
+            int value = map.containsKey(term) ? map.get(term) : 0;
+            vector.setEntry(i++, value);
+        }
+        return vector.mapDivide(vector.getL1Norm());
+    }
+    
+    private RealVector getTermVectorDouble(Map<String, Double> map) {
+        RealVector vector = new ArrayRealVector(terms.size());
+        int i = 0;
+        for (String term : terms) {
+            double value = map.containsKey(term) ? map.get(term) : 0d;
+            vector.setEntry(i++, value);
+        }
+        return vector.mapDivide(vector.getL1Norm());
+    }
+    
+    public static void main(String[] args) throws Exception {
+		double cosineSimilarity = VSMCosineDocumentSimilarity.getCosineSimilarity("The king is here", "The salad is cold");
+		System.out.println(cosineSimilarity);
+	}
+
+}

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.





[DL-Learner SVN] SF.net SVN: dl-learner:[4102] trunk/components-core/src/main/java/org/ dllearner/a

[DL-Learner SVN] SF.net SVN: dl-learner:[4102] trunk/components-core/src/main/java/org/ dllearner/algorithms/isle