From: <lor...@us...> - 2013-09-09 10:43:49
|
Revision: 4102 http://sourceforge.net/p/dl-learner/code/4102 Author: lorenz_b Date: 2013-09-09 10:43:46 +0000 (Mon, 09 Sep 2013) Log Message: ----------- Added class to compute the cosine similarity for 2 documents using the Lucene API. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/NLPHeuristic.java Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/VSMCosineDocumentSimilarity.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/NLPHeuristic.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/NLPHeuristic.java 2013-09-09 10:18:57 UTC (rev 4101) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/NLPHeuristic.java 2013-09-09 10:43:46 UTC (rev 4102) @@ -57,7 +57,7 @@ @ConfigOption(name = "startNodeBonus", defaultValue="0.1") private double startNodeBonus = 0.1; - private double nlpBonusFactor = 0.0001; + private double nlpBonusFactor = 1; private Map<Entity, Double> entityRelevance; Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/VSMCosineDocumentSimilarity.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/VSMCosineDocumentSimilarity.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/VSMCosineDocumentSimilarity.java 2013-09-09 10:43:46 UTC (rev 4102) @@ -0,0 +1,238 @@ +/** + * + */ +package org.dllearner.algorithms.isle; + +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import org.apache.commons.math3.linear.ArrayRealVector; +import org.apache.commons.math3.linear.RealVector; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.core.SimpleAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.Version; + +/** + * Imagine an N-dimensional space where N is the number of unique words in a pair of texts. Each of the two texts + * can be treated like a vector in this N-dimensional space. The distance between the two vectors is an indication + * of the similarity of the two texts. The cosine of the angle between the two vectors is the most common distance measure. + * @author Lorenz Buehmann + * + */ +public class VSMCosineDocumentSimilarity { + + enum TermWeighting { + TF, TF_IDF + } + + public static final String CONTENT = "Content"; + public static final FieldType TYPE_STORED = new FieldType(); + + private final Set<String> terms = new HashSet<String>(); + private final RealVector v1; + private final RealVector v2; + + static { + TYPE_STORED.setIndexed(true); + TYPE_STORED.setTokenized(true); + TYPE_STORED.setStored(true); + TYPE_STORED.setStoreTermVectors(true); + TYPE_STORED.setStoreTermVectorPositions(true); + TYPE_STORED.freeze(); + } + + public VSMCosineDocumentSimilarity(String s1, String s2, TermWeighting termWeighting) throws IOException { + //create the index + Directory directory = createIndex(s1, s2); + IndexReader reader = DirectoryReader.open(directory); + //generate the document vectors + if(termWeighting == TermWeighting.TF){//based on term frequency only + //compute the term frequencies for document 1 + Map<String, Integer> f1 = getTermFrequencies(reader, 0); + //compute the term frequencies for document 2 + Map<String, Integer> f2 = getTermFrequencies(reader, 1); + reader.close(); + //map both documents to vector objects + v1 = getTermVectorInteger(f1); + v2 = getTermVectorInteger(f2); + } else if(termWeighting == TermWeighting.TF_IDF){//based on tf*idf weighting + //compute the term frequencies for document 1 + Map<String, Double> f1 = getTermWeights(reader, 0); + //compute the term frequencies for document 2 + Map<String, Double> f2 = getTermWeights(reader, 1); + reader.close(); + //map both documents to vector objects + v1 = getTermVectorDouble(f1); + v2 = getTermVectorDouble(f2); + } else { + v1 = null; + v2 = null; + } + } + + public VSMCosineDocumentSimilarity(String s1, String s2) throws IOException { + this(s1, s2, TermWeighting.TF_IDF); + } + + /** + * Returns the cosine document similarity between document {@code doc1} and {@code doc2} using TF-IDF as weighting for each term. + * The resulting similarity ranges from -1 meaning exactly opposite, to 1 meaning exactly the same, + * with 0 usually indicating independence, and in-between values indicating intermediate similarity or dissimilarity. + * @param s1 + * @param s2 + * @return + * @throws IOException + */ + public static double getCosineSimilarity(String doc1, String doc2) + throws IOException { + return new VSMCosineDocumentSimilarity(doc1, doc2).getCosineSimilarity(); + } + + /** + * Returns the cosine document similarity between document {@code doc1} and {@code doc2} based on {@code termWeighting} to compute the weight + * for each term in the documents. + * The resulting similarity ranges from -1 meaning exactly opposite, to 1 meaning exactly the same, + * with 0 usually indicating independence, and in-between values indicating intermediate similarity or dissimilarity. + * @param s1 + * @param s2 + * @return + * @throws IOException + */ + public static double getCosineSimilarity(String doc1, String doc2, TermWeighting termWeighting) + throws IOException { + return new VSMCosineDocumentSimilarity(doc1, doc2, termWeighting).getCosineSimilarity(); + } + + /** + * Create a in-memory Lucene index for both documents. + * @param s1 + * @param s2 + * @return + * @throws IOException + */ + private Directory createIndex(String s1, String s2) throws IOException { + Directory directory = new RAMDirectory(); + Analyzer analyzer = new SimpleAnalyzer(Version.LUCENE_43); + IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, analyzer); + IndexWriter writer = new IndexWriter(directory, iwc); + addDocument(writer, s1); + addDocument(writer, s2); + writer.close(); + return directory; + } + + /** + * Add the document to the Lucene index. + * @param writer + * @param content + * @throws IOException + */ + private void addDocument(IndexWriter writer, String content) throws IOException { + Document doc = new Document(); + Field field = new Field(CONTENT, content, TYPE_STORED); + doc.add(field); + writer.addDocument(doc); + } + + /** + * Get the frequency of each term contained in the document. + * @param reader + * @param docId + * @return + * @throws IOException + */ + private Map<String, Integer> getTermFrequencies(IndexReader reader, int docId) + throws IOException { + Terms vector = reader.getTermVector(docId, CONTENT); + TermsEnum termsEnum = vector.iterator(null); + Map<String, Integer> frequencies = new HashMap<String, Integer>(); + BytesRef text = null; + while ((text = termsEnum.next()) != null) { + String term = text.utf8ToString(); + int freq = (int) termsEnum.totalTermFreq(); + frequencies.put(term, freq); + terms.add(term); + } + return frequencies; + } + + /** + * Get the weight(tf*idf) of each term contained in the document. + * @param reader + * @param docId + * @return + * @throws IOException + */ + private Map<String, Double> getTermWeights(IndexReader reader, int docId) + throws IOException { + Terms vector = reader.getTermVector(docId, CONTENT); + TermsEnum termsEnum = vector.iterator(null); + Map<String, Double> weights = new HashMap<String, Double>(); + BytesRef text = null; + while ((text = termsEnum.next()) != null) { + String term = text.utf8ToString(); + //get the term frequency + int tf = (int) termsEnum.totalTermFreq(); + //get the document frequency + int df = reader.docFreq(new Term(CONTENT, text)); + //compute the inverse document frequency + double idf = getIDF(reader.numDocs(), df); + //compute tf*idf + double weight = tf * idf; + + weights.put(term, weight); + terms.add(term); + } + return weights; + } + + private double getIDF(int totalNumberOfDocuments, int documentFrequency){ + return 1 + Math.log(totalNumberOfDocuments/documentFrequency); + } + + private double getCosineSimilarity() { + return (v1.dotProduct(v2)) / (v1.getNorm() * v2.getNorm()); + } + + private RealVector getTermVectorInteger(Map<String, Integer> map) { + RealVector vector = new ArrayRealVector(terms.size()); + int i = 0; + for (String term : terms) { + int value = map.containsKey(term) ? map.get(term) : 0; + vector.setEntry(i++, value); + } + return vector.mapDivide(vector.getL1Norm()); + } + + private RealVector getTermVectorDouble(Map<String, Double> map) { + RealVector vector = new ArrayRealVector(terms.size()); + int i = 0; + for (String term : terms) { + double value = map.containsKey(term) ? map.get(term) : 0d; + vector.setEntry(i++, value); + } + return vector.mapDivide(vector.getL1Norm()); + } + + public static void main(String[] args) throws Exception { + double cosineSimilarity = VSMCosineDocumentSimilarity.getCosineSimilarity("The king is here", "The salad is cold"); + System.out.println(cosineSimilarity); + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |