From: <tho...@us...> - 2013-09-13 14:42:17
|
Revision: 7412 http://bigdata.svn.sourceforge.net/bigdata/?rev=7412&view=rev Author: thompsonbry Date: 2013-09-13 14:42:08 +0000 (Fri, 13 Sep 2013) Log Message: ----------- Changes to the sampling logic. The BD and SAIL implementations no longer include self-loops into the set from which the vertex sample is drawn. The sampling logic has been modified to look at whether the algorithm wants to visit in-edges, out-edges, or all edges for GATHER. If there is no gather (no-edges), then it looks at the SCATTER phase. The samples are then only drawn from a distribution for the kinds of edges that will be traversed in the phase phase of the algorithm (either scatter or gather). The sampling logic was modified to drop vertices that have zero-degree with respect to the type(s) of edges that are being sampled. The GASRunner has been modified to drop vertices that are not connected to at least one other vertex by the execution of the GASProgram (the cumulative frontier size is one) and to report the number of such unconnected vertices that were found in the sample. See #629 (graph mining) Modified Paths: -------------- branches/BIGDATA_RELEASE_1_3_0/bigdata-gas/src/java/com/bigdata/rdf/graph/impl/ram/RAMGASEngine.java branches/BIGDATA_RELEASE_1_3_0/bigdata-gas/src/java/com/bigdata/rdf/graph/impl/sail/SAILGASEngine.java branches/BIGDATA_RELEASE_1_3_0/bigdata-gas/src/java/com/bigdata/rdf/graph/impl/util/GASRunnerBase.java branches/BIGDATA_RELEASE_1_3_0/bigdata-gas/src/java/com/bigdata/rdf/graph/impl/util/VertexDistribution.java branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/graph/impl/bd/BigdataGASEngine.java Modified: branches/BIGDATA_RELEASE_1_3_0/bigdata-gas/src/java/com/bigdata/rdf/graph/impl/ram/RAMGASEngine.java =================================================================== --- branches/BIGDATA_RELEASE_1_3_0/bigdata-gas/src/java/com/bigdata/rdf/graph/impl/ram/RAMGASEngine.java 2013-09-13 12:41:49 UTC (rev 7411) +++ branches/BIGDATA_RELEASE_1_3_0/bigdata-gas/src/java/com/bigdata/rdf/graph/impl/ram/RAMGASEngine.java 2013-09-13 14:42:08 UTC (rev 7412) @@ -230,11 +230,11 @@ boolean modified = false; if (o instanceof URI) { // Edge - modified|=get(s, true/* create */).addOutEdge(st); - modified|=get(o, true/* create */).addInEdge(st); + modified |= get(s, true/* create */).addOutEdge(st); + modified |= get(o, true/* create */).addInEdge(st); } else { // Property value. - modified|=get(s, true/* create */).addAttrib(st); + modified |= get(s, true/* create */).addAttrib(st); } return modified; @@ -341,12 +341,27 @@ final VertexDistribution sample = new VertexDistribution(r); - for (Value v : g.vertices.keySet()) { + for (RAMGraph.Vertex vertex : g.vertices.values()) { + final Value v = vertex.v; + if (v instanceof Resource) { - sample.addSample((Resource) v); + /* + * FIXME This is not ignoring self-loops. Realistically, we + * want to include them in the data since they are part of + * the data, but we do not want to consider them in samples + * since they do not actually go anywhere. The SAIL and BD + * implementations of this method filter out self-loops, but + * this implementation does not. + */ + + if (vertex.getInEdgeCount() > 0) + sample.addInEdgeSample((Resource) v); + if (vertex.getOutEdgeCount() > 0) + sample.addOutEdgeSample((Resource) v); + } } Modified: branches/BIGDATA_RELEASE_1_3_0/bigdata-gas/src/java/com/bigdata/rdf/graph/impl/sail/SAILGASEngine.java =================================================================== --- branches/BIGDATA_RELEASE_1_3_0/bigdata-gas/src/java/com/bigdata/rdf/graph/impl/sail/SAILGASEngine.java 2013-09-13 12:41:49 UTC (rev 7411) +++ branches/BIGDATA_RELEASE_1_3_0/bigdata-gas/src/java/com/bigdata/rdf/graph/impl/sail/SAILGASEngine.java 2013-09-13 14:42:08 UTC (rev 7412) @@ -246,10 +246,17 @@ continue; } - sample.addSample(st.getSubject()); + if (st.getSubject().equals(st.getObject())) { - sample.addSample((Resource) st.getObject()); + // ignore self-loops. + continue; + } + + sample.addOutEdgeSample(st.getSubject()); + + sample.addInEdgeSample((Resource) st.getObject()); + } } finally { Modified: branches/BIGDATA_RELEASE_1_3_0/bigdata-gas/src/java/com/bigdata/rdf/graph/impl/util/GASRunnerBase.java =================================================================== --- branches/BIGDATA_RELEASE_1_3_0/bigdata-gas/src/java/com/bigdata/rdf/graph/impl/util/GASRunnerBase.java 2013-09-13 12:41:49 UTC (rev 7411) +++ branches/BIGDATA_RELEASE_1_3_0/bigdata-gas/src/java/com/bigdata/rdf/graph/impl/util/GASRunnerBase.java 2013-09-13 14:42:08 UTC (rev 7412) @@ -9,6 +9,7 @@ import org.apache.log4j.Logger; import org.openrdf.model.Value; +import com.bigdata.rdf.graph.EdgesEnum; import com.bigdata.rdf.graph.IGASContext; import com.bigdata.rdf.graph.IGASEngine; import com.bigdata.rdf.graph.IGASProgram; @@ -415,10 +416,24 @@ final VertexDistribution dist = graphAccessor.getDistribution(opt.r); - final Value[] sampled = dist.getWeightedSample(opt.nsamples); + // Assume that a GATHER will be done for each starting vertex. + EdgesEnum edges = gasProgram.getGatherEdges(); + if (edges == EdgesEnum.NoEdges) { + + // If no GATHER is performed, then use the SCATTER edges. + edges = gasProgram.getScatterEdges(); + + } + + final Value[] sampled = dist.getWeightedSample(opt.nsamples, + edges); + final IGASStats total = new GASStats(); + // #of vertices that were not connected for that analytic. + long nunconnected = 0; + for (int i = 0; i < sampled.length; i++) { final Value startingVertex = sampled[i]; @@ -427,6 +442,19 @@ final IGASStats stats = (IGASStats) gasContext.call(); + if (stats.getFrontierSize() == 1) { + /* + * The starting vertex was not actually connected to any + * other vertices by the traversal performed by the GAS + * program. + */ + if (log.isInfoEnabled()) + log.info("Ignoring unconnected startingVertex: " + + startingVertex + ", stats=" + stats); + nunconnected++; + continue; + } + total.add(stats); if (log.isInfoEnabled()) { @@ -445,6 +473,7 @@ sb.append(", nsamples=" + opt.nsamples); // #desired samples sb.append(", nsampled=" + sampled.length);// #actually sampled sb.append(", distSize=" + dist.size());// #available for sampling. + sb.append(", nunconnected=" + nunconnected);// #unconnected vertices. sb.append(", nthreads=" + opt.nthreads); sb.append(", scheduler=" + ((GASState<VS, ES, ST>)gasState).getScheduler().getClass().getSimpleName()); sb.append(", gasEngine=" + gasEngine.getClass().getSimpleName()); Modified: branches/BIGDATA_RELEASE_1_3_0/bigdata-gas/src/java/com/bigdata/rdf/graph/impl/util/VertexDistribution.java =================================================================== --- branches/BIGDATA_RELEASE_1_3_0/bigdata-gas/src/java/com/bigdata/rdf/graph/impl/util/VertexDistribution.java 2013-09-13 12:41:49 UTC (rev 7411) +++ branches/BIGDATA_RELEASE_1_3_0/bigdata-gas/src/java/com/bigdata/rdf/graph/impl/util/VertexDistribution.java 2013-09-13 14:42:08 UTC (rev 7412) @@ -23,6 +23,8 @@ import org.openrdf.model.Resource; +import com.bigdata.rdf.graph.EdgesEnum; + /** * Utility class for sampling vertices from a graph. * <p> @@ -47,16 +49,28 @@ * A sample. */ private static class VertexSample { - /** The frequence of the {@link Resource}. */ - public double f; /** The {@link Resource}. */ public final Resource v; + /** The #of the {@link Resource} occurring as the target of an in-edge. */ + public int in; + /** The #of the {@link Resource} occurring as the source of an out-edge. */ + public int out; - // /** The #of times the {@link Resource} has been selected. */ - // public int n; - public VertexSample(final double f, final Resource v) { - this.f = f; + /** + * + * @param v + * The resource. + * @param in + * The #of times this {@link Resource} has been observed as + * the target of an in-edge. + * @param out + * The #of times this {@link Resource} has been observed as + * the source of an out-edge. + */ + public VertexSample(final Resource v, final int in, final int out) { this.v = v; + this.in = in; + this.out = out; // this.n = 0; } } @@ -87,19 +101,19 @@ } /** - * Add a sample. + * Add a sample of a vertex having some out-edge. * * @param v * The vertex. */ - public void addSample(final Resource v) { + public void addOutEdgeSample(final Resource v) { VertexSample s = samples.get(v); if (s == null) { // new sample. - samples.put(v, s = new VertexSample(1d/* f */, v)); + samples.put(v, s = new VertexSample(v, 0/* in */, 1/* out */)); // indexOf that sample. indexOf.put(samples.size() - 1, s); @@ -109,6 +123,28 @@ } /** + * Add a sample of a vertex having some in-edge. + * + * @param v + * The vertex. + */ + public void addInEdgeSample(final Resource v) { + + VertexSample s = samples.get(v); + + if (s == null) { + + // new sample. + samples.put(v, s = new VertexSample(v, 1/* in */, 0/* out */)); + + // indexOf that sample. + indexOf.put(samples.size() - 1, s); + + } + + } + + /** * Return the #of samples in the distribution from which a called specified * number of samples may then drawn using a random sampling without * replacement technique. @@ -123,19 +159,35 @@ } /** - * Build a normalized vector over the sample frequences. The indices of the - * sample vector are correlated with the {@link #indexOf} map. The values in - * the normalized vector are in <code>[0:1]</code> and sum to - * <code>1.0</code>. + * Build a vector over the samples. The indices of the sample vector are + * correlated with the {@link #indexOf} map. + * + * @param edges + * Only vertice having the specified type(s) of edges will be + * included in the distribution. + * @param normalize + * When <code>true</code> the vector will be normalized such that + * the elements in the vector are in <code>[0:1]</code> and sum + * to <code>1.0</code>. When <code>false</code> the elements of + * the vector are the unnormalized sum of the #of edges of the + * specified type(s). + * + * @return The distribution vector over the samples. */ - double[] getNormVector() { + double[] getVector(final EdgesEnum edges, final boolean normalize) { - final double[] norm = new double[samples.size()]; + if (edges == null) + throw new IllegalArgumentException(); - if (norm.length == 0) { + if (edges == EdgesEnum.NoEdges) + throw new IllegalArgumentException(); + final double[] a = new double[samples.size()]; + + if (a.length == 0) { + // No samples. Avoid division by zero. - return norm; + return a; } @@ -145,18 +197,64 @@ for (VertexSample s : samples.values()) { - norm[i++] = sum += s.f; + final double d; + switch (edges) { + case InEdges: + d = s.in; + break; + case OutEdges: + d = s.out; + break; + case AllEdges: + d = (s.in + s.out); + break; + default: + throw new AssertionError(); + } + + if (d == 0) + continue; + if (normalize) { + a[i] = sum += d; + } else { + a[i] = d; + } + + i++; + } + final int nfound = i; - for (i = 0; i < norm.length; i++) { + if (nfound == 0) { + // Empty sample. + return new double[0]; + } + + if (normalize) { - norm[i] /= sum; + for (i = 0; i < a.length; i++) { + a[i] /= sum; + + } + } - return norm; + if (nfound < a.length) { + // Make the array dense. + + final double[] b = new double[nfound]; + + System.arraycopy(a/* src */, 0/* srcPos */, b/* dest */, + 0/* destPos */, nfound/* length */); + + return b; + } + + return a; + } /** @@ -165,10 +263,15 @@ * * @param desiredSampleSize * The desired sample size. + * @param edges + * The sample is taken from vertices having the specified type(s) + * of edges. Vertices with zero degree for the specified type(s) + * of edges will not be present in the returned sampled. * * @return The distinct samples that were found. */ - public Resource[] getWeightedSample(final int desiredSampleSize) { + public Resource[] getWeightedSample(final int desiredSampleSize, + final EdgesEnum edges) { if (samples.isEmpty()) { @@ -178,7 +281,7 @@ } // Build a normalized vector over the sample. - final double[] norm = getNormVector(); + final double[] norm = getVector(edges, true/* normalized */); // Maximum number of samples to attempt. final int limit = (int) Math.min(desiredSampleSize * 3L, @@ -218,10 +321,16 @@ * at random without regard to their frequency distribution. * * @param desiredSampleSize + * The desired sample size. + * @param edges + * The sample is taken from vertices having the specified type(s) + * of edges. Vertices with zero degree for the specified type(s) + * of edges will not be present in the returned sampled. * - * @return + * @return The distinct samples that were found. */ - public Resource[] getUnweightedSample(final int desiredSampleSize) { + public Resource[] getUnweightedSample(final int desiredSampleSize, + final EdgesEnum edges) { if (samples.isEmpty()) { @@ -230,6 +339,9 @@ } + // Build a vector over the sample. + final double[] vec = getVector(edges, true/* normalized */); + // Maximum number of samples to attempt. final int limit = (int) Math.min(desiredSampleSize * 3L, Integer.MAX_VALUE); @@ -239,11 +351,9 @@ // The selected samples. final Set<Resource> selected = new HashSet<Resource>(); - final int nsamples = this.samples.size(); - while (selected.size() < desiredSampleSize && round++ < limit) { - final int i = r.nextInt(nsamples); + final int i = r.nextInt(vec.length); final Resource v = indexOf.get(Integer.valueOf(i)).v; @@ -252,6 +362,23 @@ } return selected.toArray(new Resource[selected.size()]); + +// // The selected samples. +// final Set<Resource> selected = new HashSet<Resource>(); +// +// final int nsamples = this.samples.size(); +// +// while (selected.size() < desiredSampleSize && round++ < limit) { +// +// final int i = r.nextInt(nsamples); +// +// final Resource v = indexOf.get(Integer.valueOf(i)).v; +// +// selected.add(v); +// +// } +// +// return selected.toArray(new Resource[selected.size()]); } Modified: branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/graph/impl/bd/BigdataGASEngine.java =================================================================== --- branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/graph/impl/bd/BigdataGASEngine.java 2013-09-13 12:41:49 UTC (rev 7411) +++ branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/graph/impl/bd/BigdataGASEngine.java 2013-09-13 14:42:08 UTC (rev 7412) @@ -494,10 +494,17 @@ continue; } - sample.addSample((Resource) spo.s()); + if (spo.s().equals(spo.o())) { - sample.addSample((Resource) spo.o()); + // ignore self-loops. + continue; + } + + sample.addOutEdgeSample((Resource) spo.s()); + + sample.addInEdgeSample((Resource) spo.o()); + } return sample; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |