From: <tho...@us...> - 2014-01-04 20:53:58
|
Revision: 7722 http://bigdata.svn.sourceforge.net/bigdata/?rev=7722&view=rev Author: thompsonbry Date: 2014-01-04 20:53:51 +0000 (Sat, 04 Jan 2014) Log Message: ----------- Some additional log statements, inline documentation on the analytic mode hash join algorithm, and inline documentation on the root cause for #763 (stochastic results with the analytic query mode). Modified Paths: -------------- branches/BIGDATA_RELEASE_1_3_0/bigdata/src/java/com/bigdata/bop/join/HTreeHashJoinUtility.java Modified: branches/BIGDATA_RELEASE_1_3_0/bigdata/src/java/com/bigdata/bop/join/HTreeHashJoinUtility.java =================================================================== --- branches/BIGDATA_RELEASE_1_3_0/bigdata/src/java/com/bigdata/bop/join/HTreeHashJoinUtility.java 2014-01-04 20:03:56 UTC (rev 7721) +++ branches/BIGDATA_RELEASE_1_3_0/bigdata/src/java/com/bigdata/bop/join/HTreeHashJoinUtility.java 2014-01-04 20:53:51 UTC (rev 7722) @@ -600,6 +600,7 @@ * This exposes a view of the {@link HTree} which is safe for concurrent * readers. */ + @Override public void saveSolutionSet() { if (!open.get()) @@ -642,6 +643,9 @@ // Checkpoint the HTree. final Checkpoint checkpoint = tmp.writeCheckpoint2(); + if (log.isInfoEnabled()) + log.info(checkpoint.toString()); + final HTree readOnly = HTree.load(store, checkpoint.getCheckpointAddr(), true/* readOnly */); @@ -735,7 +739,7 @@ // Encode the solution. final byte[] val = encoder.encodeSolution(tmp.bset); - +//log.warn("insert: key="+BytesUtil.toString(key)); // Insert binding set under hash code for that key. htree.insert(key, val); @@ -755,6 +759,10 @@ } + if (log.isInfoEnabled()) + log.info("naccepted=" + naccepted + ", nright=" + + htree.getEntryCount()); + return naccepted; } catch(Throwable t) { @@ -813,13 +821,20 @@ /* * Search the hash index for a match. + * + * TODO VECTOR: This does not take explicit advantage of the + * fact that different source solutions will fall into the + * same hash bucket in the HTree. The solutions are ordered + * by hashCode by vector() above, but we are using one lookupAll() + * invocation per source solution here rather than recognizing that + * multiple source solutions will hit the same hash bucket. */ boolean found = false; - + final ITupleIterator<?> titr = htree.lookupAll(key); - while(titr.hasNext()) { - + while (titr.hasNext()) { + final ITuple<?> t = titr.next(); final ByteArrayBuffer tb = t.getValueBuffer(); @@ -967,6 +982,14 @@ } + /* + * The hash join is vectored. We compute the hashCode for each source + * solution from the leftItr and then sort those left solutions. This gives + * us an ordered progression through the hash buckets for the HTree. + * Further, since we know that any left solution having the same hash code + * will read on the same hash bucket, we probe that hash bucket once for all + * left solutions that hash into the same bucket. + */ @Override public void hashJoin2(// final ICloseableIterator<IBindingSet[]> leftItr,// @@ -976,7 +999,13 @@ ) { // Note: We no longer rechunk in this method. - final Iterator<IBindingSet[]> it = leftItr; + final Iterator<IBindingSet[]> it; + it = leftItr;// incremental. + /* + * Note: This forces all source chunks into a single chunk. This could + * improve vectoring, but was really added for debugging. + */ +// it = new SingletonIterator<IBindingSet[]>(BOpUtility.toArray(leftItr, null/*stats*/)); try { @@ -996,6 +1025,7 @@ final boolean noJoinVars = joinVars.length == 0; final AtomicInteger vectorSize = new AtomicInteger(); + while (it.hasNext()) { final BS[] a; // vectored solutions. @@ -1017,11 +1047,6 @@ nleftConsidered.add(n); - if (log.isTraceEnabled()) - log.trace("Vectoring chunk for HTree locality: " + n - + " out of " + a.length - + " solutions are preserved."); - } int fromIndex = 0; @@ -1079,9 +1104,37 @@ final byte[] key = keyBuilder.reset().append(hashCode) .getKey(); - // visit all source solutions having the same hash code - final ITupleIterator<?> titr = rightSolutions - .lookupAll(key); + /** + * Visit all source solutions having the same hash code. + * + * @see <a + * href="https://sourceforge.net/apps/trac/bigdata/ticket/763#comment:19"> + * Stochastic results with Analytic Query Mode) + * </a> + * + * FIXME This appears to be the crux of the problem + * for #764. If you replace lookupAll(key) with + * rangeIterator() then the hash join is correct. + * Of course, it is also scanning all tuples each + * time so it is very inefficient. The root cause + * is the FrontCodedRabaCoder. It is doing a binary + * search on the BucketPage. However, the + * FrontCodedRabaCoder was not developed to deal + * with duplicates on the page. Therefore it is + * returning an offset into the middle of a run of + * duplicate keys when it does its binary search. + * We will either need to modify this IRabaCoder to + * handle this case (where duplicate keys are + * allowed) or write a new IRabaCoder that is smart + * about duplicates. + */ + final ITupleIterator<?> titr; + if (true) {// scan just the hash bucket for that key. +//log.warn(" probe: key="+BytesUtil.toString(key)); + titr = rightSolutions.lookupAll(key); + } else { // do a full scan on the HTree. + titr = rightSolutions.rangeIterator(); + } long sameHashCodeCount = 0; @@ -1167,7 +1220,6 @@ // Join failed. continue; } - // Resolve against ivCache. encoder.resolveCachedValues(outSolution); @@ -1247,6 +1299,9 @@ } // while(itr.hasNext() + if (log.isInfoEnabled()) + log.info("done: " + toString()); + } catch(Throwable t) { throw launderThrowable(t); @@ -1292,7 +1347,8 @@ final BS[] a = new BS[leftSolutions.length]; int n = 0; // The #of non-dropped source solutions. - + int ndropped = 0; // The #of dropped solutions. + for (int i = 0; i < a.length; i++) { /* @@ -1317,6 +1373,8 @@ if (log.isTraceEnabled()) log.trace(ex); + + ndropped++; continue; @@ -1338,7 +1396,11 @@ // Indicate the actual vector size to the caller via a side-effect. vectorSize.set(n); - + + if (log.isTraceEnabled()) + log.trace("Vectoring chunk for HTree locality: naccepted=" + n + + ", ndropped=" + ndropped); + return a; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |