[Assorted-commits] SF.net SVN: assorted: [862] tcq-wavelets
Brought to you by:
yangzhang
From: <yan...@us...> - 2008-07-02 06:37:47
|
Revision: 862 http://assorted.svn.sourceforge.net/assorted/?rev=862&view=rev Author: yangzhang Date: 2008-07-01 23:37:56 -0700 (Tue, 01 Jul 2008) Log Message: ----------- added tcq wavelets Added Paths: ----------- tcq-wavelets/ tcq-wavelets/trunk/ tcq-wavelets/trunk/Test/ tcq-wavelets/trunk/Test/Backup/ tcq-wavelets/trunk/Test/Backup/histogram.c tcq-wavelets/trunk/Test/Backup/histogram.h tcq-wavelets/trunk/Test/Backup/pg_proc.h tcq-wavelets/trunk/Test/Backup/pg_type.h tcq-wavelets/trunk/Test/Backup/shedding.c tcq-wavelets/trunk/Test/Backup/shedding.h tcq-wavelets/trunk/Test/Backup/wavelet.c tcq-wavelets/trunk/Test/Backup/wavelet.h tcq-wavelets/trunk/Test/Backup/wrapch.c tcq-wavelets/trunk/Test/Copy (2) of Test.cpp tcq-wavelets/trunk/Test/Copy (2) of wavelet.c tcq-wavelets/trunk/Test/Copy (2) of wavelet.h tcq-wavelets/trunk/Test/Copy (3) of wavelet.c tcq-wavelets/trunk/Test/Copy (3) of wavelet.h tcq-wavelets/trunk/Test/Copy (4) of wavelet.c tcq-wavelets/trunk/Test/Copy (4) of wavelet.h tcq-wavelets/trunk/Test/Copy (5) of wavelet.c tcq-wavelets/trunk/Test/Copy (5) of wavelet.h tcq-wavelets/trunk/Test/Copy (6) of wavelet.c tcq-wavelets/trunk/Test/Copy (6) of wavelet.h tcq-wavelets/trunk/Test/Copy (7) of wavelet.c tcq-wavelets/trunk/Test/Copy (7) of wavelet.h tcq-wavelets/trunk/Test/Copy (8) of wavelet.c tcq-wavelets/trunk/Test/Copy (8) of wavelet.h tcq-wavelets/trunk/Test/Copy of Test.cpp tcq-wavelets/trunk/Test/Copy of wavelet.c tcq-wavelets/trunk/Test/Copy of wavelet.h tcq-wavelets/trunk/Test/Test.ncb tcq-wavelets/trunk/Test/Test.sln tcq-wavelets/trunk/Test/Test.suo tcq-wavelets/trunk/Test/Test.vcproj tcq-wavelets/trunk/Test/stdafx.cpp tcq-wavelets/trunk/Test/stdafx.h tcq-wavelets/trunk/Test/wavelet.c tcq-wavelets/trunk/Test/wavelet.h Added: tcq-wavelets/trunk/Test/Backup/histogram.c =================================================================== --- tcq-wavelets/trunk/Test/Backup/histogram.c (rev 0) +++ tcq-wavelets/trunk/Test/Backup/histogram.c 2008-07-02 06:37:56 UTC (rev 862) @@ -0,0 +1,3741 @@ +/* + * PostgreSQL type definitions for built-in histogram data types. + * + * $Header: /project/eecs/db/cvsroot/postgres/src/backend/telegraphcq/histogram.c,v 1.23 2004/05/14 21:29:14 yangsta Exp $ + */ + +#include <ctype.h> +#include <sys/time.h> // yang + + /* For toupper(3) */ + +#include "postgres.h" + +#include "access/hash.h" +#include "executor/spi.h" +#include "funcapi.h" +#include "lib/stringinfo.h" +#include "utils/builtins.h" +#include "utils/elog.h" +#include "utils/relcache.h" + +#include "utils/lsyscache.h" + /* For get_typlenbyval() function. */ + + +#include "telegraphcq/shedding.h" +#include "telegraphcq/histogram.h" + + +#define ENABLE_STREAM_STATE_CHECKS + +/******************************************************************************* + * INTERNAL DATA STRUCTURES + ******************************************************************************/ + +/* Used for passing maxdiff information back. + * + * In particular, get_maxdiffs() returns a pointer to an array of these, one per + * bucket. */ +typedef struct bucket_maxdiff_info { + int abs; + /* The absolute difference between two adjacent tuples when sorted + * along a particular dimension. */ + + int dimid; + /* The dimension along which this difference occurs. */ + + double norm; + /* Normalized to the min. and max. values along this dimension. */ + +} bucket_maxdiff_info; + + +/******************************************************************************* + * PROTOTYPES FOR INTERNAL FUNCTIONS + ******************************************************************************/ + +static mhist * real_mhist_in(char *relname, + char **cols, + int ncols, + int num_buckets); + +static void mhist_set_nameinfo(mhist *hist, char *relname, + char **cols, int ncols); + +static HeapTuple *get_table_as_array(char *relname, char **cols, int ncols, + int *ntuples, TupleDesc *descdest); + +static bool split_a_bucket(HeapTuple *giant_tup_array, int ntuples, + TupleDesc desc, + int *bucket_offsets, int *bucket_lengths, mhist *hist, + mhist_dim_info *ranges, int nbuckets_done); + +static void sort_tuples_along_dim(HeapTuple *tuples, int length, int dimid, + TupleDesc desc); + +static void load_mhist(char *relname, char **cols, mhist *hist); + +static mhist * create_mhist_copy(const mhist *orig); +static mhist * create_bigger_mhist(mhist *hist); + +mhist * create_proj_mhist(const mhist * original, int dim); + +void srf_firstcall_junk(FuncCallContext *funcctx, void *state, int max_calls, + char *typename); + +void srf_cleanup(FuncCallContext *funcctx); + +mhist_dim_info * get_ranges(HeapTuple *giant_tup_array, int ntuples, + TupleDesc desc, int ndims); + +bucket_maxdiff_info *get_maxdiffs( + HeapTuple *giant_tup_array, + TupleDesc desc, + const int *bucket_offsets, + const int *bucket_lengths, + int nbuckets_done, + const mhist_dim_info *ranges); + +int get_mhist_bucket_for_tuple(mhist *hist, HeapTuple tuple, TupleDesc desc); + +mhist * real_mhist_project(mhist *inhist, int32 proj_dim); + +void project_mhist_bucket(mhist_bucket *src, mhist_bucket *dest, + int ndims, int proj_dim); + +void remove_empty_mhist_buckets(mhist *hist); + +mhist *merge_overlapping_mhist_buckets(mhist *hist); + +mhist *really_merge_overlapping_mhist_buckets(mhist *hist, bool *found_overlap, + bool verbose); + +mhist *merge_two_overlapping_buckets(mhist *hist, + int left_bucketid, int right_bucketid, bool verbose); + +void cut_top_off_mhist_bucket(mhist_bucket *bucket_ptr, int dimid, int new_max); + +void cut_bottom_off_mhist_bucket(mhist_bucket *bucket_ptr, int dimid, + int new_min); + +bool potential_overlap(mhist_bucket *left_bucket, mhist_bucket *right_bucket, + int ndims); + +double mhist_total_count(mhist *hist); + +char *mhist_to_str(mhist *hist); + +mhist * mhist_cross_product(const mhist *hist1, const mhist *hist2); + +void mhist_equality_select_in_place(mhist *hist, int dim1, int dim2); + +void check_mhist_integrity(const mhist *hist, const char *id); + +void get_mhist_split_point(HeapTuple *giant_tup_array, TupleDesc desc, + const int *bucket_offsets, + const int *bucket_lengths, + int nbuckets_done, + const mhist_dim_info *ranges, + int *split_bucketid, int *split_dimid, int *split_offset); + +bool no_buckets_to_split(HeapTuple *giant_tup_array, + const int *bucket_offsets, + const int *bucket_lengths, + int nbuckets_done, + TupleDesc desc); + +void mhist_stream_load_buckets(mhist_stream_state *state); + +void mhist_create_buckets(mhist *hist, HeapTuple *tuparr, int ntups, + TupleDesc desc); + +void tdesc_to_mhist_diminfo(mhist *hist, TupleDesc desc); + +mhist * create_reg_mhist(HeapTuple *tups, int ntups, + TupleDesc desc, int bucket_width); + +int create_reg_buckets(mhist *hist, mhist_dim_info *ranges, int num_dims, + int cur_dim, int bucket_width, int bucket_num, int *itr); + +int remove_duplicate_names(mhist_dim_name *names, int num_names); + +int get_dimid_for_name(mhist_dim_name *names, int num_names, + const char *name); + +int get_random(HeapTuple*, int, int, int, int, double); + +void create_prob_array(double*, double, int); + +int skip_random(double*, int); + +void print_array(HeapTuple*, int); + +/******************************************************************************* + * PROFILING + ******************************************************************************/ + +int asdf(char *); +int zxcv(char *); + +int asdf(char *str) { +#ifdef ASDF + struct timeval tv; + gettimeofday(&tv, 0); + elog(LOG, "YANG START %s %d.%0.6d", str, tv.tv_sec, tv.tv_usec); +#endif + return 0; +} + +int zxcv(char *str) { +#ifdef ZXCV + struct timeval tv; + gettimeofday(&tv, 0); + elog(LOG, "YANG END %s %d.%0.6d", str, tv.tv_sec, tv.tv_usec); +#endif + return 0; +} + +/******************************************************************************* + * EXTERNAL FUNCTIONS + ******************************************************************************/ + +/* FUNCTION create_mhist + * ARGUMENTS: <num_buckets> is the number of buckets for a new histogram, + * and <num_dims> is the number of dimensions of the histogram. + * POSTCONDITIONS: Creates a new, empty histogram data structure and fills in + * the header fields. + */ +mhist * create_mhist(int num_buckets, int num_dims) { + mhist *result = NULL; + int32 hist_len = MHIST_SIZE(num_buckets, num_dims); + + if (hist_len < MHIST_HDRSZ) { + elog(ERROR, "Invalid MHIST length of %d.", hist_len); + } + + result = (mhist*)palloc(hist_len); + + /* + elog(LOG, "create_mhist(): Created a structure of size %d at %p.", + hist_len, result); + elog(LOG, "create_mhist(): Last byte at %p.", + ((char*)result + hist_len) ); + */ + + /* Zero out the struct so as not to screw up Postgres hash functions. */ + memset(result, 0x0, hist_len); + + result->hdr.length = hist_len; + result->hdr.numDims = num_dims; + result->hdr.numBucketsUsed = result->hdr.numBucketsAllocated = num_buckets; + + return result; +} + +/* FUNCTION mhist_load_buckets() + * ARGUMENTS: <giant_tup_array> is an array containing a bunch of tuples. + * <ntuples> is the number of tuples in the array. + * <hist> is an MHIST. + * PRECONDITIONS: <hist> has not been loaded yet, but its bucket boundaries + * have already been determined. + * POSTCONDITIONS: Fills up the buckets array of <hist> with information about + * the indicated buckets. + */ +void mhist_load_buckets(HeapTuple *giant_tup_array, TupleDesc desc, + int ntuples, mhist *hist) +{ + asdf("mhist_load_buckets"); + int tupleid = -1; + + /* SPECIAL CASE: No tuples to load. */ + if (0 == ntuples) { + return; + } + /* END SPECIAL CASE */ + + for (tupleid = 0; tupleid < ntuples; tupleid++) { + + /* Figure out what bucket this tuple should go into. */ + int bucketid = get_mhist_bucket_for_tuple(hist, + giant_tup_array[tupleid], desc); + + mhist_bucket *target_bucket_ptr = MHIST_BUCKET_PTR(hist, bucketid); + + /* Increment the counters. */ + /* TODO: Calculate the number of unique values along each dimension! */ + target_bucket_ptr->count += 1.0; + } + + zxcv("mhist_load_buckets"); +} + + + +/* + * MHist constructor. Takes a single string argument, in the following format: + * + * (<Relation name>),(<col1>,col2>,...,<coln>),(<numbuckets>) + * + * Where: + * <Relation name> is the name of the relation to be histogrammed. + * <col1>,col2>,...,<coln> are the columns of the relation to be + * included in the histogram. + * <numbuckets> is the number of buckets in the histogram. + */ +Datum +mhist_in(PG_FUNCTION_ARGS) +{ + asdf("mhist_in"); + + char *str = PG_GETARG_CSTRING(0); + char *str_copy = NULL; + char *relname_str = NULL, *cols_str = NULL, *num_buckets_str = NULL; + char **cols_arr = NULL; + int ncols = -1, colno = -1; + char *ptr = NULL; + mhist *result = NULL; + + /* TODO: Sould the parsing code below be moved to a single function? */ + + /* Parse a copy of the string, strtok() style. */ + str_copy = palloc(1 + strlen(str)); + strcpy(str_copy, str); + + /* First the relation name, "(name)". */ + if (str_copy[0] != '(') goto parse_error; + relname_str = ptr = str_copy + 1; + while (*ptr != ')') { + if ('\0' == *ptr) goto parse_error; ++ptr; + } + *ptr = '\0'; ++ptr; + + /*elog(LOG, "mhist_in(): Relation string is '%s'.", relname_str);*/ + + /* Skip the comma between the parens. */ + if (*ptr != ',') goto parse_error; ++ptr; + + /* Then parse the column names, "(col1,col2,...,coln)". */ + if (*ptr != '(') goto parse_error; ++ptr; + cols_str = ptr; + while (*ptr != ')') { + if ('\0' == *ptr) goto parse_error; ++ptr; + } + *ptr = '\0'; ++ptr; + + /*elog(LOG, "mhist_in(): Cols string is '%s'.", cols_str);*/ + + /* Skip another comma, and parse the number of buckets. */ + if (*ptr != ',') goto parse_error; ++ptr; + if (*ptr != '(') goto parse_error; ++ptr; + num_buckets_str = ptr; + while (*ptr != ')') { + if ('\0' == *ptr) goto parse_error; ++ptr; + } + *ptr = '\0'; ++ptr; + + /*elog(LOG, "mhist_in(): Num. buckets string is '%s'.", num_buckets_str);*/ + + /* TODO: We currently ignore any junk at the end of the string. Should we + * read through it and complain about non-whitespace characters? */ + + /* Generate an array of column names. */ + /* First we need to count the number of columns. */ + ncols = 1; + ptr = cols_str; + if (ptr == '\0') goto parse_error; + while (*ptr != '\0') { + if (',' == *ptr) ++ncols; + ++ptr; + } + + Assert(ncols <= MHIST_MAX_DIMS); + + + /* Then we generate the array, again by parsing strtok()-style. */ + cols_arr = palloc(ncols * sizeof(char*)); + ptr = cols_str; + for (colno = 0; colno < ncols - 1; colno++) { + cols_arr[colno] = ptr; + while (*ptr != ',') ++ptr; + *ptr = '\0'; ++ptr; + } + cols_arr[ncols-1] = ptr; + + /* Generate the histogram object. */ + result = real_mhist_in(relname_str, cols_arr, ncols, + atoi(num_buckets_str)); + + /* Clean up. */ + /*elog(LOG, "mhist_in(): Cleaning up.");*/ + pfree(str_copy); + pfree(cols_arr); + + zxcv("mhist_in"); + + /* Now we're done. */ + PG_RETURN_MHIST_P(result); + + /* Error handlers. */ +parse_error: + /* elog(ERROR) should clean up everything we palloc()'d. */ + elog(ERROR, "mhist_in(): Error parsing string '%s'.", str); + + zxcv("mhist_in"); + + /* UNREACHABLE, but the compiler might not know that. */ + PG_RETURN_MHIST_P(NULL); +} + +/* + * Constructor that builds an MHIST with square buckets. Takes a single + * string argument, in the following format: + * + * (<Relation name>),(<col1>,col2>,...,<coln>),(<bucketwidth>) + * + * Where: + * <Relation name> is the name of the relation to be histogrammed. + * <col1>,col2>,...,<coln> are the columns of the relation to be + * included in the histogram. + * <bucketwidth> is the width of a single bucket along each dimension. + */ +Datum +square_mhist_in(PG_FUNCTION_ARGS) +{ + asdf("square_mhist_in"); + + char *str = PG_GETARG_CSTRING(0); + char *str_copy = NULL; + char *relname_str = NULL, *cols_str = NULL, + *bucketwidth_str = NULL; + char **cols_arr = NULL; + int ncols = -1, colno = -1; + char *ptr = NULL; + mhist *result = NULL; + HeapTuple *giant_tup_array = NULL; + TupleDesc desc = NULL; + int ntuples = -1; + + /* TODO: Sould the parsing code below be moved to a single function? */ + + /* Parse a copy of the string, strtok() style. */ + str_copy = palloc(1 + strlen(str)); + strcpy(str_copy, str); + + /* First the relation name, "(name)". */ + if (str_copy[0] != '(') goto parse_error; + relname_str = ptr = str_copy + 1; + while (*ptr != ')') { + if ('\0' == *ptr) goto parse_error; ++ptr; + } + *ptr = '\0'; ++ptr; + + /*elog(LOG, "mhist_in(): Relation string is '%s'.", relname_str);*/ + + /* Skip the comma between the parens. */ + if (*ptr != ',') goto parse_error; ++ptr; + + /* Then parse the column names, "(col1,col2,...,coln)". */ + if (*ptr != '(') goto parse_error; ++ptr; + cols_str = ptr; + while (*ptr != ')') { + if ('\0' == *ptr) goto parse_error; ++ptr; + } + *ptr = '\0'; ++ptr; + + /*elog(LOG, "mhist_in(): Cols string is '%s'.", cols_str);*/ + + /* Skip another comma, and parse the number of buckets. */ + if (*ptr != ',') goto parse_error; ++ptr; + if (*ptr != '(') goto parse_error; ++ptr; + bucketwidth_str = ptr; + while (*ptr != ')') { + if ('\0' == *ptr) goto parse_error; ++ptr; + } + *ptr = '\0'; ++ptr; + + /*elog(LOG, "mhist_in(): Num. buckets string is '%s'.", bucketwidth_str);*/ + + /* TODO: We currently ignore any junk at the end of the string. Should we + * read through it and complain about non-whitespace characters? */ + + /* Generate an array of column names. */ + /* First we need to count the number of columns. */ + ncols = 1; + ptr = cols_str; + if (ptr == '\0') goto parse_error; + while (*ptr != '\0') { + if (',' == *ptr) ++ncols; + ++ptr; + } + + Assert(ncols <= MHIST_MAX_DIMS); + + + /* Then we generate the array, again by parsing strtok()-style. */ + cols_arr = palloc(ncols * sizeof(char*)); + ptr = cols_str; + for (colno = 0; colno < ncols - 1; colno++) { + cols_arr[colno] = ptr; + while (*ptr != ',') ++ptr; + *ptr = '\0'; ++ptr; + } + cols_arr[ncols-1] = ptr; + + /* Read in the tuples for this histogram. */ + giant_tup_array = get_table_as_array(relname_str, cols_arr, ncols, + &ntuples, &desc); + + /* Generate the histogram object. */ + result = create_reg_mhist(giant_tup_array, ntuples, desc, + atoi(bucketwidth_str)); + + /* create_reg_mhist() doesn't know about the relation names, because + * they're not in the tuple descriptor. So we overwrite the dimension name + * information. */ + mhist_set_nameinfo(result, relname_str, cols_arr, ncols); + + /* Fill its buckets. */ + mhist_load_buckets(giant_tup_array, desc, ntuples, result); + + /* Clean up. */ + /*elog(LOG, "mhist_in(): Cleaning up.");*/ + pfree(str_copy); + pfree(cols_arr); + + zxcv("square_mhist_in"); + + /* Now we're done. */ + PG_RETURN_MHIST_P(result); + + /* Error handlers. */ +parse_error: + /* elog(ERROR) should clean up everything we palloc()'d. */ + elog(ERROR, "square_mhist_in(): Error parsing string '%s'.", str); + + zxcv("square_mhist_in"); + + /* UNREACHABLE, but the compiler might not know that. */ + PG_RETURN_MHIST_P(NULL); +} +/* + * MHIST-to-string conversion function (for output). + * + * It's hard to convert a histogram to a one-line string, so this function + * just prints a note to the effect that this is an MHIST. + */ +Datum +mhist_out(PG_FUNCTION_ARGS) +{ + mhist *hist = PG_GETARG_MHIST_P(0); + char *result = mhist_to_str(hist); + + PG_RETURN_CSTRING(result); +} + +/* + * Function to convert an mhist into relational representation. This is a + * set-returning function (See section 9.5.6.2 of the Programmer's Reference), + * so it needs to save its state across multiple calls. Yuck. + * + * Based on hist_to_table(). + * + * In order to call this function, you need to define the following SQL + * datatypes: + * + * CREATE_TYPE __mhist1 AS (bucketid, min0, max0, count); + * CREATE_TYPE __mhist2 AS (bucketid, min0, max0, min1, max1, count); + * CREATE_TYPE __mhist3 AS (bucketid, min0, max0, min1, max1, min2, max2, + * count); + * CREATE_TYPE __mhist4 AS (bucketid, min0, max0, min1, max1, min2, max2, + * min3, max3, count); + * [Add additional types here if you change the value of HISTOGRAM_MAX_DIMS.] + */ +Datum +mhist_to_table(PG_FUNCTION_ARGS) +{ + asdf("mhist_to_table"); + + FuncCallContext *funcctx = NULL; + mhist_to_table_state *state = NULL; + mhist *hist = PG_GETARG_MHIST_P(0); +/* TupleDesc tupdesc;*/ + char typename[128]; + + check_mhist_integrity(hist, "mhist_to_table()"); + + if (SRF_IS_FIRSTCALL()) { + MemoryContext oldcontext; + + funcctx = SRF_FIRSTCALL_INIT(); + + /*elog(LOG, "mhist_to_table(): Starting first-call setup.");*/ + + + /* Create the data structure that will remember the status of this + * function. */ + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + state = palloc(sizeof(*state)); + MemoryContextSwitchTo(oldcontext); + + state->current_bucket = 0; + state->num_buckets = MHIST_NUM_BUCKETS_USED(hist); + + /* Generate the name of the type that corresponds to the histogram's + * number of dimensions. */ + snprintf(typename, sizeof(typename), "__mhist%d", hist->hdr.numDims); + + /* Postgres SRF crap has been moved to this function: */ + srf_firstcall_junk(funcctx, state, + MHIST_NUM_BUCKETS_USED(hist), typename); + + /*elog(LOG, "mhist_to_table(): Done with first-call setup.");*/ + } + + /* Stuff that gets done every time the function is called: */ + funcctx = SRF_PERCALL_SETUP(); + + state = (mhist_to_table_state*)(funcctx->user_fctx); + + /* Are we done returning tuples? */ + if (state->current_bucket < state->num_buckets) { + char **values; + int numvalues = -1; + HeapTuple tuple; + Datum result; + int valid = -1; + + /* + elog(LOG, "mhist_to_table(): Generating tuple for bucket %d (%d total).", + state->current_bucket, state->num_buckets); + */ + + /* Not done returning tuples -- construct the next one. */ + /* For the sake of simplicity, we convert the field values into strings + * and then back into values. */ + + numvalues = 2 * hist->hdr.numDims + 2; + /* Number of items in row == 2 + (2 x number of dimensions: + * 1 column for the bucketID (not strictly necessary) + * 1 column for the bucket's count + * 2 columns per dimension to hold min and max values. + */ + + /*elog(LOG, "mhist_to_table(): This tuple has %d fields.", + numvalues);*/ + + /* Build up the array of string values. */ + values = (char**)palloc(sizeof(char*) * (numvalues)); + + for (valid = 0; valid < numvalues; valid++) { + values[valid] = (char*)palloc(32 * sizeof(char)); + } + + /* Fill in the array. */ + for (valid = 0; valid < numvalues; valid++) { + if (0 == valid) { + /* First entry is the bucketid. */ + snprintf(values[valid], 32, "%d", state->current_bucket); + + } else if (numvalues - 1 == valid) { + /* Last entry is the bucket count. */ + mhist_bucket *cur_bucket = + MHIST_BUCKET_PTR(hist, state->current_bucket); + /* + snprintf(values[valid], 32, "%d", + (int)(cur_bucket->count + 0.5)); + */ + /* Round up if necessary */ + snprintf(values[valid], 32, "%f", + (cur_bucket->count)); + + } else { + /* valid > 0 and < (numvalues-1) */ + /* Entries between first and last are the bounds of the + * rectangle that comprises this bucket. These entries + * alternate between the lower and upper bound. */ + int dimid = (valid - 1) / 2; + bool is_min = (valid % 2 == 1); + /* Is this a lower or upper bound? */ + mhist_bucket *cur_bucket = + MHIST_BUCKET_PTR(hist, state->current_bucket); + snprintf(values[valid], 32, "%d", + is_min ? + cur_bucket->diminfo[dimid].min_val + : cur_bucket->diminfo[dimid].max_val); + } + + /*elog(LOG, "mhist_to_table(): Value %d is '%s'.", + valid, values[valid]);*/ + + } + + /*elog(LOG, "mhist_to_table(): Done generating values.");*/ + + /* Construct the tuple for the strings. */ + tuple = BuildTupleFromCStrings(funcctx->attinmeta, values); + + /*elog(LOG, "mhist_to_table(): Done building tuple.");*/ + + /* Make the tuple into a datum. */ + result = TupleGetDatum(funcctx->slot, tuple); + + /* Clean up. */ + for (valid = 0; valid < hist->hdr.numDims; valid++) { + /*elog(LOG, "mhist_to_table(): Freeing field %d of a tuple.", + valid);*/ + pfree(values[valid]); + } + /*elog(LOG, "mhist_to_table(): Freeing holder for tuple %d.", + state->current_bucket);*/ + pfree(values); + + /*elog(LOG, "mhist_to_table(): Done generating tuple for bucket %d.", + state->current_bucket);*/ + + ++(state->current_bucket); + + zxcv("mhist_to_table"); + + SRF_RETURN_NEXT(funcctx, result); + } else { + /*elog(LOG, "mhist_to_table(): Cleaning up.");*/ + + /* here we are done returning items, and just need to clean up: */ + srf_cleanup(funcctx); + + zxcv("mhist_to_table"); + + SRF_RETURN_DONE(funcctx); + } +} + + +/* + * Function that removes a dimension of an MHIST, merging all bins along the + * dimension. + * + * Usage: mhist_project(hist, dim) + * + * Where: + * <hist> is a histogram created with mhist_in(). + * <dim> is the name of the dimension to eliminate. + * + * This function creates and returns a new MHIST. + */ +Datum +mhist_project(PG_FUNCTION_ARGS) { + asdf("mhist_project"); + + mhist *inhist = PG_GETARG_MHIST_P(0); + char *proj_dim_name = PG_GETARG_CSTRING(1); + + mhist *result = NULL; + + /* Figure out which index to project. */ + int proj_dim = get_dimid_for_name(inhist->hdr.dimNames, + inhist->hdr.numDimNames, + proj_dim_name); + + if (MHIST_NOT_A_DIMID == proj_dim) { + elog(ERROR, "mhist_project(): Dimension '%s' of '%s' not found.", + proj_dim_name, mhist_to_str(inhist)); + } + + result = real_mhist_project(inhist, proj_dim); + + zxcv("mhist_project"); + + PG_RETURN_MHIST_P(result); +} + +/* + * Function that removes all dimensions of an MHIST except for the dimensions + * corresponding to the specified columns. + * + * Usage: mhist_project_many(hist, dimlist) + * + * Where: + * <hist> is a histogram created with mhist_in(). + * <dimlist> is a string containing the names of the dimensions to + * keep, separated by spaces. + * + * This function creates and returns a new MHIST. + */ +Datum +mhist_project_many(PG_FUNCTION_ARGS) { + asdf("mhist_project"); + + mhist *inhist = PG_GETARG_MHIST_P(0); + char *dimlist = PG_GETARG_CSTRING(1); + int i = -1; + int num_cols_kept = -1; + int dimid = -1; + char dimlist_copy[512]; + char *cols_kept[MHIST_MAX_DIMS]; + mhist *curhist = NULL; + bool dims_to_keep[MHIST_MAX_DIMS]; + bool first_copy = false; + int num_to_keep = -1; + + + /* Parse the list of column names. */ + Assert(dimlist != NULL); + + /* In one pass, we make an uppercase copy, count the number of columns, + * and make an array of pointers to their names. */ + num_cols_kept = 1; + cols_kept[0] = dimlist_copy; + for (i = 0; dimlist[i] != '\0' && i < sizeof(dimlist_copy); i++) { + if (' ' == dimlist[i]) { + /* Names are separated by a single space. */ + dimlist_copy[i] = '\0'; + cols_kept[num_cols_kept] = dimlist_copy + (i+1); + num_cols_kept++; + Assert(num_cols_kept < MHIST_MAX_DIMS); + } else { + dimlist_copy[i] = toupper(dimlist[i]); + } + } + dimlist_copy[i] = '\0'; + + elog(LOG, "mhist_project_many(): Keeping the following cols:"); + for (i = 0; i < num_cols_kept; i++) { + elog(LOG, " '%s'", cols_kept[i]); + } + + /* Now we go through each column and see if it is one of the columns we + * want to keep; if it is not, we will project it out. */ + curhist = inhist; + for (dimid = 0; dimid < MHIST_NUM_DIMS(curhist); dimid++) { + int colid = -1; + + dims_to_keep[dimid] = false; + /* Start out assuming we're going to throw away this column. */ + + for (colid = 0; colid < num_cols_kept; colid++) { + int nameid = -1; + for (nameid = 0; nameid < MHIST_NUM_DIM_NAMES(curhist); nameid++) + { + elog(LOG, "Comparing '%s' with '%s'", + cols_kept[colid], MHIST_DIM_NAME(curhist, nameid)); + if (MHIST_NAME_DIM(curhist, nameid) == dimid + && 0 == strncasecmp(cols_kept[colid], + MHIST_DIM_NAME(curhist, nameid), + MHIST_COL_NAME_LEN)) + { + dims_to_keep[dimid] = true; + /* Want to keep this column. */ + elog(LOG, " Found a match!"); + } + } + } + } + + /* Make sure we got the right number of columns. */ + num_to_keep = 0; + for (dimid = 0; dimid < MHIST_NUM_DIMS(curhist); dimid++) { + if (dims_to_keep[dimid]) { + num_to_keep++; + } + } + + if (num_to_keep != num_cols_kept) { + elog(ERROR, "Only found %d of %d columns '%s'.", + num_to_keep, num_cols_kept, dimlist); + } + + /* For now, we project dimensions out one by one, since that code is + * already written. We do the dimensions in reverse order since projection + * causes the dimensions above the one projected to be renumbered. */ + /* TODO: Perform all the projections in one step. */ + first_copy = true; + for (dimid = MHIST_NUM_DIMS(curhist) - 1; dimid >= 0; dimid--) { + if (false == dims_to_keep[dimid]) { + mhist *newhist = real_mhist_project(curhist, dimid); + + /* Don't want to pfree() the function argument, inhist. */ + if (first_copy) { + first_copy = false; + } else { + pfree(curhist); + } + curhist = newhist; + } + } + + zxcv("mhist_project"); + + PG_RETURN_MHIST_P(curhist); +} + + + + +/* + * Function that adds the corresponding buckets of two mhists. + * + * Usage: mhist_union(hist1, hist2) + * + * Where: + * <hist1> and <hist2> are MHISTs created with mhist_in(). + * + * The dimension names for the output histogram will be taken from the + * union of the two histograms. + */ +Datum +mhist_union(PG_FUNCTION_ARGS) { + asdf("mhist_union"); + +mhist *hist1 = PG_GETARG_MHIST_P(0); + mhist *hist2 = PG_GETARG_MHIST_P(1); + mhist *result = NULL; + int num_buckets = -1; + int bucketid = -1; + int nameid = -1; + int num_names = -1; + mhist_dim_name new_names[2 * MHIST_MAX_COL_NAMES]; + + + /* Verify the integrity of the inputs. */ + check_mhist_integrity(hist1, "mhist_union() hist1"); + check_mhist_integrity(hist2, "mhist_union() hist2"); + if (MHIST_NUM_DIMS(hist1) != MHIST_NUM_DIMS(hist2)) { + elog(ERROR, + "mhist_union(): Histograms have different numbers of dimensions."); + } + + elog(DEBUG1, "mhist_union(): Left arg is (%s).", + mhist_to_str(hist1)); + elog(DEBUG1, "mhist_union(): Right arg is (%s).", + mhist_to_str(hist2)); + + /* Start out with the buckets of the original histograms. */ + num_buckets = MHIST_NUM_BUCKETS_USED(hist1) + + MHIST_NUM_BUCKETS_USED(hist2); + + /* Allocate a new histogram containing all the buckets from the input + * histograms. */ + result = create_mhist(num_buckets, MHIST_NUM_DIMS(hist1)); + + /* Copy over the dimension names, and remove duplicates. */ + for (nameid = 0; nameid < MHIST_NUM_DIM_NAMES(hist1); nameid++) { + new_names[nameid] = MHIST_DIM_NAME_INFO(hist1, nameid); + } + + for (nameid = 0; nameid < MHIST_NUM_DIM_NAMES(hist2); nameid++) { + new_names[nameid + MHIST_NUM_DIM_NAMES(hist1)] = + MHIST_DIM_NAME_INFO(hist2, nameid); + } + + num_names = remove_duplicate_names(new_names, + MHIST_NUM_DIM_NAMES(hist1) + MHIST_NUM_DIM_NAMES(hist2)); + + if (num_names > MHIST_MAX_COL_NAMES) { + elog(ERROR, "mhist_union(): Too many dimension names."); + } + + memcpy(result->hdr.dimNames, new_names, num_names * sizeof(*new_names)); + + MHIST_SET_NUM_DIM_NAMES(result, num_names); + + for (bucketid = 0; bucketid < MHIST_NUM_BUCKETS_USED(hist1); bucketid++) { + memcpy(MHIST_BUCKET_PTR(result, bucketid), + MHIST_BUCKET_PTR(hist1, bucketid), + MHIST_BUCKET_LEN(MHIST_NUM_DIMS(hist1))); + } + + for (bucketid = 0; bucketid < MHIST_NUM_BUCKETS_USED(hist2); bucketid++) { + memcpy(MHIST_BUCKET_PTR(result, + bucketid + MHIST_NUM_BUCKETS_USED(hist1)), + MHIST_BUCKET_PTR(hist2, bucketid), + MHIST_BUCKET_LEN(MHIST_NUM_DIMS(hist1))); + } + + /* Fix overlapping buckets. */ + result = merge_overlapping_mhist_buckets(result); + + zxcv("mhist_union"); + + PG_RETURN_MHIST_P(result); +} + +/* + * Function that removes a dimension of am MHIST, keeping only those bins + * for which the bin along the indicated dimension was a particular number. + * + * Usage: mhist_slice(hist, dim, val) + * + * Where: + * <hist> is a histogram created with hist_in(). + * <dim> is the name of the dimension to eliminate. + * <val> is the value to keep along that dimension. + * + * This function creates and returns a new MHIST. + */ +Datum +mhist_slice(PG_FUNCTION_ARGS) { + asdf("mhist_slice"); + + mhist *hist = PG_GETARG_MHIST_P(0); + char *dim_name = PG_GETARG_CSTRING(1); + int32 val = PG_GETARG_INT32(2); + mhist *result = NULL; + int num_result_buckets = -1; + int bucketid = -1; + + int32 dim = -1; + + /* Verify the integrity of the inputs. */ + check_mhist_integrity(hist, "mhist_slice() argument"); + + dim = get_dimid_for_name(hist->hdr.dimNames, + MHIST_NUM_DIM_NAMES(hist), dim_name); + + if (MHIST_NOT_A_DIMID == dim) { + elog(ERROR, "mhist_slice(): Dimension '%s' of '%s' not found.", + dim_name, mhist_to_str(hist)); + } + + /* TODO: Right now, we accept any value. Is that okay? */ + + /* Create the empty output histogram. */ + result = create_proj_mhist(hist, dim); + + /* Project the buckets and adjust their counts. While we're at it, remove + * buckets that don't overlap with the indicated slice. */ + num_result_buckets = 0; + for (bucketid = 0; bucketid < MHIST_NUM_BUCKETS_USED(hist); bucketid++) + { + int min_val = MHIST_DIMINFO_PTR(hist, bucketid, dim)->min_val; + int max_val = MHIST_DIMINFO_PTR(hist, bucketid, dim)->max_val; + + /* Should we copy this bucket over? */ + if (val >= min_val && val <= max_val) { + mhist_bucket *bkt_ptr = NULL; + + project_mhist_bucket( + MHIST_BUCKET_PTR(hist, bucketid), + MHIST_BUCKET_PTR(result, num_result_buckets), + MHIST_NUM_DIMS(hist), dim); + + /* Adjust the count of the bucket. */ + bkt_ptr = MHIST_BUCKET_PTR(result, num_result_buckets); + + /* We're taking a slice that is one unit wide. */ + bkt_ptr->count /= (max_val - min_val + 1.0); + + num_result_buckets++; + } + } + + Assert(MHIST_NUM_BUCKETS_ALLOC(result) >= num_result_buckets); + + MHIST_SET_NUM_BUCKETS_USED(result, num_result_buckets); + + zxcv("mhist_slice"); + + PG_RETURN_MHIST_P(result); +} + +/* + * Function that performs the approximate equijoin of two relations using + * MHISTs. + * + * Usage: mhist_equijoin(hist1, col1, hist2, col2) + * + * Where: + * <hist1> and <hist2> are MHISTs created with mhist_in(). + * <col1> and <col2> are the names of columns for the MHISTs. + */ +Datum +mhist_equijoin(PG_FUNCTION_ARGS) +{ + asdf("mhist_equijoin"); + + /* Function arguments. */ + mhist *hist1 = PG_GETARG_MHIST_P(0); + char* colname1 = PG_GETARG_CSTRING(1); + mhist *hist2 = PG_GETARG_MHIST_P(2); + char* colname2 = PG_GETARG_CSTRING(3); + + /* Locals */ + mhist *xp = NULL; + mhist *result = NULL; + int xp_dim1 = -1; + int xp_dim2 = -1; + int col1 = -1; + int col2 = -1; + int nameid = -1; + + /* Check validity of arguments. */ + check_mhist_integrity(hist1, "mhist_equijoin() hist1"); + check_mhist_integrity(hist2, "mhist_equijoin() hist2"); + + col1 = get_dimid_for_name(hist1->hdr.dimNames, + MHIST_MAX_COL_NAMES, colname1); + col2 = get_dimid_for_name(hist2->hdr.dimNames, + MHIST_MAX_COL_NAMES, colname2); + + if (MHIST_NOT_A_DIMID == col1) { + elog(ERROR, "mhist_equijoin(): Dimension '%s' of '%s' not found.", + colname1, mhist_to_str(hist1)); + } + if (MHIST_NOT_A_DIMID == col2) { + elog(ERROR, "mhist_equijoin(): Dimension '%s' of '%s' not found.", + colname2, mhist_to_str(hist2)); + } + + + /* First, compute the cross-product of the two histograms. Using the + * cross-product isn't terribly space-efficient, but it makes + * correctness easier. + * + * TODO: Compute the join directly. + */ + xp = mhist_cross_product(hist1, hist2); + /*check_mhist_integrity(xp, "mhist_equijoin() xp");*/ + + + elog(LOG, "mhist_equijoin(): Starting."); + elog(LOG, "mhist_equijoin(): Left arg is (%s).", + mhist_to_str(hist1)); + elog(LOG, "mhist_equijoin(): Right arg is (%s).", + mhist_to_str(hist2)); + elog(LOG, "mhist_equijoin(): Cross product is (%s).", + mhist_to_str(xp)); + + /* Map our dimensions into the dimensions of the cross-product histogram. + */ + xp_dim1 = col1; + xp_dim2 = MHIST_NUM_DIMS(hist1) + col2; + + /* Next, perform the selection. */ + mhist_equality_select_in_place(xp, xp_dim1, xp_dim2);; + + /*check_mhist_integrity(xp, "mhist_equijoin() done selection");*/ + + /* Eliminate zero-size buckets. */ + remove_empty_mhist_buckets(xp); + + /*check_mhist_integrity(xp, "mhist_equijoin() removed empty");*/ + + /* Finally, project the resulting histogram so we have only one copy of the + * join column. */ + result = real_mhist_project(xp, xp_dim2); + /*check_mhist_integrity(xp, "mhist_equijoin() result");*/ + + /* The projection removed the names of the second column in the equijoin, + * but we want the names transferred to the first column. Fix up the names + * array. */ + for (nameid = 0; nameid < MHIST_NUM_DIM_NAMES(hist2); nameid++) { + if (MHIST_NAME_DIM(hist2, nameid) == col2) { + int new_id = MHIST_NUM_DIM_NAMES(result); + if (new_id >= MHIST_MAX_COL_NAMES) { + elog(ERROR, + "Too many column names (%d >= %d) creating equijoin.", + new_id, MHIST_MAX_COL_NAMES); + } + result->hdr.dimNames[new_id] = hist2->hdr.dimNames[nameid]; + MHIST_SET_NAME_DIM(result, new_id, xp_dim1); + MHIST_SET_NUM_DIM_NAMES(result, MHIST_NUM_DIM_NAMES(result) + 1); + } + } + + + pfree(xp); + + zxcv("mhist_equijoin"); + + /* TODO: Should we free hist1 and hist2? */ + PG_RETURN_MHIST_P(result); +} + +/* + * Function that performs the approximate cross-product of two relations using + * MHISTs. + * + * Usage: mhist_crossprod(hist1, hist2) + * + * Where: + * <hist1> and <hist2> are MHISTs created with mhist_in(). + */ +Datum +mhist_crossprod(PG_FUNCTION_ARGS) +{ + asdf("mhist_crossprod"); + + /* Function arguments. */ + mhist *hist1 = PG_GETARG_MHIST_P(0); + mhist *hist2 = PG_GETARG_MHIST_P(1); + + /* Locals */ + mhist *xp = NULL; + + /* Check validity of arguments. */ + check_mhist_integrity(hist1, "mhist_crossprod() hist1"); + check_mhist_integrity(hist2, "mhist_crossprod() hist2"); + + xp = mhist_cross_product(hist1, hist2); + + zxcv("mhist_crossprod"); + + /* TODO: Should we free hist1 and hist2? */ + PG_RETURN_MHIST_P(xp); +} + +/* + * Function that performs the approximate equality selection on an MHIST. + * + * Usage: mhist_eq_selection(hist, col1, col2) + * + * Where: + * <hist> is an MHIST created with mhist_in(). + * <col1> and <col2> are the names of columns stored inside <hist>. + */ +Datum +mhist_eq_selection(PG_FUNCTION_ARGS) +{ + asdf("mhist_eq_selection"); + + /* Function arguments. */ + mhist *hist = PG_GETARG_MHIST_P(0); + char* colname1 = PG_GETARG_CSTRING(1); + char* colname2 = PG_GETARG_CSTRING(2); + + /* Locals */ + mhist *result = NULL; + int dim1 = -1, dim2 = -1; + + /* Check validity of arguments. */ + check_mhist_integrity(hist, "mhist_eq_selection()"); + + dim1 = get_dimid_for_name(hist->hdr.dimNames, + MHIST_MAX_COL_NAMES, colname1); + dim2 = get_dimid_for_name(hist->hdr.dimNames, + MHIST_MAX_COL_NAMES, colname2); + + if (MHIST_NOT_A_DIMID == dim1) { + elog(ERROR, "mhist_eq_selection(): Dimension '%s' of '%s' not found.", + colname1, mhist_to_str(hist)); + } + if (MHIST_NOT_A_DIMID == dim2) { + elog(ERROR, "mhist_eq_selection(): Dimension '%s' of '%s' not found.", + colname2, mhist_to_str(hist)); + } + + result = create_mhist_copy(hist); + + mhist_equality_select_in_place(result, dim1, dim2); + + /* Eliminate zero-size buckets. */ + remove_empty_mhist_buckets(result); + + zxcv("mhist_eq_selection"); + + PG_RETURN_MHIST_P(result); +} + +/* FUNCTION mhist_can_summarize_tuple + * ARGUMENTS: <desc> describes a type of tuple. + * POSTCONDITIONS: Returns TRUE if the current implementation of MHISTs can + * summarize the indicated type of tuple. + */ +bool mhist_can_summarize_tuple(TupleDesc desc) { + int i = -1; + int ncols = get_num_non_timestamp_attrs(desc); + + asdf("mhist_can_summarize_tuple"); + + for (i = 0; i < ncols; i++) { + + /* Skip the timestamp, if any. */ + int attrid = get_attr_id_skipping_ts(i + 1, desc); + + + /* Is this attribute an integer? */ + switch (desc->attrs[attrid - 1]->atttypid) { + case INT2OID: + case INT4OID: + case CHAROID: + case BYTEAOID: + /* We can insert values from this index into the MHIST. */ + break; + + default: + elog(LOG, "Can't build an MHIST on columns of type '%s'", + format_type_with_typemod(desc->attrs[i]->atttypid, + desc->attrs[i]->atttypmod)); + zxcv("mhist_can_summarize_tuple"); + return false; + } + } + + zxcv("mhist_can_summarize_tuple"); + + /* Haven't found any indication that we can't summarize this tuple... */ + return true; +} + +/* FUNCTION get_timestamp_col + * ARGUMENTS: <desc> is a tuple descriptor. + * PRECONDITIONS: <desc> represents a stream. + * POSTCONDITIONS: Returns the attrid of the timestamp attribute for the + * stream. + */ +int get_timestamp_col(TupleDesc desc) +{ + Assert(desc != NULL); + + if (NULL == desc->constr) { + /* No constraints, so no timestamp. */ + return TCQTSATTRNOP; + } + + /* Someone decided to make ts_attrnum 0-based, rather than 1-based. Rather + * than correct that mistake, we add 1 to convert to the actual attribute + * number. */ + /* TODO: Fix the original problem. */ + return 1 + (desc->constr->ts_attrnum); +} + +/* FUNCTION get_num_non_timestamp_attrs + * ARGUMENTS: <desc> points to a tuple descriptor. + * POSTCONDITIONS: Returns the number of attributes of <desc> that are not + * timestamps. + */ +int get_num_non_timestamp_attrs(TupleDesc desc) +{ + int ts_col = -1; + Assert(desc != NULL); + + ts_col = get_timestamp_col(desc); + + if (ts_col != TCQTSATTRNOP) { + /* Have a timestamp column. */ + return desc->natts - 1; + } else { + /* No timestamp. */ + return desc->natts; + } + +} + +/* FUNCTION get_attr_id_skipping_ts + * ARGUMENTS: <id> is an attribute number and <desc> is a tuple descriptor. + * PRECONDITIONS: <id> is >= 1 and <= the number of non-TCQTIME attributes in + * <desc>. + * POSTCONDITIONS: Returns the attribute number of the <id>th attribute that is + * not a timestamp. + */ +int get_attr_id_skipping_ts(int id, TupleDesc desc) +{ + int ts_col = -1; + Assert(desc != NULL); + Assert(id >= 1 && id <= get_num_non_timestamp_attrs(desc)); + + ts_col = get_timestamp_col(desc); + + if (ts_col != TCQTSATTRNOP && id >= ts_col) { + return id + 1; + } else { + return id; + } +} + +/* FUNCTION get_int_field_val + * ARGUMENTS: <tup> is a tuple described by <desc>, and <fnum> is a field + * number for <tup>. <skip_ts> is true if you wish to skip the timestamp + * field of the tuple. + * PRECONDITIONS: <fnum> is 1-based, and the indicated field is not NULL. + * POSTCONDITIONS: Fetches the indicated field of the tuple, converts it into + * 32-bit integer, and returns it. + */ +int64 get_int_field_val(HeapTuple tup, TupleDesc desc, int fnum, bool skip_ts) { + Datum d; + int64 val = -1; + bool is_null = false; + Oid typoid; + int16 typlen = -1; + bool typbyval = false; + + Assert(tup != NULL); + Assert(desc != NULL); + + /* We skip over the timestamp column, if desired. */ + if (skip_ts) { + fnum = get_attr_id_skipping_ts(fnum, desc); + } + + /* Fetch the value of the tuple along this dimension. */ + /* TODO: Should we be using fastgetattr()? */ + d = heap_getattr(tup, fnum, desc, &is_null); + + Assert(!is_null); + + /* There are three kinds of integer, each of which has a different + * method of extracting from a Datum. Figure out which one to use + * here. */ + typoid = desc->attrs[fnum - 1]->atttypid; + + get_typlenbyval(typoid, &typlen, &typbyval); + + /*elog(LOG, "get_int_field_val(): Length of integer is %d.", typlen);*/ + + switch(typlen) { + case 1: + /* 8-bit integer... */ + val = DatumGetUInt8(d); + break; + + case 2: + val = DatumGetInt16(d); + break; + + case 4: + val = DatumGetInt32(d); + break; + + case 8: + val = DatumGetInt64(d); + break; + + default: + elog(ERROR, "get_int_field_val(): Got invalid field length %d.", + typlen); + break; + } + + return val; +} + + + +/******************************************************************************* + * INTERNAL FUNCTIONS + ******************************************************************************/ + +/* FUNCTION create_mhist_copy + * ARGUMENTS: <hist> is an MHIST. + * PRECONDITIONS: <hist> is properly initialized. + * POSTCONDITIONS: Creates a copy of <hist>. + */ +static mhist * create_mhist_copy(const mhist *orig) { + mhist *result = NULL; + int len = MHIST_SIZE(MHIST_NUM_BUCKETS_ALLOC(orig), MHIST_NUM_DIMS(orig)); + + result = palloc(len); + + /* Since there are no pointers stored in the MHIST currently, we can just + * do a byte-by-byte copy. */ + memcpy(result, orig, len); + + check_mhist_integrity(result, "create_mhist_copy()"); + + return result; +} + +/* FUNCTION create_bigger_mhist + * ARGUMENTS: <hist> is an MHIST. + * PRECONDITIONS: <hist>'s numBucketsUsed header field is set correctly. + * POSTCONDITIONS: Creates a copy of <hist> with some empty space for new + * buckets. + */ +static mhist *create_bigger_mhist(mhist *hist) { + /* Double the size of the buckets array. */ + mhist *ret = create_mhist(2 * MHIST_NUM_BUCKETS_ALLOC(hist), + MHIST_NUM_DIMS(hist)); + + /* Copy information about dimension names. */ + memcpy(ret->hdr.dimNames, hist->hdr.dimNames, MHIST_NAMESZ); + MHIST_SET_NUM_DIM_NAMES(ret, MHIST_NUM_DIM_NAMES(hist)); + + /* Set the correct number of buckets used. */ + MHIST_SET_NUM_BUCKETS_USED(ret, MHIST_NUM_BUCKETS_USED(hist)); + + /* Copy the buckets en masse. */ + memcpy(ret->buckets, hist->buckets, + MHIST_NUM_BUCKETS_USED(hist) + * MHIST_BUCKET_LEN(MHIST_NUM_DIMS(hist))); + + return ret; +} + + +/* FUNCTION real_mhist_in + * ARGUMENTS: <relname> is the name of the relation on which to build a + * histogram. <cols> and <ncols> represent which columns of the relation + * to build on, and <num_buckets> is the number of buckets to use. + * PRECONDITIONS: All strings are non-NULL, and the number of buckets is + * greater than zero. + * POSTCONDITIONS: Attempts to create an mhist on the indicated relation. + * Calls elog() on failure. + */ +static mhist * real_mhist_in(char *relname, + char **cols, + int ncols, + int num_buckets) +{ + asdf("real_mhist_in"); + mhist *result = NULL; + + /* Create the histogram data structure. */ + result = create_mhist(num_buckets, ncols); + + /* Fill in information about the different dimensions. */ + mhist_set_nameinfo(result, relname, cols, ncols); + + + /* Load the histogram. */ + load_mhist(relname, cols, result); + + /* Pass our newly-minted histogram back up the stack. */ + zxcv("real_mhist_in"); + return result; +} + +/* FUNCTION mhist_set_nameinfo + * ARGUMENTS: <relname> is a relation name, and <cols> is an array of column + * names. <hist> is an MHIST. + * PRECONDITIONS: The histogram is to be built on the indicated columns of the + * indicated relation, with dimensions in the same order as <cols>. + * POSTCONDITIONS: Fills in the dimension name information of the histogram. + */ +static void mhist_set_nameinfo(mhist *hist, char *relname, + char **cols, int ncols) { + int dimid = -1; + + for (dimid = 0; dimid < ncols; dimid++) { + MHIST_SET_NAME_DIM(hist, dimid, dimid); + snprintf(MHIST_DIM_NAME(hist, dimid), MHIST_COL_NAME_LEN, + "%s.%s", relname, cols[dimid]); + /* We're assuming that the entire histogram was zeroed out; + * otherwise, we would have to pad the rest of the name field with + * zeros to avoid screwing up postgres hashing. */ + } + + MHIST_SET_NUM_DIM_NAMES(hist, ncols); +} + + + +/* FUNCTION get_table_as_array + * ARGUMENTS: <relname>, <cols>, and <ncols> are as with real_mhist_in(). + * <ntuples> points to a location where we will put the number of tuples + * read. <descdest> points to where we will put a descriptor for the + * tuples in the array we return. + * POSTCONDITIONS: Creates an array of HeapTuples of the appropriate size + * and loads it with the contents of the indicated columns. Returns + * the array, and sticks a tuple descriptor into *descdest. + */ +static HeapTuple *get_table_as_array(char *relname, char **cols, int ncols, + int *ntuples, TupleDesc *descdest) { + asdf("get_table_as_array"); + + StringInfo sql = makeStringInfo(); + int dimid = -1, return_code = -1, proc = -1; + int tupnum = -1; + + HeapTuple *ret = NULL; + /* Will be returned. */ + + MemoryContext main_context; + MemoryContext oldcontext; + + Assert(ntuples != NULL); + + + /* First, we snarf all the data out of the table. At some point in the + * future, we will probably do some sampling here, but for now, we just + * snarf. + * + * In order to get at the tuples, we create a query, namely: + * + * SELECT col1, col2, ... , coln FROM relation; + */ + appendStringInfo(sql, "SELECT "); + + for (dimid = 0; dimid < ncols; dimid++) { + + appendStringInfo(sql, "%s.%s", relname, cols[dimid]); + if (dimid < ncols - 1) { + appendStringInfo(sql, ", "); + } + } + appendStringInfo(sql, " FROM %s" , relname); + + elog(LOG, "get_table_as_array(): Query is '%s'.", sql->data); + + /* The Server Programming Interface is going to switch memory contexts + * behind our back, so we need to remember what the original context was. + */ + main_context = CurrentMemoryContext; + + /* Connect to the Server Programming Interface (see Postgresql Programmer's + * Reference, Chapter 17). */ + if ((return_code = SPI_connect()) < 0) { + elog(ERROR, "get_table_as_array(): SPI_connect returned %d.", + return_code); + } + + /* Execute the query. */ + SetQuerySnapshot(); /* Don't know why we need to do this. */ + return_code = SPI_exec(sql->data, 0); + /* The 0 here for number of tuples to retrieve apparently means to get + * as many tuples as possible. */ + proc = SPI_processed; + + elog(LOG, "get_table_as_array(): Query generated %d tuples.", proc); + + /* Make a copy of the tuple descriptor. See note above about memory + * contexts... */ + oldcontext = MemoryContextSwitchTo(main_context); + *descdest = CreateTupleDescCopy(SPI_tuptable->tupdesc); + MemoryContextSwitchTo(oldcontext); + + + /* SPECIAL CASE: Empty result. */ + if (0 == proc) { + *ntuples = 0; + + /* The caller should free the returned pointer unconditionally, so we + * need to allocate _something_. */ + /* See note above about memory contexts. */ + oldcontext = MemoryContextSwitchTo(main_context); + ret = palloc(1); + MemoryContextSwitchTo(oldcontext); + + goto cleanup; + } + /* END SPECIAL CASE */ + + /* Fetch the result tuples and put them into a giant array. */ + *ntuples = proc; + + /* See note above about memory contexts. */ + oldcontext = MemoryContextSwitchTo(main_context); + ret = palloc(proc * sizeof(HeapTuple)); + MemoryContextSwitchTo(oldcontext); + + for (tupnum = 0; tupnum < proc; tupnum++) { + HeapTuple current_tup; + + /* See note above about memory contexts. */ + oldcontext = MemoryContextSwitchTo(main_context); + current_tup = heap_copytuple(SPI_tuptable->vals[tupnum]); + MemoryContextSwitchTo(oldcontext); + + ret[tupnum] = current_tup; + } + + +cleanup: + + /* Disconnect from the Server Programming Interface. */ + if ((return_code = SPI_finish()) != SPI_OK_FINISH) { + elog(ERROR, "get_table_as_array(): SPI_finish() returned %d.", + return_code); + } + + zxcv("get_table_as_array"); + + return ret; +} + +/* FUNCTION split_a_bucket() + * ARGUMENTS: <giant_tup_array> is an array containing <ntuples> tuples, + * currently divided into (contiguous) buckets at locations given by + * <bucket_offsets> and <bucket_lengths>. + * PRECONDITIONS: <giant_tup_array> has been properly divided up. + * POSTCONDITIONS: If possible, splits one of the buckets and redistributes + * tuples accordingly. Updates the <bucket_offsets> and <bucket_lengths> + * arrays. Returns TRUE if it managed to split a bucket, FALSE + * otherwise. + */ +static bool split_a_bucket(HeapTuple *giant_tup_array, int ntuples, + TupleDesc desc, + int *bucket_offsets, int *bucket_lengths, mhist *hist, + mhist_dim_info *ranges, int nbuckets_done) { + + asdf("split_a_bucket"); + + int split_bucketid = -1; + int split_dimid = -1; + int split_offset = -1; + + int offset = -1, length = -1; + HeapTuple *bucketstart = NULL; + + mhist_bucket *maxdiff_bucket_ptr = NULL; + mhist_bucket *new_bucket_ptr = MHIST_BUCKET_PTR(hist, nbuckets_done); + /* The actual bucket (currently empty) */ + + mhist_dim_info *maxdiff_di = NULL; + mhist_dim_info *newbucket_di = NULL; + + int split_val = -1; + + int ndims = (NULL == desc) ? 0 : get_num_non_timestamp_attrs(desc); + + Assert(0.0 == new_bucket_ptr->count); + + /*elog(LOG, "split_a_bucket(): Checking whether we can split.");*/ + + /* SPECIAL CASE: Can't perform any splits. */ + if (no_buckets_to_split(giant_tup_array, bucket_offsets, bucket_lengths, + nbuckets_done, desc)) { + return false; + } + /* END SPECIAL CASE */ + + /*elog(LOG, "split_a_bucket(): Figuring out where to split.");*/ + + /* Figure out where to split. */ + get_mhist_split_point(giant_tup_array, desc, bucket_offsets, + bucket_lengths, nbuckets_done, ranges, + &split_bucketid, &split_dimid, &split_offset); + + /*elog(LOG, "split_a_bucket(): Starting split.");*/ + + /* Now we split the chosen bucket along the chosen dimension. */ + maxdiff_bucket_ptr = MHIST_BUCKET_PTR(hist, split_bucketid); + offset = bucket_offsets[split_bucketid]; + length = bucket_lengths[split_bucketid]; + bucketstart = giant_tup_array + offset; + + /*elog(LOG, "split_a_bucket(): Sorting.");*/ + + /* First we sort along the dimension. */ + sort_tuples_along_dim(bucketstart, length, split_dimid, desc); + + /*elog(LOG, "split_a_bucket(): Performing bookkeeping.");*/ + + /* The sorting has already partitioned the tuples; we just need to + * make note of the new bucket boundary! */ + bucket_offsets[nbuckets_done] = + bucket_offsets[split_bucketid] + split_offset + 1; + + + /* The new bucket starts immediately after the current + * tuple. */ + bucket_lengths[nbuckets_done] = + bucket_lengths[split_bucketid] - split_offset - 1; + + bucket_lengths[split_bucketid] = split_offset; + + + /* Adjust the bucket boundaries in the histogram data structure. */ + + /* We start out by making a copy of the bucket we just split. */ + memcpy(new_bucket_ptr, maxdiff_bucket_ptr, MHIST_BUCKET_LEN(ndims)); + + /* The we adjust the boundaries of the copies so that they split + * the space along the maxdiff line. */ + maxdiff_di = &(maxdiff_bucket_ptr->diminfo[split_dimid]); + newbucket_di = &(new_bucket_ptr->diminfo[split_dimid]); + + split_val = get_int_field_val(bucketstart[split_offset], desc, + split_dimid + 1, true); + /* + elog(LOG, "split_a_bucket(): Splitting at value %d on dimension %d.", + split_val, split_dimid); + */ + + /* By convention, the bucket contains values x, such that + * min_val <= x <= max_val. + */ + maxdiff_di->max_val = split_val - 1; + newbucket_di->min_val = split_val; + + + /* We've created a hard boundary, so these two buckets should no + * longer accept outliers in the direction of the boundary. */ + maxdiff_di->accept_above_max = false; + newbucket_di->accept_below_min = false; + + zxcv("split_a_bucket"); + + return true; +} + +/* Comparison function for passing to qsort. The globals before it are used to + * pass parameters to it on a "side channel". */ +static int global_cmpf_dimid = -1; +static TupleDesc global_cmpf_desc = NULL; +static int cmpf(const void * left, const void * right) { + HeapTuple *left_ptr = (HeapTuple*)left; + HeapTuple *right_ptr = (HeapTuple*)right; + int left_val = get_int_field_val(*left_ptr, global_cmpf_desc, + global_cmpf_dimid + 1, true); + int right_val = get_int_field_val(*right_ptr, global_cmpf_desc, + global_cmpf_dimid + 1, true); + + /* We return a number less than zero if left < right */ + if (left_val < right_val) { + return -1; + } else if (left_val == right_val) { + return 0; + } else { + return 1; + } +} + +/* FUNCTION sort_tuples_along_dim() + * ARGUMENTS: <tuples> points to an array of <length> tuples, each of <ndims> + * dimensions. <dimid> is a dimension < <ndims>. + * PRECONDITIONS: The specified range stays within the actual bounds of the + * array. + * POSTCONDITIONS: Sorts the range of tuples along dimension <dimid>. + */ +static void sort_tuples_along_dim(HeapTuple *tuples, int length, int dimid, + TupleDesc desc) { + + asdf("sort_tuples_along_dim"); + + /* Tell our comparison function which dimension to pay attention to. */ + global_cmpf_dimid = dimid; + global_cmpf_desc = desc; + + /* Then we can let qsort do the heavy lifting... */ + qsort(tuples, length, sizeof(*tuples), &cmpf); + + zxcv("sort_tuples_along_dim"); +} + + + +/* FUNCTION load_mhist + * ARGUMENTS: <relname> and <cols> are as with real_mhist_in(), and + * <hist> is an appropriately-sized and initialized MHIST data + * structure. + * PRECONDITIONS: The histogram is initial... [truncated message content] |