Thread: [Beet-users] Patch MySQL support in beet-utils
Status: Beta
Brought to you by:
tormp
From: Pierre Q. <pie...@ze...> - 2010-06-17 20:19:32
|
From 59fb42994bd7339ae6c9bc138271ebf393e148cb Mon Sep 17 00:00:00 2001 From: Pierre Queinnec <pie...@ze...> Date: Thu, 17 Jun 2010 14:47:08 +0200 Subject: [PATCH 1/2] Add MySQL support to the beet-utils. Use with the '-tool csv'. --- utils/src/main/sh/load-events-mysql.sh | 59 ++++++++++++++++++++++ utils/src/main/sql/etl/mysql/create_etl.sql | 38 ++++++++++++++ utils/src/main/sql/etl/mysql/event_summarize.sql | 40 +++++++++++++++ utils/src/main/sql/etl/mysql/load_event_csv.ctl | 17 ++++++ 4 files changed, 154 insertions(+), 0 deletions(-) create mode 100755 utils/src/main/sh/load-events-mysql.sh create mode 100644 utils/src/main/sql/etl/mysql/create_etl.sql create mode 100644 utils/src/main/sql/etl/mysql/event_summarize.sql create mode 100644 utils/src/main/sql/etl/mysql/load_event_csv.ctl diff --git a/utils/src/main/sh/load-events-mysql.sh b/utils/src/main/sh/load-events-mysql.sh new file mode 100755 index 0000000..415b75e --- /dev/null +++ b/utils/src/main/sh/load-events-mysql.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +## function to echo a usage message and exit. +function usage { + cat <<EOF + +Usage: $0 [mysql client args] [event log] + + Example: + + > $0 '-h 127.0.0.1 -P 3306 -D analysis' log.bxml.gz.20080101000000 + +EOF + exit 1 +} + +## check arg count. +if [[ $# < 2 ]]; then + usage; +fi + +## verify that second arg is an actual file. +if [ ! -f $2 ]; then + echo + echo "\"$2\" does not exist or is not a regular file." + usage; +fi + +if [ -z "$JAVA_HOME" ]; then + JAVA=java +else + JAVA=$JAVA_HOME/bin/java +fi + +echo "Loading $2 into behavior tracking database at $1." +echo "Start: " $(date) + +case "`uname`" in +# FIXME Cygwin support +Darwin) + # FIXME use a randfile + mkfifo /tmp/import-mysql.fifo + gzcat "$2" | "$JAVA" -jar beet-utils.jar -tool csv > /tmp/import-mysql.fifo & + mysql $1 < sql/etl/mysql/load_event_csv.ctl + rm /tmp/import-mysql.fifo + ;; + +*) + mkfifo /tmp/import-mysql.fifo + zcat "$2" | "$JAVA" -jar beet-utils.jar -tool csv > /tmp/import-mysql.fifo & + mysql $1 < sql/etl/mysql/load_event_csv.ctl + rm /tmp/import-mysql.fifo + ;; +esac + +echo "Summarizing data..." +mysql $1 < sql/etl/mysql/event_summarize.sql + +echo "End: " $(date) diff --git a/utils/src/main/sql/etl/mysql/create_etl.sql b/utils/src/main/sql/etl/mysql/create_etl.sql new file mode 100644 index 0000000..8eff6f4 --- /dev/null +++ b/utils/src/main/sql/etl/mysql/create_etl.sql @@ -0,0 +1,38 @@ +-- this script showcases a strange MySQL bug (probably) on case-insensitive FS, +-- where the event table name ends up in lowercase and the summary one in upper +-- You'll need to use the classic 'set-variable=lower_case_table_names=0' + +-- event data table for basic analysis +create table BEHAVIOR_TRACKING_EVENT ( + EVENT_ID VARCHAR(100) NOT NULL, + PARENT_EVENT_ID VARCHAR(100), + EVENT_TYPE VARCHAR(20) NOT NULL, + EVENT_NAME VARCHAR(256) NOT NULL, + EVENT_START DATE NOT NULL, + USER_ID VARCHAR(64), + SESSION_ID VARCHAR(64), + APPLICATION VARCHAR(64) NOT NULL, + DURATION_NS BIGINT(16) NOT NULL, + EVENT_DATA BLOB, + ERROR VARCHAR(512), + SUMMARIZED CHAR(1) +); +create index EVENT_NAME_IDX on BEHAVIOR_TRACKING_EVENT ( + EVENT_NAME ASC +); + +-- event summary table for trending over long periods of time +create table BEHAVIOR_TRACKING_SUMMARY ( + SUMMARY_DATE VARCHAR(100) NOT NULL, + EVENT_TYPE VARCHAR(20) NOT NULL, + EVENT_NAME VARCHAR(256) NOT NULL, + PERIOD_START DATE NOT NULL, + PERIOD_END DATE NOT NULL, + `COUNT` BIGINT(16) NOT NULL, + AVERAGE_NS BIGINT(16) NOT NULL, + MINIMUM_NS BIGINT(16) NOT NULL, + MAXIMUM_NS BIGINT(16) NOT NULL, + MEDIAN_NS BIGINT(16) NOT NULL, + STANDARD_DEVIATION BIGINT(16) NOT NULL, + ERROR_COUNT BIGINT(16) NOT NULL +); diff --git a/utils/src/main/sql/etl/mysql/event_summarize.sql b/utils/src/main/sql/etl/mysql/event_summarize.sql new file mode 100644 index 0000000..44af0de --- /dev/null +++ b/utils/src/main/sql/etl/mysql/event_summarize.sql @@ -0,0 +1,40 @@ +-- FIXME benchmark this median impl against the median UDF and the Wikipedia impl +-- (the following one uses a self-join...) +INSERT INTO BEHAVIOR_TRACKING_SUMMARY + (SUMMARY_DATE, EVENT_TYPE, EVENT_NAME, PERIOD_START, PERIOD_END, COUNT, + AVERAGE_NS, MINIMUM_NS, MAXIMUM_NS, median_ns, STANDARD_DEVIATION, ERROR_COUNT) +SELECT NOW(), evt.* + FROM ( + SELECT event.event_type, event.event_name, + MIN(event_start) period_start, + MAX(event_start) period_end, + COUNT(*) event_count, + AVG(duration_ns) average_ns, + MIN(duration_ns) min_ns, + MAX(duration_ns) max_ns, + + (SELECT AVG(tmp.median) AS median_ns + FROM ( + SELECT x.duration_ns as median + FROM BEHAVIOR_TRACKING_EVENT x, BEHAVIOR_TRACKING_EVENT y + GROUP BY x.duration_ns + HAVING + ((SUM(SIGN(1 - SIGN(y.duration_ns - x.duration_ns)))) >= FLOOR((COUNT(*) + 1) / 2)) + AND + ((SUM(SIGN(1 + SIGN(y.duration_ns - x.duration_ns)))) >= FLOOR((COUNT(*) + 1) / 2)) + ) AS tmp + ), + + STDDEV(duration_ns) std_dev_ns, + (SELECT COUNT(*) + FROM BEHAVIOR_TRACKING_EVENT tmp + WHERE tmp.summarized is NULL + and event_name = event.event_name + and event_type = event.event_type + and error IS NOT NULL) error_cnt + FROM BEHAVIOR_TRACKING_EVENT event + WHERE event.summarized IS NULL + GROUP BY event_name, event_type + ) evt; + UPDATE BEHAVIOR_TRACKING_EVENT SET summarized='Y' WHERE summarized IS NULL; + COMMIT; diff --git a/utils/src/main/sql/etl/mysql/load_event_csv.ctl b/utils/src/main/sql/etl/mysql/load_event_csv.ctl new file mode 100644 index 0000000..35ffd6a --- /dev/null +++ b/utils/src/main/sql/etl/mysql/load_event_csv.ctl @@ -0,0 +1,17 @@ +LOAD DATA INFILE '/tmp/import-mysql.fifo' + INTO TABLE BEHAVIOR_TRACKING_EVENT + FIELDS TERMINATED BY "," OPTIONALLY ENCLOSED BY '"' + IGNORE 1 LINES + ( + EVENT_ID, + PARENT_EVENT_ID, + EVENT_TYPE, + EVENT_NAME, + APPLICATION, + EVENT_START, + DURATION_NS, + USER_ID, + SESSION_ID, + ERROR, + EVENT_DATA + ) -- 1.7.1 |