prepare.sh

#!/usr/bin/env bash

show_help() {
    cat <<EOF
    Usage: bash ${0##*/} [-s scale] [-i pginstdir] [-d pgdatadir] [-t tpchtmp]
    [-p pgport] [-n tpchdbname] [-g dbgenpath] [-e] [-x] [-h] [-a] [-q]

    Prepare Postgres cluster for running TPC-H queries:
      * Remove everything inside <pgdatadir>
      * Create Postgres cluster at <pgdatadir> via initdb from <pginstdir>
      * Add configuration from postgresql.conf to default configuration at
        <pgdatadir>/postgresql.conf, if the former exists
      * Run the cluster on port <pgport>
      * Generate *.tbl files with TPC-H data, if needed
      * Create database with TPC-H tables named <tpchdbname>
      * Fill these tables with generated (or existing) data
      * Remove generated data, if needed
      * Create indexes, if needed
      * Reset Postgres state (vacuum-analyze-checkpoint)
      * Generate the TPC-H queries, if needed, and put them
        to <pgdatadir>/queries

    Options
    The first seven options are read from $CONFIGFILE file, but you can override
    them in command line args. See their meaning in that file. The rest are:

    -e don't generate *.tbl files, use the existing ones
    -r don't remove generated files after use, they are removed by default
    -x don't create indexes, they are created by default
    -a disable sanity checks: using Postgres built with assertions and
       wal_level_minimal
    -q don't generate queries, they are generated by default
    -h display this help and exit

    Example:
    ./prepare.sh -s 2 -d /mnt/ramdisk/mytpch-2
EOF
}

source common.sh
read_conf "$CONFIGFILE"

GENDATA=true
REMOVEGENDATA=true
CREATEINDEXES=true
SANITYCHECKS=true
GENQUERIES=true
OPTIND=1
while getopts "s:i:d:t:p:n:g:erxaqh" opt; do
    case $opt in
	h)
	    show_help
	    exit 0
	    ;;
	s)
	    SCALE="$OPTARG"
	    ;;
	i)
	    PGINSTDIR="$OPTARG"
	    ;;
	d)
	    PGDATADIR="$OPTARG"
	    ;;
	t)
	    TPCHTMP="$OPTARG"
	    ;;
	p)
	    PGPORT="$OPTARG"
	    ;;
	n)
	    TPCHDBNAME="$OPTARG"
	    ;;
	g)
	    DBGENPATH="$OPTARG"
	    ;;
	e)
	    GENDATA=false
	    ;;
	r)
	    REMOVEGENDATA=false
	    ;;
	x)
	    CREATEINDEXES=false
	    ;;
	a)
	    SANITYCHECKS=false
	    ;;
	q)
	    GENQUERIES=false
	    ;;
	\?)
	    show_help >&2
	    exit 1
	    ;;
    esac
done

if [ -z "$SCALE" ]; then die "scale is empty"; fi
if [ -z "$PGINSTDIR" ]; then die "pginstdir is empty"; fi
if [ -z "$PGDATADIR" ]; then die "pgdatadir is empty"; fi
if [ -z "$TPCHTMP" ]; then die "tpchtmp is empty"; fi
if [ -z "$PGPORT" ]; then die "pgport is empty"; fi
if [ -z "$TPCHDBNAME" ]; then die "tpchdbname is empty"; fi
# We need dbgenpath even if we don't generate *.tbl files because we always
# generate queries
if [ -z "$DBGENPATH" ]; then die "dbgenpath is empty"; fi

# directory with this script
BASEDIR=`dirname "$(readlink -f "$0")"`
PGBINDIR="${PGINSTDIR}/bin"
PGLIBDIR="${PGINSTDIR}/lib"
cd "$BASEDIR"
cd "$DBGENPATH" || die "dbgen directory not found"
DBGENABSPATH=`readlink -f "$(pwd)"`

echo "Using Postgres at $PGINSTDIR"
echo "Using datadir at $PGDATADIR"
echo "Scale is $SCALE"
echo "Using dbgen at $DBGENABSPATH"

# ========================== Preparing DB =========================
# Current time
CURRTIME=$(timer)

# create database cluster
rm -rf "$PGDATADIR"
mkdir -p "$PGDATADIR"
LD_LIBRARY_PATH="$LD_LIBRARY_PATH":"$PGLIBDIR" $PGBINDIR/initdb -D "$PGDATADIR" \
	       --encoding=UTF-8 --locale=C

# copy postgresql settings
if [ -f "$BASEDIR/postgresql.conf" ]; then
    # Postgres uses the last read setting
    cat "$BASEDIR/postgresql.conf" >> "$PGDATADIR/postgresql.conf"
    echo "Postgres config applied"
else
    echo "Config file postgresql.conf not found, using the default"
fi

# Start a new instance of Postgres
postgres_start 0

# create db with this user's name to give access
LD_LIBRARY_PATH="$LD_LIBRARY_PATH":"$PGLIBDIR" $PGBINDIR/createdb -h /tmp \
	       -p $PGPORT `whoami` --encoding=UTF-8 --locale=C;

echo "Current settings are"
LD_LIBRARY_PATH="$LD_LIBRARY_PATH":"$PGLIBDIR" $PGBINDIR/psql -h /tmp -p $PGPORT \
	       -c "select name, current_setting(name) from
pg_settings where name in('debug_assertions', 'wal_level',
'checkpoint_segments', 'shared_buffers', 'wal_buffers', 'fsync',
'maintenance_work_mem', 'checkpoint_completion_target', 'max_connections');"

if [ "$SANITYCHECKS" = true ]; then
    WAL_LEVEL_MINIMAL=`LD_LIBRARY_PATH="$LD_LIBRARY_PATH":"$PGLIBDIR" $PGBINDIR/psql -h /tmp -p $PGPORT -c 'show wal_level' -t | grep minimal | wc -l`
    if [ $WAL_LEVEL_MINIMAL != 1 ] ; then die "Postgres wal_level is not set to
    minimal; 'Elide WAL traffic' optimization cannot be used"; fi
    DEBUG_ASSERTIONS=`LD_LIBRARY_PATH="$LD_LIBRARY_PATH":"$PGLIBDIR" $PGBINDIR/psql -h /tmp -p $PGPORT -c 'show debug_assertions' -t | grep on | wc -l`
    if [ $DEBUG_ASSERTIONS = 1 ] ; then die "Option debug_assertions is enabled"; fi
fi

# generate *.tbl files, if needed
if [ "$GENDATA" = true ]; then
    make -j # build dbgen
    if ! [ -x "$DBGENABSPATH/dbgen" ] || ! [ -x "$DBGENABSPATH/qgen" ]; then
	die "Can't find dbgen or qgen.";
    fi
    mkdir -p "$TPCHTMP" || die "Failed to create temporary directory: '$TPCHTMP'"
    cd "$TPCHTMP"
    # needed by ./dbgen
    cp "$DBGENABSPATH/dists.dss" . || die "dists.dss not found"
    cp "$DBGENABSPATH/dss.ddl" . || die "dss.ddl not found" # table definitions
    # foreign & primary keys
    cp "$DBGENABSPATH/dss.ri" . || die "dss.ri not found"

    # Create table files separately to have better IO throughput
    # -v is verbose, -f for overwrtiting existing files, -T <letter> is
    # "generate only table <letter>"
    for TABLENAME in c s n r O L P s S; do
	"$DBGENABSPATH/dbgen" -s $SCALE -f -T $TABLENAME &
    done
    wait_jobs
    echo "TPC-H data *.tbl files generated at $TPCHTMP"
fi

LD_LIBRARY_PATH="$LD_LIBRARY_PATH":"$PGLIBDIR" $PGBINDIR/createdb -h /tmp \
	       -p $PGPORT $TPCHDBNAME --encoding=UTF-8 --locale=C
if [ $? != 0 ]; then die "Error: Can't proceed without database"; fi
TIME=`date`
LD_LIBRARY_PATH="$LD_LIBRARY_PATH":"$PGLIBDIR" $PGBINDIR/psql -h /tmp -p $PGPORT \
	       -d $TPCHDBNAME -c "comment on database
$TPCHDBNAME is 'TPC-H data, created at $TIME'"
echo "TPC-H database created"

LD_LIBRARY_PATH="$LD_LIBRARY_PATH":"$PGLIBDIR" $PGBINDIR/psql -h /tmp -p $PGPORT \
	       -d $TPCHDBNAME < "$TPCHTMP/dss.ddl"
echo "TPCH-H tables created"

cd "$TPCHTMP"
TBLFILESNUM=`find . -maxdepth 1 -type f -name '*.tbl' | wc -l`
if [ "$TBLFILESNUM" -eq "0" ]; then
    die "No *.tbl files found"
fi
for f in *.tbl; do
    # bf is f without .tbl extensions. Since unquoted names are case insensitive
    # in Postgres, bf is basically a table name.
    bf="$(basename $f .tbl)"
    # We truncate the empty table in the sames transaction to enable Postgres to
    # safely skip WAL-logging. See
    # http://www.postgresql.org/docs/current/static/populate.html#POPULATE-PITR
    echo "truncate $bf;
    	  COPY $bf FROM '$(pwd)/$f' WITH DELIMITER AS '|'" |
	LD_LIBRARY_PATH="$LD_LIBRARY_PATH":"$PGLIBDIR" $PGBINDIR/psql -h /tmp \
		       -p $PGPORT -d $TPCHDBNAME &
done
wait_jobs
echo "TPC-H tables are populated with data"

LD_LIBRARY_PATH="$LD_LIBRARY_PATH":"$PGLIBDIR" $PGBINDIR/psql -h /tmp -p $PGPORT \
	       -d $TPCHDBNAME < "dss.ri"
echo "primary and foreign keys added"

if [ "$REMOVEGENDATA" = true ]; then
    cd && rm -rf "$TPCHTMP"
    echo "tpch tmp directory removed"
fi

if [ "$CREATEINDEXES" = true ]; then
    declare -a INDEXCMDS=(
	# Pg does not create indexed on foreign keys, create them manually
	"CREATE INDEX i_n_regionkey ON nation (n_regionkey);"	#& #unused on 1GB
	"CREATE INDEX i_s_nationkey ON supplier (s_nationkey);"	#&
	"CREATE INDEX i_c_nationkey ON customer (c_nationkey);"	#&
	"CREATE INDEX i_ps_suppkey ON partsupp (ps_suppkey);"	#&
	"CREATE INDEX i_ps_partkey ON partsupp (ps_partkey);"	#&
	"CREATE INDEX i_o_custkey ON orders (o_custkey);"	#&
	"CREATE INDEX i_l_orderkey ON lineitem (l_orderkey);"	#&
	"CREATE INDEX i_l_suppkey_partkey ON lineitem (l_partkey, l_suppkey);"	#&
        # other indexes
	"CREATE INDEX i_l_shipdate ON lineitem (l_shipdate);"	#&
	"CREATE INDEX i_l_partkey ON lineitem (l_partkey);"	#&
	"CREATE INDEX i_l_suppkey ON lineitem (l_suppkey);"	#&
	"CREATE INDEX i_l_receiptdate ON lineitem (l_receiptdate);"	#&
	"CREATE INDEX i_l_orderkey_quantity ON lineitem (l_orderkey, l_quantity);"	#&
	"CREATE INDEX i_o_orderdate ON orders (o_orderdate);"	#&
	"CREATE INDEX i_l_commitdate ON lineitem (l_commitdate);"	#& #unused on 1GB
    )
    for cmd in "${INDEXCMDS[@]}"; do
	LD_LIBRARY_PATH="$LD_LIBRARY_PATH":"$PGLIBDIR" $PGBINDIR/psql -h /tmp \
		       -p $PGPORT -d $TPCHDBNAME -c "$cmd"
    done
    wait_jobs
    echo "Indexes created"
else
    echo "Indexes will not be created"
fi

# Always analyze after bulk-loading; when hacking Postgres, typically Postgres
# is run with autovacuum turned off.
echo "Running vacuum freeze analyze checkpoint..."
LD_LIBRARY_PATH="$LD_LIBRARY_PATH":"$PGLIBDIR" $PGBINDIR/psql -h /tmp -p $PGPORT \
	       -d $TPCHDBNAME -c "vacuum freeze"
LD_LIBRARY_PATH="$LD_LIBRARY_PATH":"$PGLIBDIR" $PGBINDIR/psql -h /tmp -p $PGPORT \
	       -d $TPCHDBNAME -c "analyze"
# Checkpoint, so we have a "clean slate". Just in-case.
LD_LIBRARY_PATH="$LD_LIBRARY_PATH":"$PGLIBDIR" $PGBINDIR/psql -h /tmp -p $PGPORT \
	       -d $TPCHDBNAME -c "checkpoint"

postgres_stop 0

if [ "$GENQUERIES" = true ]; then
    gen_queries $PGDATADIR
fi

printf 'Preparing elapsed time: %s\n' $(timer $CURRTIME)