#!/bin/sh
#
# Compression of files of a PCP archive.
#
# Copyright (c) 2024 Ken McDonell, Inc.  All Rights Reserved.
# 
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
# 
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
# for more details.
# 
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#
# Arguments to compression programs are handled thusly:
# if -A	use all the arg arguments literally, ignore everything else
# else if $PCP_COMPRESS_<PROG>_ARGS set in the environment, use that
# else if $PCP_COMPRESS_<PROG>_ARGS set in /etc/pcp/pmlogcompress/defaults
#         then use that
# else use nothing

. $PCP_DIR/etc/pcp.env

prog=`basename $0`

tmp=/var/tmp/pmlogcompress.$$
status=1
trap "rm -f $tmp.*; exit \$status" 0 1 2 3 15

# Note -A arg (not -a arg) to avoid confusion with -a archive
#
cat <<'End-of-File' >$tmp.usage
# usage: [options] archive ...

Options:
  -A=ARG, --arg=ARG		argument for compression program
  -C=conffile, --config=conffile	alternate defaults configuration file
  -c=PROGLIST, --command=PROGLIST	candidate compression program(s)
  -f=PROG, --use=PROG		use this compression program
  -l=LIMIT, --lower-limit=LIMIT	do no compress files smaller than LIMIT
  -N, --show-me			do nothing, but show me
  -o=TYPE, --optimize=TYPE	choose program to optimize compression
  -V, --verbose			increase verbosity
  -Z=MIN, --min-zstd-size=MIN   minimum file size for compression with zstd
  --help
# end

More than one -A option is allowed, and more than one -c option is allowed.

PROGLIST may be one command, or multiple commands separated by colons,
[default zstd:xz:bzip2:gzip].

Compression program (without -o or -f) depends on installed programs and
file size (use -N to see what would be used, use -f to force a particular
compression program to be used).

Decompression program is selected based on file extension.

LIMIT is in bytes [default 4096]; use 0 to force compression. similarly
for zstd mimimum file size MIN [default 52428800].

Compression optimization TYPE may be time or space.
End-of-File

# get file ($1) size
#
_size()
{
    ls -l "$1" | awk '{print $5}'
}


# configuration file of PCP_COMPRESS_* defaults
#
conffile=$PCP_SYSCONF_DIR/pmlogcompress/defaults

# establish value for variable ($1) either from the environment
# or /etc/pcp/pmlogcompress/defaults
# optional $2 is default if $1 not found elsewhere
#
_get_var()
{
    VAR="$1"
    _check=`eval echo "\\\$$VAR"`
    if [ -n "$_check" ]
    then
	# should have already been reported where this was set from,
	# so don't add babble here
	#
	# [ $verbose -gt 1 ] && echo >&2 "Info: $VAR ignore defaults, already set: $_check"
	return
    fi
    _val=''
    if [ -f "$conffile" ]
    then
	if grep -q "^$VAR=" $conffile 2>/dev/null
	then
	    eval `grep "^$VAR=" $conffile 2>/dev/null`
	    _val=`eval echo "\\\$$VAR"`
	    [ $verbose -gt 1 ] && echo >&2 "Info: $VAR set from $conffile: $_val"
	else
	    if [ $# -eq 2 ]
	    then
		_val="$2"
		[ $verbose -gt 1 ] && echo >&2 "Info: $VAR not set in $conffile, using default $2"
	    else

		[ $verbose -gt 1 ] && echo >&2 "Info: $VAR not set in $conffile"
	    fi
	fi
    else
	echo >&2 "$prog: Error: configuration file ($conffile) vanished"
    fi

    # some variables need special handling
    #
    if [ "$VAR" = "PCP_COMPRESS_XZ_ARGS" ]
    then
	# to get here, we must have a xz(1) executable ...
	if xz --block-size=10MiB </dev/null >/dev/null 2>&1
	then
	    :
	else
	    # no --block-size in older versions of xz
	    #
	    _val=`echo "$_val" | sed -e 's/^/ /' -e 's/--block-size=[^ ]*//' -e 's/^ //'`
	fi
    fi

    eval $VAR=\"$_val\"
}

# component archive filenames are arguments
# - pick largest
# - iterate over compression tools measuring size
#   or CPU time
# - set $use_prog for selected compression tool  
_optimize()
{
    pick=''
    largest=0
    for file
    do
	[ -f "$file" ] || continue
	case "$file"
	in
	    *.xz|*.lzma|*.bz2|*.bz|*.gz|*.Z|*.z|*.zst)
		    # already compressed
		    [ $verbose -gt 0 ] && echo >&2 "Info: $file: skipped, already compressed"
		    continue
		    ;;
	esac
	size=`_size "$file"`
	if [ "$size" -ge "$PCP_COMPRESS_MIN_FILESIZE" -a "$size" -gt "$largest" ]
	then
	    pick="$file"
	    largest=$size
	fi
    done
    if [ -z "$pick" ]
    then
	[ $verbose -gt 0 ] && echo >&2 "Info: optimize: failed to find a candidate file, defaulting to xz"
	use_prog=xz
	return
    fi
    [ $verbose -gt 0 ] && echo >&2 "Info: $pick: ($largest bytes) selected as candidate for optimize measurements"
    pick_size=''
    pick_cpu=''
    use_prog=''
    for tool in `echo $PCP_COMPRESS_PROGS | sed -e 's/:/ /g'` 
    do
	case "$tool"
	in
	    zstd)
		if $have_zstd
		then
		    if time -f '%U %S' zstd -zc -q "$pick" >$tmp.tmp 2>$tmp.time
		    then
			size=`_size "$tmp.tmp"`
			cpu=`awk <$tmp.time '{ print 1000*($1+$2) }'`
			[ $verbose -gt 0 ] && echo >&2 "Info: $pick: zstd size=$size cpu=$cpu"
			if [ "$optimize" = space ]
			then
			    if [ -z "$pick_size" -o $size -lt 0$pick_size ]
			    then
				pick_size=$size
				use_prog=zstd
			    fi
			else
			    if [ -z "$pick_cpu" -o $cpu -lt 0$pick_cpu ]
			    then
				pick_cpu=$cpu
				use_prog=zstd
			    fi
			fi
		    else
			if [ $verbose -gt 0 ]
			then
			    cat >&2 $tmp.time
			    echo >&2 "Info: $file: optimize test with zstd failed"
			fi
		    fi
		fi
		;;

	    xz)
		if $have_xz
		then
		    if [ -n "$args" ]
		    then
			largs=" $args"
		    else
			[ -z "${PCP_COMPRESS_XZ_ARGS+onetrip}" ] && _get_var PCP_COMPRESS_XZ_ARGS
			largs=" $PCP_COMPRESS_XZ_ARGS"
		    fi
		    if time -f '%U %S' xz$largs -zc -q "$pick" >$tmp.tmp 2>$tmp.time
		    then
			size=`_size "$tmp.tmp"`
			cpu=`awk <$tmp.time '{ print 1000*($1+$2) }'`
			[ $verbose -gt 0 ] && echo >&2 "Info: $pick: xz size=$size cpu=$cpu"
			if [ "$optimize" = space ]
			then
			    if [ -z "$pick_size" -o $size -lt 0$pick_size ]
			    then
				pick_size=$size
				use_prog=xz
			    fi
			else
			    if [ -z "$pick_cpu" -o $cpu -lt 0$pick_cpu ]
			    then
				pick_cpu=$cpu
				use_prog=xz
			    fi
			fi
		    else
			if [ $verbose -gt 0 ]
			then
			    cat >&2 $tmp.time
			    echo >&2 "Info: $file: optimize test with xz failed"
			fi
		    fi
		fi
		;;

	    bzip2)
		if $have_bzip2
		then
		    if time -f '%U %S' bzip2 -zc -q "$pick" >$tmp.tmp 2>$tmp.time
		    then
			size=`_size "$tmp.tmp"`
			cpu=`awk <$tmp.time '{ print 1000*($1+$2) }'`
			[ $verbose -gt 0 ] && echo >&2 "Info: $pick: bzip2 size=$size cpu=$cpu"
			if [ "$optimize" = space ]
			then
			    if [ -z "$pick_size" -o $size -lt $pick_size ]
			    then
				pick_size=$size
				use_prog=bzip2
			    fi
			else
			    if [ -z "$pick_cpu" -o $cpu -lt 0$pick_cpu ]
			    then
				pick_cpu=$cpu
				use_prog=bzip2
			    fi
			fi
		    else
			if [ $verbose -gt 0 ]
			then
			    cat >&2 $tmp.time
			    echo >&2 "Info: $file: optimize test with bzip2 failed"
			fi
		    fi
		fi
		;;

	    gzip)
		if $have_gzip
		then
		    if time -f '%U %S' gzip -c -q "$pick" >$tmp.tmp 2>$tmp.time
		    then
			size=`_size "$tmp.tmp"`
			cpu=`awk <$tmp.time '{ print 1000*($1+$2) }'`
			[ $verbose -gt 0 ] && echo >&2 "Info: $pick: gzip size=$size cpu=$cpu"
			if [ "$optimize" = space ]
			then
			    if [ -z "$pick_size" -o $size -lt 0$pick_size ]
			    then
				pick_size=$size
				use_prog=gzip
			    fi
			else
			    if [ -z "$pick_cpu" -o $cpu -lt $pick_cpu ]
			    then
				pick_cpu=$cpu
				use_prog=gzip
			    fi
			fi
		    else
			if [ $verbose -gt 0 ]
			then
			    cat >&2 $tmp.time
			    echo >&2 "Info: $file: optimize test with gzip failed"
			fi
		    fi
		fi
		;;

	    *)
		echo >&2 "$file: Botch: no compression recipe for $tool program"
		;;

	esac
    done
    [ $verbose -gt 0 ] && echo >&2 "Info: optimize: pick $use_prog"
}

_compress()
{
    if [ -f "$1" ]
    then
	# single file ...
	#
	archbase=''
	filelist="$1"
    else
	# assume it is an archive basename
	#
	archbase="`pmlogbasename $1`"
	[ $verbose -gt 0 ] && echo >&2 "Info: archbase=$archbase"
	filelist=`echo ${archbase}*`
    fi
    [ -n "$optimize" ] && _optimize $filelist

    nfile=0
    for file in $filelist
    do
	[ ! -f "$file" ] && continue
	[ -n "$archbase" -a "$archbase" != `pmlogbasename "$file"` ] && continue
	nfile=`expr $nfile + 1`
	case "$file"
	in
	    *.xz|*.lzma|*.bz2|*.bz|*.gz|*.Z|*.z|*.zst)
			# already compressed
			[ $verbose -gt 0 ] && echo >&2 "Info: $file: skipped, already compressed"
			continue
			;;

	    *.index)	# TI is never compressed
			continue
			;;
	esac
	size=`_size "$file"`
	if [ "$size" -lt "$PCP_COMPRESS_MIN_FILESIZE" ]
	then
	    [ $verbose -gt 0 ] && echo >&2 "Info: $file: skipped, size $size < limit $PCP_COMPRESS_MIN_FILESIZE"
	    continue
	fi

	for tool in $use_prog `echo $PCP_COMPRESS_PROGS | sed -e 's/:/ /g'` 
	do
	    case "$tool"
	    in
		zstd)
		    if $have_zstd || [ "$use_prog" = zstd ]
		    then
			if [ "$size" -ge "$PCP_COMPRESS_ZSTD_MIN_FILESIZE" ] || [ "$use_prog" = zstd ]
			then
			    if [ -n "$args" ]
			    then
				largs=" $args"
			    else
				[ -z "${PCP_COMPRESS_ZSTD_ARGS+onetrip}" ] && _get_var PCP_COMPRESS_ZSTD_ARGS
				largs=" $PCP_COMPRESS_ZSTD_ARGS"
			    fi
			    if $showme
			    then
				echo >&2 "+ zstd$largs $file"
			    else
				if zstd$largs "$file"
				then
				    [ $verbose -gt 0 ] && echo >&2 "Info: $file: compressed with zstd"
				else
				    echo >&2 "$file: zstd failed!"
				    rm -f "$file.zst"
				    exit
				fi
			    fi
			    break
			else
			    [ $verbose -gt 0 ] && echo >&2 "Info:: $file: size $size too small for zstd"
			fi
		    fi
		    ;;

		xz)
		    if $have_xz || [ "$use_prog" = xz ]
		    then
			if [ -n "$args" ]
			then
			    largs=" $args"
			else
			    [ -z "${PCP_COMPRESS_XZ_ARGS+onetrip}" ] && _get_var PCP_COMPRESS_XZ_ARGS
			    largs=" $PCP_COMPRESS_XZ_ARGS"
			fi
			if $showme
			then
			    echo >&2 "+ xz$largs $file"
			else
			    if xz$largs "$file"
			    then
				[ $verbose -gt 0 ] && echo >&2 "Info: $file: compressed with xz"
			    else
				echo >&2 "$file: xz failed!"
				exit
			    fi
			fi
			break
		    fi
		    ;;

		bzip2)
		    if $have_bzip2 || [ "$use_prog" = bzip2 ]
		    then
			if [ -n "$args" ]
			then
			    largs=" $args"
			else
			    [ -z "${PCP_COMPRESS_BZIP2_ARGS+onetrip}" ] && _get_var PCP_COMPRESS_BZIP2_ARGS
			    largs=" $PCP_COMPRESS_BZIP2_ARGS"
			fi
			if $showme
			then
			    echo >&2 "+ bzip2$largs $file"
			else
			    if bzip2$largs "$file"
			    then
				[ $verbose -gt 0 ] && echo >&2 "Info: $file: compressed with bzip2"
			    else
				echo >&2 "$file: bzip2 failed!"
				exit
			    fi
			fi
			break
		    fi
		    ;;

		gzip)
		    if $have_gzip || [ "$use_prog" = gzip ]
		    then
			if [ -n "$args" ]
			then
			    largs=" $args"
			else
			    [ -z "${PCP_COMPRESS_GZIP_ARGS+onetrip}" ] && _get_var PCP_COMPRESS_GZIP_ARGS
			    largs=" $PCP_COMPRESS_GZIP_ARGS"
			fi
			if $showme
			then
			    echo >&2 "+ gzip$largs $file"
			else
			    if gzip$largs "$file"
			    then
				[ $verbose -gt 0 ] && echo >&2 "Info: $file: compressed with gzip"
			    else
				echo >&2 "$file: gzip failed!"
				exit
			    fi
			fi
			break
		    fi
		    ;;

		*)
		    echo >&2 "$file: Botch: no compression recipe for $tool program"
		    ;;

	    esac
	done
    done

    if [ "$nfile" -eq 0 ]
    then
	echo >&2 "$prog: Warning: no PCP archive files match \"$1\""
	return
    fi
}

args=''
showme=false
verbose=0
dir=''
optimize=''
min_size=''
min_zstd_size=''
proglist=''
use_prog=''
ARGS=`pmgetopt --progname=$prog --config=$tmp.usage -- "$@"`
[ $? != 0 ] && exit
eval set -- "$ARGS"

while [ $# -gt 0 ]
do
    case "$1"
    in
	-A)	# arg(s) for compression program
	    if [ -z "$args" ]
	    then
		args="$2"
	    else
		args="$args $2"
	    fi
	    shift
	    ;;

	-C)	# alternate config file for defaults
	    conffile="$2"
	    shift
	    ;;

	-c)	# candidate compression program(s)
	    if [ -z "$proglist" ]
	    then
		proglist="$2"
	    else
		proglist="$proglist:$2"
	    fi
	    shift
	    ;;

	-f)	# use this compression program
	    use_prog="$2"
	    shift
	    ;;

	-l)	# lower limit on file size for compression
	    check=`echo "$2" | sed -e 's/[0-9]//g'`
	    if [ -z "$check" ]
	    then
		min_size="$2"
	    else
		echo >&2 "$prog: Error: -l value ($2) must be numeric"
		exit
	    fi
	    shift
	    ;;

	-N)	# show me, do nothing
	    showme=true
	    ;;

	-o)	# optimize compression (option = space|time)
	    case "$2"
	    in
		space)
		    optimize=space
		    ;;
		time)
		    optimize=time
		    ;;
		*)
		    echo >&2 "$prog: Error: -o option must be space or time"
		    exit
		    ;;
	    esac
	    shift
	    ;;

	-V)
	    verbose=`expr $verbose + 1`
	    ;;

	-Z)	# lower limit on file size for zstd compression
	    check=`echo "$2" | sed -e 's/[0-9]//g'`
	    if [ -z "$check" ]
	    then
		min_zstd_size="$2"
	    else
		echo >&2 "$prog: Error: -Z value ($2) must be numeric"
		exit
	    fi
	    shift
	    ;;

	--)
	    shift
	    break
	    ;;

	-\?)
	    pmgetopt --usage --progname=$prog --config=$tmp.usage
	    status=0
	    exit
	    ;;
    esac
    shift
done

if [ $# -eq 0 ]
then
    pmgetopt --usage --progname=$prog --config=$tmp.usage
    exit
fi

# some combinations of command line args are NOT valid
#
# with -o: disallow: -f
if [ -n "$optimize" ]
then
    if [ -n "$use_prog" ]
    then
	echo >&2 "$prog: Error: -f [$use_prog] not allowed when compressing with -o"
	exit
    fi
fi

if [ ! -f "$conffile" ]
then
    echo >&2 "$prog: Error: configuration file ($conffile) not found"
    exit
fi

if [ $verbose -gt 1 ]
then
    # report key variables set from the environment
    #
    if [ -f "$conffile" ]
    then
	sed <$conffile \
	    -e '/^#/d' \
	    -e '/^[ 	]*$/d' \
	    -e 's/=.*//' \
	| while read VAR
	do
	    eval _set=`eval echo "\$\{$VAR+onetrip}"`
	    if [ -n "$_set" ]
	    then
		_val=`eval echo "\\\$$VAR"`
		echo >&2 "Info: $VAR set from environment: $_val"
	    fi
	done
    else
	echo >&2 "$prog: Error: configuration file ($conffile) vanished"
    fi
fi

# now command line args overrride environment settings
#
if [ -n "$min_size" ]
then
    PCP_COMPRESS_MIN_FILESIZE="$min_size"
    [ $verbose -gt 1 ] && echo >&2 "Info: PCP_COMPRESS_MIN_FILESIZE set from command line: $min_size"
else
    _get_var PCP_COMPRESS_MIN_FILESIZE 4096
fi
if [ -n "$min_zstd_size" ]
then
    PCP_COMPRESS_ZSTD_MIN_FILESIZE="$min_zstd_size"
    [ $verbose -gt 1 ] && echo >&2 "Info: PCP_COMPRESS_ZSTD_MIN_FILESIZE set from command line: $min_zstd_size"
else
    _get_var PCP_COMPRESS_ZSTD_MIN_FILESIZE 52428800
fi
if [ -n "$proglist" ]
then
    PCP_COMPRESS_PROGS="$proglist"
    [ $verbose -gt 1 ] && echo >&2 "Info: PCP_COMPRESS_PROGS set from command line: $proglist"
else
    _get_var PCP_COMPRESS_PROGS 'zstd:xz:bzip2:gzip'
fi

have_zstd=false
have_xz=false
have_bzip2=false
have_gzip=false
if [ -n "$use_prog" ]
then
    # -f so no choice ...
    #
    if ! which "$use_prog" >/dev/null 2>&1
    then
	echo >&2 "$prog: cannot find a compression program ($use_prog)"
	exit
    fi
else
    # work through the -c [or default] list ...
    #
    for try in `echo $PCP_COMPRESS_PROGS | sed -e 's/:/ /g'` 
    do
	case "$try"
	in
	    zstd)
		which zstd >/dev/null 2>&1 && have_zstd=true
		;;
	    xz)
		which xz >/dev/null 2>&1 && have_xz=true
		;;
	    bzip2)
		which bzip2 >/dev/null 2>&1 && have_bzip2=true
		;;
	    gzip)
		which gzip >/dev/null 2>&1 && have_gzip=true
		;;
	    *)
		echo >&2 "$prog: Warning: no clue how to deal with \"compression\" program $try"
		;;
	esac
    done

    if [ "$have_zstd$have_xz$have_bzip2$have_gzip" = "falsefalsefalsefalse" ]
    then
	echo >&2 "$prog: cannot find a compression program (tried $PCP_COMPRESS_PROGS)"
	exit
    fi
fi

while [ $# -gt 0 ]
do
    _compress "$1"
    shift
done

status=0
exit

