#!/bin/bash

# -----------------------------------------------------------------------------
# script parameters
#
# MAX_TEST_TIME - how long each unit test is allowed to run in seconds
#
MAX_TEST_TIME=$((10 * 60))

source /ciena/scripts/utils.sh

PROGRESS_FILE=/mnt/sysfs/guardian_unit_test_progress
LIMIT_TIMER_PID_FILE=/var/run/limit_timer.pid
POSTRCS_FILE=/mnt/sysfs/system/postrcS

this_script_full_path="/ciena/scripts/guardian_unit_test"
this_script=$(basename $0)

# -----------------------------------------------------------------------------
print_usage_and_exit()
{
    echo "Usage:"
    echo "        $this_script start - initiate test sequence"
    echo "        $this_script stop  - halt test sequence"
    echo
    echo "$this_script is designed to run the guardian through a"
    echo "set of unit tests.  Use the start command to setup the tests"
    echo "and then reboot the system to start the sequence.  This will"
    echo "run all tests and stop.  Test results are collected via the"
    echo "ALERT syslog(), which should be collected in flash"
    echo "(Use the \"evt_show except\" command on SAOS 6.x systems)."
    echo
    exit 0
}

# -----------------------------------------------------------------------------
kill_guardian_test_after_120()
{
    print_app_line_cr "killing guardian_test after 120 seconds"
    (
        sleep 120
        $SYSLOG_ALERT "kill(TERM) guardian_test"
        kill -TERM $(pidof guardian_test)
    ) &
}

# -----------------------------------------------------------------------------
respawn_guardian_test_after_120()
{
    print_app_line_cr "kill/respawn guardian_test after 120 seconds"
    (
        sleep 120
        $SYSLOG_ALERT "kill(TERM) guardian_test"
        kill -TERM $(pidof guardian_test)
        sleep 10
        $SYSLOG_ALERT "guardian_test restarting"
        bg_run_and_wait "guardian test" /ciena/bin/guardian_test heartbeat
    ) &

}

# -----------------------------------------------------------------------------
halt_guardian_test_after_120()
{
    print_app_line_cr "halting guardian_test after 120 seconds"
    (
        sleep 120
        $SYSLOG_ALERT "kill(STOP) guardian_test"
        kill -STOP $(pidof guardian_test)
    ) &
}

# -----------------------------------------------------------------------------
test_1()
{
    # Test for guardian that is stuck
    #
    # - during initialization, halt the guardian.
    # - the system should get rebooted via the watchdog
    #
    print_app_line_cr "SIGSTOP guardian after 120 seconds"
    (
        sleep 120
        kill -STOP $(pidof bob-the-guardian)
        $SYSLOG_ALERT "guardian has been STOPed"
    ) &
}

# -----------------------------------------------------------------------------
test_2()
{
    # Test for guardian that doesn't reach "run"
    #
    # - during initialization, prevent the guardian from reaching run state
    # - the system should get rebooted via the guardian after the init timeout
    #   expires (current default is 240 seconds).
    #
    print_app_line_cr "waste time until guardian init time out"
    while :; do
        sleep 10
        echo -n .
    done
}

# -----------------------------------------------------------------------------
test_3()
{
    # Test for a config entry that never appears
    #
    # - during initialization a guardian config entry is added for the
    #   guardian_test application.
    # - the guardian_test application is never started
    # - the system should get rebooted by the guardian when it reaches the
    #   run state.
    #
    print_app_line_cr "missing config entry test"
    guard_file_add respawn_wait guardian_test
}

# -----------------------------------------------------------------------------
test_4()
{
    # Test for client death action (default + monitor)
    #
    # - the guardian_test application is started and registers for
    #   guaridan monitoring (without a config entry).
    # - the guardian_test application is killed after 120 seconds.
    # - the system should get rebooted by the guardian once it detects
    #   that the guardian_test application is missing.
    #
    print_app_line_cr "test client death action (default)"
    bg_run_and_wait "guardian test (monitor)" /ciena/bin/guardian_test
    kill_guardian_test_after_120
}

# -----------------------------------------------------------------------------
test_5()
{
    # Test for client death action (respawn + dead client)
    #
    # - the guardian_test application is added to the guardian config with
    #   a death action of respawn_wait.
    # - the guardian_test application is started.
    # - the guardian_test application is killed after 120 seconds.
    # - the guardian should wait for guardian_test to respawn (120 seconds).
    # - the system should get rebooted by the guardian once it detects that
    #   the guardian_test application has not respawned in time.
    #
    print_app_line_cr "test client death action (respawn + dead client)"
    guard_file_add respawn_wait guardian_test
    bg_run_and_wait "guardian test" /ciena/bin/guardian_test
    kill_guardian_test_after_120
}

# -----------------------------------------------------------------------------
test_6()
{
    # Test for client death action (respawn + recovered client)
    #
    # - the guardian_test application is added to the guardian config with
    #   a death action of respawn_wait.
    # - the guardian_test application is started.
    # - the guardian_test application is killed after 120 seconds, and then
    #   respawned shortly afterwards.
    # - the guardian should notice the guardian_test death, but not take any
    #   explicit action, as the respawn should occur within the appropriate
    #   timeout.
    # - the system should get rebooted by the limit timer
    #
    print_app_line_cr "test client death action (respawn + recovery)"
    guard_file_add respawn_wait guardian_test
    bg_run_and_wait "guardian test" /ciena/bin/guardian_test heartbeat
    respawn_guardian_test_after_120
}

# -----------------------------------------------------------------------------
test_7()
{
    # Test for client death action (restart_group + dead client)
    #
    # - the guardian_test application is added to the guardian config with
    #   a death action of restart_group.
    # - the guardian_test application is started.
    # - the guardian_test application is killed after 120 seconds, and the
    #   group script does not restart guardian_test.
    # - the guardian should wait for guardian_test to respawn (120 seconds).
    # - the system should get rebooted by the guardian once it detects that
    #   the guardian_test application has not respawned in time.
    # 
    print_app_line_cr "test client death action (restart_group + dead client)"
    cat > /tmp/test_group <<EOF
#!/bin/bash
source /ciena/scripts/utils.sh
guard_file_add restart_group guardian_test
bg_run_and_wait "guardian test" /ciena/bin/guardian_test heartbeat
EOF
    chmod +x /tmp/test_group
    /tmp/test_group
    cat > /tmp/test_group <<EOF
#!/bin/bash
logger -p USER.ALERT -t $(basename $0) "fake script will deliberately fail"
EOF
    kill_guardian_test_after_120
}

# -----------------------------------------------------------------------------
test_8()
{
    # Test for client death action (restart_group + recovered client)
    #
    # - the guardian_test application is added to the guardian config with
    #   a death action of restart_group.
    # - the guardian_test application is started.
    # - the guardian_test application is killed after 120 seconds, and the
    #   group script restarts guardian_test.
    # - the system should get rebooted by the limit timer
    #
    print_app_line_cr "test client death action (restart_group + recovery)"
    cat > /tmp/test_group <<EOF
#!/bin/bash
source /ciena/scripts/utils.sh
guard_file_add restart_group guardian_test
bg_run_and_wait "guardian test" /ciena/bin/guardian_test heartbeat
EOF
    chmod +x /tmp/test_group
    /tmp/test_group
    kill_guardian_test_after_120
}

# -----------------------------------------------------------------------------
test_9()
{
    # Test for client death action on heartbeat stop
    #
    # - the guardian_test application is added to the guardian config with
    #   a death action of restart_group.
    # - the guardian_test application is started and registers for heartbeat
    #   monitoring.
    # - the guardian_test application does not heartbeat the guardian.
    # - the guardian should kill the guardian_test application once it
    #   detects that heartbeats have stopped.
    # - the group script should restart the guardian_test application (with
    #   proper heartbeats enabled).
    # - the system should get rebooted by the limit timer
    #
    print_app_line_cr "test client death action on heartbeat stop"
    cat > /tmp/test_group <<EOF
#!/bin/bash
source /ciena/scripts/utils.sh
guard_file_add restart_group guardian_test
bg_run_and_wait "guardian test" /ciena/bin/guardian_test heartbeat_death
EOF
    chmod +x /tmp/test_group
    /tmp/test_group
    cat > /tmp/test_group <<EOF
#!/bin/bash
logger -p USER.ALERT -t $(basename $0) "respawning guardian_test"
/ciena/bin/guardian_test heartbeat &
EOF

}

# -----------------------------------------------------------------------------
start_limit_timer()
{
    (
        sleep $MAX_TEST_TIME
        $SYSLOG_ALERT "limit timer triggered on test $TEST_NUMBER (rebooting)"
        reboot
    ) &

    echo $! > $LIMIT_TIMER_PID_FILE
}

# -----------------------------------------------------------------------------
run_next_test_and_exit()
{
    TEST_NUMBER=$(cat $PROGRESS_FILE)
    SYSLOG_ALERT="logger -p USER.ALERT -t $(basename $0) guardian test_$TEST_NUMBER"

    local next_test=$((TEST_NUMBER + 1))
    echo $next_test > $PROGRESS_FILE

    start_limit_timer $TEST_NUMBER

    fg_run "ack bootchain" /ciena/scripts/bootack

    print_app_line_cr "RUNNING GUARDIAN TEST #$TEST_NUMBER"
    $SYSLOG_ALERT "start"

    test_$TEST_NUMBER

    if [ "$?" -ne "0" ] ; then
        stop_unit_tests_and_exit
    fi

    print_app_line "Complete"
    exit 0
}

# -----------------------------------------------------------------------------
start_unit_tests()
{
    if [ -e "$POSTRCS_FILE" ] ; then
        echo "ERROR: $POSTRCS_FILE exists"
        echo "This file must be removed before the unit test can begin."
        exit 1
    fi
    if [ ! -x "/ciena/bin/guardian_test" ] ; then
        echo "ERROR: /ciena/bin/guardian_test can not be executed"
        exit 1
    fi
    ln -s $this_script_full_path $POSTRCS_FILE
    if [ "$?" -ne "0" ] ; then
        echo "ERROR: failed to link $POSTRCS_FILE"
        exit 1
    fi
    echo "start guardian unit tests - reboot the system to begin"
    echo 1 > $PROGRESS_FILE
}

# -----------------------------------------------------------------------------
stop_unit_tests_and_exit()
{
    echo "stop guardian unit tests"
    rm $PROGRESS_FILE
    rm $POSTRCS_FILE
    if [ -r "$LIMIT_TIMER_PID_FILE" ] ; then
        kill $(cat $LIMIT_TIMER_PID_FILE)
    fi
    exit 0
}

# --- main --------------------------------------------------------------------

# If no parameter is provided, then continue running tests if the progress
# file is present, otherwise error out.
#
if [ "$#" -eq "0" ] ; then
    if [ -w $PROGRESS_FILE ] ; then
        echo ...
        run_next_test_and_exit
    else 
        echo "$0: no tests in progress and no command provided"
        exit
    fi
fi

command=$1

case "$command" in

    "start")
        start_unit_tests
        ;;

    "stop")
        stop_unit_tests_and_exit
        ;;

    *)
        print_usage_and_exit
        ;;
esac
