#!/pkg/bin/ksh
# ---------------------------------------------------------------------
# isis_perf - Do CPU profiling of an IS-IS process, on an IOS-XR router.
#
# September 2023, Henk Smit
#
# Copyright (c) 2023-2024 by cisco Systems, Inc.
# All rights reserved.
#--------------------------------------------------------------------
#
# isis_perf - Do CPU profiling of an IS-IS process, on an IOS-XR router.
#             Uses the standard Linux perf (1) utility.
#             See "man 1 perf" on any Linux machine for more details.
#             Works on EnXR on RHEL8 (September 2023).
#             Should also work on IOS-XR production routers.
#
PROGNAME=`basename $0`

#
# The default values.
#
DURATION=10
INSTANCE=
PID=
THRESHOLD=0.5
CLEAN=false
REPEAT=false

#
# What are the flags and arguments of this script?
#
function Usage {
    echo
    echo "Usage: $PROGNAME [flags] [ -i <tag> | -p <pid> ]"
    echo
    echo "       Flags: -i  Name/tag of the IS-IS instance to examine"
    echo "              -p  Process ID of the process to examine"
    echo "              -t  Time in seconds to examine CPU usage (default: 10 sec)"
    echo "              -%  Threshold percentage which functions to show (default: 0.5%)"
    echo "              -r  Show the numbers from the previous run again"
    echo "              -c  Delete the perf.data and perf.data.old files"
    echo "              -h  See this text"
    echo
}

#
# Run in /tmp.
# Perf leaves a large perf.data file. This script can clean that up when
# the -c flag is given. But in case the file is not deleted, it is better
# to leave it in /tmp than somewhere else in the file-system.
#
if [[ -d /tmp ]]; then
    cd /tmp
fi

#
# Besides -h flag, we also support the --help flag.
#
if [[ "$1" == --help ]]; then
    Usage
    exit 1
fi

#
# Perf report seems to call the pager itself.
# It honors the $PAGER shell variable.
# If the $PAGER variable is not set, perf prefers to use less.
# On SE-Linux, we run old versions of perf and less.
# This causes problems with color escape-codes.
#
# So on all platforms, besides EnXR, we force perf to use more as pager.
# Perf report then still prints the color escape-codes.
# And more can't interpret them. Ugly. But at least we get some output.
#
# When running on EnXR, the user can pick his own pager.
# If she didn't pick anything, we pick less for her.
#
if [[ -z $ENXR_ACTIVE ]]; then
    export PAGER=more
elif [[ -z $PAGER ]]; then
    export PAGER=less
fi

if [[ $PAGER == less ]]; then
    if [[ -z $LESS ]]; then
        export LESS=-FMR
    else
        case $LESS in
            *R*) ;;
            *)   LESS=${LESS}R
                 ;;
        esac
    fi
fi

#
# Parse the options from the commandline.
#
while getopts "i:p:t:%:chr" FLAG
do
    case $FLAG in
        i) INSTANCE=$OPTARG
           ;;
        p) PID=$OPTARG
           ;;
        t) DURATION=$OPTARG
           ;;
        %) THRESHOLD=$OPTARG
           ;;
        #
        # The previous "perf record" leaves a file perf.data around.
        # We can see the numbers again, if we want.
        #
        r) REPEAT=true
           ;;
        #
        # Clean. Don't do any CPU profiling.
        # Only remove the the old perf.data files.
        #
        c) CLEAN=true
           ;;
        h|*) Usage
             exit 1
           ;;
    esac
done
shift $((OPTIND - 1))

#
# When displaying the data, we can set the threshold percentage.
#
REGEXP='^[0-9.]+$'
if [[ ! $THRESHOLD =~ $REGEXP ]]; then
    echo "Percentage must be a number. (Can include . but not % sign.)"
    exit 2
fi

REPORT_COMMAND="perf report --call-graph=graph,${THRESHOLD} --sort=cpu --stdio"

#
# Clean up perf.data files if requested.
#
if [[ $CLEAN == true ]]; then
    echo Cleaning old perf.data files
    for FILE in perf.data perf.data.old /tmp/perf.data /tmp/perf.data.old
    do
        if [[ -f $FILE ]]; then
            rm -f $FILE
        fi
    done
    echo
    exit 0
fi

#
# Check whether the perf utility is installed or not.
# Not all XR routers might have it.
#
which perf 2>&1 > /dev/null
if [[ $? -ne 0 ]]; then
    echo Perf utility is not installed on this router
    echo
    exit 2
fi

#
# If the user wants to see the numbers from the previous run, there is
# no need to check the instance name or pid.
#
if [[ $REPEAT == true ]]; then
    if [[ ! -f perf.data ]]; then
        echo "No perf.data file from a previous perf run was found"
        echo
        exit 3
    fi
    echo "Showing perf data from previous perf run"
    sleep 1
    $REPORT_COMMAND
    exit 0
fi

#
# We're gonna collect new data.
# Check the arguments.
#
# IS-IS process name or PID. Can't have both.
#
if [[ -n $PID && -n $INSTANCE ]]; then
    echo Cannot specify both PID and Instance Name at the same time
    echo
    exit 4
fi

#
# If no Instance Name and no PID was given, use the first instance we find.
#
if [[ -z "$PID" && -z "$INSTANCE" ]]; then
    LINE=`isis_show --cmd summary | egrep -A1 "^---" | tail -1`
    if [[ $LINE == *"No IS-IS instances found"* ]]; then
        echo $LINE
        echo "Specify '-p <pid>' to examine a non IS-IS process"
        echo
        exit 5
    fi
    INSTANCE=`echo $LINE | sed 's/ .*$//'`
fi

#
# If we have an instance name, find its PID.
#
if [[ -n "$INSTANCE" ]]; then
    #
    # Check if the instance exists.
    #
    LINES=`isis_show --cmd proto-global --instance $INSTANCE 2>&1`
    case "$LINES" in
        *"No IS-IS instances found"*)
            echo "No IS-IS instances running on this router"
            echo
            exit 6
            ;;
        *"No matches for supplied arguments"*)
            echo "IS-IS Instance '$INSTANCE' is not running on this router"
            echo
            exit 7
            ;;
        *)  ;;
    esac

    #
    # No errors. The instance exists. Get the PID.
    #
    PID=`isis_show --cmd proto-global --instance $INSTANCE | grep 'PID:' | head -1 | sed 's/PID: //'`

fi

#
# PID should be set now.
#
if [[ -z "$PID" ]]; then
    echo Error: Could not find the process ID
    echo
    exit 8
fi

#
# Check if the PID is running
#
ps $PID 2>&1 > /dev/null
if [[ $? -ne 0 ]]; then
    echo "Process $PID is not running"
    echo
    exit 9
fi

#
# Is the timeout a valid number?
#
case $DURATION in
    [0-9]*)    ;;
    *)         echo "Duration '$DURATION' is not a valid number of seconds"
               echo
               exit 10
               ;;
esac

if [[ $DURATION -gt 600 || $DURATION -lt 1 ]]; then
    echo "Time $DURATION should be between 1 and 600 seconds"
    echo "If you want a longer duration, use the perf utility directly:"
    echo "  perf record -e cpu-clock --call-graph fp --freq=6997 \ "
    echo "                      -p <pid> sleep <sec> ; perf report --stdio"
    echo
    exit 11
fi

#
# Do the real work.
# Call perf. Once to gather the data. And once to display the data.
#
echo Running perf CPU profiling on PID $PID for $DURATION seconds

perf record -e cpu-clock --call-graph fp --freq=6997 -p $PID sleep $DURATION

$REPORT_COMMAND

exit 0
