#!/bin/bash
#
# Description: Gather basic HMC problem debugging information
#
# Change Activity:
#   05/22/2003 L. Brocious    - Initial version
#   08/18/2004 L. Brocious    - Serious enhancements and some corrections
#                               . Added command line options
#                               . Use DFC lookup for file locations when possible
#                               . Wait for JVM dumps to complete
#   08/20/2004 L. Brocious    - Add output from ps command to capture full command
#                               line of each process
#   05/17/2005 P. Callaghan   - Get some debugging info even if the Manager is not running.
#   09/30/2005 P. Callaghan   - Add collection of more stuff from the /proc directories and
#                               collect information about the threads in the Manager JVM.
#   10/05/2005 PM Almy        - Defect 517738 (p Series HMC)
#                               . Remove created dump/heap files (all HMCs)
#                               . If i/p Series create entry in the log (iqyylog.log)
#   10/18/2005 J. Jenks       - Add ability to specify the output tar file name (-o option).
#   12/06/2005 J. Jenks       - Update warning message to note that it is normal on z Systems.
#   05/30/2006 P. Callaghan   - Add /var/log/messages file collected.
#   05/30/2006 P. Callaghan   - Run AusMicrocodeLog in background so this script does not hang if logging
#                               is hung (seen in defect 554206).
#   07/28/2006 P. Callaghan   - Add more files.
#   10/08/2006 L. Brocious    - Gather previous versions of the stdout/stderr log file
#   10/20/2006 L. Brocious    - Fix -o and -s command line options, gather "mouse tracks" file
#   12/15/2006 L. Brocious    - Add "ps -efLm" for more thread info, gather resetccfw.log file
#   01/19/2007 L. Brocious    - Add quotes around grep regular expression
#
#STARTUSAGE
#
# This script gathers some basic information about the processes running
# on the system, including new JVM thread/heap dumps of the HMC/CCFW JVM,
# as well as other basic HMC debugging information, and creates a
# compressed tar file containing that information.  This script will
# determine whether a zSeries or pSeries HMC is running. This script 
# assumes that the JVM is the IBM JVM.  You must be logged in as root
# to run this script.
#
# It gathers debugging information such as:
# . Output from commands such as top, ps, pstree and free
# . HMC log file and compressed log files
# . HMC trace file, current and previous
# . JVM thread dump(s)
# . JVM heap dump(s)
# . HMC linemode console output, current and previous
# . HMC "mouse tracks" file showing task execution history
#
# Usage:
#    hmcdebuginfo [-j][-q][-o OUTPUT_FILE][-s][-?]
# -j option ("alljvmdumps") causes all JVM thread dumps and heap dumps
#          to be included, not just the new dumps initiated by this script.
# -q option ("quick") eliminates the wait for the JVM thread/heap dumps
#          if the JVM does not acknowledge the dump request in a timely 
#          fashion.  This script continues without gathering JVM dumps.
#          This option is convenient when the JVM is not responsive at
#          all and you still wish to gather other debugging information.
# -o option specifies that the output tar file created by this script should
#          be named OUTPUT_FILE.  If OUTPUT_FILE does not end with .tgz,
#          .tgz is appended.  The file is always created in /tmp.
# -s option specifies that an attempt should be made to use expect and gdb
#          (both which must be accessible via the PATH environment variable)
#          to show native stack traces and local variables for the JVM threads.
# -? displays this usage information.
#
#ENDUSAGE
#****************************************************************************

# Function to return the PID of the top-level JVM process.  Returns nothing if
# the top-level JVM process cannot be found.
# Arguments
# . HMC script that starts the JVM
getjvmpid() {
   searchtarget="$1"   # The first thing to find is the script that starts the JVM
   # Loop through "ps -eH" output to find the PID of the top-level JVM process
   ps -eH | while read -r pid tty time cmd; do
      if [ "$cmd" == "$searchtarget" ]; then
         if [ "$searchtarget" == "java" ]; then    # Found the top-level JVM process
            echo $pid   # Return this process' PID to our caller
            break
         else
            # We found the script that starts the JVM; the next thing to look
            # for is the top-level java process
            searchtarget="java"
         fi
      fi
   done
}


# Set default values for options
quick=0
giveUsage=0
alljvmdumps=0
stackTraces=0

# Parse the options
while getopts 'jqo:s?' optname; do
   case "$optname" in
      j)  alljvmdumps=1;;
      q)  quick=1;;
      o)  tarfn="$OPTARG";;
      s)  stackTraces=1;;
      \?) giveUsage=1; break;;
   esac
done

if [ "$giveUsage" -eq 1 ]; then
   # Print out the prologue comments as usage info
   sed -e '/STARTUSAGE/,/ENDUSAGE/ s/^#//' -e '1,/STARTUSAGE/ d' -e '/ENDUSAGE/,$ d' "$0"
   exit 0
fi

# First positional parameter is at index OPTIND.  Shift so that these parms
# are easily accessible to following code.
if [ $OPTIND -ne 1 ]; then
   shift $(($OPTIND-1))
fi

# Make sure we're root
me=$(whoami)                # Current user's login name
if [ $me != "root" ]; then
   echo "You must be logged in as root to run this script; you are currently logged in as $me."
   exit 1
fi

sleepamount='20'  # How long to wait between checks for JVM dumps, in seconds
let sleepcount=15 # Number of times to sleep while waiting for JVM dumps

now=$(date +%Y%m%d.%H%M%S)      # Build a date/time stamp to identify this run
hostname=$(hostname)            # Name of this system

echo "Gathering FFDC data for system $hostname at $now..."

# Names of output files created by this script.  Include hostname and timestamp for uniqueness.
logfn="/tmp/hmcdebuginfo.$hostname.$now.log"
tracebufsfn="/tmp/hmcdebuginfo.$hostname.$now.showTraceBuf"
if [ -n "$tarfn" ]; then
   tarfn="/tmp/${tarfn}";
   if ! grep -i \\.tgz$ <(echo $tarfn) >/dev/null; then
      tarfn="${tarfn}.tgz"
   fi
else
   tarfn="/tmp/hmcdebuginfo.$hostname.$now.tgz"
fi
  
# Start a new log file and gather some command output
echo "*** Gathering info from host $hostname at date.time $now" > $logfn
echo "*** Output from 'top -bn1' command follows ***" >> $logfn
top -bn1 >> $logfn
echo "*** Output from 'ps -Afww' command follows ***" >> $logfn
ps -Afww >> $logfn
echo "*** Output from 'pstree -lp' command follows ***" >> $logfn
pstree -lp >> $logfn
echo "*** Output from 'free' command follows ***" >> $logfn
free >> $logfn
echo "*** Output from 'netstat -atpn' command follows ***" >> $logfn
netstat -atpn >> $logfn
echo "*** Output from 'ps -Aww -o pid,start_time,etime,%cpu,args' command follows ***" >> $logfn
ps -Aww -o pid,start_time,etime,%cpu,args >> $logfn
echo "*** Output from 'ps -efLm' command follows (LWPs match TIDs (hex) in iqzdtrac.trm and PIDs in javacore) ***" >> $logfn
ps -efLm >> $logfn
echo "*** Output from 'df' command follows ***" >> $logfn
df >> $logfn
echo "*** Output from 'ipcs' commands follows ***" >> $logfn
ipcs >> $logfn
ipcs -s | cut -c 12-22 - | grep '[0-9]' | xargs -n1 ipcs -s -i >> $logfn
echo "*** Output from command to display the status of each PID follows ***" >> $logfn
find /proc/ -name 'status' -exec cat {} \; >> $logfn 2>&1

# Determine which HMC (zSeries or i/pSeries) is running and
# set a bunch of variables to identify the important directories and files we
# want to collect.  Use an iqzddfc.trm lookup whenever possible.
# if i/pSeries set iphmc to 1
iphmc=0
if [ -d "/var/hsc/log/" ]; then  # i/pSeries HMC is installed
   iphmc=1
   hmctopdir='/opt/ccfw'
   export CONSOLE_PATH=$hmctopdir/
   . $hmctopdir/hmcfunctions
   jvmscript='runccfw'
   consolelog="$hmctopdir/ccfw.out"
   previousconsolelogs="$hmctopdir/ccfw.out.*"
   ###platformspecificfiles="$(queryFileLocation iqzdtrac.trm)iqzdtrac.trm.previous" # From startccfw
   platformspecificfiles='/var/hsc/log/resetccfw.log'
else                                       # zSeries HMC is running
   hmctopdir='/console'              # Top-level HMC code/data directory
   export CONSOLE_PATH=$hmctopdir/   # Needed by hmcfunctions
   . $hmctopdir/hmcfunctions         # Get access to common function definitions
   jvmscript='startdriver'           # The script that starts the JVM
   consolelog='/var/log/hmc.log'     # Line mode output from startup scripts and JVM
   previousconsolelogs="/var/log/hmc.log.*"     # Line mode output from startup scripts and JVM
   platformspecificfiles=''          # Anything specific to this variety of HMC
fi

# More directories/files.  These are the same on both zSeries and i/pSeries                 
tracefile="$(queryFileLocation iqzdtrac.trm)iqzdtrac.trm"   # Trace file
previoustracefiles="$tracefile.*"                        # Trace files from previous runs
logfile="$(queryFileLocation iqyylog.log)iqyylog.log" # Log file
compressedlogfiles="$(queryFileLocation iqyycom*.log)iqyycom*.log" # Compressed log files
jvmdumpdir="$(queryFileLocation jvmdumptargetdirectory)" # JVM thread/heap dumps dir
jvmthreaddumps="${jvmdumpdir}javacore*"                  # JVM thread dumps
jvmheapdumps="${jvmdumpdir}heapdump*"                    # JVM heap dumps
corefiles="$(queryFileLocation core.*)core*"             # Core dumps
systemmessages='/var/log/messages'                       # System messages
tracefedcfiles='/ffdc/trace/tracefedc.trm /tmp/tracefedc.trm' # Primary and alternate trace FEDC files
mousetracksfile="$(queryFileLocation actzuict.dat)actzuict.dat"  # Task execution history file

# Get all the in-process trace buffers
echo "*** Getting all in-process trace buffers ***" >> $logfn
$hmctopdir/bin/base/showTraceBuf all > $tracebufsfn 2>&1

# Request the JVM to take a thread dump and heap dump, if configured to do so and get the file descriptors for 
# the main PID in the JVM.
filesset=0
pid=$(getjvmpid $jvmscript)
if [ -z $pid ]; then
   echo "Unable to find top-level JVM process. Skipping getting a JVM thread/heap dump and file descriptors." | tee -a $logfn
else
   echo "*** Output from command to display the file descriptors of the JVM main PID ***" >> $logfn
   ls -la /proc/$pid/fd >> $logfn 2>&1
   filesize=$(wc -l $consolelog | awk '{print $1}')  # Number of lines in console log before we signal JVM
   echo "About to signal process $pid to create JVM thread/heap dumps." | tee -a $logfn
   kill -s sigquit $pid  # Send the JVM a SIGQUIT signal
   if [ $? != 0 ]; then
      echo "Attempt to signal PID $pid failed with exit status $?" | tee -a $logfn
   else   # Signal sent; wait for thread/heap dump creation
      echo "Signal successfully sent; waiting $sleepamount seconds for dump creation..." | tee -a $logfn
      sleep $sleepamount
      # This code assumes that the JVM is an IBM JVM.  It looks for a particular sequence
      # of JVM messages in the linemode console log.  The sed scripts delete all lines that
      # were in the console log file before the JVM was signalled, and then searches any new
      # lines for JVM message identifiers.  JVMDG217 is issued when the JVM acknowledges the
      # SIGQUIT; JVMDG318 identifies the heapdump file; JVMDG304 identifies the thread dump
      # file; JVMDG215 indicates that the JVM has completed handling the SIGQUIT.  We loop
      # with a delay until the JVM completes the dumps or runs out of time.
      done=0
      let loopcount=0
      while [[ $done -eq 0 && $loopcount -lt $sleepcount ]]; do
         if sed -e "1,$filesize d" $consolelog | grep 'JVMDG215' >/dev/null; then
            echo "JVM dumps are complete." | tee -a $logfn
            # Get the filenames of the heap dump and thread dump from the JVM messages
            heapdump=$(sed -e "1,$filesize d" -e '/JVMDG318/!d' $consolelog | awk '{print $NF}')
            javacore=$(sed -e "1,$filesize d" -e '/JVMDG304/!d' $consolelog | awk '{print $NF}')
            done=1
            filesset=1
         else
            if sed -e "1,$filesize d" $consolelog | grep 'JVMDG217' >/dev/null; then # JVM got signal
               echo "Dumps are not yet complete; waiting another $sleepamount seconds..." | tee -a $logfn
            else # JVM has not acknowledged signal yet
               if [ $quick == 1 ]; then    # User does not want to wait for unresponsive JVM
                  echo "Not waiting on the JVM any longer; continuing to gather other files..." | tee -a $logfn
                  done=1
                  break
               else    # User is willing to wait; sleep and check again...
                  echo "JVM has not acknowledged signal; waiting another $sleepamount seconds..." | tee -a $logfn
               fi
            fi
            sleep $sleepamount
            let loopcount=$loopcount+1
         fi
      done
      if [ $done -eq 0 ]; then
         echo "Timeout occurred waiting for JVM to create thread/heap dump" | tee -a $logfn
      fi
   fi

   if [ $stackTraces -eq 1 ]; then 
      # Dump information about the threads of the JVM.
      expectProgram=`which expect`
      if [ -z $expectProgram ]; then
         echo "Unable to find the program to automate dumping the threads of the JVM processes.  (This is normal operation on z Systems.)" | tee -a $logfn
      else 
         gdbProgram=`which gdb`
         if [ -z $gdbProgram ]; then
            echo "Unable to find the gdb program." | tee -a $logfn
         else
            javaProgram=`which java`
            if [ -z $javaProgram ]; then
               echo "Unable to find the java program." | tee -a $logfn
            else
               echo "Dumping information about the JVM threads which may take a short while. Please wait." | tee -a $logfn
               cat $hmctopdir/dumpThreads | $expectProgram -f - $gdbProgram $javaProgram $pid >> $logfn
            fi
         fi
      fi
   fi
fi


filestogather="$logfn $tracefile $previoustracefiles $logfile $compressedlogfiles $consolelog $previousconsolelogs $javacore $heapdump $platformspecificfiles $tracebufsfn $systemmessages $tracefedcfiles $mousetracksfile"
if [ $alljvmdumps == 1 ]; then # User wants all JVM thread/heap dumps, not just new ones
   filestogather="$filestogather $jvmthreaddumps $jvmheapdumps"
fi

echo "*** Listing of files to gather ***" >> $logfn
ls -l $filestogather >> $logfn 2>&1

echo "*** Listing of other interesting files ***" >> $logfn
otherfiles="$corefiles $hmctopdir/core*"
if [ $alljvmdumps == 0 ]; then
   otherfiles="$otherfiles $jvmthreaddumps $jvmheapdumps"
fi
ls -l $otherfiles >> $logfn 2>&1

echo "About to create tar file containing debugging information..." | tee -a $logfn
tar cvzf $tarfn $filestogather >> $logfn 2>&1
if [ -f $tarfn ]; then
   echo "tar file $tarfn has been created."
else
   echo "Attempt to create tar file $tarfn failed with exit status $?" | tee -a $logfn
   exit 5
fi

# If the JVM successfully created a javacore and/or heapdump, remove them now so they
# don't accumulate and use up large amounts of disk space.  They have been put into
# our output file.
if [ $filesset -eq 1 ]; then     
   if [ -n "$javacore" ]; then
      rm $javacore
   fi
   if [ -n "$heapdump" ]; then
      rm $heapdump
   fi
fi

# If we're running on an i/p Series HMC, create a log entry to mark our success/failure.
if [ $iphmc -eq 1 ]; then
    echo "Running on i/p Series HMC.  Logging the event"
    lcmd="com.ibm.hsc.common.util.AusMicrocodeLog"
    if [ $filesset -eq 0 ]; then
        ltext="hmcdebuginfo - timeout reached"
    else
        ltext="hmcdebuginfo - dumps taken"
    fi
    /usr/websm/bin/wjava $lcmd $ltext &
fi

# Remove some of our working files; they're in our output file.
rm $tracebufsfn
rm $logfn

exit 0
