#!/bin/bash
#
# Description: Gather basic HMC problem debugging information
#
# Change Activity:
#   05/22/2003 L. Brocious    - Initial version
#   08/18/2004 L. Brocious    - Serious enhancements and some corrections
#                               . Added command line options
#                               . Use DFC lookup for file locations when possible
#                               . Wait for JVM dumps to complete
#   08/20/2004 L. Brocious    - Add output from ps command to capture full command
#                               line of each process
#   05/17/2005 P. Callaghan   - Get some debugging info even if the Manager is not running.
#   09/30/2005 P. Callaghan   - Add collection of more stuff from the /proc directories and
#                               collect information about the threads in the Manager JVM.
#   10/05/2005 PM Almy        - Defect 517738 (p Series HMC)
#                               . Remove created dump/heap files (all HMCs)
#                               . If i/p Series create entry in the log (iqyylog.log)
#   10/18/2005 J. Jenks       - Add ability to specify the output tar file name (-o option).
#   12/06/2005 J. Jenks       - Update warning message to note that it is normal on z Systems.
#   05/30/2006 P. Callaghan   - Add /var/log/messages file collected.
#   05/30/2006 P. Callaghan   - Run AusMicrocodeLog in background so this script does not hang if logging
#                               is hung (seen in defect 554206).
#   07/28/2006 P. Callaghan   - Add more files.
#   10/08/2006 L. Brocious    - Gather previous versions of the stdout/stderr log file
#   10/20/2006 L. Brocious    - Fix -o and -s command line options, gather "mouse tracks" file
#   12/15/2006 L. Brocious    - Add "ps -efLm" for more thread info, gather resetccfw.log file
#   01/19/2007 L. Brocious    - Add quotes around grep regular expression
#   08/16/2007 L. Brocious    - Remove use of wjava on ipHMC, since it doesn't exist on ecl320 and later,
#                               and collect the task recorder files
#   08/22/2007 L. Brocious    - Add support for JVM 1.5.0 and later (heapdump/javacore messages)
#   12/04/2007 L. Brocious    - Add -d option for description
#   02/24/2008 L. Brocious    - Add -l list file support
#   02/24/2008 L. Brocious    - Gather .version file and hmcdebug.jar (ipHMC only)
#   02/24/2008 L. Brocious    - Gather /proc/distro_id for MCP level information
#   06/18/2008 B. Myers       - Add support for JVM 1.6.0 heapdump/javacore messages
#   07/08/2008 S. Feustel     - Add marker file to supress PA collection of heapdump/javacore generated by this script (668916)
#
#STARTUSAGE
#
# This script gathers some basic information about the processes running
# on the system, including new JVM thread/heap dumps of the HMC/CCFW JVM,
# as well as other basic HMC debugging information, and creates a
# compressed tar file containing that information.  This script will
# determine whether a zSeries or pSeries HMC is running. This script 
# assumes that the JVM is the IBM JVM.  You must be logged in as root
# to run this script.
#
# It gathers debugging information such as:
# . Output from commands such as top, ps, pstree and free
# . HMC log file and compressed log files
# . HMC trace file, current and previous
# . JVM thread dump(s)
# . JVM heap dump(s)
# . HMC linemode console output, current and previous
# . HMC "mouse tracks" file showing task execution history
#
# Usage:
#    hmcdebuginfo [-d DESCRIPTION][-j][-q][-o OUTPUT_FILE][-s][-?]
# -d option specifies a description to be included in the output file
#          and, if created, on the log entry.  Note that this is a
#          single argument and must be properly quoted.  This is useful
#          to document who ran this script and/or why.
# -j option ("alljvmdumps") causes all JVM thread dumps and heap dumps
#          to be included, not just the new dumps initiated by this script.
# -q option ("quick") eliminates the wait for the JVM thread/heap dumps
#          if the JVM does not acknowledge the dump request in a timely 
#          fashion.  This script continues without gathering JVM dumps.
#          This option is convenient when the JVM is not responsive at
#          all and you still wish to gather other debugging information.
# -o option specifies that the output tar file created by this script should
#          be named OUTPUT_FILE.  If OUTPUT_FILE does not end with .tgz,
#          .tgz is appended.  The file is always created in /tmp.
# -s option specifies that an attempt should be made to use expect and gdb
#          (both which must be accessible via the PATH environment variable)
#          to show native stack traces and local variables for the JVM threads.
# -l option specifies the name of a "list file" which identifies additional
#          files to be collected.  Blank lines and lines that begin with a '#'
#          are ignored.  The syntax for all other lines in the list file is:
#             FILE filename
#             DFCFILE filename dfckey
#          where filename is the name of the file to be collected.  This
#                        filename may include wildcards such as "*".
#                 dfckey is the key to be used for a "DFC lookup" (in the 
#                        iqzddfc.trm file) of filename.
# -? displays this usage information.
#
#ENDUSAGE
#****************************************************************************

# Function to return the PID of the top-level JVM process.  Returns nothing if
# the top-level JVM process cannot be found.
# Arguments
# . HMC script that starts the JVM
getjvmpid() {
   searchtarget="$1"   # The first thing to find is the script that starts the JVM
   # Loop through "ps -eH" output to find the PID of the top-level JVM process
   ps -eH | while read -r pid tty time cmd; do
      if [ "$cmd" == "$searchtarget" ]; then
         if [ "$searchtarget" == "java" ]; then    # Found the top-level JVM process
            echo $pid   # Return this process' PID to our caller
            break
         else
            # We found the script that starts the JVM; the next thing to look
            # for is the top-level java process
            searchtarget="java"
         fi
      fi
   done
}


# Set default values for options
quick=0
giveUsage=0
alljvmdumps=0
stackTraces=0
filesfromlistfile=

# Parse the options
while getopts 'd:jqo:sl:?' optname; do
   case "$optname" in
      d)  description="$OPTARG";;
      j)  alljvmdumps=1;;
      q)  quick=1;;
      o)  tarfn="$OPTARG";;
      s)  stackTraces=1;;
      l)  listfile="$OPTARG";;
      \?) giveUsage=1; break;;
   esac
done

if [ "$giveUsage" -eq 1 ]; then
   # Print out the prologue comments as usage info
   sed -e '/STARTUSAGE/,/ENDUSAGE/ s/^#//' -e '1,/STARTUSAGE/ d' -e '/ENDUSAGE/,$ d' "$0"
   exit 0
fi

# First positional parameter is at index OPTIND.  Shift so that these parms
# are easily accessible to following code.
if [ $OPTIND -ne 1 ]; then
   shift $(($OPTIND-1))
fi

# Make sure we're root
me=$(whoami)                # Current user's login name
if [ $me != "root" ]; then
   echo "You must be logged in as root to run this script; you are currently logged in as $me."
   exit 1
fi

# If a list file was specified, make sure it exists
if [[ -n "$listfile" ]]; then
   if [[ ! -f "$listfile" ]]; then
      echo "List file not found: $listfile"
      exit 1
   fi
fi

sleepamount='20'  # How long to wait between checks for JVM dumps, in seconds
let sleepcount=15 # Number of times to sleep while waiting for JVM dumps

now=$(date +%Y%m%d.%H%M%S)      # Build a date/time stamp to identify this run
hostname=$(hostname)            # Name of this system

echo "Gathering FFDC data for system $hostname at $now..."

# Names of output files created by this script.  Include hostname and timestamp for uniqueness.
logfn="/tmp/hmcdebuginfo.$hostname.$now.log"
tracebufsfn="/tmp/hmcdebuginfo.$hostname.$now.showTraceBuf"
if [ -n "$tarfn" ]; then
   tarfn="/tmp/${tarfn}";
   if ! grep -i \\.tgz$ <(echo $tarfn) >/dev/null; then
      tarfn="${tarfn}.tgz"
   fi
else
   tarfn="/tmp/hmcdebuginfo.$hostname.$now.tgz"
fi
  
# Start a new log file and gather some command output
echo "*** Gathering info from host $hostname at date.time $now" > $logfn
if [ -n "$description" ]; then
   echo "*** Description from hmcdebuginfo command='$description'" >> $logfn
fi
echo "*** Output from 'top -bn1' command follows ***" >> $logfn
top -bn1 >> $logfn
echo "*** Output from 'ps -Afww' command follows ***" >> $logfn
ps -Afww >> $logfn
echo "*** Output from 'pstree -lp' command follows ***" >> $logfn
pstree -lp >> $logfn
echo "*** Output from 'free' command follows ***" >> $logfn
free >> $logfn
echo "*** Output from 'netstat -atpn' command follows ***" >> $logfn
netstat -atpn >> $logfn
echo "*** Output from 'ps -Aww -o pid,start_time,etime,%cpu,args' command follows ***" >> $logfn
ps -Aww -o pid,start_time,etime,%cpu,args >> $logfn
echo "*** Output from 'ps -efLm' command follows (LWPs match TIDs (hex) in iqzdtrac.trm and PIDs in javacore) ***" >> $logfn
ps -efLm >> $logfn
echo "*** Output from 'df' command follows ***" >> $logfn
df >> $logfn
echo "*** Output from 'ipcs' commands follows ***" >> $logfn
ipcs >> $logfn
ipcs -s | cut -c 12-22 - | grep '[0-9]' | xargs -n1 ipcs -s -i >> $logfn
echo "*** Output from command to display the status of each PID follows ***" >> $logfn
find /proc/ -name 'status' -exec cat {} \; >> $logfn 2>&1

# Determine which HMC (zSeries or i/pSeries) is running and
# set a bunch of variables to identify the important directories and files we
# want to collect.  Use an iqzddfc.trm lookup whenever possible.
if [ -d "/var/hsc/log/" ]; then  # i/pSeries HMC is installed
   iphmc=1
   hmctopdir='/opt/ccfw'
   export CONSOLE_PATH=$hmctopdir/
   . $hmctopdir/hmcfunctions
   jvmscript='runccfw'
   consolelog="$hmctopdir/ccfw.out"
   previousconsolelogs="$hmctopdir/ccfw.out.*"
   ###platformspecificfiles="$(queryFileLocation iqzdtrac.trm)iqzdtrac.trm.previous" # From startccfw
   platformspecificfiles='/var/hsc/log/resetccfw.log /home/hscroot/.version /usr/websm/codebase/pluginjars/hmcdebug.jar'
else                                       # zSeries HMC/SE is running
   iphmc=0
   hmctopdir='/console'              # Top-level HMC code/data directory
   export CONSOLE_PATH=$hmctopdir/   # Needed by hmcfunctions
   . $hmctopdir/hmcfunctions         # Get access to common function definitions
   jvmscript='startdriver'           # The script that starts the JVM
   consolelog='/var/log/hmc.log'     # Line mode output from startup scripts and JVM
   previousconsolelogs="/var/log/hmc.log.*"     # Line mode output from startup scripts and JVM
   platformspecificfiles=''          # Anything specific to this variety of HMC
fi

# More directories/files.  These are the same on both zSeries and i/pSeries                 
tracefile="$(queryFileLocation iqzdtrac.trm)iqzdtrac.trm"   # Trace file
previoustracefiles="$tracefile.*"                        # Trace files from previous runs
logfile="$(queryFileLocation iqyylog.log)iqyylog.log" # Log file
compressedlogfiles="$(queryFileLocation iqyycom*.log)iqyycom*.log" # Compressed log files
jvmdumpdir="$(queryFileLocation jvmdumptargetdirectory)" # JVM thread/heap dumps dir
jvmthreaddumps="${jvmdumpdir}javacore*"                  # JVM thread dumps
jvmheapdumps="${jvmdumpdir}heapdump*"                    # JVM heap dumps
corefiles="$(queryFileLocation core.*)core*"             # Core dumps
systemmessages='/var/log/messages'                       # System messages
tracefedcfiles='/ffdc/trace/tracefedc.trm /tmp/tracefedc.trm' # Primary and alternate trace FEDC files
mousetracksfile="$(queryFileLocation actzuict.dat)actzuict.dat"  # Task execution history file
taskrecorderfiles="$(queryFileLocation iqzdtrac.trm)taskRecorder*.trm"  # Task event recorder files
distrofile='/proc/distro_id'                             # MCP level information

echo "*** Contents of $distrofile follows ***" >> $logfn
cat $distrofile >> $logfn   # This is a weird "file" in /proc, so can't copy it; must cat it instead

# If a list file was specified on the command line, get the list of files from it
if [[ -n "$listfile" ]]; then
   filesfromlistfile=$(cat $listfile|while read cmd fn dfckey; do
      # Ignore blank lines or lines starting with '#'
      if echo "$cmd"|grep -E '^#|^$' - >/dev/null 2>&1; then
         continue
      fi

      # Check the directive on this line and handle accordingly
      if [[ "$cmd" == "FILE" ]]; then
         echo $fn                                # (note this will do filename expansion, e.g., "*")
      elif [[ "$cmd" == "DFCFILE" ]]; then
         echo "$(queryFileLocation $dfckey)$fn"  # (note this will do filename expansion, e.g., "*")
      else
         echo "Invalid directive in $listfile: $cmd" >&2
         exit 1
      fi

   done)   # Complete the command pipe that reads the list file
   if [ $? != 0 ]; then  # Some error with the list file
      exit 1
   fi
fi

# Get all the in-process trace buffers
echo "*** Getting all in-process trace buffers ***" >> $logfn
$hmctopdir/bin/base/showTraceBuf all > $tracebufsfn 2>&1

# Create marker file to exist while javacores and heapdumps are generated
if [ $iphmc -eq 1 ]; then
   markerfile="$(queryFileLocation paconfig)active_hmcdebuginfo"
   echo "About to create active hmcdebuginfo marker file $markerfile" | tee -a $logfn
   touch $markerfile >> $logfn 2>&1
   chmod 666 $markerfile >> $logfn 2>&1
fi

# Request the JVM to take a thread dump and heap dump, if configured to do so, and get the file descriptors for 
# the main PID in the JVM.
filesset=0
pid=$(getjvmpid $jvmscript)
if [ -z $pid ]; then
   echo "Unable to find top-level JVM process. Skipping getting a JVM thread/heap dump and file descriptors." | tee -a $logfn
else
   ccfwjvm=$(ls -l /proc/$pid/exe | awk '{print $NF}')
   echo "CCFW java executable is $ccfwjvm" >> $logfn
   echo "*** Output from command to display the file descriptors of the JVM main PID ***" >> $logfn
   ls -la /proc/$pid/fd >> $logfn 2>&1
   filesize=$(wc -l $consolelog | awk '{print $1}')  # Number of lines in console log before we signal JVM
   echo "About to signal process $pid to create JVM thread/heap dumps." | tee -a $logfn
   kill -s sigquit $pid  # Send the JVM a SIGQUIT signal
   if [ $? != 0 ]; then
      echo "Attempt to signal PID $pid failed with exit status $?" | tee -a $logfn
   else   # Signal sent; wait for thread/heap dump creation
      echo "Signal successfully sent; waiting $sleepamount seconds for dump creation..." | tee -a $logfn
      sleep $sleepamount

      # This code assumes that the JVM is an IBM JVM.  It looks for a particular sequence
      # of JVM messages in the linemode console log.  The sed scripts delete all lines that
      # were in the console log file before the JVM was signalled, and then search any new
      # lines for JVM message identifiers:  
      #   <= 1.4.2    >= 1.5.0
      # - JVMDG217 | JVMDUMP006I | is issued when the JVM acknowledges the SIGQUIT
      # - JVMDG318 | JVMDUMP007I | identifies the heapdump file
      # - JVMDG304 | JVMDUMP007I | identifies the thread dump file
      # - JVMDG215 | JVMDUMP013I | indicates that the JVM has completed handling the SIGQUIT
      #
      # We loop with a delay until the JVM completes the dumps or runs out of time.
      
      # First, get the version number of the JVM that ccfw is running, by running that same
      # java executable with the "-version" option
      jvmversion=$($ccfwjvm -version 2>&1|grep 'java version'|awk '{print $3}'|sed -e 's/"//g')
      if [ -z "$jvmversion" ]; then  # Unable to get it with "-version"; assume 1.5.0
         jvmversion='1.5.0'
         echo "Attempt to determine JVM version failed; assuming $jvmversion" | tee -a $logfn
      fi

      # Based on the JVM version, set variables to the message identifiers we'll search for
      if echo $jvmversion | grep '^1.[34]' 2>/dev/null; then  # JVM 1.3 or 1.4
         ackmsg='JVMDG217'
         heapdumpmsg='JVMDG318'
         threaddumpmsg='JVMDG304'
         donemsg='JVMDG215'
      else                                                    # JVM 1.5 or later
         ackmsg='JVMDUMP006I'
         heapdumpmsg='JVMDUMP007I JVM Requesting Heap'
         threaddumpmsg='JVMDUMP007I JVM Requesting Java'
         donemsg='JVMDUMP013I'
      fi

      # Search the new information in the console log file for the message identifiers above
      done=0
      let loopcount=0
      while [[ $done -eq 0 && $loopcount -lt $sleepcount ]]; do
         if sed -e "1,$filesize d" $consolelog | grep $donemsg >/dev/null; then # All done
            echo "JVM dumps are complete." | tee -a $logfn
            # Get the filenames of the heap dump and thread dump from the JVM messages
            heapdump=$(sed -e "1,$filesize d" -e "/$heapdumpmsg/!d"   $consolelog | awk '{print $NF}' | sed -e "s/'//g")
            javacore=$(sed -e "1,$filesize d" -e "/$threaddumpmsg/!d" $consolelog | awk '{print $NF}' | sed -e "s/'//g")
            done=1
            filesset=1
         else
            if sed -e "1,$filesize d" $consolelog | grep $ackmsg >/dev/null; then # JVM got signal
               echo "Dumps are not yet complete; waiting another $sleepamount seconds..." | tee -a $logfn
            else # JVM has not acknowledged signal yet
               if [ $quick == 1 ]; then    # User does not want to wait for unresponsive JVM
                  echo "Not waiting on the JVM any longer; continuing to gather other files..." | tee -a $logfn
                  done=1
                  break
               else    # User is willing to wait; sleep and check again...
                  echo "JVM has not acknowledged signal; waiting another $sleepamount seconds..." | tee -a $logfn
               fi
            fi
            sleep $sleepamount
            let loopcount=$loopcount+1
         fi
      done
      if [ $done -eq 0 ]; then
         echo "Timeout occurred waiting for JVM to create thread/heap dump" | tee -a $logfn
      fi
   fi

   if [ $stackTraces -eq 1 ]; then 
      # Dump information about the threads of the JVM.
      expectProgram=`which expect`
      if [ -z $expectProgram ]; then
         echo "Unable to find the program to automate dumping the threads of the JVM processes.  (This is normal operation on z Systems.)" | tee -a $logfn
      else 
         gdbProgram=`which gdb`
         if [ -z $gdbProgram ]; then
            echo "Unable to find the gdb program." | tee -a $logfn
         else
            javaProgram=`which java`
            if [ -z $javaProgram ]; then
               echo "Unable to find the java program." | tee -a $logfn
            else
               echo "Dumping information about the JVM threads which may take a short while. Please wait." | tee -a $logfn
               cat $hmctopdir/dumpThreads | $expectProgram -f - $gdbProgram $javaProgram $pid >> $logfn
            fi
         fi
      fi
   fi
fi


filestogather="$logfn $tracefile $previoustracefiles $logfile $compressedlogfiles $consolelog $previousconsolelogs $javacore $heapdump $platformspecificfiles $tracebufsfn $systemmessages $tracefedcfiles $mousetracksfile $taskrecorderfiles $filesfromlistfile"
if [ $alljvmdumps == 1 ]; then # User wants all JVM thread/heap dumps, not just new ones
   filestogather="$filestogather $jvmthreaddumps $jvmheapdumps"
fi

echo "*** Listing of files to gather ***" >> $logfn
ls -l $filestogather >> $logfn 2>&1

echo "*** Listing of other interesting files ***" >> $logfn
otherfiles="$corefiles $hmctopdir/core*"
if [ $alljvmdumps == 0 ]; then
   otherfiles="$otherfiles $jvmthreaddumps $jvmheapdumps"
fi
ls -l $otherfiles >> $logfn 2>&1

echo "About to create tar file containing debugging information..." | tee -a $logfn
tar cvzf $tarfn $filestogather >> $logfn 2>&1
if [ -f $tarfn ]; then
   echo "tar file $tarfn has been created."
else
   echo "Attempt to create tar file $tarfn failed with exit status $?" | tee -a $logfn
   if [ $iphmc -eq 1 ]; then
      echo "About to remove active hmcdebuginfo marker file $markerfile" | tee -a $logfn
      rm $markerfile >> $logfn 2>&1
   fi
   exit 5
fi

# If the JVM successfully created a javacore and/or heapdump, remove them now so they
# don't accumulate and use up large amounts of disk space.  They have been put into
# our output file.
if [ $filesset -eq 1 ]; then     
   if [ -n "$javacore" ]; then
      rm $javacore
   fi
   if [ -n "$heapdump" ]; then
      rm $heapdump
   fi
fi

# Now that any heapdump or javacore is removed, remove marker file
if [ $iphmc -eq 1 ]; then
   echo "About to remove active hmcdebuginfo marker file $markerfile" | tee -a $logfn
   rm $markerfile >> $logfn 2>&1
fi

# If we're running on an i/p Series HMC, create a log entry to mark our success/failure.
if [ $iphmc -eq 1 ]; then
    echo "Running on i/p Series HMC.  Logging the event"
    lcmd="com.ibm.hsc.common.util.AusMicrocodeLog"
    if [ -n "$description" ]; then
        description=": $description"
    fi
    if [ $filesset -eq 0 ]; then
        ltext="hmcdebuginfo timeout$description"
    else
        if [ -z "$description" ]; then
            description=" - dumps taken"
        fi
        ltext="hmcdebuginfo$description"
    fi

    # Build classpath from all jar files in the main ipHMC jar directory
    classpath=.   # Make sure that there's at least something in $classpath
    for jar in /usr/websm/codebase/pluginjars/*.jar; do
       classpath=$classpath:$jar
    done
    # Add the typical ipHMC java location to PATH
    PATH=/opt/IBMJava/bin:$PATH
    java -classpath $classpath $lcmd $ltext &
fi

# Remove some of our working files; they're in our output file.
rm $tracebufsfn
rm $logfn

exit 0
