Show Sub Sum Event Log Debug Cheat Sheet
SFA OS 2.3.1.x or lower (and 3.0.x except for new features)
##################################################################### ### SSS - Event Log Debug CheatSheet ################################ ##################################################################### === Event Log Checks === ++ Boot Process (fail/restart of controller) - Summary +++ cat diag_health_monitoring_report.txt | egrep "[*]|LOG_JOI_BUILD_INFO1|LOG_ES_ENCL_CONFIG_INFO|LOG_DUCK_CONTROLLER_CONNECTED|LOG_ST_MASTERSHIP|LOG_LOGDISK_ENABLE|LOG_ST_JIS_ACTIVE|LOG_ST_MIR_STATE_NONE|LOG_ST_OTHER_DIED|LOG_ST_MIR_STATE_NONE|LOG_ST_MI_CTRL_DOWN" | less ++ Boot Process (fail/restart of controller) - Detail +++ cat diag_health_monitoring_report.txt | egrep "[*]|LOG_AMPD_MPI_IOC_RESET_START|LOG_JOI_BUILD|LOG_ES|LOG_ES_ENCL_CONFIG_INFO|LOG_DUCK_CONTROLLER_CONNECTED|LOG_AMPD_MPI_IOC_VERSION|LOG_AMPD_MPI_IOC_RESET|LOG_ST_MASTERSHIP|LOG_LOGDISK_ENABLE|LOG_ST_JIS_ACTIVE|LOG_ST_MIR_STATE_NONE|LOG_ST_OTHER_DIED|LOG_ST_MI_CTRL_DOWN" | less ++ CLI History +++ cat diag_health_monitoring_report.txt | egrep "[*]|FROM:CLI" | less +++ General things to look for +++ cat diag_health_monitoring_report.txt | egrep "[*]|LOG_ES_ENCL_CONFIG_INFO|LOG_ES_CHANNEL_ENCLOSURE_CONNECTED|LOG_IB_INITIATOR_LOGIN|LOG_DUCK_CONTROLLER_SEQ_FAILOVER|LOG_ST_OTHER_DIED|LOG_ST_MASTERSHIP|LOG_ST_SET_WT|LOG_ST_POOL_FLUSHED|LOG_ES_DISK_SLOT_ELEMENT_REMOVED" | less +++ General ugly things to look for +++ cat diag_health_monitoring_report.txt | egrep "[*]|LOG_AMPD_MPI_SCSI_SENSE_DATA_EVT|LOG_AMPD_DSK_DEVICE_INIT_FAILED|LOG_AMPD_MPI_IOC_FAULT|LOG_ES_CTLR_PWR_SRC_CHANGED|LOG_ES_6620_LI_BATT_STATE_CHANGED|LOG_ES_6620_LI_BATT_CHARGING_STATUS|LOG_ES_COOLING_ELEMENT_REMOVED|LOG_ES_EXPANDER_ELEMENT_REMOVED|LOG_ES_DISK_SLOT_ELEMENT_REMOVED|LOG_ST_SET_AWL|LOG_ST_SPLIT_BRAIN|LOG_ST_SET_CRITICAL|LOG_RAID_UNCORRECTED_SILENT_ERR|LOG_RAID_UNCORRECTED_MEDIUM_ERR|UNCOR_ERROR" | less +++ General Show all errors +++ cat diag_health_monitoring_report.txt | egrep "[*]|ERROR" | less +++ Show down items/general checks +++ cat diag_health_monitoring_report.txt | egrep -i "[*]|DOWN" | less cat diag_health_monitoring_report.txt | egrep -i "[*]|MIRR_REMOVED" | less cat diag_health_monitoring_report.txt | egrep -i "[*]|NA" | less cat diag_health_monitoring_report.txt | egrep -i "[*]|CRITICAL" | less +++ Drive Paths +++ cat diag_health_monitoring_report.txt | egrep "[*]|0xffff 0xffff 0xffff 0xffff 0xffff 0xffff 0xffff 0xffff" -B 13 -A 20 | egrep "[*]|Index| NA | Disk Slot|0xffff|--|PARTIAL READY" | less cat diag_health_monitoring_report.txt | egrep -v "LOG_APP_EVENT_CREATE_PRES" | egrep "[*]|0xffff" -B 13 -A 7 | egrep "[*]|Index| NA | Disk Slot|0xffff|--" | less +++ Drive issues +++ cat diag_health_monitoring_report.txt | egrep "[*]|LOG_AMPD_.*SENSE_DATA|LOG_IOF_AUTO_QUIESCED|LOG_ES_SES_COMMAND_QUERY_FAILURE|JS_AMPD_MEDIUM_ERROR|LOG_RAID_DIF" | less cat diag_health_monitoring_report.txt | egrep -i "LOG_AMPD_.*SENSE_DATA" | awk '{print $21,$12,$13,$15,$17}' | sort | less cat diag_health_monitoring_report.txt | egrep -i "LOG_AMPD_.*SENSE_DATA" | awk '{print $21,$12,$13,$15,$17}' | sort | uniq -c | sort | less cat diag_health_monitoring_report.txt | egrep "[*]|Pool Index: |Enabled Path IDs: |Enclosure Index: |Index: " | less +++ Pool - Drive History +++ cat diag_health_monitoring_report.txt | egrep "[*]|POOL|LOG_ST_MI_PD_FAILED|LOG_ST_DS_STATUS" | less +++ Host Port login/logouts History +++ cat diag_health_monitoring_report.txt | egrep "[*]|INITIATOR_LOGOUT|INITIATOR_LOGIN" | less +++ Check for HW Errors +++ cat diag_health_monitoring_report.txt | egrep "[*]|UNCOR_ERROR|INIT_FAIL|SENSOR_ERROR|IOC_RESET_RESTART|IOC_INIT_FAIL|IOC_RESET_FAILED|IOC_RESET_COMPLETE|IOC_SHUTDOWN" | less +++ Check for S1/S2 entries +++ cat diag_health_monitoring_report.txt | egrep "[*]|S=1|S=2" | less +++ Check for Power Supply Mismatch +++ cat diag_health_monitoring_report.txt | egrep "[*]|LOG_ES_POWER_SUPPLY_MISMATCH" | less +++ Event when diag is being pulled - tells you when the diag was pulled +++ cat diag_health_monitoring_report.txt | egrep "[*]|LOG_ST_.*_LOG_COMP|LOG_REOPEN" | less === Files to pull out of diag === - /var/log/dmesg* - /var/log/syslog/ - /var/log/janus/eventlog - /tmp/diag_health_monitoring_report - /tmp/corefiles/dmesg* - /tmp/corefiles/logdisk (if needed more indepth) === SYSLOG Checks === +++ Find System Type +++ cat syslog* | egrep "MODEL" | less +++ Check For cores / Janus Crash +++ cat syslog* | egrep "core dumped|JANUS_CRASH" | less egrep "core dumped|JANUS_CRASH" syslog* | sort | less +++ Check For HW Failures & Errors +++ cat syslog* | egrep -i "failed|error|Fatal|Uncorrected|recover|device has no|Bad karma|Surprise" | egrep -v "comm failures|checksum error|Error accepting SSL|JHELPER" | sort | less egrep -i "failed|error|Fatal|Uncorrected|recover|device has no|Bad karma|Surprise" syslog* | egrep -v "comm failures|checksum error|Error accepting SSL|JHELPER|cron.daily|bcmxcp_usb|/USR/SBIN/CRON" | sort | less +++ Check For Reboot History +++ cat syslog* | egrep -i "kernel|JANUS_STARTUP|acpid" | egrep -i "reboot|poweroff|Janus Model|starting up|Power Button" | less === DMESG Checks === +++ failures, errors, crash +++ cat dmesg* | egrep -i "fail|SysRq|BUG|segfault|unified|error|Fatal|Uncorrected|recover|device has no|Bad karma|Surprise" | sort | less egrep -i "fail|SysRq|BUG|segfault|unified|error|Fatal|Uncorrected|recover|device has no|Bad karma|Surprise" dmesg* | sort | less +++ Reboot History +++ cat dmesg* | egrep -i "sys_reboot|Initializing cgroup|Linux version" | less #####################################################################
Sourced from DDN support portal
0 Comments