DDN SFA Debug Cheat Sheet

Robert Leong -

Show Sub Sum Event Log Debug Cheat Sheet

SFA OS 2.3.1.x or lower (and 3.0.x except for new features)

#####################################################################
### SSS - Event Log Debug CheatSheet ################################
#####################################################################
=== Event Log Checks ===
++ Boot Process (fail/restart of controller) - Summary +++
cat diag_health_monitoring_report.txt | egrep "[*]|LOG_JOI_BUILD_INFO1|LOG_ES_ENCL_CONFIG_INFO|LOG_DUCK_CONTROLLER_CONNECTED|LOG_ST_MASTERSHIP|LOG_LOGDISK_ENABLE|LOG_ST_JIS_ACTIVE|LOG_ST_MIR_STATE_NONE|LOG_ST_OTHER_DIED|LOG_ST_MIR_STATE_NONE|LOG_ST_MI_CTRL_DOWN" | less

++ Boot Process (fail/restart of controller) - Detail +++
cat diag_health_monitoring_report.txt | egrep "[*]|LOG_AMPD_MPI_IOC_RESET_START|LOG_JOI_BUILD|LOG_ES|LOG_ES_ENCL_CONFIG_INFO|LOG_DUCK_CONTROLLER_CONNECTED|LOG_AMPD_MPI_IOC_VERSION|LOG_AMPD_MPI_IOC_RESET|LOG_ST_MASTERSHIP|LOG_LOGDISK_ENABLE|LOG_ST_JIS_ACTIVE|LOG_ST_MIR_STATE_NONE|LOG_ST_OTHER_DIED|LOG_ST_MI_CTRL_DOWN" | less

++ CLI History  +++
cat diag_health_monitoring_report.txt | egrep "[*]|FROM:CLI" | less

+++ General things to look for +++
cat diag_health_monitoring_report.txt | egrep "[*]|LOG_ES_ENCL_CONFIG_INFO|LOG_ES_CHANNEL_ENCLOSURE_CONNECTED|LOG_IB_INITIATOR_LOGIN|LOG_DUCK_CONTROLLER_SEQ_FAILOVER|LOG_ST_OTHER_DIED|LOG_ST_MASTERSHIP|LOG_ST_SET_WT|LOG_ST_POOL_FLUSHED|LOG_ES_DISK_SLOT_ELEMENT_REMOVED" | less

+++ General ugly things to look for +++
cat diag_health_monitoring_report.txt | egrep "[*]|LOG_AMPD_MPI_SCSI_SENSE_DATA_EVT|LOG_AMPD_DSK_DEVICE_INIT_FAILED|LOG_AMPD_MPI_IOC_FAULT|LOG_ES_CTLR_PWR_SRC_CHANGED|LOG_ES_6620_LI_BATT_STATE_CHANGED|LOG_ES_6620_LI_BATT_CHARGING_STATUS|LOG_ES_COOLING_ELEMENT_REMOVED|LOG_ES_EXPANDER_ELEMENT_REMOVED|LOG_ES_DISK_SLOT_ELEMENT_REMOVED|LOG_ST_SET_AWL|LOG_ST_SPLIT_BRAIN|LOG_ST_SET_CRITICAL|LOG_RAID_UNCORRECTED_SILENT_ERR|LOG_RAID_UNCORRECTED_MEDIUM_ERR|UNCOR_ERROR" | less

+++ General Show all errors +++
cat diag_health_monitoring_report.txt | egrep "[*]|ERROR" | less

+++ Show down items/general checks +++
cat diag_health_monitoring_report.txt | egrep -i "[*]|DOWN" | less
cat diag_health_monitoring_report.txt | egrep -i "[*]|MIRR_REMOVED" | less
cat diag_health_monitoring_report.txt | egrep -i "[*]|NA" | less
cat diag_health_monitoring_report.txt | egrep -i "[*]|CRITICAL" | less

+++ Drive Paths +++
cat diag_health_monitoring_report.txt | egrep "[*]|0xffff  0xffff  0xffff  0xffff  0xffff  0xffff  0xffff  0xffff" -B 13 -A 20 | egrep "[*]|Index| NA | Disk Slot|0xffff|--|PARTIAL READY" | less
cat diag_health_monitoring_report.txt | egrep -v "LOG_APP_EVENT_CREATE_PRES" | egrep "[*]|0xffff" -B 13 -A 7 | egrep "[*]|Index| NA | Disk Slot|0xffff|--" | less

+++ Drive issues +++
cat diag_health_monitoring_report.txt | egrep "[*]|LOG_AMPD_.*SENSE_DATA|LOG_IOF_AUTO_QUIESCED|LOG_ES_SES_COMMAND_QUERY_FAILURE|JS_AMPD_MEDIUM_ERROR|LOG_RAID_DIF" | less
cat diag_health_monitoring_report.txt | egrep -i "LOG_AMPD_.*SENSE_DATA" | awk '{print $21,$12,$13,$15,$17}' | sort | less
cat diag_health_monitoring_report.txt | egrep -i "LOG_AMPD_.*SENSE_DATA" | awk '{print $21,$12,$13,$15,$17}' | sort | uniq -c | sort | less
cat diag_health_monitoring_report.txt | egrep "[*]|Pool Index:      |Enabled Path IDs:     |Enclosure Index:      |Index:                " | less

+++ Pool - Drive History +++
cat diag_health_monitoring_report.txt | egrep "[*]|POOL|LOG_ST_MI_PD_FAILED|LOG_ST_DS_STATUS" | less

+++ Host Port login/logouts History +++
cat diag_health_monitoring_report.txt | egrep "[*]|INITIATOR_LOGOUT|INITIATOR_LOGIN" | less

+++ Check for HW Errors +++
cat diag_health_monitoring_report.txt | egrep "[*]|UNCOR_ERROR|INIT_FAIL|SENSOR_ERROR|IOC_RESET_RESTART|IOC_INIT_FAIL|IOC_RESET_FAILED|IOC_RESET_COMPLETE|IOC_SHUTDOWN" | less

+++ Check for S1/S2 entries +++
cat diag_health_monitoring_report.txt | egrep "[*]|S=1|S=2" | less

+++ Check for Power Supply Mismatch +++
cat diag_health_monitoring_report.txt | egrep "[*]|LOG_ES_POWER_SUPPLY_MISMATCH" | less

+++ Event when diag is being pulled - tells you when the diag was pulled +++
cat diag_health_monitoring_report.txt | egrep "[*]|LOG_ST_.*_LOG_COMP|LOG_REOPEN" | less

=== Files to pull out of diag ===
 - /var/log/dmesg*
 - /var/log/syslog/
 - /var/log/janus/eventlog
 - /tmp/diag_health_monitoring_report
 - /tmp/corefiles/dmesg*
 - /tmp/corefiles/logdisk (if needed more indepth)


=== SYSLOG Checks ===
+++ Find System Type +++
cat syslog* | egrep "MODEL" | less

+++ Check For cores / Janus Crash +++
cat syslog* | egrep "core dumped|JANUS_CRASH" | less
egrep "core dumped|JANUS_CRASH" syslog* | sort | less

+++ Check For HW Failures & Errors +++
cat syslog* | egrep -i "failed|error|Fatal|Uncorrected|recover|device has no|Bad karma|Surprise" | egrep -v "comm failures|checksum error|Error accepting SSL|JHELPER" | sort | less
egrep -i "failed|error|Fatal|Uncorrected|recover|device has no|Bad karma|Surprise" syslog* | egrep -v "comm failures|checksum error|Error accepting SSL|JHELPER|cron.daily|bcmxcp_usb|/USR/SBIN/CRON" | sort | less

+++ Check For Reboot History +++
cat syslog* | egrep -i "kernel|JANUS_STARTUP|acpid" | egrep -i "reboot|poweroff|Janus Model|starting up|Power Button" | less

=== DMESG Checks ===
+++ failures, errors, crash +++
cat dmesg* | egrep -i "fail|SysRq|BUG|segfault|unified|error|Fatal|Uncorrected|recover|device has no|Bad karma|Surprise" | sort | less
egrep -i "fail|SysRq|BUG|segfault|unified|error|Fatal|Uncorrected|recover|device has no|Bad karma|Surprise" dmesg* | sort | less

+++ Reboot History +++
cat dmesg* | egrep -i "sys_reboot|Initializing cgroup|Linux version" | less
#####################################################################

 

Sourced from DDN support portal

 

 

Have more questions? Submit a request

0 Comments

Article is closed for comments.