一、問題
1、數據庫集群檢查
節點1
[root@dbrac1 backup]# /home/app/11.2.0/grid/bin/crsctl check crs
CRS-4638: Oracle High Availability Services is online
CRS-4535: Cannot communicate with Cluster Ready Services CRS-4529: Cluster Synchronization Services is online
CRS-4534: Cannot communicate with Event Manager
CRS-4535:無法與集群就緒服務通信
CRS-4534:無法與事件管理器通信
2、數據庫狀態檢查
節點2
[root@dbrac2 dev]# /home/app/11.2.0/grid/bin/crsctl status res -t
--------------------------------------------------------------------------------
NAME???????????TARGET??STATE????????SERVER???????????????????STATE_DETAILS
--------------------------------------------------------------------------------
Local Resources
--------------------------------------------------------------------------------
ora.ARCH.dg
ONLINE??ONLINE? ? ? ?dbrac2
ora.DATA.dg
ONLINE??ONLINE? ? ? ?dbrac2
ora.LISTENER.lsnr
ONLINE??ONLINE? ? ? ?dbrac2
ora.OCR.dg
ONLINE??ONLINE? ? ? ?dbrac2
ora.REDO.dg
ONLINE??ONLINE? ? ? ?dbrac2
ora.REDO1.dg
ONLINE??ONLINE? ? ? ?dbrac2
ora.asm
ONLINE??ONLINE? ? ? ?dbrac2???????????????Started
ora.net1.network
ONLINE??ONLINE? ? ? ?dbrac2
ora.ons
ONLINE??ONLINE? ? ? ?dbrac2
--------------------------------------------------------------------------------
Cluster Resources
--------------------------------------------------------------------------------
ora.LISTENER_SCAN1.lsnr
1????????ONLINE??ONLINE? ? ? ?dbrac2
ora.cvu
1????????ONLINE??ONLINE? ? ? ?dbrac2
ora.oc4j
1????????ONLINE??ONLINE? ? ? ?dbrac2
ora.orcl.db
1????????ONLINE??ONLINE? ? ? ?dbrac2???????????????Open
2????????ONLINE??OFFLINE
ora.dbrac1.vip
1????????ONLINE??INTERMEDIATE dbrac2???????????????FAILED OVER
ora.dbrac2.vip
1????????ONLINE??ONLINE? ? ? ?dbrac2
ora.scan1.vip
1????????ONLINE??ONLINE? ? ? ?dbrac2
數據庫集群節點1無法打開,且重啟無效
二、判斷思路
首先肯定要去檢查報警日志的,查哪些?
CRS-4535: Cannot communicate with Cluster Ready Services
CRS-4534: Cannot communicate with Event Manager
這些是集群日志
[grid@dbrac1 ~]$ cd /home/app/11.2.0/grid/log/dbrac1/
[grid@dbrac1 dbrac1]$ ls -lrt
total 156
drwxr-x— 2 root oinstall 6 Nov 27 2020 gnsd
drwxr-x— 2 grid oinstall 6 Nov 27 2020 srvm
drwxr-x— 2 grid oinstall 6 Nov 27 2020 diskmon
drwxr-x— 4 grid oinstall 34 Nov 27 2020 cvu
drwxr-xr-x 2 root oinstall 6 Nov 27 2020 acfssec
drwxr-x— 2 grid oinstall 6 Nov 27 2020 acfsrepl
drwxr-x— 2 grid oinstall 6 Nov 27 2020 acfslog
drwxrwxr-t 4 root oinstall 31 Nov 27 2020 agent
drwxr-x— 2 grid oinstall 6 Nov 27 2020 admin
drwxr-x— 2 root oinstall 6 Nov 27 2020 acfsreplroot
drwxr-x— 2 grid oinstall 43 Nov 27 2020 mdnsd
drwxr-x— 2 root oinstall 47 Nov 27 2020 crfmond
drwxr-x— 2 root oinstall 47 Nov 27 2020 crflogd
drwxr-x— 2 grid oinstall 41 Nov 27 2020 evmd
drwxrwxr-t 5 grid oinstall 88 Dec 17 2020 racg
drwxr-x— 2 root oinstall 201 May 11 21:33 crsd
drwxr-x— 2 root oinstall 213 May 24 09:21 ohasd
drwxr-x— 2 grid oinstall 212 Oct 13 06:11 cssd
drwxr-x— 2 root oinstall 224 Oct 19 13:27 ctssd
drwxrwxrwt 2 grid oinstall 4096 Oct 19 14:16 client
drwxr-x— 2 grid oinstall 65 Oct 19 14:50 gpnpd
-rw-rw-r-- 1 grid oinstall 151975 Oct 19 15:51 alertqhxsdbrac1.log
drwxr-x— 2 grid oinstall 213 Oct 19 16:08 gipcd
這些是數據庫和asm日志(與此次問題無關)
首先比對下異常的節點和正常的節點2啟動的進程
節點1
[root@dbrac1 dev]# ps -ef| grep d.bin
root 330721 1 4 14:50 ? 00:00:01 /home/app/11.2.0/grid/bin/ohasd.bin reboot
grid 330909 1 0 14:50 ? 00:00:00 /home/app/11.2.0/grid/bin/oraagent.bin
grid 330920 1 0 14:50 ? 00:00:00 /home/app/11.2.0/grid/bin/mdnsd.bin
grid 330950 1 1 14:50 ? 00:00:00 /home/app/11.2.0/grid/bin/gpnpd.bin
root 330989 1 1 14:50 ? 00:00:00 /home/app/11.2.0/grid/bin/orarootagent.bin
grid 330992 1 2 14:50 ? 00:00:00 /home/app/11.2.0/grid/bin/gipcd.bin
root 331006 1 9 14:50 ? 00:00:02 /home/app/11.2.0/grid/bin/osysmond.bin
root 331029 1 0 14:50 ? 00:00:00 /home/app/11.2.0/grid/bin/cssdmonitor
root 331047 1 0 14:50 ? 00:00:00 /home/app/11.2.0/grid/bin/cssdagent
grid 331076 1 2 14:50 ? 00:00:00 /home/app/11.2.0/grid/bin/ocssd.bin
root 331223 1 1 14:50 ? 00:00:00 /home/app/11.2.0/grid/bin/ologgerd -m dbrac2 -r -d /home/app/11.2.0/grid/crf/db/dbrac1
root 331481 1 2 14:51 ? 00:00:00 /home/app/11.2.0/grid/bin/octssd.bin reboot
root 331551 274510 0 14:51 pts/2 00:00:00 grep --color=auto d.bin
[root@qhxsdbrac1 dev]# ps -ef| grep d.bin | wc -l
14
節點2
[root@qhxsdbrac2 dev]# ps -ef| grep d.bin
root 76375 395109 0 16:23 pts/3 00:00:00 grep --color=auto d.bin
root 320436 1 0 5月28 ? 08:07:30 /home/app/11.2.0/grid/bin/ologgerd -m qhxsdbrac1 -r -d /home/app/11.2.0/grid/crf/db/dbrac2
grid 339608 1 0 2021 ? 01:28:23 /home/app/11.2.0/grid/bin/tnslsnr LISTENER -inherit
grid 340795 1 0 2021 ? 04:12:04 /home/app/11.2.0/grid/bin/scriptagent.bin
grid 340818 1 0 2021 ? 01:44:17 /home/app/11.2.0/grid/bin/tnslsnr LISTENER_SCAN1 -inherit
root 420308 1 0 2020 ? 4-23:24:52 /home/app/11.2.0/grid/bin/ohasd.bin reboot
grid 420430 1 0 2020 ? 3-11:46:22 /home/app/11.2.0/grid/bin/oraagent.bin
grid 420441 1 0 2020 ? 01:01:51 /home/app/11.2.0/grid/bin/mdnsd.bin
grid 420452 1 0 2020 ? 13:43:44 /home/app/11.2.0/grid/bin/gpnpd.bin
root 420462 1 0 2020 ? 4-17:22:04 /home/app/11.2.0/grid/bin/orarootagent.bin
grid 420465 1 0 2020 ? 4-22:02:03 /home/app/11.2.0/grid/bin/gipcd.bin
root 420478 1 13 2020 ? 89-13:54:28 /home/app/11.2.0/grid/bin/osysmond.bin
root 420491 1 0 2020 ? 23:50:01 /home/app/11.2.0/grid/bin/cssdmonitor
root 420508 1 0 2020 ? 23:56:28 /home/app/11.2.0/grid/bin/cssdagent
grid 420520 1 0 2020 ? 4-09:00:12 /home/app/11.2.0/grid/bin/ocssd.bin
root 420617 1 0 2020 ? 3-02:27:49 /home/app/11.2.0/grid/bin/octssd.bin reboot
grid 420644 1 0 2020 ? 3-01:06:44 /home/app/11.2.0/grid/bin/evmd.bin
root 420737 1 1 2020 ? 7-07:53:37 /home/app/11.2.0/grid/bin/crsd.bin reboot
grid 420813 420644 0 2020 ? 00:00:00 /home/app/11.2.0/grid/bin/evmlogger.bin -o /home/app/11.2.0/grid/evm/log/evmlogger.info -l /home/app/11.2.0/grid/evm/log/evmlogger.log
grid 420852 1 0 2020 ? 2-02:15:58 /home/app/11.2.0/grid/bin/oraagent.bin
root 420856 1 0 2020 ? 5-22:49:01 /home/app/11.2.0/grid/bin/orarootagent.bin
oracle 421019 1 0 2020 ? 4-01:21:57 /home/app/11.2.0/grid/bin/oraagent.bin
[root@qhxsdbrac2 dev]# ps -ef| grep d.bin | wc -l
22
比對以后,節點1缺少以下進程,高光的需要優先觀察的
grid 339608 1 0 2021 ? 01:28:23 /home/app/11.2.0/grid/bin/tnslsnr LISTENER -inherit
grid 340795 1 0 2021 ? 04:12:04 /home/app/11.2.0/grid/bin/scriptagent.bin
grid 340818 1 0 2021 ? 01:44:17 /home/app/11.2.0/grid/bin/tnslsnr LISTENER_SCAN1 -inherit
grid 420644 1 0 2020 ? 3-01:06:44 /home/app/11.2.0/grid/bin/evmd.bin
root 420737 1 1 2020 ? 7-07:53:37 /home/app/11.2.0/grid/bin/crsd.bin reboot grid 420813 420644 0 2020 ? 00:00:00 /home/app/11.2.0/grid/bin/evmlogger.bin -o /home/app/11.2.0/grid/evm/log/evmlogger.info -l /home/app/11.2.0/grid/evm/log/evmlogger.log
先看crs,發現幾個疑點
第一,最新的報警日志居然是2022年5月28號,懷疑是服務器時間出現了問題,結果是正常的,說明五月份宕機,運行了5個月都沒人知道,NB!
第二個,我可以理解不能打開ocr文件,但是為什么asm登錄拒絕(這個沒多想)
2022-05-28 07:47:37.580: [ CRSMAIN][1032308544] Checking the OCR device
2022-05-28 07:47:37.580: [ CRSMAIN][932718336] Policy Engine is not initialized yet!
2022-05-28 07:47:37.581: [ CRSMAIN][1032308544] Sync-up with OCR
2022-05-28 07:47:37.581: [ CRSMAIN][1032308544] Connecting to the CSS Daemon
2022-05-28 07:47:37.581: [ CRSMAIN][1032308544] Getting local node number
2022-05-28 07:47:37.582: [ CRSMAIN][1032308544] Initializing OCR
[ CLWAL][1032308544]clsw_Initialize: OLR initlevel [70000]
2022-05-28 07:47:40.853: \[ OCRASM\]\[1032308544\]proprasmo: Error in open/create file in dg \[ocr\] [ OCRASM][1032308544]SLOS : SLOS: cat=7, opn=kgfoAl06, dep=1017, loc=kgfokge
2022-05-28 07:47:40.853: [ OCRASM][1032308544]ASM Error Stack : ORA-01017: invalid username/password; logon denied
2022-05-28 07:47:41.909: [ OCRASM][1032308544]proprasmo: kgfoCheckMount returned [7]
2022-05-28 07:47:41.909: [ OCRASM][1032308544]proprasmo: The ASM instance is down
2022-05-28 07:47:41.911: [ OCRRAW][1032308544]proprioo: Failed to open [+ocr]. Returned proprasmo() with [26]. Marking location as UNAVAILABLE.
2022-05-28 07:47:41.911: [ OCRRAW][1032308544]proprioo: No OCR/OLR devices are usable
2022-05-28 07:47:41.911: [ OCRASM][1032308544]proprasmcl: asmhandle is NULL
2022-05-28 07:47:41.911: [ GIPC][1032308544] gipcCheckInitialization: possible incompatible non-threaded init from [prom.c : 690], original from [clsss.c : 5343]
2022-05-28 07:47:41.913: [ default][1032308544]clsvactversion:4: Retrieving Active Version from local storage.
2022-05-28 07:47:41.916: [ OCRRAW][1032308544]proprrepauto: The local OCR configuration matches with the configuration published by OCR Cache Writer. No repair required.
2022-05-28 07:47:41.917: [ OCRRAW][1032308544]proprinit: Could not open raw device
2022-05-28 07:47:41.917: [ OCRASM][1032308544]proprasmcl: asmhandle is NULL
2022-05-28 07:47:41.919: [ OCRAPI][1032308544]a_init:16!: Backend init unsuccessful : [26]
2022-05-28 07:47:41.919: [ CRSOCR][1032308544] OCR context init failure. Error: PROC-26: Error while accessing the physical storage
ORA-01017: invalid username/password; logon denied
2022-05-28 07:47:41.919: [ CRSD][1032308544] Created alert : (:CRSD00111:) : Could not init OCR, error: PROC-26: Error while accessing the physical storage
ORA-01017: invalid username/password; logon denied
2022-05-28 07:47:41.919: [ CRSD][1032308544][PANIC] CRSD exiting: Could not init OCR, code: 26
2022-05-28 07:47:41.919: [ CRSD][1032308544] Done.
再看evm,關鍵點
第一,和crs報警日志差不多
第二、暫時沒想法,去看看數據庫和asm的日志
2022-05-28 07:47:32.571: [ CRSMAIN][690448192] Initializing OCR
[ CLWAL][690448192]clsw_Initialize: OLR initlevel [70000]
2022-05-28 07:47:35.365: \[ OCRASM\]\[690448192\]proprasmo: Error in open/create file in dg \[ocr\] [ OCRASM][690448192]SLOS : SLOS: cat=7, opn=kgfoAl06, dep=1017, loc=kgfokge
2022-05-28 07:47:35.365: [ OCRASM][690448192]ASM Error Stack : ORA-01017: invalid username/password; logon denied
2022-05-28 07:47:36.423: [ OCRASM][690448192]proprasmo: kgfoCheckMount returned [7]
2022-05-28 07:47:36.423: [ OCRASM][690448192]proprasmo: The ASM instance is down
2022-05-28 07:47:36.424: [ OCRRAW][690448192]proprioo: Failed to open [+ocr]. Returned proprasmo() with [26]. Marking location as UNAVAILABLE.
2022-05-28 07:47:36.424: [ OCRRAW][690448192]proprioo: No OCR/OLR devices are usable
2022-05-28 07:47:36.424: [ OCRASM][690448192]proprasmcl: asmhandle is NULL
2022-05-28 07:47:36.425: [ GIPC][690448192] gipcCheckInitialization: possible incompatible non-threaded init from [prom.c : 690], original from [clsss.c : 5343]
2022-05-28 07:47:36.426: [ default][690448192]clsvactversion:4: Retrieving Active Version from local storage.
2022-05-28 07:47:36.430: [ OCRRAW][690448192]proprrepauto: The local OCR configuration matches with the configuration published by OCR Cache Writer. No repair required.
2022-05-28 07:47:36.431: [ OCRRAW][690448192]proprinit: Could not open raw device
2022-05-28 07:47:36.431: [ OCRASM][690448192]proprasmcl: asmhandle is NULL
2022-05-28 07:47:36.433: [ OCRAPI][690448192]a_init:16!: Backend init unsuccessful : [26]
2022-05-28 07:47:36.434: [ CRSOCR][690448192] OCR context init failure. Error: PROC-26: Error while accessing the physical storage
ORA-01017: invalid username/password; logon denied
2022-05-28 07:47:36.434: [ CRSD][690448192] Created alert : (:CRSD00111:) : Could not init OCR, error: PROC-26: Error while accessing the physical storage
ORA-01017: invalid username/password; logon denied
2022-05-28 07:47:36.434: [ CRSD][690448192][PANIC] CRSD exiting: Could not init OCR, code: 26
2022-05-28 07:47:36.434: [ CRSD][690448192] Done.
接著看asm集群日志注意點:
第一,高光的顯示直連失敗、不能連接、拒絕登錄
第二,真沒注意這個錯誤,陷入停滯,去看看磁盤的屬主屬組?
Sat May 28 07:16:54 2022
NOTE: client exited [266185]
WARNING: ASM communication error: op 0 state 0x0 (15055)
ERROR: direct connection failure with ASM NOTE: Deferred communication with ASM instance
Errors in file /home/app/grid/diag/asm/+asm/+ASM1/trace/+ASM1_ora_266205.trc:
ORA-15055: unable to connect to ASM instance
ORA-01017: invalid username/password; logon denied
NOTE: deferred map free for map id 2
Tue Oct 18 19:23:17 2022
Error 29701: unexpected return code 6 from the Cluster Synchronization Service
Errors in file /home/app/grid/diag/asm/+asm/+ASM1/trace/+ASM1_lmon_266127.trc:
ORA-29701: unable to connect to Cluster Synchronization Service
Tue Oct 18 19:23:17 2022
System state dump requested by (instance=1, osid=266127 (LMON)), summary=[abnormal instance termination].
LMON (ospid: 266127): terminating the instance due to error 29701
System State dumped to trace file /home/app/grid/diag/asm/+asm/+ASM1/trace/+ASM1_diag_266121_20221018192317.trc
Dumping diagnostic data in directory=[cdmp_20221018192317], requested by (instance=1, osid=266127 (LMON)), summary=[abnormal instance termination].
Instance terminated by LMON, pid = 266127
節點1、2的屬主屬組都對
[root@dbrac1 dev]# ls -lrt /dev/asm*
brw-rw---- 1 grid asmadmin 65, 128 10月 19 10:18 /dev/asm-ocr2
brw-rw---- 1 grid asmadmin 65, 0 10月 19 10:18 /dev/asm-arch1
brw-rw---- 1 grid asmadmin 65, 32 10月 19 10:18 /dev/asm-arch2
brw-rw---- 1 grid asmadmin 65, 64 10月 19 14:51 /dev/asm-arch3
brw-rw---- 1 grid asmadmin 65, 112 10月 19 14:51 /dev/asm-ocr3
brw-rw---- 1 grid asmadmin 8, 240 10月 19 17:00 /dev/asm-data3
brw-rw---- 1 grid asmadmin 8, 64 10月 19 17:00 /dev/asm-data5
brw-rw---- 1 grid asmadmin 8, 80 10月 19 17:00 /dev/asm-data7
brw-rw---- 1 grid asmadmin 8, 48 10月 19 17:00 /dev/asm-data6
brw-rw---- 1 grid asmadmin 8, 96 10月 19 17:00 /dev/asm-data8
brw-rw---- 1 grid asmadmin 8, 16 10月 19 17:00 /dev/asm-data9
brw-rw---- 1 grid asmadmin 8, 32 10月 19 17:00 /dev/asm-data4
brw-rw---- 1 grid asmadmin 8, 144 10月 19 17:00 /dev/asm-data12
brw-rw---- 1 grid asmadmin 8, 128 10月 19 17:00 /dev/asm-data10
brw-rw---- 1 grid asmadmin 8, 192 10月 19 17:00 /dev/asm-data1
brw-rw---- 1 grid asmadmin 65, 224 10月 19 17:00 /dev/asm-redo2
brw-rw---- 1 grid asmadmin 65, 208 10月 19 17:00 /dev/asm-redo1
brw-rw---- 1 grid asmadmin 8, 208 10月 19 17:00 /dev/asm-data2
brw-rw---- 1 grid asmadmin 8, 176 10月 19 17:00 /dev/asm-data13
brw-rw---- 1 grid asmadmin 8, 160 10月 19 17:00 /dev/asm-data11
brw-rw---- 1 grid asmadmin 65, 176 10月 19 17:00 /dev/asm-ocr1
思維停滯!請求外援
三、提供思路步驟
1、sqlplus /as sysam登陸失敗,從這點入手
2、不用crsctl命令,用專用的srvctl命令
第一個就拒絕登錄的思路,sqlnet.ora可以限制
節點1
[oracle@dbrac1 ~]$ vim /home/app/11.2.0/grid/network/admin/sqlnet.ora
# sqlnet.ora.qhxsdbrac1 Network Configuration File: /home/app/11.2.0/grid/network/admin/sqlnet.ora.qhxsdbrac1
# Generated by Oracle configuration tools.
NAMES.DIRECTORY_PATH= (TNSNAMES, EZCONNECT)
SQLNET.AUTHENTICATION_SERVICES=(NONE)
ADR_BASE = /home/app/grid
節點2
[root@dbrac2 dev]# vim /home/app/11.2.0/grid/network/admin/sqlnet.ora
# sqlnet.ora.qhxsdbrac2 Network Configuration File: /home/app/11.2.0/grid/network/admin/sqlnet.ora.qhxsdbrac2
# Generated by Oracle configuration tools.
NAMES.DIRECTORY_PATH= (TNSNAMES, EZCONNECT)
ADR_BASE = /home/app/grid
經過對比,發現節點1的sqlnet.ora日志多一行 SQLNET.AUTHENTICATION_SERVICES=(NONE)
百度意思是 If SQLNET. AUTHENTICATION_SERVICES=none then a valid username and password is need to connect to ASM instance.
將其注銷即正常啟動,沒做任何操作
[oracle@dbrac1 ~]$ vim /home/app/11.2.0/grid/network/admin/sqlnet.ora
# sqlnet.ora.dbrac1 Network Configuration File: /home/app/11.2.0/grid/network/admin/sqlnet.ora.dbrac1
# Generated by Oracle configuration tools.
NAMES.DIRECTORY_PATH= (TNSNAMES, EZCONNECT)
#SQLNET.AUTHENTICATION\_SERVICES=(NONE)
ADR_BASE = /home/app/grid
2、不用crsctl命令,用專用的srvctl命令(方便追蹤日志)
兩者有何區別?下回再驗證吧
~




