LustrePerfMon icon indicating copy to clipboard operation
LustrePerfMon copied to clipboard

Add new fileinfinbiand xml definition

Open sihara opened this issue 4 years ago • 0 comments

I would add another infiniband (e.g. infiniband2) definition file with perfquery rather than scan /sys/class/infiniband/xxxx/ports/x/counters, but would aslo keep current infinbiand xml file for compatibility. The reason of why it needs new infiniband xml files because systems don't have /sys/class/infiniband/xxxx/ports/x/counters (e.g. docker environment) that is not able to collect infiniband metrics by filedata.

Here is example how to capture IB stats with perfquery.

[root@amd01 ~]# ibstat
CA 'mlx5_4'
	CA type: MT4123
	Number of ports: 1
	Firmware version: 20.26.4012
	Hardware version: 0
	Node GUID: 0x0c42a1030017c078
	System image GUID: 0x0c42a1030017c078
	Port 1:
		State: Active
		Physical state: LinkUp
		Rate: 200
		Base lid: 90
		LMC: 0
		SM lid: 2
		Capability mask: 0x2651e848
		Port GUID: 0x0c42a1030017c078
		Link layer: InfiniBand
CA 'mlx5_2'
	CA type: MT4123
	Number of ports: 1
	Firmware version: 20.26.4012
	Hardware version: 0
	Node GUID: 0x0c42a1030017bb48
	System image GUID: 0x0c42a1030017bb48
	Port 1:
		State: Down
		Physical state: Disabled
		Rate: 10
		Base lid: 65535
		LMC: 0
		SM lid: 0
		Capability mask: 0x2651e848
		Port GUID: 0x0c42a1030017bb48
		Link layer: InfiniBand
CA 'mlx5_0'
	CA type: MT4123
	Number of ports: 1
	Firmware version: 20.26.4012
	Hardware version: 0
	Node GUID: 0x0c42a1030017c090
	System image GUID: 0x0c42a1030017c090
	Port 1:
		State: Active
		Physical state: LinkUp
		Rate: 200
		Base lid: 88
		LMC: 0
		SM lid: 2
		Capability mask: 0x2651e848
		Port GUID: 0x0c42a1030017c090
		Link layer: InfiniBand
CA 'mlx5_5'
	CA type: MT4123
	Number of ports: 1
	Firmware version: 20.26.4012
	Hardware version: 0
	Node GUID: 0x0c42a1030017c079
	System image GUID: 0x0c42a1030017c078
	Port 1:
		State: Down
		Physical state: Disabled
		Rate: 40
		Base lid: 0
		LMC: 0
		SM lid: 0
		Capability mask: 0x00010000
		Port GUID: 0x0e42a1fffe17c079
		Link layer: Ethernet
CA 'mlx5_3'
	CA type: MT4123
	Number of ports: 1
	Firmware version: 20.26.4012
	Hardware version: 0
	Node GUID: 0x0c42a1030017bb49
	System image GUID: 0x0c42a1030017bb48
	Port 1:
		State: Down
		Physical state: Disabled
		Rate: 40
		Base lid: 0
		LMC: 0
		SM lid: 0
		Capability mask: 0x00010000
		Port GUID: 0x0e42a1fffe17bb49
		Link layer: Ethernet
CA 'mlx5_1'
	CA type: MT4123
	Number of ports: 1
	Firmware version: 20.26.4012
	Hardware version: 0
	Node GUID: 0x0c42a1030017c091
	System image GUID: 0x0c42a1030017c090
	Port 1:
		State: Down
		Physical state: Disabled
		Rate: 40
		Base lid: 0
		LMC: 0
		SM lid: 0
		Capability mask: 0x00010000
		Port GUID: 0x0e42a1fffe17c091
		Link layer: Ethernet

LID can be found from /sys/class/infiniband/mlx5_$i/ports/1/lid

[root@amd01 ~]# for i in `seq 0 5`; do cat /sys/class/infiniband/mlx5_$i/ports/1/lid; done
0x58
0x0
0xffff
0x0
0x5a
0x0

perfquery requires LID and port number. "0xffff" means presented and 0x0 means not Infiniband mode.

[root@amd01 ~]# for i in `seq 0 5`; do                                                    
> perf
perf       perfquery  
> perfquery $(cat /sys/class/infiniband/mlx5_$i/ports/1/lid) 1
> done
# Port counters: Lid 88 port 1 (CapMask: 0x5A00)
PortSelect:......................1
CounterSelect:...................0x0000
SymbolErrorCounter:..............0
LinkErrorRecoveryCounter:........0
LinkDownedCounter:...............1
PortRcvErrors:...................0
PortRcvRemotePhysicalErrors:.....0
PortRcvSwitchRelayErrors:........0
PortXmitDiscards:................0
PortXmitConstraintErrors:........0
PortRcvConstraintErrors:.........0
CounterSelect2:..................0x00
LocalLinkIntegrityErrors:........0
ExcessiveBufferOverrunErrors:....0
QP1Dropped:......................0
VL15Dropped:.....................0
PortXmitData:....................4294967295
PortRcvData:.....................4294967295
PortXmitPkts:....................4294967295
PortRcvPkts:.....................4294967295
PortXmitWait:....................4294967295
perfquery: iberror: failed: can't resolve destination port 0x0
perfquery: iberror: failed: can't resolve destination port 0xffff
perfquery: iberror: failed: can't resolve destination port 0x0
# Port counters: Lid 90 port 1 (CapMask: 0x5A00)
PortSelect:......................1
CounterSelect:...................0x0000
SymbolErrorCounter:..............0
LinkErrorRecoveryCounter:........0
LinkDownedCounter:...............1
PortRcvErrors:...................0
PortRcvRemotePhysicalErrors:.....0
PortRcvSwitchRelayErrors:........0
PortXmitDiscards:................0
PortXmitConstraintErrors:........0
PortRcvConstraintErrors:.........0
CounterSelect2:..................0x00
LocalLinkIntegrityErrors:........0
ExcessiveBufferOverrunErrors:....0
QP1Dropped:......................0
VL15Dropped:.....................0
PortXmitData:....................4294967295
PortRcvData:.....................4294967295
PortXmitPkts:....................4294967295
PortRcvPkts:.....................4294967295
PortXmitWait:....................4294967295
perfquery: iberror: failed: can't resolve destination port 0x0

sihara avatar Oct 17 '20 10:10 sihara