CentOS7 で BUG: unable to handle kernel NULL pointer dereference at (null)
で kernel panic したのを深追いで調べていた
バックトレース
[705327.823158] BUG: unable to handle kernel NULL pointer dereference at (null) [705327.824009] IP: [<ffffffff81573762>] skb_entail+0x52/0xd0 [705327.828140] PGD 18fe72067 PUD 18fe73067 PMD 0 [705327.828140] Oops: 0002 [#1] SMP [705327.828140] Modules linked in: dccp_diag dccp tcp_diag udp_diag inet_diag unix_diag af_packet_diag netlink_diag nls_utf8 isofs ppdev i2c_piix4 sg parport_pc virtio_balloon parport pcspkr nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c sr_mod cdrom ata_generic pata_acpi cirrus virtio_blk virtio_net syscopyarea sysfillrect sysimgblt drm_kms_helper ttm ata_piix crc32c_intel drm virtio_pci virtio_ring serio_raw libata virtio i2c_core floppy dm_mirror dm_region_hash dm_log dm_mod [705327.837099] CPU: 0 PID: 2790 Comm: nginx Not tainted 3.10.0-327.36.1.el7.x86_64 #1 [705327.837099] Hardware name: Red Hat Inc. OpenStack Nova, BIOS 0.5.1 01/01/2007 [705327.837099] task: ffff880036a1dc00 ti: ffff88018fcbc000 task.ti: ffff88018fcbc000 [705327.837099] RIP: 0010:[<ffffffff81573762>] [<ffffffff81573762>] skb_entail+0x52/0xd0 [705327.837099] RSP: 0018:ffff88018fcbfb88 EFLAGS: 00010202 [705327.837099] RAX: 0000000000000000 RBX: 0000000000003890 RCX: 00000000ffffffff [705327.837099] RDX: ffff8801089c9038 RSI: ffff8801089c9038 RDI: ffff8801089c8f00 [705327.837099] RBP: ffff88018fcbfb88 R08: 00000000000006c0 R09: 0000000000000900 [705327.837099] R10: ffff880217001400 R11: 8883e3b982e39d83 R12: 0000000000000000 [705327.837099] R13: ffff8801089c8f00 R14: ffff8801089c9038 R15: 0000000000000fb0 [705327.837099] FS: 00007fca72598840(0000) GS:ffff88021fc00000(0000) knlGS:0000000000000000 [705327.837099] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [705327.837099] CR2: 0000000000000000 CR3: 000000018fe71000 CR4: 00000000000007f0 [705327.837099] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [705327.837099] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [705327.837099] Stack: [705327.837099] ffff88018fcbfc50 ffffffff81577bf2 ffff880036a1e688 ffff880000000001 [705327.837099] 0000000000000050 ffff88018fcbffd8 0000000000000000 ffff880036a1dc00 [705327.837099] ffff88018fcbffd8 ffff88018fcbfe70 00000000011b4df9 0000000000000501 [705327.837099] Call Trace: [705327.837099] [<ffffffff81577bf2>] tcp_sendmsg+0x6f2/0xc20 [705327.837099] [<ffffffff815a1c04>] inet_sendmsg+0x64/0xb0 [705327.837099] [<ffffffff815103d7>] sock_aio_write+0x157/0x180 [705327.837099] [<ffffffff811de5d9>] do_sync_readv_writev+0x79/0xd0 [705327.837099] [<ffffffff811dfbae>] do_readv_writev+0xce/0x260 [705327.837099] [<ffffffff81058aaf>] ? kvm_clock_get_cycles+0x1f/0x30 [705327.837099] [<ffffffff810d896a>] ? __getnstimeofday64+0x3a/0xd0 [705327.837099] [<ffffffff811dfdd5>] vfs_writev+0x35/0x60 [705327.837099] [<ffffffff811dff8f>] SyS_writev+0x7f/0x110 [705327.837099] [<ffffffff81646a09>] system_call_fastpath+0x16/0x1b [705327.837099] Code: 4d 00 89 46 44 89 46 40 8b 86 dc 00 00 00 c7 44 02 24 01 00 01 00 48 8b 87 40 01 00 00 48 8d 97 38 01 00 00 48 89 16 48 89 46 08 <48> 89 30 83 87 48 01 00 00 01 48 83 bf 68 02 00 00 00 48 89 b7 [705327.837099] RIP [<ffffffff81573762>] skb_entail+0x52/0xd0 [705327.837099] RSP <ffff88018fcbfb88> [705327.837099] CR2: 0000000000000000
以下、調べてみてのわかったことと推測
- sk_stream_alloc_skb で新規に割り当てた struct sk_buffer skb と、 sk->write_queue のアドレスが なぜか同一 になっている
- sk_stream_alloc_skb が返す skb は memset でゼロクリアされているので、skb->next, skb->prev 共に NULL となっている
- skb と sk->write_queue が同じアドレスを指しいるので、つまり、sk->write_queue->next, sk->write_queue->prev も NULL となっている
- 以上の状況で __skb_insert でリスト操作をする際に NULL pointer dereference が起きた
NULL pointer dereference が起きた状況は整理できたが、なぜこうなったのか、根本的な原因はよくわからない。ググってみても「これ」といったものは発見できず
- http://www.spinics.net/lists/netdev/msg309842.html で議論されている内容と近い内容のようだが確証はない
- https://patchwork.kernel.org/patch/9356243 も似たような箇所についてのパッチだが、バグの起こり方が違う
調べていったログを記しておく (全体を整頓しきれていないので、情報が飛散しているのはご了承を)
dmesg
該当のインスタンスには kdump が仕込まれていたので、vmcore と dmesg が採取できている
$ cat /var/crash/127.0.0.1-2016-10-27-01\:11\:15/vmcore-dmesg.txt | grep -A100 BUG [705327.823158] BUG: unable to handle kernel NULL pointer dereference at (null) [705327.824009] IP: [<ffffffff81573762>] skb_entail+0x52/0xd0 [705327.828140] PGD 18fe72067 PUD 18fe73067 PMD 0 [705327.828140] Oops: 0002 [#1] SMP [705327.828140] Modules linked in: dccp_diag dccp tcp_diag udp_diag inet_diag unix_diag af_packet_diag netlink_diag nls_utf8 isofs ppdev i2c_piix4 sg parport_pc virtio_balloon parport pcspkr nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c sr_mod cdrom ata_generic pata_acpi cirrus virtio_blk virtio_net syscopyarea sysfillrect sysimgblt drm_kms_helper ttm ata_piix crc32c_intel drm virtio_pci virtio_ring serio_raw libata virtio i2c_core floppy dm_mirror dm_region_hash dm_log dm_mod [705327.837099] CPU: 0 PID: 2790 Comm: nginx Not tainted 3.10.0-327.36.1.el7.x86_64 #1 [705327.837099] Hardware name: Red Hat Inc. OpenStack Nova, BIOS 0.5.1 01/01/2007 [705327.837099] task: ffff880036a1dc00 ti: ffff88018fcbc000 task.ti: ffff88018fcbc000 [705327.837099] RIP: 0010:[<ffffffff81573762>] [<ffffffff81573762>] skb_entail+0x52/0xd0 [705327.837099] RSP: 0018:ffff88018fcbfb88 EFLAGS: 00010202 [705327.837099] RAX: 0000000000000000 RBX: 0000000000003890 RCX: 00000000ffffffff [705327.837099] RDX: ffff8801089c9038 RSI: ffff8801089c9038 RDI: ffff8801089c8f00 [705327.837099] RBP: ffff88018fcbfb88 R08: 00000000000006c0 R09: 0000000000000900 [705327.837099] R10: ffff880217001400 R11: 8883e3b982e39d83 R12: 0000000000000000 [705327.837099] R13: ffff8801089c8f00 R14: ffff8801089c9038 R15: 0000000000000fb0 [705327.837099] FS: 00007fca72598840(0000) GS:ffff88021fc00000(0000) knlGS:0000000000000000 [705327.837099] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [705327.837099] CR2: 0000000000000000 CR3: 000000018fe71000 CR4: 00000000000007f0 [705327.837099] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [705327.837099] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [705327.837099] Stack: [705327.837099] ffff88018fcbfc50 ffffffff81577bf2 ffff880036a1e688 ffff880000000001 [705327.837099] 0000000000000050 ffff88018fcbffd8 0000000000000000 ffff880036a1dc00 [705327.837099] ffff88018fcbffd8 ffff88018fcbfe70 00000000011b4df9 0000000000000501 [705327.837099] Call Trace: [705327.837099] [<ffffffff81577bf2>] tcp_sendmsg+0x6f2/0xc20 [705327.837099] [<ffffffff815a1c04>] inet_sendmsg+0x64/0xb0 [705327.837099] [<ffffffff815103d7>] sock_aio_write+0x157/0x180 [705327.837099] [<ffffffff811de5d9>] do_sync_readv_writev+0x79/0xd0 [705327.837099] [<ffffffff811dfbae>] do_readv_writev+0xce/0x260 [705327.837099] [<ffffffff81058aaf>] ? kvm_clock_get_cycles+0x1f/0x30 [705327.837099] [<ffffffff810d896a>] ? __getnstimeofday64+0x3a/0xd0 [705327.837099] [<ffffffff811dfdd5>] vfs_writev+0x35/0x60 [705327.837099] [<ffffffff811dff8f>] SyS_writev+0x7f/0x110 [705327.837099] [<ffffffff81646a09>] system_call_fastpath+0x16/0x1b [705327.837099] Code: 4d 00 89 46 44 89 46 40 8b 86 dc 00 00 00 c7 44 02 24 01 00 01 00 48 8b 87 40 01 00 00 48 8d 97 38 01 00 00 48 89 16 48 89 46 08 <48> 89 30 83 87 48 01 00 00 01 48 83 bf 68 02 00 00 00 48 89 b7 [705327.837099] RIP [<ffffffff81573762>] skb_entail+0x52/0xd0 [705327.837099] RSP <ffff88018fcbfb88> [705327.837099] CR2: 0000000000000000
crash
debug-info と crash を用意した後に vmcore を解析する
crash> sys KERNEL: /usr/lib/debug/usr/lib/modules/3.10.0-327.36.1.el7.x86_64/vmlinux DUMPFILE: /var/crash/127.0.0.1-2016-10-27-01:11:15/vmcore [PARTIAL DUMP] CPUS: 4 DATE: Wed Oct 26 16:11:13 2016 UPTIME: 8 days, 03:55:27 LOAD AVERAGE: 0.13, 0.14, 0.14 TASKS: 285 NODENAME: ******************** RELEASE: 3.10.0-327.36.1.el7.x86_64 VERSION: #1 SMP Sun Sep 18 13:04:29 UTC 2016 MACHINE: x86_64 (2399 Mhz) MEMORY: 8 GB PANIC: "BUG: unable to handle kernel NULL pointer dereference at (null)"
詳細な backtrace を取る
crash> bt -FF PID: 2790 TASK: ffff880036a1dc00 CPU: 0 COMMAND: "nginx" #0 [ffff88018fcbf830] machine_kexec at ffffffff81051e9b ffff88018fcbf838: 000088018fcbf898 ffff880000000000 ffff88018fcbf848: 000000002d001000 ffff88002d001000 ffff88018fcbf858: 000000002d000000 ffff88018fcbf878 ffff88018fcbf868: 000000001e1b4df9 ffff88018fcbfad8 ffff88018fcbf878: ffff88018fcbf898 0000000000000246 ffff88018fcbf888: ffff88018fcbf958 crash_kexec+114 #1 [ffff88018fcbf890] crash_kexec at ffffffff810f27e2 ffff88018fcbf898: 0000000000000fb0 [ffff8801089c9038:TCP] ffff88018fcbf8a8: [ffff8801089c8f00:TCP] 0000000000000000 ffff88018fcbf8b8: ffff88018fcbfb88 0000000000003890 ffff88018fcbf8c8: 8883e3b982e39d83 [ffff880217001400:kmem_cache] ffff88018fcbf8d8: 0000000000000900 00000000000006c0 ffff88018fcbf8e8: 0000000000000000 00000000ffffffff ffff88018fcbf8f8: [ffff8801089c9038:TCP] [ffff8801089c9038:TCP] ffff88018fcbf908: [ffff8801089c8f00:TCP] ffffffffffffffff ffff88018fcbf918: skb_entail+82 0000000000000010 ffff88018fcbf928: 0000000000010202 ffff88018fcbfb88 ffff88018fcbf938: 0000000000000018 000000001e1b4df9 ffff88018fcbf948: 0000000000000009 ffff88018fcbfad8 ffff88018fcbf958: ffff88018fcbf980 oops_end+280 #2 [ffff88018fcbf960] oops_end at ffffffff8163f308 ffff88018fcbf968: ffff88018fcbfad8 0000000000000246 ffff88018fcbf978: 0000000000000009 ffff88018fcbf9d0 ffff88018fcbf988: no_context+640 #3 [ffff88018fcbf988] no_context at ffffffff8162f431 ffff88018fcbf990: 0003000100000003 0000000000000246 ffff88018fcbf9a0: 000000001e1b4df9 0000000000000002 ffff88018fcbf9b0: ffff88018fcbfad8 0000000000000000 ffff88018fcbf9c0: [ffff880036a1dc00:task_struct] 0000000000030001 ffff88018fcbf9d0: ffff88018fcbfa18 __bad_area_nosemaphore+115 #4 [ffff88018fcbf9d8] __bad_area_nosemaphore at ffffffff8162f4c7 ffff88018fcbf9e0: __alloc_pages_nodemask+407 ffff88021ffd8000 ffff88018fcbf9f0: ffff88018fcbfad8 0000000000000002 ffff88018fcbfa00: 0000000000000000 ffff88018fcbfad8 ffff88018fcbfa10: 0000000000000029 ffff88018fcbfa40 ffff88018fcbfa20: bad_area+67 #5 [ffff88018fcbfa20] bad_area at ffffffff8162f7eb ffff88018fcbfa28: 0000000000000002 0000000000000000 ffff88018fcbfa38: [ffff8802108eb840:mm_struct] ffff88018fcbfaa0 ffff88018fcbfa48: __do_page_fault+837 #6 [ffff88018fcbfa48] __do_page_fault at ffffffff816420f5 ffff88018fcbfa50: kmem_cache_free+437 0000000000000002 ffff88018fcbfa60: [ffff880036a1dc00:task_struct] 0000000036a1dc00 ffff88018fcbfa70: [ffff8802108eb8b8:mm_struct] ffff88018fcbfad8 ffff88018fcbfa80: 0000000000000000 [ffff8801089c8f00:TCP] ffff88018fcbfa90: [ffff8801089c9038:TCP] 0000000000000fb0 ffff88018fcbfaa0: ffff88018fcbfac8 do_page_fault+35 #7 [ffff88018fcbfaa8] do_page_fault at ffffffff81642223 ffff88018fcbfab0: ffff88018fcbfb2f 0000000000000001 ffff88018fcbfac0: 0000000000000000 ffff88018fcbfb88 ffff88018fcbfad0: page_fault+40 #8 [ffff88018fcbfad0] page_fault at ffffffff8163e508 [exception RIP: skb_entail+82] 🔥 RIP: ffffffff81573762 RSP: ffff88018fcbfb88 RFLAGS: 00010202 RAX: 0000000000000000 RBX: 0000000000003890 RCX: 00000000ffffffff RDX: ffff8801089c9038 RSI: ffff8801089c9038 RDI: ffff8801089c8f00 RBP: ffff88018fcbfb88 R8: 00000000000006c0 R9: 0000000000000900 R10: ffff880217001400 R11: 8883e3b982e39d83 R12: 0000000000000000 R13: ffff8801089c8f00 R14: ffff8801089c9038 R15: 0000000000000fb0 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 ffff88018fcbfad8: 0000000000000fb0 [ffff8801089c9038:TCP] ffff88018fcbfae8: [ffff8801089c8f00:TCP] 0000000000000000 ffff88018fcbfaf8: ffff88018fcbfb88 0000000000003890 ffff88018fcbfb08: 8883e3b982e39d83 [ffff880217001400:kmem_cache] ffff88018fcbfb18: 0000000000000900 00000000000006c0 ffff88018fcbfb28: 0000000000000000 00000000ffffffff ffff88018fcbfb38: [ffff8801089c9038:TCP] [ffff8801089c9038:TCP] ffff88018fcbfb48: [ffff8801089c8f00:TCP] ffffffffffffffff ffff88018fcbfb58: skb_entail+82 0000000000000010 ffff88018fcbfb68: 0000000000010202 ffff88018fcbfb88 ffff88018fcbfb78: 0000000000000018 [ffff8801089c8f00:TCP] ffff88018fcbfb88: ffff88018fcbfc50 tcp_sendmsg+1778 #9 [ffff88018fcbfb90] tcp_sendmsg at ffffffff81577bf2 ffff88018fcbfb98: [ffff880036a1e688:task_struct] ffff880000000001 ffff88018fcbfba8: 0000000000000050 ffff88018fcbffd8 ffff88018fcbfbb8: 0000000000000000 [ffff880036a1dc00:task_struct] ffff88018fcbfbc8: ffff88018fcbffd8 ffff88018fcbfe70 ffff88018fcbfbd8: 00000000011b4df9 0000000000000501 ffff88018fcbfbe8: [ffff880036a1e601:task_struct] [ffff8801089c9038:TCP] ffff88018fcbfbf8: 0000000001fef010 000005a800000000 ffff88018fcbfc08: 0000004000001050 ffff880100003890 ffff88018fcbfc18: 0000000000000000 000000001e1b4df9 ffff88018fcbfc28: ffff88018fcbfd60 ffff88018fcbfd60 ffff88018fcbfc38: 0000000000000002 ffff88018fcbfe60 ffff88018fcbfc48: [ffff88013ab76580:sock_inode_cache] ffff88018fcbfc80 ffff88018fcbfc58: inet_sendmsg+100 #10 [ffff88018fcbfc58] inet_sendmsg at ffffffff815a1c04 ffff88018fcbfc60: 0000000000000040 000000001e1b4df9 ffff88018fcbfc70: ffff88018fcbfc80 0000000000002000 ffff88018fcbfc80: ffff88018fcbfd50 sock_aio_write+343 #11 [ffff88018fcbfc88] sock_aio_write at ffffffff815103d7 ffff88018fcbfc90: 00002000815d3d04 ffff88018fcbfcd8 ffff88018fcbfca0: 0000900000000040 [ffff88013ab75180:sock_inode_cache] ffff88018fcbfcb0: 000020008fcbfd70 [ffff88013ab76580:sock_inode_cache] ffff88018fcbfcc0: ffff88018fcbfcc8 0000000000000000 ffff88018fcbfcd0: ffff88018fcbfcd8 0000000000000000 ffff88018fcbfce0: 0000000000000000 ffff88018fcbfe60 ffff88018fcbfcf0: 0000000000000002 0000000000000000 ffff88018fcbfd00: 0000000000000000 0000000000000040 ffff88018fcbfd10: ffff88018fcbfd60 000000001e1b4df9 ffff88018fcbfd20: 0000000000009000 ffff88018fcbff48 ffff88018fcbfd30: 0000000000002000 [ffff88007c3d5400:kmalloc-256] ffff88018fcbfd40: 0000000000000002 [ffff88007c3d5400:kmalloc-256] ffff88018fcbfd50: ffff88018fcbfe28 do_sync_readv_writev+121 #12 [ffff88018fcbfd58] do_sync_readv_writev at ffffffff811de5d9 ffff88018fcbfd60: 0000000000000001 [ffff88007c3d5400:kmalloc-256] ffff88018fcbfd70: 0000000000000000 0000000000000000 ffff88018fcbfd80: 0000000000000000 [ffff880036a1dc00:task_struct] ffff88018fcbfd90: 0000000000000000 0000000000000000 ffff88018fcbfda0: ffff88018fcbfca0 0000000000000000 ffff88018fcbfdb0: 0000000000002000 0000000000000000 ffff88018fcbfdc0: 0000000000002000 0000000000000000 ffff88018fcbfdd0: 0000000000000000 0000000000000000 ffff88018fcbfde0: 0000000000000000 0000000000000000 ffff88018fcbfdf0: 0000000000000000 0000000000000000 ffff88018fcbfe00: 0000000000000000 000000001e1b4df9 ffff88018fcbfe10: 0000000000002000 0000000000000001 ffff88018fcbfe20: ffff88018fcbfe60 ffff88018fcbff20 ffff88018fcbfe30: do_readv_writev+206 #13 [ffff88018fcbfe30] do_readv_writev at ffffffff811dfbae #13 [ffff88018fcbfe30] do_readv_writev at ffffffff811dfbae ffff88018fcbfe38: 0000000000000040 0000000000001000 ffff88018fcbfe48: 0000000000002000 ffff88018fcbff48 ffff88018fcbfe58: ffff88018fcbfe60 0000000001ff1ff0 ffff88018fcbfe68: 0000000000001000 0000000001feefc0 ffff88018fcbfe78: 0000000000001000 0000000001fedfb0 ffff88018fcbfe88: 0000000000001000 ffff88018fcbfe70 ffff88018fcbfe98: 0000000000000001 0000000000000000 ffff88018fcbfea8: 0000000000000000 0000000000000000 ffff88018fcbfeb8: ffff88018fcbff20 kvm_clock ffff88018fcbfec8: ffff88018fcbfed8 kvm_clock_get_cycles+31 ffff88018fcbfed8: ffff88018fcbff00 __getnstimeofday64+58 ffff88018fcbfee8: 000000001e1b4df9 0000000000000000 ffff88018fcbfef8: [ffff88007c3d5400:kmalloc-256] 0000000000000001 ffff88018fcbff08: 00007ffcc8b17100 0000000000000002 ffff88018fcbff18: 0000000001f702e0 ffff88018fcbff30 ffff88018fcbff28: vfs_writev+53 #14 [ffff88018fcbff28] vfs_writev at ffffffff811dfdd5 ffff88018fcbff30: ffff88018fcbff78 sys_writev+127 #15 [ffff88018fcbff38] sys_writev at ffffffff811dff8f ffff88018fcbff40: ffff88018fcbff78 0000000000000000 ffff88018fcbff50: 000000001e1b4df9 00007ffcc8b170e0 ffff88018fcbff60: 0000000001f702e0 0000000001fc3cd0 ffff88018fcbff70: 0000000000002000 0000000001ff6050 ffff88018fcbff80: system_call_fastpath+22 #16 [ffff88018fcbff80] system_call_fastpath at ffffffff81646a09 RIP: 00007fca70a8aea0 RSP: 00007ffcc8b179f0 RFLAGS: 00000246 RAX: 0000000000000014 RBX: ffffffff81646a09 RCX: 0000000000000000 RDX: 0000000000000002 RSI: 00007ffcc8b17100 RDI: 0000000000000009 RBP: 0000000001ff6050 R8: 0000000000000000 R9: 0000000000002000 R10: 000000007fffdfff R11: 0000000000000293 R12: 0000000000002000 R13: 0000000001fc3cd0 R14: 0000000001f702e0 R15: 00007ffcc8b170e0 ORIG_RAX: 0000000000000014 CS: 0033 SS: 002b
🔥 [exception RIP: skb_entail+82]
を手がかりに skb_entail
を disassemble する
crash> dis skb_entail 0xffffffff81573710 <skb_entail>: data32 data32 data32 xchg %ax,%ax [FTRACE NOP] 0xffffffff81573715 <skb_entail+5>: push %rbp 0xffffffff81573716 <skb_entail+6>: movl $0x0,0x74(%rsi) 0xffffffff8157371d <skb_entail+13>: mov 0x658(%rdi),%eax 0xffffffff81573723 <skb_entail+19>: mov 0xe0(%rsi),%rdx 0xffffffff8157372a <skb_entail+26>: orb $0x10,0x7c(%rsi) 0xffffffff8157372e <skb_entail+30>: movb $0x10,0x4c(%rsi) 0xffffffff81573732 <skb_entail+34>: mov %rsp,%rbp 0xffffffff81573735 <skb_entail+37>: movb $0x0,0x4d(%rsi) 0xffffffff81573739 <skb_entail+41>: mov %eax,0x44(%rsi) 0xffffffff8157373c <skb_entail+44>: mov %eax,0x40(%rsi) 0xffffffff8157373f <skb_entail+47>: mov 0xdc(%rsi),%eax 0xffffffff81573745 <skb_entail+53>: movl $0x10001,0x24(%rdx,%rax,1) 0xffffffff8157374d <skb_entail+61>: mov 0x140(%rdi),%rax ?null? 0xffffffff81573754 <skb_entail+68>: lea 0x138(%rdi),%rdx 0xffffffff8157375b <skb_entail+75>: mov %rdx,(%rsi) 0xffffffff8157375e <skb_entail+78>: mov %rax,0x8(%rsi) 0xffffffff81573762 <skb_entail+82>: mov %rsi,(%rax) 🔥 0xffffffff81573765 <skb_entail+85>: addl $0x1,0x148(%rdi) 0xffffffff8157376c <skb_entail+92>: cmpq $0x0,0x268(%rdi) 0xffffffff81573774 <skb_entail+100>: mov %rsi,0x140(%rdi) 0xffffffff8157377b <skb_entail+107>: je 0xffffffff815737c0 <skb_entail+176>
🔥 で NULL pointer dereference により exception を起こしている
ここまでの状況を整理
- Nginx が TCPソケットに writev(2) したプロセスコンテキストである
- exception を起こした RIP は
0xffffffff81573762 skb_entail+82
mov %rsi,(%rax)
が例外を起こした命令 - %rax が
0000000000000000
なので、(%rax) で NULL pointer dereference となり例外を起こしている
その他の情報として、このインスタンスの運用で把握していることは
- kernel panic を起こした KVMインスタンスは、 Nginx + Unicorn が動作している典型的な Rails 構成の role である
- この role は、水平にスケールする構成をとっており、同様に動く KVM インスタンスが他に数十ほどいる
- 全て合計すると Nginx プロセス数百ほどいる
という環境で発生した kernel panic なので、かなり稀にしか発生しないバグ、という推測をたてている
disassemble して解析
objdump でソースのアノテーション付きで disassemble する
objdump -D -S /usr/lib/debug/usr/lib/modules/3.10.0-327.36.1.el7.x86_64/vmlinux > disassemble.txt
RIP 0xffffffff81573762
の付近のソースは下記の通り
static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb) { ffffffff81573754: 48 8d 97 38 01 00 00 lea 0x138(%rdi),%rdx __skb_queue_tail(&sk->sk_write_queue, skb); static inline void __skb_insert(struct sk_buff *newsk, struct sk_buff *prev, struct sk_buff *next, struct sk_buff_head *list) { ffffffff8157375b: 48 89 16 mov %rdx,(%rsi) newsk->next = next; ffffffff8157375e: 48 89 46 08 mov %rax,0x8(%rsi) newsk->prev = prev; ffffffff81573762: 48 89 30 mov %rsi,(%rax) 🔥 next->prev = prev->next = newsk; ffffffff81573765: 83 87 48 01 00 00 01 addl $0x1,0x148(%rdi) list->qlen++;
🔥 exception を起こした時点でのレジスタを確認する
- %rsi
struct sk_buff *next
0xffff8801089c9038
- 0x08(%rsi)
struct sk_buff *next
0xffff8801089c9038
- %rdx
struct sk_buff *next
0xffff8801089c9038
- %rax
struct sk_buff *prev
NULL
prev が NULL であることで例外をおこしている。newsk, next が同一アドレスなのはおかしい気がする。
crash で構造体を見る
crash> p *(struct sk_buff *) 0xffff8801089c9038 $12 = { next = 0xffff8801089c9038, prev = 0x0 { tstamp = { tv64 = 0 }, skb_mstamp = { { v64 = 0, { stamp_us = 0, stamp_jiffies = 0 } } } }, sk = 0x0, dev = 0x0, ... 略
crash> p ((struct sock *) 0xffff8801089c8f00)->sk_write_queue $19 = { next = 0xffff8801089c9038, /* ❓ */ prev = 0x0, qlen = 0, lock = { { rlock = { raw_lock = { { head_tail = 0, tickets = { head = 0, tail = 0 } } } } } } }
struct sk_buff は struct sk_buff_head と合わせて下記のような circular doubly linked list の構造を取る。
sk->sk_write_queue は 図の struct sk_buff_head に相当する
systemtap で確認
別の検証として systemtap で sk_stream_alloc_skb と skb_entail を呼び出した際の変数のアドレスを見てみたが、 $skb と $sk->sk_write_queue のアドレスは別になっている
[11185] sshd sk_stream_alloc_skb $skb: 0xffff880038bf7600 $skb->prev: 0x0 $skb->next: 0x0 skb_entail $skb: 0xffff880038bf7600 $skb->prev: 0x0 $skb->next: 0x0 $sk->sk_write_queue: 0xffff8800360f2e38 $sk->sk_write_queue->prev: 0xffff8800360f2e38 $sk->sk_write_queue->next: 0xffff8800360f2e38
本来はこのように別のアドレスを指すべきなのだと思われる
検証に使った systemtap
$ cat test.stap probe kernel.function("sk_stream_alloc_skb").return { printf ("---------------------------------------------------------\n") printf ("[%d] %s\n", pid(), execname()) printf ("sk_stream_alloc_skb\n") printf ("$skb: %p\n", $return) printf ("$skb->prev: %p\n", $return->prev) printf ("$skb->next: %p\n", $return->next) } probe kernel.function("skb_entail").call { printf ("skb_entail\n") printf ("$skb: %p\n", $skb) printf ("$skb->prev: %p\n", $skb->prev) printf ("$skb->next: %p\n", $skb->next); printf ("$sk->sk_write_queue: %p\n", &$sk->sk_write_queue) printf ("$sk->sk_write_queue->prev: %p\n", $sk->sk_write_queue->prev) printf ("$sk->sk_write_queue->next: %p\n", $sk->sk_write_queue->next) }
更にソースを追う
struct sk_buffer * skb
がどこからやってくるかをソースを遡ってみる
static inline void __skb_insert(struct sk_buff *newsk, struct sk_buff *prev 💀, struct sk_buff *next, struct sk_buff_head *list) { // newsk 0xffff8801089c9038 // prev NULL // next 0xffff8801089c9038 // list 0xffff8801089c9038 newsk->next = next; newsk->prev = prev; 💀 /* prev is NULL */ next->prev = prev->next = newsk; 🔥 /* oops NULL pointer derefernce */ list->qlen++; }
static inline void __skb_queue_before(struct sk_buff_head *list, struct sk_buff *next 🍣, struct sk_buff *newsk) { // newsk 0xffff8801089c9038 // next->prev NULL // next 0xffff8801089c9038 // list 0xffff8801089c9038 ??? __skb_insert(newsk, next->prev 💀, next, list); }
static inline void __skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) { // list 0xffff8801089c9038 // * list->prev NULL // newsk 0xffff8801089c9038 __skb_queue_before(list, (struct sk_buff *)list 🍣, newsk); }
static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb) { // sk 0xffff8801089c8f00 // * sk->sk_write_queue->next 0xffff8801089c9038 // * sk->sk_write_queue->prev NULL 💀 // skb 0xffff8801089c9038 __skb_queue_tail(&sk->sk_write_queue 🍣, skb ); }
static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb) { // sk 0xffff8801089c8f00 // skb 0xffff8801089c9038 __tcp_add_write_queue_tail(sk 🍣, skb ); /* Queue it, remembering where we must start sending. */ if (sk->sk_send_head == NULL) { sk->sk_send_head = skb; if (tcp_sk(sk)->highest_sack == NULL) tcp_sk(sk)->highest_sack = skb; } }
static void skb_entail(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); skb->csum = 0; tcb->seq = tcb->end_seq = tp->write_seq; tcb->tcp_flags = TCPHDR_ACK; tcb->sacked = 0; __skb_header_release(skb); // sk 0xffff8801089c8f00 // skb 0xffff8801089c9038 tcp_add_write_queue_tail(sk 🍣, skb ); sk->sk_wmem_queued += skb->truesize; sk_mem_charge(sk, skb->truesize); if (tp->nonagle & TCP_NAGLE_PUSH) tp->nonagle &= ~TCP_NAGLE_PUSH; }
int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t size) { struct iovec *iov; struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int iovlen, flags, err, copied = 0; int mss_now = 0, size_goal, copied_syn = 0, offset = 0; bool sg; long timeo; //... new_segment: /* Allocate new segment. If the interface is SG, * allocate skb fitting to single page. */ if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; skb = sk_stream_alloc_skb(sk, select_size(sk, sg), sk->sk_allocation); if (!skb) goto wait_for_memory; /* * Check whether we can use HW checksum. */ if (sk->sk_route_caps & NETIF_F_ALL_CSUM) skb->ip_summed = CHECKSUM_PARTIAL; // sk 0xffff8801089c8f00 %rdi // skb 0xffff8801089c9038 %rsi skb_entail(sk, skb); 🔥
sk_stream_alloc_skb で割り当てた skb は __alloc_skb の memset で 0
クリアされている
struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, int flags, int node) { struct kmem_cache *cache; struct skb_shared_info *shinfo; struct sk_buff *skb; u8 *data; bool pfmemalloc; ... 略 /* * Only clear those fields we need to clear, not those that we will * actually initialise below. Hence, don't put any more fields after * the tail pointer in struct sk_buff! */ memset(skb, 0, offsetof(struct sk_buff, tail));
よって skb->prev と skb->next は 0 = NULL