デバッグ: CentOS7 BUG: unable to handle kernel NULL pointer dereference at (null)

CentOS7 で BUG: unable to handle kernel NULL pointer dereference at (null) で kernel panic したのを深追いで調べていた

バックトレース

[705327.823158] BUG: unable to handle kernel NULL pointer dereference at           (null)
[705327.824009] IP: [<ffffffff81573762>] skb_entail+0x52/0xd0
[705327.828140] PGD 18fe72067 PUD 18fe73067 PMD 0 
[705327.828140] Oops: 0002 [#1] SMP 
[705327.828140] Modules linked in: dccp_diag dccp tcp_diag udp_diag inet_diag unix_diag af_packet_diag netlink_diag nls_utf8 isofs ppdev i2c_piix4 sg parport_pc virtio_balloon parport pcspkr nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c sr_mod cdrom ata_generic pata_acpi cirrus virtio_blk virtio_net syscopyarea sysfillrect sysimgblt drm_kms_helper ttm ata_piix crc32c_intel drm virtio_pci virtio_ring serio_raw libata virtio i2c_core floppy dm_mirror dm_region_hash dm_log dm_mod
[705327.837099] CPU: 0 PID: 2790 Comm: nginx Not tainted 3.10.0-327.36.1.el7.x86_64 #1
[705327.837099] Hardware name: Red Hat Inc. OpenStack Nova, BIOS 0.5.1 01/01/2007
[705327.837099] task: ffff880036a1dc00 ti: ffff88018fcbc000 task.ti: ffff88018fcbc000
[705327.837099] RIP: 0010:[<ffffffff81573762>]  [<ffffffff81573762>] skb_entail+0x52/0xd0
[705327.837099] RSP: 0018:ffff88018fcbfb88  EFLAGS: 00010202
[705327.837099] RAX: 0000000000000000 RBX: 0000000000003890 RCX: 00000000ffffffff
[705327.837099] RDX: ffff8801089c9038 RSI: ffff8801089c9038 RDI: ffff8801089c8f00
[705327.837099] RBP: ffff88018fcbfb88 R08: 00000000000006c0 R09: 0000000000000900
[705327.837099] R10: ffff880217001400 R11: 8883e3b982e39d83 R12: 0000000000000000
[705327.837099] R13: ffff8801089c8f00 R14: ffff8801089c9038 R15: 0000000000000fb0
[705327.837099] FS:  00007fca72598840(0000) GS:ffff88021fc00000(0000) knlGS:0000000000000000
[705327.837099] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[705327.837099] CR2: 0000000000000000 CR3: 000000018fe71000 CR4: 00000000000007f0
[705327.837099] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[705327.837099] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[705327.837099] Stack:
[705327.837099]  ffff88018fcbfc50 ffffffff81577bf2 ffff880036a1e688 ffff880000000001
[705327.837099]  0000000000000050 ffff88018fcbffd8 0000000000000000 ffff880036a1dc00
[705327.837099]  ffff88018fcbffd8 ffff88018fcbfe70 00000000011b4df9 0000000000000501
[705327.837099] Call Trace:
[705327.837099]  [<ffffffff81577bf2>] tcp_sendmsg+0x6f2/0xc20
[705327.837099]  [<ffffffff815a1c04>] inet_sendmsg+0x64/0xb0
[705327.837099]  [<ffffffff815103d7>] sock_aio_write+0x157/0x180
[705327.837099]  [<ffffffff811de5d9>] do_sync_readv_writev+0x79/0xd0
[705327.837099]  [<ffffffff811dfbae>] do_readv_writev+0xce/0x260
[705327.837099]  [<ffffffff81058aaf>] ? kvm_clock_get_cycles+0x1f/0x30
[705327.837099]  [<ffffffff810d896a>] ? __getnstimeofday64+0x3a/0xd0
[705327.837099]  [<ffffffff811dfdd5>] vfs_writev+0x35/0x60
[705327.837099]  [<ffffffff811dff8f>] SyS_writev+0x7f/0x110
[705327.837099]  [<ffffffff81646a09>] system_call_fastpath+0x16/0x1b
[705327.837099] Code: 4d 00 89 46 44 89 46 40 8b 86 dc 00 00 00 c7 44 02 24 01 00 01 00 48 8b 87 40 01 00 00 48 8d 97 38 01 00 00 48 89 16 48 89 46 08 <48> 89 30 83 87 48 01 00 00 01 48 83 bf 68 02 00 00 00 48 89 b7 
[705327.837099] RIP  [<ffffffff81573762>] skb_entail+0x52/0xd0
[705327.837099]  RSP <ffff88018fcbfb88>
[705327.837099] CR2: 0000000000000000

以下、調べてみてのわかったことと推測

  • sk_stream_alloc_skb で新規に割り当てた struct sk_buffer skb と、 sk->write_queue のアドレスが なぜか同一 になっている
    • sk_stream_alloc_skb が返す skb は memset でゼロクリアされているので、skb->next, skb->prev 共に NULL となっている
    • skb と sk->write_queue が同じアドレスを指しいるので、つまり、sk->write_queue->next, sk->write_queue->prev も NULL となっている
  • 以上の状況で __skb_insert でリスト操作をする際に NULL pointer dereference が起きた

NULL pointer dereference が起きた状況は整理できたが、なぜこうなったのか、根本的な原因はよくわからない。ググってみても「これ」といったものは発見できず

調べていったログを記しておく (全体を整頓しきれていないので、情報が飛散しているのはご了承を)

dmesg

該当のインスタンスには kdump が仕込まれていたので、vmcore と dmesg が採取できている

$ cat /var/crash/127.0.0.1-2016-10-27-01\:11\:15/vmcore-dmesg.txt | grep -A100 BUG
[705327.823158] BUG: unable to handle kernel NULL pointer dereference at           (null)
[705327.824009] IP: [<ffffffff81573762>] skb_entail+0x52/0xd0
[705327.828140] PGD 18fe72067 PUD 18fe73067 PMD 0 
[705327.828140] Oops: 0002 [#1] SMP 
[705327.828140] Modules linked in: dccp_diag dccp tcp_diag udp_diag inet_diag unix_diag af_packet_diag netlink_diag nls_utf8 isofs ppdev i2c_piix4 sg parport_pc virtio_balloon parport pcspkr nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c sr_mod cdrom ata_generic pata_acpi cirrus virtio_blk virtio_net syscopyarea sysfillrect sysimgblt drm_kms_helper ttm ata_piix crc32c_intel drm virtio_pci virtio_ring serio_raw libata virtio i2c_core floppy dm_mirror dm_region_hash dm_log dm_mod
[705327.837099] CPU: 0 PID: 2790 Comm: nginx Not tainted 3.10.0-327.36.1.el7.x86_64 #1
[705327.837099] Hardware name: Red Hat Inc. OpenStack Nova, BIOS 0.5.1 01/01/2007
[705327.837099] task: ffff880036a1dc00 ti: ffff88018fcbc000 task.ti: ffff88018fcbc000
[705327.837099] RIP: 0010:[<ffffffff81573762>]  [<ffffffff81573762>] skb_entail+0x52/0xd0
[705327.837099] RSP: 0018:ffff88018fcbfb88  EFLAGS: 00010202
[705327.837099] RAX: 0000000000000000 RBX: 0000000000003890 RCX: 00000000ffffffff
[705327.837099] RDX: ffff8801089c9038 RSI: ffff8801089c9038 RDI: ffff8801089c8f00
[705327.837099] RBP: ffff88018fcbfb88 R08: 00000000000006c0 R09: 0000000000000900
[705327.837099] R10: ffff880217001400 R11: 8883e3b982e39d83 R12: 0000000000000000
[705327.837099] R13: ffff8801089c8f00 R14: ffff8801089c9038 R15: 0000000000000fb0
[705327.837099] FS:  00007fca72598840(0000) GS:ffff88021fc00000(0000) knlGS:0000000000000000
[705327.837099] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[705327.837099] CR2: 0000000000000000 CR3: 000000018fe71000 CR4: 00000000000007f0
[705327.837099] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[705327.837099] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[705327.837099] Stack:
[705327.837099]  ffff88018fcbfc50 ffffffff81577bf2 ffff880036a1e688 ffff880000000001
[705327.837099]  0000000000000050 ffff88018fcbffd8 0000000000000000 ffff880036a1dc00
[705327.837099]  ffff88018fcbffd8 ffff88018fcbfe70 00000000011b4df9 0000000000000501
[705327.837099] Call Trace:
[705327.837099]  [<ffffffff81577bf2>] tcp_sendmsg+0x6f2/0xc20
[705327.837099]  [<ffffffff815a1c04>] inet_sendmsg+0x64/0xb0
[705327.837099]  [<ffffffff815103d7>] sock_aio_write+0x157/0x180
[705327.837099]  [<ffffffff811de5d9>] do_sync_readv_writev+0x79/0xd0
[705327.837099]  [<ffffffff811dfbae>] do_readv_writev+0xce/0x260
[705327.837099]  [<ffffffff81058aaf>] ? kvm_clock_get_cycles+0x1f/0x30
[705327.837099]  [<ffffffff810d896a>] ? __getnstimeofday64+0x3a/0xd0
[705327.837099]  [<ffffffff811dfdd5>] vfs_writev+0x35/0x60
[705327.837099]  [<ffffffff811dff8f>] SyS_writev+0x7f/0x110
[705327.837099]  [<ffffffff81646a09>] system_call_fastpath+0x16/0x1b
[705327.837099] Code: 4d 00 89 46 44 89 46 40 8b 86 dc 00 00 00 c7 44 02 24 01 00 01 00 48 8b 87 40 01 00 00 48 8d 97 38 01 00 00 48 89 16 48 89 46 08 <48> 89 30 83 87 48 01 00 00 01 48 83 bf 68 02 00 00 00 48 89 b7 
[705327.837099] RIP  [<ffffffff81573762>] skb_entail+0x52/0xd0
[705327.837099]  RSP <ffff88018fcbfb88>
[705327.837099] CR2: 0000000000000000

crash

debug-info と crash を用意した後に vmcore を解析する

crash> sys
      KERNEL: /usr/lib/debug/usr/lib/modules/3.10.0-327.36.1.el7.x86_64/vmlinux
    DUMPFILE: /var/crash/127.0.0.1-2016-10-27-01:11:15/vmcore  [PARTIAL DUMP]
        CPUS: 4
        DATE: Wed Oct 26 16:11:13 2016
      UPTIME: 8 days, 03:55:27
LOAD AVERAGE: 0.13, 0.14, 0.14
       TASKS: 285
    NODENAME: ********************
     RELEASE: 3.10.0-327.36.1.el7.x86_64
     VERSION: #1 SMP Sun Sep 18 13:04:29 UTC 2016
     MACHINE: x86_64  (2399 Mhz)
      MEMORY: 8 GB
       PANIC: "BUG: unable to handle kernel NULL pointer dereference at           (null)"

詳細な backtrace を取る

crash> bt -FF
PID: 2790   TASK: ffff880036a1dc00  CPU: 0   COMMAND: "nginx"
 #0 [ffff88018fcbf830] machine_kexec at ffffffff81051e9b
    ffff88018fcbf838: 000088018fcbf898 ffff880000000000 
    ffff88018fcbf848: 000000002d001000 ffff88002d001000 
    ffff88018fcbf858: 000000002d000000 ffff88018fcbf878 
    ffff88018fcbf868: 000000001e1b4df9 ffff88018fcbfad8 
    ffff88018fcbf878: ffff88018fcbf898 0000000000000246 
    ffff88018fcbf888: ffff88018fcbf958 crash_kexec+114  
 #1 [ffff88018fcbf890] crash_kexec at ffffffff810f27e2
    ffff88018fcbf898: 0000000000000fb0 [ffff8801089c9038:TCP] 
    ffff88018fcbf8a8: [ffff8801089c8f00:TCP] 0000000000000000 
    ffff88018fcbf8b8: ffff88018fcbfb88 0000000000003890 
    ffff88018fcbf8c8: 8883e3b982e39d83 [ffff880217001400:kmem_cache] 
    ffff88018fcbf8d8: 0000000000000900 00000000000006c0 
    ffff88018fcbf8e8: 0000000000000000 00000000ffffffff 
    ffff88018fcbf8f8: [ffff8801089c9038:TCP] [ffff8801089c9038:TCP] 
    ffff88018fcbf908: [ffff8801089c8f00:TCP] ffffffffffffffff 
    ffff88018fcbf918: skb_entail+82    0000000000000010 
    ffff88018fcbf928: 0000000000010202 ffff88018fcbfb88 
    ffff88018fcbf938: 0000000000000018 000000001e1b4df9 
    ffff88018fcbf948: 0000000000000009 ffff88018fcbfad8 
    ffff88018fcbf958: ffff88018fcbf980 oops_end+280     
 #2 [ffff88018fcbf960] oops_end at ffffffff8163f308
    ffff88018fcbf968: ffff88018fcbfad8 0000000000000246 
    ffff88018fcbf978: 0000000000000009 ffff88018fcbf9d0 
    ffff88018fcbf988: no_context+640   
 #3 [ffff88018fcbf988] no_context at ffffffff8162f431
    ffff88018fcbf990: 0003000100000003 0000000000000246 
    ffff88018fcbf9a0: 000000001e1b4df9 0000000000000002 
    ffff88018fcbf9b0: ffff88018fcbfad8 0000000000000000 
    ffff88018fcbf9c0: [ffff880036a1dc00:task_struct] 0000000000030001 
    ffff88018fcbf9d0: ffff88018fcbfa18 __bad_area_nosemaphore+115 
 #4 [ffff88018fcbf9d8] __bad_area_nosemaphore at ffffffff8162f4c7
    ffff88018fcbf9e0: __alloc_pages_nodemask+407 ffff88021ffd8000 
    ffff88018fcbf9f0: ffff88018fcbfad8 0000000000000002 
    ffff88018fcbfa00: 0000000000000000 ffff88018fcbfad8 
    ffff88018fcbfa10: 0000000000000029 ffff88018fcbfa40 
    ffff88018fcbfa20: bad_area+67      
 #5 [ffff88018fcbfa20] bad_area at ffffffff8162f7eb
    ffff88018fcbfa28: 0000000000000002 0000000000000000 
    ffff88018fcbfa38: [ffff8802108eb840:mm_struct] ffff88018fcbfaa0 
    ffff88018fcbfa48: __do_page_fault+837 
 #6 [ffff88018fcbfa48] __do_page_fault at ffffffff816420f5
    ffff88018fcbfa50: kmem_cache_free+437 0000000000000002 
    ffff88018fcbfa60: [ffff880036a1dc00:task_struct] 0000000036a1dc00 
    ffff88018fcbfa70: [ffff8802108eb8b8:mm_struct] ffff88018fcbfad8 
    ffff88018fcbfa80: 0000000000000000 [ffff8801089c8f00:TCP] 
    ffff88018fcbfa90: [ffff8801089c9038:TCP] 0000000000000fb0 
    ffff88018fcbfaa0: ffff88018fcbfac8 do_page_fault+35 
 #7 [ffff88018fcbfaa8] do_page_fault at ffffffff81642223
    ffff88018fcbfab0: ffff88018fcbfb2f 0000000000000001 
    ffff88018fcbfac0: 0000000000000000 ffff88018fcbfb88 
    ffff88018fcbfad0: page_fault+40    
 #8 [ffff88018fcbfad0] page_fault at ffffffff8163e508
    [exception RIP: skb_entail+82] 🔥
    RIP: ffffffff81573762  RSP: ffff88018fcbfb88  RFLAGS: 00010202
    RAX: 0000000000000000  RBX: 0000000000003890  RCX: 00000000ffffffff
    RDX: ffff8801089c9038  RSI: ffff8801089c9038  RDI: ffff8801089c8f00
    RBP: ffff88018fcbfb88   R8: 00000000000006c0   R9: 0000000000000900
    R10: ffff880217001400  R11: 8883e3b982e39d83  R12: 0000000000000000
    R13: ffff8801089c8f00  R14: ffff8801089c9038  R15: 0000000000000fb0
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
    ffff88018fcbfad8: 0000000000000fb0 [ffff8801089c9038:TCP] 
    ffff88018fcbfae8: [ffff8801089c8f00:TCP] 0000000000000000 
    ffff88018fcbfaf8: ffff88018fcbfb88 0000000000003890 
    ffff88018fcbfb08: 8883e3b982e39d83 [ffff880217001400:kmem_cache] 
    ffff88018fcbfb18: 0000000000000900 00000000000006c0 
    ffff88018fcbfb28: 0000000000000000 00000000ffffffff 
    ffff88018fcbfb38: [ffff8801089c9038:TCP] [ffff8801089c9038:TCP] 
    ffff88018fcbfb48: [ffff8801089c8f00:TCP] ffffffffffffffff 
    ffff88018fcbfb58: skb_entail+82    0000000000000010 
    ffff88018fcbfb68: 0000000000010202 ffff88018fcbfb88 
    ffff88018fcbfb78: 0000000000000018 [ffff8801089c8f00:TCP] 
    ffff88018fcbfb88: ffff88018fcbfc50 tcp_sendmsg+1778 
 #9 [ffff88018fcbfb90] tcp_sendmsg at ffffffff81577bf2
    ffff88018fcbfb98: [ffff880036a1e688:task_struct] ffff880000000001 
    ffff88018fcbfba8: 0000000000000050 ffff88018fcbffd8 
    ffff88018fcbfbb8: 0000000000000000 [ffff880036a1dc00:task_struct] 
    ffff88018fcbfbc8: ffff88018fcbffd8 ffff88018fcbfe70 
    ffff88018fcbfbd8: 00000000011b4df9 0000000000000501 
    ffff88018fcbfbe8: [ffff880036a1e601:task_struct] [ffff8801089c9038:TCP] 
    ffff88018fcbfbf8: 0000000001fef010 000005a800000000 
    ffff88018fcbfc08: 0000004000001050 ffff880100003890 
    ffff88018fcbfc18: 0000000000000000 000000001e1b4df9 
    ffff88018fcbfc28: ffff88018fcbfd60 ffff88018fcbfd60 
    ffff88018fcbfc38: 0000000000000002 ffff88018fcbfe60 
    ffff88018fcbfc48: [ffff88013ab76580:sock_inode_cache] ffff88018fcbfc80 
    ffff88018fcbfc58: inet_sendmsg+100 
#10 [ffff88018fcbfc58] inet_sendmsg at ffffffff815a1c04
    ffff88018fcbfc60: 0000000000000040 000000001e1b4df9 
    ffff88018fcbfc70: ffff88018fcbfc80 0000000000002000 
    ffff88018fcbfc80: ffff88018fcbfd50 sock_aio_write+343 
#11 [ffff88018fcbfc88] sock_aio_write at ffffffff815103d7
    ffff88018fcbfc90: 00002000815d3d04 ffff88018fcbfcd8 
    ffff88018fcbfca0: 0000900000000040 [ffff88013ab75180:sock_inode_cache] 
    ffff88018fcbfcb0: 000020008fcbfd70 [ffff88013ab76580:sock_inode_cache] 
    ffff88018fcbfcc0: ffff88018fcbfcc8 0000000000000000 
    ffff88018fcbfcd0: ffff88018fcbfcd8 0000000000000000 
    ffff88018fcbfce0: 0000000000000000 ffff88018fcbfe60 
    ffff88018fcbfcf0: 0000000000000002 0000000000000000 
    ffff88018fcbfd00: 0000000000000000 0000000000000040 
    ffff88018fcbfd10: ffff88018fcbfd60 000000001e1b4df9 
    ffff88018fcbfd20: 0000000000009000 ffff88018fcbff48 
    ffff88018fcbfd30: 0000000000002000 [ffff88007c3d5400:kmalloc-256] 
    ffff88018fcbfd40: 0000000000000002 [ffff88007c3d5400:kmalloc-256] 
    ffff88018fcbfd50: ffff88018fcbfe28 do_sync_readv_writev+121 
#12 [ffff88018fcbfd58] do_sync_readv_writev at ffffffff811de5d9
    ffff88018fcbfd60: 0000000000000001 [ffff88007c3d5400:kmalloc-256] 
    ffff88018fcbfd70: 0000000000000000 0000000000000000 
    ffff88018fcbfd80: 0000000000000000 [ffff880036a1dc00:task_struct] 
    ffff88018fcbfd90: 0000000000000000 0000000000000000 
    ffff88018fcbfda0: ffff88018fcbfca0 0000000000000000 
    ffff88018fcbfdb0: 0000000000002000 0000000000000000 
    ffff88018fcbfdc0: 0000000000002000 0000000000000000 
    ffff88018fcbfdd0: 0000000000000000 0000000000000000 
    ffff88018fcbfde0: 0000000000000000 0000000000000000 
    ffff88018fcbfdf0: 0000000000000000 0000000000000000 
    ffff88018fcbfe00: 0000000000000000 000000001e1b4df9 
    ffff88018fcbfe10: 0000000000002000 0000000000000001 
    ffff88018fcbfe20: ffff88018fcbfe60 ffff88018fcbff20 
    ffff88018fcbfe30: do_readv_writev+206 
#13 [ffff88018fcbfe30] do_readv_writev at ffffffff811dfbae
#13 [ffff88018fcbfe30] do_readv_writev at ffffffff811dfbae
    ffff88018fcbfe38: 0000000000000040 0000000000001000 
    ffff88018fcbfe48: 0000000000002000 ffff88018fcbff48 
    ffff88018fcbfe58: ffff88018fcbfe60 0000000001ff1ff0 
    ffff88018fcbfe68: 0000000000001000 0000000001feefc0 
    ffff88018fcbfe78: 0000000000001000 0000000001fedfb0 
    ffff88018fcbfe88: 0000000000001000 ffff88018fcbfe70 
    ffff88018fcbfe98: 0000000000000001 0000000000000000 
    ffff88018fcbfea8: 0000000000000000 0000000000000000 
    ffff88018fcbfeb8: ffff88018fcbff20 kvm_clock        
    ffff88018fcbfec8: ffff88018fcbfed8 kvm_clock_get_cycles+31 
    ffff88018fcbfed8: ffff88018fcbff00 __getnstimeofday64+58 
    ffff88018fcbfee8: 000000001e1b4df9 0000000000000000 
    ffff88018fcbfef8: [ffff88007c3d5400:kmalloc-256] 0000000000000001 
    ffff88018fcbff08: 00007ffcc8b17100 0000000000000002 
    ffff88018fcbff18: 0000000001f702e0 ffff88018fcbff30 
    ffff88018fcbff28: vfs_writev+53    
#14 [ffff88018fcbff28] vfs_writev at ffffffff811dfdd5
    ffff88018fcbff30: ffff88018fcbff78 sys_writev+127   
#15 [ffff88018fcbff38] sys_writev at ffffffff811dff8f
    ffff88018fcbff40: ffff88018fcbff78 0000000000000000 
    ffff88018fcbff50: 000000001e1b4df9 00007ffcc8b170e0 
    ffff88018fcbff60: 0000000001f702e0 0000000001fc3cd0 
    ffff88018fcbff70: 0000000000002000 0000000001ff6050 
    ffff88018fcbff80: system_call_fastpath+22 
#16 [ffff88018fcbff80] system_call_fastpath at ffffffff81646a09
    RIP: 00007fca70a8aea0  RSP: 00007ffcc8b179f0  RFLAGS: 00000246
    RAX: 0000000000000014  RBX: ffffffff81646a09  RCX: 0000000000000000
    RDX: 0000000000000002  RSI: 00007ffcc8b17100  RDI: 0000000000000009
    RBP: 0000000001ff6050   R8: 0000000000000000   R9: 0000000000002000
    R10: 000000007fffdfff  R11: 0000000000000293  R12: 0000000000002000
    R13: 0000000001fc3cd0  R14: 0000000001f702e0  R15: 00007ffcc8b170e0
    ORIG_RAX: 0000000000000014  CS: 0033  SS: 002b

🔥 [exception RIP: skb_entail+82] を手がかりに skb_entail を disassemble する

crash> dis skb_entail 
0xffffffff81573710 <skb_entail>:        data32 data32 data32 xchg %ax,%ax [FTRACE NOP]
0xffffffff81573715 <skb_entail+5>:      push   %rbp
0xffffffff81573716 <skb_entail+6>:      movl   $0x0,0x74(%rsi)
0xffffffff8157371d <skb_entail+13>:     mov    0x658(%rdi),%eax
0xffffffff81573723 <skb_entail+19>:     mov    0xe0(%rsi),%rdx
0xffffffff8157372a <skb_entail+26>:     orb    $0x10,0x7c(%rsi)
0xffffffff8157372e <skb_entail+30>:     movb   $0x10,0x4c(%rsi)
0xffffffff81573732 <skb_entail+34>:     mov    %rsp,%rbp
0xffffffff81573735 <skb_entail+37>:     movb   $0x0,0x4d(%rsi)
0xffffffff81573739 <skb_entail+41>:     mov    %eax,0x44(%rsi)
0xffffffff8157373c <skb_entail+44>:     mov    %eax,0x40(%rsi)
0xffffffff8157373f <skb_entail+47>:     mov    0xdc(%rsi),%eax
0xffffffff81573745 <skb_entail+53>:     movl   $0x10001,0x24(%rdx,%rax,1)
0xffffffff8157374d <skb_entail+61>:     mov    0x140(%rdi),%rax ?null?
0xffffffff81573754 <skb_entail+68>:     lea    0x138(%rdi),%rdx
0xffffffff8157375b <skb_entail+75>:     mov    %rdx,(%rsi)
0xffffffff8157375e <skb_entail+78>:     mov    %rax,0x8(%rsi)
0xffffffff81573762 <skb_entail+82>:     mov    %rsi,(%rax) 🔥
0xffffffff81573765 <skb_entail+85>:     addl   $0x1,0x148(%rdi)
0xffffffff8157376c <skb_entail+92>:     cmpq   $0x0,0x268(%rdi)
0xffffffff81573774 <skb_entail+100>:    mov    %rsi,0x140(%rdi)
0xffffffff8157377b <skb_entail+107>:    je     0xffffffff815737c0 <skb_entail+176>

🔥 で NULL pointer dereference により exception を起こしている

ここまでの状況を整理

  • Nginx が TCPソケットに writev(2) したプロセスコンテキストである
  • exception を起こした RIP は 0xffffffff81573762 skb_entail+82 mov %rsi,(%rax) が例外を起こした命令
  • %rax0000000000000000 なので、(%rax) で NULL pointer dereference となり例外を起こしている

その他の情報として、このインスタンスの運用で把握していることは

  • kernel panic を起こした KVMインスタンスは、 Nginx + Unicorn が動作している典型的な Rails 構成の role である
  • この role は、水平にスケールする構成をとっており、同様に動く KVM インスタンスが他に数十ほどいる
  • 全て合計すると Nginx プロセス数百ほどいる

という環境で発生した kernel panic なので、かなり稀にしか発生しないバグ、という推測をたてている

disassemble して解析

objdump でソースのアノテーション付きで disassemble する

 objdump -D -S /usr/lib/debug/usr/lib/modules/3.10.0-327.36.1.el7.x86_64/vmlinux > disassemble.txt

RIP 0xffffffff81573762 の付近のソースは下記の通り

static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb)
{
ffffffff81573754:       48 8d 97 38 01 00 00    lea    0x138(%rdi),%rdx
        __skb_queue_tail(&sk->sk_write_queue, skb);

static inline void __skb_insert(struct sk_buff *newsk,
                                struct sk_buff *prev, struct sk_buff *next,
                                struct sk_buff_head *list)
{
ffffffff8157375b:       48 89 16                mov    %rdx,(%rsi)
        newsk->next = next;
ffffffff8157375e:       48 89 46 08             mov    %rax,0x8(%rsi)
        newsk->prev = prev;
ffffffff81573762:       48 89 30                mov    %rsi,(%rax) 🔥
        next->prev  = prev->next = newsk;
ffffffff81573765:       83 87 48 01 00 00 01    addl   $0x1,0x148(%rdi)
        list->qlen++;

🔥 exception を起こした時点でのレジスタを確認する

  • %rsi struct sk_buff *next 0xffff8801089c9038
  • 0x08(%rsi) struct sk_buff *next 0xffff8801089c9038
  • %rdx struct sk_buff *next 0xffff8801089c9038
  • %rax struct sk_buff *prev NULL

prev が NULL であることで例外をおこしている。newsk, next が同一アドレスなのはおかしい気がする。

crash で構造体を見る

crash> p *(struct sk_buff *) 0xffff8801089c9038
$12 = {
  next = 0xffff8801089c9038, 
  prev = 0x0
  {
    tstamp = {
      tv64 = 0
    }, 
    skb_mstamp = {
      {
        v64 = 0, 
        {
          stamp_us = 0, 
          stamp_jiffies = 0
        }
      }
    }
  }, 
  sk = 0x0, 
  dev = 0x0, 

... 略
crash> p ((struct sock *) 0xffff8801089c8f00)->sk_write_queue
$19 = {
  next = 0xffff8801089c9038,  /* ❓ */
  prev = 0x0, 
  qlen = 0, 
  lock = {
    {
      rlock = {
        raw_lock = {
          {
            head_tail = 0, 
            tickets = {
              head = 0, 
              tail = 0
            }
          }
        }
      }
    }
  }
}

struct sk_buff は struct sk_buff_head と合わせて下記のような circular doubly linked list の構造を取る。

f:id:hiboma:20161029164403j:plain

sk->sk_write_queue は 図の struct sk_buff_head に相当する

systemtap で確認

別の検証として systemtapsk_stream_alloc_skbskb_entail を呼び出した際の変数のアドレスを見てみたが、 $skb と $sk->sk_write_queue のアドレスは別になっている

[11185] sshd
sk_stream_alloc_skb
$skb:                      0xffff880038bf7600
$skb->prev:                0x0
$skb->next:                0x0
skb_entail
$skb:                      0xffff880038bf7600
$skb->prev:                0x0
$skb->next:                0x0
$sk->sk_write_queue:       0xffff8800360f2e38
$sk->sk_write_queue->prev: 0xffff8800360f2e38
$sk->sk_write_queue->next: 0xffff8800360f2e38

本来はこのように別のアドレスを指すべきなのだと思われる

検証に使った systemtap
$ cat test.stap
probe kernel.function("sk_stream_alloc_skb").return
{
  printf ("---------------------------------------------------------\n")
  printf ("[%d] %s\n", pid(), execname())
  printf ("sk_stream_alloc_skb\n") 
  printf ("$skb:                      %p\n", $return)
  printf ("$skb->prev:                %p\n", $return->prev)
  printf ("$skb->next:                %p\n", $return->next) 
}

probe kernel.function("skb_entail").call
{
  printf ("skb_entail\n")
  printf ("$skb:                      %p\n", $skb)
  printf ("$skb->prev:                %p\n", $skb->prev)
  printf ("$skb->next:                %p\n", $skb->next); 
  printf ("$sk->sk_write_queue:       %p\n", &$sk->sk_write_queue)
  printf ("$sk->sk_write_queue->prev: %p\n", $sk->sk_write_queue->prev)
  printf ("$sk->sk_write_queue->next: %p\n", $sk->sk_write_queue->next)
}

更にソースを追う

struct sk_buffer * skb がどこからやってくるかをソースを遡ってみる

static inline void __skb_insert(struct sk_buff *newsk,
                struct sk_buff *prev 💀, struct sk_buff *next,
                struct sk_buff_head *list)
{
        // newsk 0xffff8801089c9038
        // prev  NULL
        // next  0xffff8801089c9038
        // list  0xffff8801089c9038

    newsk->next = next;
    newsk->prev = prev; 💀               /* prev is NULL */
    next->prev  = prev->next = newsk; 🔥 /* oops NULL pointer derefernce */
    list->qlen++;
}
static inline void __skb_queue_before(struct sk_buff_head *list,
                      struct sk_buff *next 🍣,
                      struct sk_buff *newsk)
{
        // newsk 0xffff8801089c9038
        // next->prev NULL
        // next 0xffff8801089c9038
        // list 0xffff8801089c9038 ???
    __skb_insert(newsk, next->prev 💀, next, list);
}
static inline void __skb_queue_tail(struct sk_buff_head *list,
                   struct sk_buff *newsk)
{
        // list  0xffff8801089c9038
        //   * list->prev NULL
        // newsk 0xffff8801089c9038
    __skb_queue_before(list, (struct sk_buff *)list 🍣, newsk);
}
static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb)
{
    // sk  0xffff8801089c8f00 
    //     * sk->sk_write_queue->next 0xffff8801089c9038
    //     * sk->sk_write_queue->prev NULL 💀
    // skb 0xffff8801089c9038
        __skb_queue_tail(&sk->sk_write_queue 🍣, skb );
}
static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb)
{
    // sk  0xffff8801089c8f00
    // skb 0xffff8801089c9038
    __tcp_add_write_queue_tail(sk 🍣, skb );

    /* Queue it, remembering where we must start sending. */
    if (sk->sk_send_head == NULL) {
        sk->sk_send_head = skb;

        if (tcp_sk(sk)->highest_sack == NULL)
            tcp_sk(sk)->highest_sack = skb;
    }
}
static void skb_entail(struct sock *sk, struct sk_buff *skb)
{
    struct tcp_sock *tp = tcp_sk(sk);
    struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);

    skb->csum    = 0;
    tcb->seq     = tcb->end_seq = tp->write_seq;
    tcb->tcp_flags = TCPHDR_ACK;
    tcb->sacked  = 0;
    __skb_header_release(skb);
    
    // sk  0xffff8801089c8f00
    // skb 0xffff8801089c9038
    tcp_add_write_queue_tail(sk 🍣, skb ); 
    sk->sk_wmem_queued += skb->truesize;
    sk_mem_charge(sk, skb->truesize);
    if (tp->nonagle & TCP_NAGLE_PUSH)
        tp->nonagle &= ~TCP_NAGLE_PUSH;
}
int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
        size_t size)
{
    struct iovec *iov;
    struct tcp_sock *tp = tcp_sk(sk);
    struct sk_buff *skb;
    int iovlen, flags, err, copied = 0;
    int mss_now = 0, size_goal, copied_syn = 0, offset = 0;
    bool sg;
    long timeo;

//...

new_segment:
                /* Allocate new segment. If the interface is SG,
                * allocate skb fitting to single page.
                */
                if (!sk_stream_memory_free(sk))
                    goto wait_for_sndbuf;

                skb = sk_stream_alloc_skb(sk,
                              select_size(sk, sg),
                              sk->sk_allocation);
                if (!skb)
                    goto wait_for_memory;

                /*
                * Check whether we can use HW checksum.
                */
                if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
                    skb->ip_summed = CHECKSUM_PARTIAL;

                                // sk  0xffff8801089c8f00 %rdi
                                // skb 0xffff8801089c9038 %rsi 
                skb_entail(sk, skb); 🔥

sk_stream_alloc_skb で割り当てた skb は __alloc_skb の memset で 0 クリアされている

struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
                int flags, int node)
{
    struct kmem_cache *cache;
    struct skb_shared_info *shinfo;
    struct sk_buff *skb;
    u8 *data;
    bool pfmemalloc;

... 略

    /*
    * Only clear those fields we need to clear, not those that we will
    * actually initialise below. Hence, don't put any more fields after
    * the tail pointer in struct sk_buff!
    */
    memset(skb, 0, offsetof(struct sk_buff, tail));

よって skb->prev と skb->next は 0 = NULL