linux 3.10 一个扇区异常可能引发的hung

最近遇到一例3.10内核的crash:

[ 4109.682163] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 4109.683440] fas_readwriter  D ffff881fffd147c0     0 46745  43298 0x00000080
[ 4109.683444]  ffff881f320bfc20 0000000000000082 ffff881f320bffd8 ffff881f320bffd8
[ 4109.683469]  ffff881f320bffd8 00000000000147c0 ffff881fd321ae00 ffff881571308f20
[ 4109.683487]  ffff881571308f24 ffff881f32186780 00000000ffffffff ffff881571308f28
[ 4109.683491] Call Trace:
[ 4109.683503]  [<ffffffff8163f959>] schedule_preempt_disabled+0x29/0x70
[ 4109.683507]  [<ffffffff8163d415>] __mutex_lock_slowpath+0xc5/0x1c0
[ 4109.683512]  [<ffffffff811eaff7>] ? unlazy_walk+0x87/0x140
[ 4109.683515]  [<ffffffff8163c87f>] mutex_lock+0x1f/0x2f
[ 4109.683522]  [<ffffffff81636235>] lookup_slow+0x33/0xa7
[ 4109.683525]  [<ffffffff811f0483>] path_lookupat+0x773/0x7a0
[ 4109.683531]  [<ffffffff81274454>] ? SYSC_semtimedop+0x264/0xd10
[ 4109.683537]  [<ffffffff810c0cbb>] ? should_numa_migrate_memory+0x5b/0x150
[ 4109.683542]  [<ffffffff811c3345>] ? kmem_cache_alloc+0x35/0x1d0
[ 4109.683545]  [<ffffffff811f117f>] ? getname_flags+0x4f/0x1a0
[ 4109.683548]  [<ffffffff811f04db>] filename_lookup+0x2b/0xc0
[ 4109.683550]  [<ffffffff811f22a7>] user_path_at_empty+0x67/0xc0
[ 4109.683556]  [<ffffffff811993c7>] ? handle_mm_fault+0x607/0xf40
[ 4109.683559]  [<ffffffff811f2311>] user_path_at+0x11/0x20
[ 4109.683564]  [<ffffffff811e5b53>] vfs_fstatat+0x63/0xc0
[ 4109.683567]  [<ffffffff811e60be>] SYSC_newstat+0x2e/0x60
[ 4109.683570]  [<ffffffff81273ced>] ? do_smart_update+0x11d/0x130
[ 4109.683575]  [<ffffffff8101c899>] ? read_tsc+0x9/0x10
[ 4109.683580]  [<ffffffff8110d336>] ? __audit_syscall_exit+0x1e6/0x280
[ 4109.683583]  [<ffffffff811e639e>] SyS_newstat+0xe/0x10
[ 4109.683586]  [<ffffffff81649909>] system_call_fastpath+0x16/0x1b

看来在一个mutex上处于uninterrupt同步等待超过了时间。

看堆栈是stat调用访问文件的元数据,

crash> struct nameidata ffff881f320bfdc0
struct nameidata {
  path = {
    mnt = 0xffff883fcb7e2820,
    dentry = 0xffff881f502552c0
  },
  last = {
    {
      {
        hash = 3495247374,
        len = 27
      },
      hash_len = 119459364366
    },
    name = 0xffff881040c56031 "00fe_0002_011712b3_00000001"-----------访问的文件名
  },
  root = {
    mnt = 0xffff883fccff1020,
    dentry = 0xffff881fcc7ff980
  },
  inode = 0xffff881571308e78,
  flags = 1,
  seq = 2,
  last_type = 0,
  depth = 0,
  saved_names = {0x7f30b11923e8 <Address 0x7f30b11923e8 out of bounds>, 0xffff881f8a6a2c40 "g", 0xffff881f46cc0438 "", 0xffff881fce793e80 "\260\334\341W\037\210\377\377\310\354\242o\037\210\377\377\020E-D\037\210\377\377\200\231\001\201\377\377\377\377\020\363\031\201\377\377\377\377", 0x80 <Address 0x80 out of bounds>, 0xffff881f320bfec0 "\260\004|\a(\177", 0xffffffff811993c7 <handle_mm_fault+1543> "A\211\302\351\327\375\377\377\220I\211\370L\211\341H\211\332L\211\356L\211\367D\211M\300\350\310]\003", 0x5cd06a0 <Address 0x5cd06a0 out of bounds>, 0x0}
}

files 看不到这个文件,说明该进程没有open 这个file。

要找到对应的mutex:

crash> struct nameidata.path ffff881f320bfdc0
  path = {
    mnt = 0xffff883fcb7e2820,
    dentry = 0xffff881f502552c0
  }
crash> dentry.d_inode 0xffff881f502552c0
  d_inode = 0xffff881571308e78
crash> struct -x dentry.d_inode
struct dentry {
  [0x30] struct inode *d_inode;
}
crash> inode.i_mutex 0xffff881571308e78
  i_mutex = {
    count = {
      counter = -1
    },
    wait_lock = {
      {
        rlock = {
          raw_lock = {
            {
              head_tail = 100009462,
              tickets = {
                head = 1526,
                tail = 1526
              }
            }
          }
        }
      }
    },
    wait_list = {
      next = 0xffff8819fc5b3c90,
      prev = 0xffff881f5097bc40
    },
    owner = 0xffff881f52764500,-------spinlock在没开启debug的情况下不会设置owner,而mutex一般看owner就知道谁持有
    {
      osq = 0x0,
      __UNIQUE_ID_rh_kabi_hide1 = {
        spin_mlock = 0x0
      },
      {<No data fields>}
    }
  }

然后看一下owner为啥拿了互斥量不放:

task 0xffff881f52764500
PID: 47270  TASK: ffff881f52764500  CPU: 25  COMMAND: "nginx"
struct task_struct {
  state = 2,
  stack = 0xffff881a5c19c000,

查看它的堆栈:

crash> bt -f 47270
PID: 47270  TASK: ffff881f52764500  CPU: 25  COMMAND: "nginx"
 #0 [ffff881a5c19f7f0] __schedule at ffffffff8163df9b
    ffff881a5c19f7f8: 0000000000000082 ffff881a5c19ffd8
    ffff881a5c19f808: ffff881a5c19ffd8 ffff881a5c19ffd8
    ffff881a5c19f818: 00000000000147c0 ffff881fd3267300
    ffff881a5c19f828: ffff8813ee889a58 ffff8813ee889a60
    ffff881a5c19f838: 7fffffffffffffff ffff881f52764500
    ffff881a5c19f848: ffff881fd042b500 ffff881a5c19f860
    ffff881a5c19f858: ffffffff8163e879
 #1 [ffff881a5c19f858] schedule at ffffffff8163e879
    ffff881a5c19f860: ffff881a5c19f908 ffffffff8163c329
 #2 [ffff881a5c19f868] schedule_timeout at ffffffff8163c329
    ffff881a5c19f870: 7fffffffffffffff ffff881a5c19f900
    ffff881a5c19f880: 0000000000000000 0000000000000000
    ffff881a5c19f890: 0000000000000000 ffff8813ee889980
    ffff881a5c19f8a0: ffff881a5c19f8b8 ffffffff812cea44
    ffff881a5c19f8b0: 0000000000000001 ffff881a5c19f968
    ffff881a5c19f8c0: ffffffffa07a51c4 000010005c19f8d8
    ffff881a5c19f8d0: 0000000000000000 00000000edb89915
    ffff881a5c19f8e0: ffff8813ee889a58 ffff8813ee889a60
    ffff881a5c19f8f0: 7fffffffffffffff ffff881f52764500
    ffff881a5c19f900: ffff881fd042b500 ffff881a5c19f968
    ffff881a5c19f910: ffffffff8163ec46
 #3 [ffff881a5c19f910] wait_for_completion at ffffffff8163ec46
    ffff881a5c19f918: 0000000000000001 ffff881f52764500
    ffff881a5c19f928: ffffffff810b9930 ffff8813ee889a68
    ffff881a5c19f938: ffff8813ee889a68 00000000edb89915
    ffff881a5c19f948: ffff8813ee889980 0000000000000001
    ffff881a5c19f958: ffffffffa07a6ce3 0000000000000000
    ffff881a5c19f968: ffff881a5c19f990 ffffffffa07a6b74
 #4 [ffff881a5c19f970] xfs_buf_submit_wait at ffffffffa07a6b74 [xfs]---------同步的buffer io提交函数
    ffff881a5c19f978: ffff8813ee889980 0000000000000001-----------ffff8813ee889980 为xfs_buf
    ffff881a5c19f988: ffffffffa07d3829 ffff881a5c19f9a0
    ffff881a5c19f998: ffffffffa07a6ce3
 #5 [ffff881a5c19f998] _xfs_buf_read at ffffffffa07a6ce3 [xfs]
    ffff881a5c19f9a0: ffff881a5c19f9e0 ffffffffa07a6dda
 #6 [ffff881a5c19f9a8] xfs_buf_read_map at ffffffffa07a6dda [xfs]----------,走这个分支说明tp为NULL,根据map找xfs_buf,找不到则申请一个,扇区为1960485944,长度为8
    ffff881a5c19f9b0: ffffffffa07e6cc0 0000000000000000
    ffff881a5c19f9c0: 0000000000000000 ffff883f001f7800
    ffff881a5c19f9d0: ffff881a5c19fa60 ffff881fd042b500
    ffff881a5c19f9e0: ffff881a5c19fa20 ffffffffa07d3829
 #7 [ffff881a5c19f9e8] xfs_trans_read_buf_map at ffffffffa07d3829 [xfs]----根据trans和xfs_buf_map调用xfs_buf_read_map 来获取xfs_buf
    ffff881a5c19f9f0: 00000000edb89915 ffff881a5c19fb10
    ffff881a5c19fa00: 0000000000000000 ffff881571308cc0
    ffff881a5c19fa10: ffffffffa07e6cc0 ffff881a5c19fa60
    ffff881a5c19fa20: ffff881a5c19faa0 ffffffffa0786204
 #8 [ffff881a5c19fa28] xfs_da_read_buf at ffffffffa0786204 [xfs]
    ffff881a5c19fa30: ffff881a5c19fa50 ffffffffa07e6cc0
    ffff881a5c19fa40: 0000000000000000 00000001a07e6650
    ffff881a5c19fa50: 0000000000000000 ffff881a5c19fa60
    ffff881a5c19fa60: 0000000074daa438 ffff881a00000008
    ffff881a5c19fa70: 00000000edb89915 0000000000000000
    ffff881a5c19fa80: ffff881a5c19fb10 ffff8810cfbd1b00
    ffff881a5c19fa90: 0000000000000016 000000000000006c
    ffff881a5c19faa0: ffff881a5c19fad0 ffffffffa078baa6
 #9 [ffff881a5c19faa8] xfs_dir3_data_read at ffffffffa078baa6 [xfs]
    ffff881a5c19fab0: ffffffffa07e6cc0 ffff881f5a13ad00
    ffff881a5c19fac0: ffff881f5a13ad00 ffff8814905453a0
    ffff881a5c19fad0: ffff881a5c19fb58 ffffffffa078f99e
#10 [ffff881a5c19fad8] xfs_dir2_leafn_lookup_for_entry at ffffffffa078f99e [xfs]
    ffff881a5c19fae0: 0000000172bc1800 ffff881f5a13ad58
    ffff881a5c19faf0: ffff883f001f7800 ffff881571308cc0
    ffff881a5c19fb00: 0000000000000000 ffff881400000016
    ffff881a5c19fb10: 0000000000000000 0080001d00800010
    ffff881a5c19fb20: ffff000501873dff 00000000edb89915
    ffff881a5c19fb30: ffff881f5a13ad40 ffff881f5a13ad00
    ffff881a5c19fb40: ffff881f5a13ad10 ffff8810cfbd1b00
    ffff881a5c19fb50: ffff881f5a13ad58 ffff881a5c19fb68
    ffff881a5c19fb60: ffffffffa0791177
#11 [ffff881a5c19fb60] xfs_dir2_leafn_lookup_int at ffffffffa0791177 [xfs]
    ffff881a5c19fb68: ffff881a5c19fbc8 ffffffffa0787726
#12 [ffff881a5c19fb70] xfs_da3_node_lookup_int at ffffffffa0787726 [xfs]
    ffff881a5c19fb78: ffff881a5c19fbdc ffff881a5c19fcb8
    ffff881a5c19fb88: 0000000000000000 ffff000100643ebe
    ffff881a5c19fb98: 00000000edb89915 ffff8810cfbd1b00
    ffff881a5c19fba8: 0000000000000000 ffff8810cfbd1b00
    ffff881a5c19fbb8: ffff881f5a13ad00 ffff881a5c19fcb8
    ffff881a5c19fbc8: ffff881a5c19fc08 ffffffffa0791ded
#13 [ffff881a5c19fbd0] xfs_dir2_node_lookup at ffffffffa0791ded [xfs]
    ffff881a5c19fbd8: ffff881a5c19fc08 00000000edb89915
    ffff881a5c19fbe8: ffff8810cfbd1b00 0000000000000000
    ffff881a5c19fbf8: 0000000000000000 0000000000000000
    ffff881a5c19fc08: ffff881a5c19fc58 ffffffffa07897b5
#14 [ffff881a5c19fc10] xfs_dir_lookup at ffffffffa07897b5 [xfs]
    ffff881a5c19fc18: ffff881a5c19fc68 0000000000000008
    ffff881a5c19fc28: 00000000edb89915 0000000000000008
    ffff881a5c19fc38: ffff881571308cc0 ffff881a5c19fcb8
    ffff881a5c19fc48: 0000000000000000 ffff881a5c19fcb0
    ffff881a5c19fc58: ffff881a5c19fca0 ffffffffa07b7bb6
#15 [ffff881a5c19fc60] xfs_lookup at ffffffffa07b7bb6 [xfs]
    ffff881a5c19fc68: ffff881a5c19fc90 00000000edb89915
    ffff881a5c19fc78: ffff881fcd1038c0 0000000000008000
    ffff881a5c19fc88: ffff881a5c19fdf0 ffff881f502552c0
    ffff881a5c19fc98: ffff881a5c19fe50 ffff881a5c19fcd8
    ffff881a5c19fca8: ffffffffa07b4a0b
#16 [ffff881a5c19fca8] xfs_vn_lookup at ffffffffa07b4a0b [xfs]
    ffff881a5c19fcb0: ffff881a5c19fe60 ffff881fcd1038f8
    ffff881a5c19fcc0: 000000000000001b 00000000edb89915
    ffff881a5c19fcd0: ffff881fcd1038c0 ffff881a5c19fcf8
    ffff881a5c19fce0: ffffffff811eac3d
#17 [ffff881a5c19fce0] lookup_real at ffffffff811eac3d
    ffff881a5c19fce8: ffff881f502552c0 ffff881a5c19ff28
    ffff881a5c19fcf8: ffff881a5c19fda8 ffffffff811ee813
#18 [ffff881a5c19fd00] do_last at ffffffff811ee813---------------do_last-->lookup_open-->lookup_real,对应2905行
    ffff881a5c19fd08: ffff881a5c19ff58 ffffffff81274abd
    ffff881a5c19fd18: 0000000000000000 ffff881f52764500
    ffff881a5c19fd28: 0000001f52764500 0000000000000000
    ffff881a5c19fd38: ffff881571308e78 ffff88041076d000------------ ffff881571308e78对应的inode
    ffff881a5c19fd48: ffff881a5c19fde4 ffff881f502552c0-------------父目录的dentry
    ffff881a5c19fd58: ffff880aed4d5700 ffff880000000024
    ffff881a5c19fd68: 01ff880aed4d5700 ffff881a5c19fd80
    ffff881a5c19fd78: 00000000edb89915 ffff881a5c19fe50
    ffff881a5c19fd88: ffff880aed4d5700 ffff88041076d000
    ffff881a5c19fd98: ffff881a5c19ff28 ffff881f52764500
    ffff881a5c19fda8: ffff881a5c19fe40 ffffffff811f0be2
#19 [ffff881a5c19fdb0] path_openat at ffffffff811f0be2
    ffff881a5c19fdb8: ffffea00515463b0 0000000000000000
    ffff881a5c19fdc8: 0000000000000200 ffff883fc04c7080
    ffff881a5c19fdd8: 0000004100000000 00000000000000a9
    ffff881a5c19fde8: 0000000000000000 00007f2c5676d000
    ffff881a5c19fdf8: ffffea00d1d8c2c0 00000000edb89915
    ffff881a5c19fe08: 00007f2c5676dff0 00000000edb89915
    ffff881a5c19fe18: 00000000ffffff9c ffff88041076d000
    ffff881a5c19fe28: ffff881a5c19ff28 0000000000000001
    ffff881a5c19fe38: 00000000000036cc ffff881a5c19ff10
    ffff881a5c19fe48: ffffffff811f23ab
#20 [ffff881a5c19fe48] do_filp_open at ffffffff811f23ab
    ffff881a5c19fe50: ffff883fcb7e2820 ffff881f502552c0
    ffff881a5c19fe60: 0000001bc5e44c0c ffff88041076d031
    ffff881a5c19fe70: ffff883fccff1020 ffff881fcc7ff980
    ffff881a5c19fe80: ffff881571308e78 0000000200000101
    ffff881a5c19fe90: 0000000000000000 0000000000000001
    ffff881a5c19fea0: 00007f2c64217510 0000000000000000
    ffff881a5c19feb0: 0000000000000000 ffff881a5c19ff00
    ffff881a5c19fec0: ffffffff811fef47 ffff883fc175c3c0
    ffff881a5c19fed0: 0010000000008000 0000000000000001
    ffff881a5c19fee0: 0000000000008000 00000000edb89915
    ffff881a5c19fef0: 0000000000000001 00000000000000e3
    ffff881a5c19ff00: 00000000ffffff9c ffff88041076d000
    ffff881a5c19ff10: ffff881a5c19ff68 ffffffff811dfd53
#21 [ffff881a5c19ff18] do_sys_open at ffffffff811dfd53
    ffff881a5c19ff20: ffff881f49292a48 ffff000000008000
    ffff881a5c19ff30: 0000010000000024 00000000edb89915
    ffff881a5c19ff40: 0000000000000000 00007f2c64217664
    ffff881a5c19ff50: 0000000000000002 00007f2c64217653
    ffff881a5c19ff60: 00000000000036cc ffff881a5c19ff78
    ffff881a5c19ff70: ffffffff811dfe6e
#22 [ffff881a5c19ff70] sys_open at ffffffff811dfe6e
    ffff881a5c19ff78: 00007f2c64217510 ffffffff81649909
#23 [ffff881a5c19ff80] system_call_fastpath at ffffffff81649909
    RIP: 00007f2e31a3dc3d  RSP: 00007f2c64217358  RFLAGS: 00010202
    RAX: 0000000000000002  RBX: ffffffff81649909  RCX: 0000000000010000
    RDX: 0000000000000005  RSI: 0000000000000000  RDI: 00007f2c64217510
    RBP: 00007f2c64217510   R8: 000000000000ffff   R9: 000000000000001f
    R10: 00007f2e30df7000  R11: 0000000000000293  R12: ffffffff811dfe6e
    R13: ffff881a5c19ff78  R14: 00000000000036cc  R15: 00007f2c64217653
    ORIG_RAX: 0000000000000002  CS: 0033  SS: 002b
crash>

该进程在一个完成量上等待:

等待的完成量:
crash>completion ffff8813ee889a58
struct completion {
  done = 0,----------------至今未完成,如果完成则为1,由complete 函数修改,并唤醒等待的task
  wait = {
    lock = {
      {
        rlock = {
          raw_lock = {
            {
              head_tail = 131074,
              tickets = {
                head = 2,
                tail = 2
              }
            }
          }
        }
      }
    },
    task_list = {
      next = 0xffff881a5c19f930,
      prev = 0xffff881a5c19f930
    }
  }
}

这个完成量其实就是在等待一个io的完成,

对应的调用链:

xfs_buf_read_map--》_xfs_buf_read--》xfs_buf_submit_wait-->_xfs_buf_ioapply-->xfs_buf_ioapply_map-->submit_bio
_xfs_buf_ioapply返回后,执行wait_for_completion,然后就一直等着了

查看对应的xfs_buf:

crash> xfs_buf 0xffff8813ee889980
struct xfs_buf {
  b_rbnode = {
    __rb_parent_color = 18446612139147634816,
    rb_right = 0x0,
    rb_left = 0x0
  },
  b_bn = 1960485944,----------扇区号
  b_length = 8,
  b_hold = {
    counter = 2
  },
  b_lru_ref = {
    counter = 1
  },
  b_flags = 1048577,也就是0x100001,io未完成,没有置为异步标志:#define XBF_ASYNC     (1 << 4),#define XBF_DONE     (1 << 5) /* all pages in the buffer uptodate */,说明需要等待的同步io
  b_sema = {
    lock = {
      raw_lock = {
        {
          head_tail = 0,
          tickets = {
            head = 0,
            tail = 0
          }
        }
      }
    },
    count = 0,
    wait_list = {
      next = 0xffff8813ee8899b8,
      prev = 0xffff8813ee8899b8
    }
  },
  b_lru = {
    next = 0xffff8813ee8899c8,
    prev = 0xffff8813ee8899c8
  },
  b_lru_flags = 0,
  b_lock = {
    {
      rlock = {
        raw_lock = {
          {
            head_tail = 0,
            tickets = {
              head = 0,
              tail = 0
            }
          }
        }
      }
    }
  },
  b_io_error = 0,
  b_waiters = {
    lock = {
      {
        rlock = {
          raw_lock = {
            {
              head_tail = 0,
              tickets = {
                head = 0,
                tail = 0
              }
            }
          }
        }
      }
    },
    task_list = {
      next = 0xffff8813ee8899f0,
      prev = 0xffff8813ee8899f0
    }
  },
  b_list = {
    next = 0xffff8813ee889a00,
    prev = 0xffff8813ee889a00
  },
  b_pag = 0xffff881f4b98db00,-------这个xfs_buf关联的ag
  b_target = 0xffff881fd042b500,
  b_addr = 0xffff880750046000,
  b_ioend_work = {----------------------这个是io结束之后,会用到的work_struct
    data = {
      counter = 0
    },
    entry = {
      next = 0x0,
      prev = 0x0
    },
    func = 0x0
  },
  b_ioend_wq = 0xffff881f5ab7c400,-----------ioend之后,初始化对应的b_ioend_work,然后挂到这个workqueue中去
  b_iodone = 0x0,-----------------------为NULL,
  b_iowait = {---------------这个就是完成量的地址
    done = 0,
    wait = {
      lock = {
        {
          rlock = {
            raw_lock = {
              {
                head_tail = 131074,
                tickets = {
                  head = 2,------------这把锁曾经吃过一次亏,记忆深刻,
                  tail = 2
                }
              }
            }
          }
        }
      },
      task_list = {
        next = 0xffff881a5c19f930,
        prev = 0xffff881a5c19f930
      }
    }
  },
  b_fspriv = 0x0,
  b_transp = 0x0,----------------关联的xfs_trans,传入的tans为NULL
  b_pages = 0xffff8813ee889a90,
  b_page_array = {0xffffea001d401180, 0x0},
  b_maps = 0xffff8813ee889aa8,
  __b_map = {
    bm_bn = 1960485944,----------扇区,由于一个ag是管理个扇区,所以这个扇区对应的是ag1
    bm_len = 8------长度
  },
  b_map_count = 1,
  b_io_length = 8,
  b_pin_count = {
    counter = 0
  },
  b_io_remaining = {
    counter = 1------------依然为1
  },
  b_page_count = 1,
  b_offset = 0,
  b_error = 0,
  b_ops = 0xffffffffa07e6cc0 <xfs_dir3_data_buf_ops>---xfs_buf负责的不同对象,有不同的操作指针

在io正常完成时,回调的 xfs_buf_ioend 会处理xfs_buf的io结果,

if (bp->b_iodone)------我们的buf不满足
(*(bp->b_iodone))(bp);
else if (bp->b_flags & XBF_ASYNC)----我们的buf也不满足
xfs_buf_relse(bp);
else
complete(&bp->b_iowait);----我们的buf该走的流程

xfs_buf是由ag来管理的,xfs使用一颗radix树来管理ag,我们的扇区号除以8就是block编号,每个ag管理的block数是固定的,那么很容易获取到对应的index。

crash> xfs_perag 0xffff881f4b98db00
struct xfs_perag {
  pag_mount = 0xffff883f001f7800,
  pag_agno = 1,-------------------------和上面对得上
  pag_ref = {
    counter = 1002
  },
  pagf_init = 1 '\001',
  pagi_init = 1 '\001',
  pagf_metadata = 0 '\000',----是否和元数据关联
  pagi_inodeok = 1 '\001',
  pagf_levels = "\001\001",
  pagf_flcount = 4,
  pagf_freeblks = 112266782,------空闲block
  pagf_longest = 112265313,----最长的空闲块
  pagf_btreeblks = 0,
  pagi_freecount = 39,--------空闲inode个数
  pagi_count = 175488,--------已经分配的inode个数
  pagl_pagino = 81,
  pagl_leftrec = 1855879528,
  pagl_rightrec = 76512,
  pagb_lock = {
    {
      rlock = {
        raw_lock = {
          {
            head_tail = 4587590,
            tickets = {
              head = 70,
              tail = 70
            }
          }
        }
      }
    }
  },
  pagb_tree = {
    rb_node = 0x0
  },
  pagf_fstrms = {
    counter = 0
  },
  pag_ici_lock = {
    {
      rlock = {
        raw_lock = {
          {
            head_tail = 65799148,
            tickets = {
              head = 1004,
              tail = 1004
            }
          }
        }
      }
    }
  },
  pag_ici_root = {
    height = 5,
    gfp_mask = 32,
    rnode = 0xffff88144b2c7909
  },
  pag_ici_reclaimable = 0,
  pag_ici_reclaim_lock = {
    count = {
      counter = 1
    },
    wait_lock = {
      {
        rlock = {
          raw_lock = {
            {
              head_tail = 0,
              tickets = {
                head = 0,
                tail = 0
              }
            }
          }
        }
      }
    },
    wait_list = {
      next = 0xffff881f4b98db70,
      prev = 0xffff881f4b98db70
    },
    owner = 0x0,
    {
      osq = 0x0,
      __UNIQUE_ID_rh_kabi_hide1 = {
        spin_mlock = 0x0
      },
      {<No data fields>}
    }
  },
  pag_ici_reclaim_cursor = 0,
  pag_buf_lock = {
    {
      rlock = {
        raw_lock = {
          {
            head_tail = 593896294,
            tickets = {
              head = 9062,
              tail = 9062
            }
          }
        }
      }
    }
  },
  pag_buf_tree = {
    rb_node = 0xffff881576c18180-----------管理xfs_buf的红黑树
  },
  callback_head = {
    next = 0x0,
    func = 0x0
  },
  pagb_count = 0
}

ag使用红黑树来管理xfs_buf,我进一步确认下我找的xfs_buf在红黑树中:

tree -t rbtree -o xfs_buf.b_rbnode ffff881f4b98dba0 -s xfs_buf.b_bn,b_length
ffff881576c18180
  b_bn = 2021058616
  b_length = 8
ffff881571630d80
  b_bn = 1975870104
  b_length = 8
ffff88144a547180
  b_bn = 1959990760
  b_length = 8
ffff881452e61800
  b_bn = 1957446840
  b_length = 8
ffff883ed4be4a80
  b_bn = 1953586592
  b_length = 8
ffff883e829a3000
  b_bn = 1953509312
  b_length = 8
ffff881577099080
  b_bn = 1953509298
  b_length = 1
ffff8817d2cd1200
  b_bn = 1953509297
  b_length = 1
ffff883e829a3300
  b_bn = 1953509299
  b_length = 1
ffff883e85d85680
  b_bn = 1953509304
  b_length = 8
ffff88144e985680
  b_bn = 1953509352
  b_length = 8
ffff883e829a2880
  b_bn = 1953509320
  b_length = 8
ffff883f3752f780
  b_bn = 1953509344
  b_length = 8
ffff883ed266a700
  b_bn = 1953585224
  b_length = 8
。。。。
ffff883ececa1b00
  b_bn = 1960485840
  b_length = 32
ffff8813ee889980---------------我们出问题的xfs_buf
  b_bn = 1960485944------------我们出问题的起始扇区和长度
  b_length = 8
ffff883e71c6c000
  b_bn = 1960486072
  b_length = 8
。。。。

说明xfs_buf是正常的,那么xfs_buf下发的io请求去哪了呢?我在这个地方因为找错了block_device,发现该block_device中的request_queue的io数为0,陷入了迷茫,后来其他同事发现

找错了设备,我找的是sdae,其实出错的是sde。

根据文件路径,可以查找对应的挂载点对应的设备的request_queue:

block_device.bd_queue 0xffff883fc1048d00
  bd_queue = 0xffff883fc1e28828
crash> request_queue 0xffff883fc1e28828
struct request_queue {
  queue_head = {
    next = 0xffff883fc1e28828,-----------prev和next都指向自己,说明为空,也就是目前该队列因为调度关系没有io下发的原因,目前没有待下发给驱动执行的io
    prev = 0xffff883fc1e28828
  },
  last_merge = 0xffff883e826fcd80,
  elevator = 0xffff883e86875000,------------------这个里面有cfqd
  nr_rqs = {0, 17},--------------积压了17个同步io
  nr_rqs_elvpriv = 17,-----------有17个io需要经过调度器,这个值最大就和nr_rqs的总数相同

request_fn = 0xffffffff81422f70 <scsi_request_fn>,---这个函数会将request从request queue中取出来执行
make_request_fn = 0xffffffff812ce690 <blk_queue_bio>,
prep_rq_fn = 0xffffffff814213c0 <scsi_prep_fn>,
merge_bvec_fn = 0x0,
softirq_done_fn = 0xffffffff81422e20 <scsi_softirq_done>,
rq_timed_out_fn = 0xffffffff8141ec00 <scsi_times_out>,-------当队列中的request超时之后被调用,
dma_drain_needed = 0x0,
lld_busy_fn = 0xffffffff81421ad0 <scsi_lld_busy>,

。。。
nr_sorted = 11,-----ELEVATOR_INSERT_SORT 方式加入到队列中的数量
in_flight = {0, 5},----有5个in_flight的同步io没有返回,其实有6个io request,但其中一个不是REQ_TYPE_FS,所以不会统计在 in_flight中
 rq_timeout = 30000,--------1个request 30ms的超时时间,会在blk_add_timer中赋值给 req->timeout
timeout_list = {------------管理所有下发给驱动的request的超时
next = 0xffff883e9d3f7ed0,
prev = 0xffff883e9d24ecd0
},

我们目前知道下发io的xfs_buf,但是bio是动态申请的,怎么知道我们当前的这个io下发了多长时间呢?

在 xfs_buf_ioapply_map函数中,bio的申请如下:

static void
xfs_buf_ioapply_map(
    struct xfs_buf    *bp,
    int        map,
    int        *buf_offset,
    int        *count,
    int        rw)
{
    。。。
    bio = bio_alloc(GFP_NOIO, nr_pages);
    bio->bi_bdev = bp->b_target->bt_bdev;
    bio->bi_sector = sector;
    bio->bi_end_io = xfs_buf_bio_end_io;
    bio->bi_private = bp;
。。。}

可以看出,bio的bi_private成员就是我们的xfs_buf,而且bi_sector成员就是我们的扇区号,既然我们知道扇区号,可以通过search 扇区的方式找到对应的bio为 0xffff881490a76000

request为  ffff8813ee88b900:

crash> struct bio.bi_sector,bi_private,bi_bdev 0xffff881490a76000
  bi_sector = 1960485944
  bi_private = 0xffff8813ee889980
  bi_bdev = 0xffff883fc1048d00


crash> request.start_time,__sector,bio ffff8813ee88b900
  start_time = 4298780289----------io生成时的jiffies
  __sector = 1960485944
  bio = 0xffff881490a76000

crash> p jiffies
jiffies = $2 = 4298781359--------------当前的jiffies

 

可以看出,我们io的生成时间和当前的jiffies相差很小,也就是1秒钟左右,该进程虽然拿到了锁,但是它并不是持有了1200s,而很大的可能是它经过唤醒,抢到了锁,然后提交了io,继续等待io完成,正常情况下。它算幸运的,因为不幸运的 46745 进程因为等锁已经1200s了,被khungtaskd 内核线程 选中而触发了hung_task_panic 。

 不管怎么样,不幸的 46745进程等待mutex已经超过了阈值,有可能是某个进程一直拿锁而不放,也有可能它排在等待队列的后面,锁虽然释放但也没轮到它。mutex本不应该这么慢,

是什么导致了这么慢呢?

crash> cfq_data.active_queue 0xffff883e86877c00
  active_queue = 0xffff883e9efc03a0
crash> cfq_queue.dispatch_start 0xffff883e9efc03a0
  dispatch_start = 4298776243
crash> p jiffies
jiffies = $2 = 4298781359
crash> p 4298781359-4298776243
$3 = 5116

可以看出,当前active的cfq_queue 距离调度发起的时候已经过去5秒钟。而正常来说,一个request设置的超时默认为30ms。

查看一下request_queue中的hash成员管理的request ,在rq_mergeable 的情况下,request的hash成员是归request queue中的hash管理的。

 elevator_queue.hash 0xffff883e86875000
  hash = {{
。。。。。
    }, {
      first = 0xffff883e826fdb80
    }, {
      first = 0xffff883e40708800
    }, {
      first = 0x0
    }, {
      first = 0x0
    }, {
      first = 0xffff8812ae0c0980
    }, {
。。。。。
    }, {
      first = 0xffff8812c96efb00
    }, {
      first = 0x0
    }, {
      first = 0x0
    }, {
      first = 0xffff8813ee88b980
    }, {
  。。。。。
    }, {
      first = 0xffff88014a437800
    }, {
      first = 0x0
    }, {
      first = 0xffff880149d39280
    }, {
      first = 0xffff883e4341e300
    }, {
  。。。。。
    }, {
      first = 0xffff883e826fce00
    }, {
    。。。。。

 通过一个个查看,确定只有9个io,而 q->nr_sorted为11个,说明有2个io是不能merge的,再加上inflight的 5个,也就是16个io,那为什么request queue中alloced了17个io呢。

那是因为,有一个io没有计入在 in_flight 中。

9个未执行的可以merge的io:
crash> request.start_time 0xffff883e826fdb00
  start_time = 4298776286
crash> request.start_time 0xffff883e40708780
  start_time = 4298780265
crash> request.start_time 0xffff8812ae0c0900
  start_time = 4298776286
crash> request.start_time 0xffff8812c96efa80
  start_time = 4298780264
crash> request.start_time 0xffff8813ee88b900
  start_time = 4298780289
crash> request.start_time 0xffff88014a437780
  start_time = 4298780258
crash> request.start_time 0xffff880149d39200
  start_time = 4298780285
crash> request.start_time 0xffff883e4341e280
  start_time = 4298780285
crash> request.start_time 0xffff883e826fcd80
  start_time = 4298776272
2个不能merge的io:

request.hash ffff883e40708c00
hash = {
next = 0x0,
pprev = 0x0

request.hash ffff8812ae0c0780
hash = {
next = 0x0,
pprev = 0x0
}

那么等待执行的6个io怎么获取呢?在request_queue的elevator中,我们看到它有10个cfq_queue等待调度,

crash> elevator_queue.elevator_data 0xffff883e86875000
  elevator_data = 0xffff883e86877c00

  
crash> cfq_data 0xffff883e86877c00
struct cfq_data {
  queue = 0xffff883fc1e28828,
  grp_service_tree = {
    rb = {
      rb_node = 0xffff883f36a09820
    },
    left = 0xffff883f36a09820,
    count = 4294964171,
    min_vdisktime = 4707409920,
    ttime = {
      last_end_request = 4295195837,
      ttime_total = 0,
      ttime_samples = 0,
      ttime_mean = 0
    }
  },
  root_group = 0xffff883f36a09800,
  serving_wl_class = BE_WORKLOAD,
  serving_wl_type = SYNC_WORKLOAD,
  workload_expires = 4298776270,---------jiffies = $1 = 4298781359,cfqd->workload_expires = jiffies + slice;
  serving_group = 0xffff883f36a09800,----根据这个当前服务的group,可以遍历其管理的所有cfq_queue
  prio_trees = {{
      rb_node = 0x0
    }, {
      rb_node = 0x0
    }, {
      rb_node = 0x0
    }, {
      rb_node = 0x0
    }, {
      rb_node = 0xffff881490a87250
    }, {
      rb_node = 0x0
    }, {
      rb_node = 0x0
    }, {
      rb_node = 0x0
    }},
  busy_queues = 10,-------------有10个queue等待调度
  busy_sync_queues = 10,
  rq_in_driver = 5,----还没完成的io数量
  rq_in_flight = {0, 5},
  rq_queued = 11,-------有11个io在cfq_data中,也就是各个cfqq中的rq_queued中的总和
  hw_tag = 1,
  hw_tag_est_depth = 32,
  hw_tag_samples = 51,
  idle_slice_timer = {
    entry = {
      next = 0x0,
      prev = 0xdead000000200200
    },
    expires = 4298772352,
    base = 0xffff881fd2fc8000,
    function = 0xffffffff812ef710 <cfq_idle_slice_timer>,
    data = 18446612400859216896,
    slack = -1,
    start_pid = -1,
    start_site = 0x0,
    start_comm = "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"
  },
  unplug_work = {
    data = {
      counter = 32
    },
    entry = {
      next = 0xffff883e86877d28,
      prev = 0xffff883e86877d28
    },
    func = 0xffffffff812edef0 <cfq_kick_queue>
  },
  active_queue = 0xffff883e9efc03a0,---------当前active的queue
  active_cic = 0xffff883e47904f00,
  async_cfqq = {{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, {0x0, 0x0, 0x0, 0x0, 0xffff883f1f7c9878, 0x0, 0x0, 0x0}},---根据io优先级的异步cfq_queue
  async_idle_cfqq = 0x0,
  last_position = 5943375904,
  cfq_quantum = 8,-----static const int cfq_quantum = 8;
  cfq_fifo_expire = {250, 125},---static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
  cfq_back_penalty = 2,-----static const int cfq_back_penalty = 2;
  cfq_back_max = 16384,-----static const int cfq_back_max = 16 * 1024;
  cfq_slice = {40, 100},---static int cfq_slice_async = HZ / 25;static const int cfq_slice_sync = HZ / 10;
  cfq_slice_async_rq = 2,---static const int cfq_slice_async_rq = 2;
  cfq_slice_idle = 8,----static int cfq_slice_idle = HZ / 125;
  cfq_group_idle = 8,----static int cfq_group_idle = HZ / 125;
  cfq_latency = 1,------    cfqd->cfq_latency = 1;
  cfq_target_latency = 300,---static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
  oom_cfqq = {
    ref = 1,
    flags = 64,
    cfqd = 0xffff883e86877c00,
    rb_node = {
      __rb_parent_color = 18446612400859217440,
      rb_right = 0x0,
      rb_left = 0x0
    },
    rb_key = 0,
    p_node = {
      __rb_parent_color = 18446612400859217472,
      rb_right = 0x0,
      rb_left = 0x0
    },
    p_root = 0x0,
    sort_list = {
      rb_node = 0x0
    },
    next_rq = 0x0,
    queued = {0, 0},
    allocated = {0, 0},
    fifo = {
      next = 0xffff883e86877e80,
      prev = 0xffff883e86877e80
    },
    dispatch_start = 0,
    allocated_slice = 0,
    slice_dispatch = 0,
    slice_start = 0,
    slice_end = 0,
    slice_resid = 0,
    prio_pending = 0,
    dispatched = 0,
    ioprio = 0,
    org_ioprio = 0,
    ioprio_class = 0,
    pid = 1,
    seek_history = 0,
    last_request_pos = 0,
    service_tree = 0x0,
    new_cfqq = 0x0,
    cfqg = 0xffff883f36a09800,
    nr_sectors = 0
  },
  last_delayed_sync = 4298780320
}

先找到当前的serving_group,

serving_group = 0xffff883f36a09800, 

根据这个cfq_group,

crash> struct -xo cfq_group.service_trees 0xffff883f36a09800
struct cfq_group {
  [ffff883f36a09878] struct cfq_rb_root service_trees[2][3];
}
crash> cfq_group.dispatched 0xffff883f36a09800
  dispatched = 5 ----这个dispatched和 cfqd->rq_in_flight是相同的,对的上

根据当前的io类别:

  serving_wl_class = BE_WORKLOAD,
  serving_wl_type = SYNC_WORKLOAD,

可以确定它的 service_trees 的数组下标:

enum wl_type_t {---列下标
    ASYNC_WORKLOAD = 0,
    SYNC_NOIDLE_WORKLOAD = 1,
    SYNC_WORKLOAD = 2
};

enum wl_class_t {----行下标
    BE_WORKLOAD = 0,
    RT_WORKLOAD = 1,
    IDLE_WORKLOAD = 2,
    CFQ_PRIO_NR,
};

找到对应的下标为[0][1],[0][2]。

总共10个queue
crash> tree -t rb_tree 0xffff883f36a098f8
ffff883f061a0920
ffff883e6fb8c3b0
ffff883e9efc03b0
ffff881490a87230
ffff883e71bcdee0
ffff8801c8b61df8
ffff881577ee1c28
ffff88018a4577a0
crash> tree -t rb_tree 0xffff883f36a098b8
ffff88144e961318
ffff883ea2021400

也就找到了当前group调度的cfq_queue,遍历各个cfq_queue,就知道对应的request了:

crash> cfq_queue.rb_node
struct cfq_queue {
   [16] struct rb_node rb_node;
}

由于cfq_queue通过rb_node成员嵌入到对应cfq_group的service_tree数组中对应class和类型的红黑树去,所以可以查看对应的cfq_queue为:
只有11个io,2个dispatch的
  crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff883f061a0910
  allocated = {1, 0}
  ref = 2
  dispatched = 0
  prio_pending = 0
   queued = {0, 1}
  request=ffff88014a437780
crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff883e6fb8c3a0------相当于有一个在request_queue中或者到driver层了,有一个还未被dispatch
  allocated = {2, 0}
  ref = 4
  dispatched = 1-------------它的dispatch为1
  prio_pending = 0
  queued = {0, 1}
  request=ffff883e826fcd80
crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff883e9efc03a0---当前active的queue,无request了
  allocated = {1, 0}
  ref = 2
  dispatched = 1-------------它的dispatch为1,
  prio_pending = 0  
  queued = {0, 0}
crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff881490a87220
  allocated = {2, 0}
  ref = 3
  dispatched = 0
  prio_pending = 0
  queued = {0, 2}
  ffff8812ae0c0780----不在hash链中
  
  request.hash ffff8812ae0c0780
    hash = {
      next = 0x0,
      pprev = 0x0
    }

  
  ffff8812ae0c0900
crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff883e71bcded0
  allocated = {2, 0}
  ref = 3
  dispatched = 0
  prio_pending = 0
  queued = {0, 2}
  ffff883e40708780
  ffff883e40708c00---这个io不在hash中
  
  crash> request.hash ffff883e40708c00
    hash = {
      next = 0x0,
      pprev = 0x0
crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff8801c8b61de8
  allocated = {1, 0}
  ref = 2
  dispatched = 0
  prio_pending = 0
  queued = {0, 1}
  ffff8812c96efa80
crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff881577ee1c18
  allocated = {1, 0}
  ref = 2
  dispatched = 0
  prio_pending = 0
  queued = {0, 1}
  ffff883e4341e280
crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff88018a457790
  allocated = {1, 0}
  ref = 2
  dispatched = 0
  prio_pending = 0
  queued = {0, 1}
  ffff8813ee88b900
crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff88144e961308
  allocated = {1, 0}
  ref = 2
  dispatched = 0
  prio_pending = 0
  queued = {0, 1}
  ffff883e826fdb00
crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff883ea20213f0
  allocated = {1, 0}
  ref = 2
  dispatched = 0
  prio_pending = 0
  queued = {0, 1}
  ffff880149d39200

其中,所以dispatch的io,可以在request_queue中的timeout_list 中找到:

 crash> struct -xo request_queue.timeout_list 0xffff883fc1e28828
struct request_queue {
  [ffff883fc1e28dc0] struct list_head timeout_list;
}

crash> list request.timeout_list -s request.__sector,queuelist,cmd_type,bio -H ffff883fc1e28dc0
ffff883e9d3f7d80
  __sector = 4090189560
  queuelist = {
    next = 0xffff883e9d3f7d80,
    prev = 0xffff883e9d3f7d80
  }
  cmd_type = REQ_TYPE_FS
  bio = 0xffff883ebb2f2200
ffff8804a8440d80
  __sector = 4151431904
  queuelist = {
    next = 0xffff8804a8440d80,
    prev = 0xffff8804a8440d80
  }
  cmd_type = REQ_TYPE_FS
  bio = 0xffff88029f7a9f00
ffff8802924f8a80-------------------------------------这个io是dfs下发的ioctl
  __sector = 18446744073709551615
  queuelist = {
    next = 0xffff8802924f8a80,
    prev = 0xffff8802924f8a80
  }
  cmd_type = REQ_TYPE_BLOCK_PC
  bio = 0x0
ffff883e73374480
  __sector = 5860805840
  queuelist = {
    next = 0xffff883e73374480,
    prev = 0xffff883e73374480
  }
  cmd_type = REQ_TYPE_FS
  bio = 0xffff883ea412bd00
ffff883ed1c4ea00
  __sector = 5977690392
  queuelist = {
    next = 0xffff883ed1c4ea00,
    prev = 0xffff883ed1c4ea00
  }
  cmd_type = REQ_TYPE_FS
  bio = 0xffff883f1e7b6300
ffff883e9d24eb80
  __sector = 5943375896
  queuelist = {
    next = 0xffff883e9d24eb80,
    prev = 0xffff883e9d24eb80
  }
  cmd_type = REQ_TYPE_FS
  bio = 0xffff883f1ae6ec00

6个io,加上11个未下发到request_queue中的io,为17个,全部找到了,通过上面的遍历,对io的管理也都熟悉了,此为后话。

我们在回过头来看我们的log:

[ 3272.940387] sd 5:0:4:0: [sde] FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE
[ 3272.940392] sd 5:0:4:0: [sde] Sense Key : Medium Error [current] [descriptor]
[ 3272.940395] sd 5:0:4:0: [sde] Add. Sense: Unrecovered read error
[ 3272.940397] sd 5:0:4:0: [sde] CDB: Read(16) 88 00 00 00 00 00 00 00 00 08 00 00 00 08 00 00
[ 3272.940399] blk_update_request: critical medium error, dev sde, sector 8
[ 3272.942369] XFS (sde): metadata I/O error: block 0x8 ("xfs_trans_read_buf_map") error 61 numblks 8
[ 3278.783813] sd 5:0:4:0: [sde] FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE
[ 3278.783823] sd 5:0:4:0: [sde] Sense Key : Medium Error [current] [descriptor]
[ 3278.783826] sd 5:0:4:0: [sde] Add. Sense: Unrecovered read error
[ 3278.783830] sd 5:0:4:0: [sde] CDB: Read(16) 88 00 00 00 00 00 00 00 00 08 00 00 00 08 00 00
[ 3278.783832] blk_update_request: critical medium error, dev sde, sector 8
[ 3278.785946] XFS (sde): metadata I/O error: block 0x8 ("xfs_trans_read_buf_map") error 61 numblks 8
[ 3820.434234] umount:start to umount target /run/user/989 .
[ 3820.434239] umount:according to target /run/user/989 , fstype tmpfs,device tmpfs
[ 3820.434287] umount:finish umount target :/run/user/989 ,retval:0 .
[ 4000.258607] umount:start to umount target /run/user/989 .
[ 4000.258613] umount:according to target /run/user/989 , fstype tmpfs,device tmpfs
[ 4000.258661] umount:finish umount target :/run/user/989 ,retval:0 .
[ 4109.680685] INFO: task fas_readwriter:46745 blocked for more than 1200 seconds.
[ 4109.682163] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 4109.683440] fas_readwriter  D ffff881fffd147c0     0 46745  43298 0x00000080
[ 4109.683444]  ffff881f320bfc20 0000000000000082 ffff881f320bffd8 ffff881f320bffd8
[ 4109.683469]  ffff881f320bffd8 00000000000147c0 ffff881fd321ae00 ffff881571308f20
[ 4109.683487]  ffff881571308f24 ffff881f32186780 00000000ffffffff ffff881571308f28
[ 4109.683491] Call Trace:

在触发crash之前,前面一直打印 访问8号扇区出错,对应的调度链是什么呢?

scsi_finish_command-->scsi_io_completion
case ACTION_FAIL:
           -->scsi_print_result
           -->scsi_print_sense
           -->scsi_print_command
           -->scsi_end_request---->blk_update_request

 从现象来看,综合日志中的delay打印:

read file dee46cc8be5ab43982c75a835768773e delay 3511 ms, readsize(581795) count(581795) offset(0)>

推测是由于访问硬盘慢导致了io的积压,可能是因为坏块的重试,具体原因由于功力问题,无法确认。

结论:

1.访问某些扇区,可能会导致多次重试而io缓慢,如果后面的io继续来到,则可能积压。

2.hungtask检测挑选的是第一个满足条件的task 来打印,但其实有可能另外一个进程等待已经超过2个hung检测的周期而没有被选中,因为khungtask在上一次调度的时候,大家都没有到检测周期,等到了下一个检测周期之后,可能多个task 满足条件而挑选的是第一个task,甚至有可能有的task在下一个检测周期之间到达了阈值而因为等待的资源得到满足之后又重新开始调度了。

猜你喜欢

转载自www.cnblogs.com/10087622blog/p/10882980.html
今日推荐