探秘malloc是如何申请内存的

今天分析下malloc申请内存时都发生了什么,Let dot it

我们都清楚malloc申请的内存不是立刻就建立虚拟地址和物理地址的映射的,当int *p = malloc(100*1024)执行这条指令之后,只是在用户空间给程序开辟一段100K左右的大小,然后就返回这段空间的首地址给程序员。

当我们尝试第一次读或者写的时候,就会经过如下步骤的:

  • CPU将此虚拟地址,送到MMU上去
  • MMU会做虚拟到物理地址的转化
  • MMU在操作时发现,此虚拟地址还没有建立物理地址关系,则发生exception
  • CPU则会跳转到exception table,根据出错的类型执行相应的调用函数
  • 此场景就会调用do_translation_fault

我们通过一个简单的malloc例子来分析

#include <stdio.h>
#include <malloc.h>
#include <unistd.h>
 
int main()
{
    int i=0;
    char *malloc_data=malloc(1024*200);
    printf("malloc address=0x%lx\n",malloc_data); 
     
    getchar();
 
    for(i=0; i<100; i++)
      malloc_data[i] = i+1;
     
    for(i=0; i<100; i++)
      printf("data=%d\n",malloc_data[i]);
 
    return 0;
}

当执行此代码后,会在用户空间分配各个虚拟内存区域

可以看到虚拟地址是属于红色框之类的。有人就会说malloc为啥的不属于heap? 当malloc申请的内存小于128K的时候是属于heap的,自己可以动手实验下。当申请的内存大于128K之后,就会从mmap区域申请内存的。

当我们尝试写这个虚拟地址的时候,就会发生上面一系列操作,我通过修改内核的代码,当在申请此虚拟地址的时候会发生panic,然后抓到dump。我们通过dump分析

可以dump的时候此地址和上面例子的地址有差别,不影响我们分析。分析dump我们以dump的地址为准。

当写malloc申请的内存0x76143BC000的时候,就会发生缺页异常,发生page_fault。 先来看dump的调用栈

-005|panic()
-006|do_anonymous_page(inline)
-006|handle_pte_fault(vmf = 0xFFFFFF80202A3BF0)
-007|handle_mm_fault(vma = 0xFFFFFFE314E27310, address = 0x00000076143BC000, flags = 0x55)
-008|do_page_fault(addr = 0x00000076143BC008, esr = 0x92000047, regs = 0xFFFFFF80202A3EC0)
-009|test_ti_thread_flag(inline)
-009|do_translation_fault(addr = 0x00000076143BC008, esr = 0x92000047, regs = 0xFFFFFF80202A3EC0)
-010|do_mem_abort(addr = 0x00000076143BC008, esr = 0x92000047, regs = 0xFFFFFF80202A3EC0)
-011|el0_da(asm)
 -->|exception

具体为啥会这样,大家可以看下我前面的ARM64异常处理流程,咋们根据调用栈分析代码。

static int __kprobes do_translation_fault(unsigned long addr,
                      unsigned int esr,
                      struct pt_regs *regs)
{
    if (addr < TASK_SIZE)
        return do_page_fault(addr, esr, regs);
 
    do_bad_area(addr, esr, regs);
    return 0;
}

这里是判断申请的内存属于用户空间还是内核空间,用户空间的大小是TASK_SIZE的。小于此值就是用户空间

-008|do_page_fault(
    |    addr_=_0x00000076143BC008,         //这就是我们上层传递下来的值,后面会将低12位清空的。
    |    esr = 0x92000047,                  //出错状态寄存器
    |    regs = 0xFFFFFF80202A3EC0)
    |  vma = 0xFFFFFFE314E27310              //这段虚拟内存区域的vma
    |  mm_flags = 0x55
    |  vm_flags = 0x2
    |  major = 0x0
    |  tsk = 0xFFFFFFE300786640              //所属的task_struct
    |  mm = 0xFFFFFFE2EBB33440              //所属的mm_struct
-009|test_ti_thread_flag(inline)
-009|do_translation_fault(
    |    addr = 0x00000076143BC008,
    |    esr = 0x92000047,
    |    regs = 0xFFFFFF80202A3EC0)
-010|do_mem_abort(
    |    addr = 0x00000076143BC008,
    |    esr = 0x92000047,
    |    regs = 0xFFFFFF80202A3EC0)
-011|el0_da(asm)
 -->|exception

此函数有点长,我们去掉不相关的,保留和我们有用的

static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
                   struct pt_regs *regs)
{
    struct task_struct *tsk;
    struct mm_struct *mm;
    struct siginfo si;
    vm_fault_t fault, major = 0;
    unsigned long vm_flags = VM_READ | VM_WRITE;
    unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
    struct vm_area_struct *vma = NULL;
 
    tsk = current;
    mm  = tsk->mm;
 
    /*
     * If we're in an interrupt or have no user context, we must not take
     * the fault.
     */
    if (faulthandler_disabled() || !mm)                    //如果在中断上下文或者是内核线程,就调用no_context处理
        goto no_context;
 
    if (user_mode(regs))                                   //如果是用户模式,则需要设置mm_flags位FAULT_FLAG_USER
        mm_flags |= FAULT_FLAG_USER;
 
    if (is_el0_instruction_abort(esr)) {                   //如果是el0的指令异常,设置flag
        vm_flags = VM_EXEC;
    } else if ((esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM)) {   //esr寄存器判断是否有写权限之类的
        vm_flags = VM_WRITE;
        mm_flags |= FAULT_FLAG_WRITE;
    }
 
    if (addr < TASK_SIZE && is_el1_permission_fault(esr, regs, addr)) {         //地址属于用户空间,但是出错是在内核空间,也就是内核空间访问了用户空间的地址,报错
        /* regs->orig_addr_limit may be 0 if we entered from EL0 */
        if (regs->orig_addr_limit == KERNEL_DS)
            die_kernel_fault("access to user memory with fs=KERNEL_DS",
                     addr, esr, regs);
 
        if (is_el1_instruction_abort(esr))
            die_kernel_fault("execution of user memory",
                     addr, esr, regs);
 
        if (!search_exception_tables(regs->pc))
            die_kernel_fault("access to user memory outside uaccess routines",
                     addr, esr, regs);
    }
 
    if (!vma || !can_reuse_spf_vma(vma, addr))                   //如果不存在vma,则通过地址找到vma,vma在mm_struct的红黑树中,只需要找此地址属于start和end范围内,就确定了vma
        vma = find_vma(mm, addr);
 
    fault = __do_page_fault(vma, addr, mm_flags, vm_flags, tsk);   //真正处理do_page_fault
    major |= fault & VM_FAULT_MAJOR;                   //major意思是当发现此地址的转化关系在页表中,但是内存就找不到。说明swap到磁盘或者swap分区了。从磁盘将文件swap进来叫major,从swap分区叫minor
 
    if (fault & VM_FAULT_RETRY) {             //是否需要重试retry
        /*
         * If we need to retry but a fatal signal is pending,
         * handle the signal first. We do not need to release
         * the mmap_sem because it would already be released
         * in __lock_page_or_retry in mm/filemap.c.
         */
        if (fatal_signal_pending(current)) {
            if (!user_mode(regs))
                goto no_context;
            return 0;
        }
 
        /*
         * Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk of
         * starvation.
         */
        if (mm_flags & FAULT_FLAG_ALLOW_RETRY) {
            mm_flags &= ~FAULT_FLAG_ALLOW_RETRY;
            mm_flags |= FAULT_FLAG_TRIED;
 
            /*
             * Do not try to reuse this vma and fetch it
             * again since we will release the mmap_sem.
             */
            vma = NULL;
 
            goto retry;
        }
    }
    up_read(&mm->mmap_sem);
 
done:
 
    /*
     * Handle the "normal" (no error) case first.
     */
    if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP |
                  VM_FAULT_BADACCESS)))) {
        /*
         * Major/minor page fault accounting is only done
         * once. If we go through a retry, it is extremely
         * likely that the page will be found in page cache at
         * that point.
         */
        if (major) {  //增减major的引用计数
            tsk->maj_flt++;
            perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs,
                      addr);
        } else {
            tsk->min_flt++;  //增加minor的引用计数
            perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs,
                      addr);
        }
 
        return 0;
    }
 
    /*
     * If we are in kernel mode at this point, we have no context to
     * handle this fault with.
     */
    if (!user_mode(regs))
        goto no_context;
 
    if (fault & VM_FAULT_OOM) {    //也就是没有内存了
        /*
         * We ran out of memory, call the OOM killer, and return to
         * userspace (which will retry the fault, or kill us if we got
         * oom-killed).
         */
        pagefault_out_of_memory();
        return 0;
    }
 
    clear_siginfo(&si);
    si.si_addr = (void __user *)addr;
 
    if (fault & VM_FAULT_SIGBUS) {
        /*
         * We had some memory, but were unable to successfully fix up
         * this page fault.
         */
        si.si_signo = SIGBUS;
        si.si_code  = BUS_ADRERR;
    } else if (fault & VM_FAULT_HWPOISON_LARGE) {
        unsigned int hindex = VM_FAULT_GET_HINDEX(fault);
 
        si.si_signo = SIGBUS;
        si.si_code  = BUS_MCEERR_AR;
        si.si_addr_lsb  = hstate_index_to_shift(hindex);
    } else if (fault & VM_FAULT_HWPOISON) {
        si.si_signo = SIGBUS;
        si.si_code  = BUS_MCEERR_AR;
        si.si_addr_lsb  = PAGE_SHIFT;
    } else {
        /*
         * Something tried to access memory that isn't in our memory
         * map.
         */
        si.si_signo = SIGSEGV;           //这就是写应用程序,出错后出现的段错误,内核直接回杀死此进程的
        si.si_code  = fault == VM_FAULT_BADACCESS ?
                  SEGV_ACCERR : SEGV_MAPERR;
    }
 
    __do_user_fault(&si, esr);     //信号告知用户层
    return 0;
 
no_context:
    __do_kernel_fault(addr, esr, regs); //处理内核的部分
    return 0;
}
  • 此函数主要是确认下当前错误是来自内核还是应用层
  • 当调用__do_page_fault处理完毕后,就会对结果做进一步处理
  • 如果用户空间,则后发信号的方式告知的。
  • 内核的话专门有__do_kernel_fault去处理的
static int __do_page_fault(struct vm_area_struct *vma, unsigned long addr,
               unsigned int mm_flags, unsigned long vm_flags,
               struct task_struct *tsk)
{
    vm_fault_t fault;
 
    fault = VM_FAULT_BADMAP;
    if (unlikely(!vma))  
        goto out;
    if (unlikely(vma->vm_start > addr))
        goto check_stack;
 
    /*
     * Ok, we have a good vm_area for this memory access, so we can handle
     * it.
     */
good_area:
    /*
     * Check that the permissions on the VMA allow for the fault which
     * occurred.
     */
    if (!(vma->vm_flags & vm_flags)) {
        fault = VM_FAULT_BADACCESS;
        goto out;
    }
 
    return handle_mm_fault(vma, addr & PAGE_MASK, mm_flags);
 
check_stack:
    if (vma->vm_flags & VM_GROWSDOWN && !expand_stack(vma, addr))
        goto good_area;
out:
    return fault;
}
  • 检查vma,以及起始地址
  • 如果起始地址小于addr,则调到check_stack处,此情况针对栈需要扩张的情况
  • 确定vma的权限,比如此vma的权限是没有写的,只读的。如果你去写的话就会报VM_FAULT_BADACCESS错误
  • 则后续会调用handle_mm_fault处理
vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
        unsigned int flags)
{
    vm_fault_t ret;
 
    __set_current_state(TASK_RUNNING);
 
    if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,   //权限错误,直接SIGSEGV,段错误
                        flags & FAULT_FLAG_INSTRUCTION,
                        flags & FAULT_FLAG_REMOTE))
        return VM_FAULT_SIGSEGV;
 
    if (unlikely(is_vm_hugetlb_page(vma)))              //巨型页,先不考虑
        ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
    else
        ret = __handle_mm_fault(vma, address, flags);   //正常处理流程
 
    return ret;
}

继续分析__handle_mm_fault函数

static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
        unsigned long address, unsigned int flags)
{
    struct vm_fault vmf = {                 //根据参数初始化vma_fault结构
        .vma = vma,
        .address = address & PAGE_MASK,
        .flags = flags,
        .pgoff = linear_page_index(vma, address),
        .gfp_mask = __get_fault_gfp_mask(vma),
        .vma_flags = vma->vm_flags,
        .vma_page_prot = vma->vm_page_prot,
    };
    unsigned int dirty = flags & FAULT_FLAG_WRITE;
    struct mm_struct *mm = vma->vm_mm;
    pgd_t *pgd;
    p4d_t *p4d;
    vm_fault_t ret;
 
    pgd = pgd_offset(mm, address);                   //根据虚拟地址和mm_struct结构找到pgd
    p4d = p4d_alloc(mm, pgd, address);               //再接着找到p4d,模拟板目前只有3级页表,也就是没有p4d和pud,这里的话p4d==pgd
    if (!p4d)
        return VM_FAULT_OOM;
 
    vmf.pud = pud_alloc(mm, p4d, address);             
    if (!vmf.pud)
        return VM_FAULT_OOM;
 
    vmf.pmd = pmd_alloc(mm, vmf.pud, address);
    if (!vmf.pmd)
        return VM_FAULT_OOM;
 
    return handle_pte_fault(&vmf);
}
  • pgd = pgd_offset(mm, address); 根据虚拟地址和mm_struct→pdg基地址就会算出pgd的值
  • p4d = p4d_alloc(mm, pgd, address); 分配p4d,目前没用p4d,#define p4d_alloc(mm, pgd, address) (pgd) 直接返回的是pgd的值
  • vmf.pud = pud_alloc(mm, p4d, address); 
#define pud_alloc(mm, p4d, address) \
    ((unlikely(pgd_none(*(p4d))) && __pud_alloc(mm, p4d, address)) ? \
        NULL : pud_offset(p4d, address))
  • 是没有p4d的时候,则分配pud,这里因为p4d=pgd,则最后返回的是pgd里面的值
  • vmf.pmd = pmd_alloc(mm, vmf.pud, address);  分配pmd, 会根据pud的值算出pmd的值
  • 处理pte, 也就是说此函数就是算pgd, p4d, pud, pmd,保存到vm_fault结构体中。

来看下dump中算好的结果。

-006|handle_pte_fault(
    |    vmf = 0xFFFFFF80202A3BF0 -> (
    |      vma = 0xFFFFFFE314E27310,
    |      flags = 0x55,
    |      gfp_mask = 0x006000C0,
    |      pgoff = 0x076143BC,
    |      address = 0x00000076143BC000,
    |      sequence = 0x2,
    |      orig_pmd = (pmd = 0x0),
    |      pmd = 0xFFFFFFE2E5E5D508 -> (
    |        pmd_=_0xE5E5A003),
    |      pud = 0xFFFFFFE2E5D8BEC0 -> (
    |        pgd = (pgd = 0xE5E5D003)),
    |      orig_pte = (pte = 0x0),
    |      cow_page = 0x0,
    |      memcg = 0x0,
    |      page = 0x0,
    |      pte = 0xFFFFFFE2E5E5ADE0 -> (
    |        pte = 0x00E800026F281F53),
    |      ptl = 0xFFFFFFE3698EC318,
    |      prealloc_pte = 0x0,
    |      vma_flags = 0x00100073,
    |      vma_page_prot = (pgprot = 0x0060000000000FD3)))
-007|handle_mm_fault(
    |    vma = 0xFFFFFFE314E27310,
    |    address = 0x00000076143BC000,
    |    flags = 0x55)

转化过程可以参考我的ARM64虚拟地址到物理地址转化文档(手动玩转虚拟地址到物理地址转化

虚拟地址:0x00000076143BC000

mm_struct→pgd = rd(0xFFFFFFE2E5D8B000) = 0xE5D80003

pdg_index =  (0x00000076143BC000 >> 30) & (0x200 - 1)  = 0x01D8

pdg = 0xFFFFFFE2E5D8B000+ 0x01D8*8 = 0xFFFFFFE2E5D8BEC0 = rd(0xFFFFFFE2E5D8BEC0 ) = 0xE5E5D003

pmd_index =  (0x00000076143BC000  >> 21) & (0x1FF ) = 0xA1

pmd = 0xE5E5D003+ 0xA1 * 8 = 0xE5E5D000+ 0xA1 * 8 = 0xE5E5D508 = rd(C:0xE5E5D508) = E5E5A003

通过我们手动计算和dump里面的值是一样的。继续分析代码。

static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
{
    pte_t entry;
    int ret = 0;
 
    if (unlikely(pmd_none(*vmf->pmd))) {                         //如果pmd里面的值是0的话,说明了pte是没有的,则将vmf->pte设置为NULL
        vmf->pte = NULL;
    } else if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) {  
         ....
    }
 
    if (!vmf->pte) {
        if (vma_is_anonymous(vmf->vma))
            return do_anonymous_page(vmf);
        else
            return do_fault(vmf);
    }
 
    if (!pte_present(vmf->orig_pte))
        return do_swap_page(vmf);
 
    entry = vmf->orig_pte;
 
    if (vmf->flags & FAULT_FLAG_WRITE) {
        if (!pte_write(entry))
            return do_wp_page(vmf);
        entry = pte_mkdirty(entry);
    }
    entry = pte_mkyoung(entry);
    if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
                vmf->flags & FAULT_FLAG_WRITE)) {
        update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
    } else {
        /*
         * This is needed only for protection faults but the arch code
         * is not yet telling us if this is a protection fault or not.
         * This still avoids useless tlb flushes for .text page faults
         * with threads.
         */
        if (vmf->flags & FAULT_FLAG_WRITE)
            flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
        if (vmf->flags & FAULT_FLAG_SPECULATIVE)
            ret = VM_FAULT_RETRY;
    }
unlock:
    pte_unmap_unlock(vmf->pte, vmf->ptl);
    return ret;
}
  • 如果pmd里面的值是NULL,所以pte不存在,设置pte为NULL
  • 判断此vma是否是匿名页,通过判断vma→vm_ops是否为NULL,

啥是匿名页:

  • malloc申请的内存
  • stack里申请的内存
  • mmap申请的匿名的内存映射

以上三种都属于匿名页

  • 很明显我们是malloc申请的内存,就会走到匿名页里面去
  • 如果不是匿名页,那就是有文件背景的页,就是和映射的时候有对应的实体,比如磁盘中的文件
  • pte_present(vmf→orig_pte) 页表存在,页表项不存在,所以swap出去了,需要swap回来
  • 如果页表有写FAULT_FLAG_WRITE权限,则更新脏页flag
  • pte_mkyoung(entry); 意思是页表刚刚访问过,比较young
  • 设置访问权限,更新mmu cache等
原创文章 204 获赞 129 访问量 38万+

猜你喜欢

转载自blog.csdn.net/longwang155069/article/details/105808155