代码架构

kmalloc架构

kmalloc
   |----->return __kmalloc(size, flags);
   |   |----->return __do_kmalloc(size, flags, _RET_IP_);
   |   |   |----->return __do_kmalloc(size, flags, _RET_IP_);
   |   |   |   |----->cachep = kmalloc_slab(size, flags);
   |   |   |   |      根据size找到最合适的kmem cache实例
   |   |   |   |----->ret = slab_alloc(cachep, flags, caller);
   |   |   |   |      从slab中分配object
   |   |   |   |   |----->objp = __do_cache_alloc(cachep, flags);
   |   |   |   |   |   |----->return ____cache_alloc(cachep, flags);
   |   |   |   |   |   |   |----->ac = cpu_cache_get(cachep);
   |   |   |   |   |   |   |      获取array cache，ac维护一组object cache
   |   |   |   |   |   |   |----->objp = ac->entry[--ac->avail];
   |   |   |   |   |   |   |      如果array cache有可用的object，则支持返回，否则调用
   |   |   |   |   |   |   |      cache_alloc_refill填充array cache,并分配object
   |   |   |   |   |   |   |----->objp = cache_alloc_refill(cachep, flags);
   |   |   |   |   |   |   |      该函数先查看slab中是否可以分配到object，如果没有则从buddy system
   |   |   |   |   |   |   |      分配slab，并从slab中分配object，详情参考重要函数分析。
   |   |   |   |   |   |   |----->return objp;

我们可以看到kmalloc是分配一个object，object size大于或等于需要分配的空间的size，所以这里会存在挺多的空间浪费。

kfree架构

kfree
  |----->c = virt_to_cache(objp);
  |      根据object地址找到kmem cache实例，详见重要函数分析。
  |----->__cache_free(c, (void *)objp, _RET_IP_);
  |   |----->___cache_free(cachep, objp, caller);
  |   |   |----->cache_flusharray(cachep, ac);
  |   |   |      如果array_cache中的object已经达到上限，则调用该函数将array_cache中的object移入
  |   |   |      shared array cache或者slab中，详见重要函数分析
  |   |   |----->ac->entry[ac->avail++] = objp;
  |   |   |      如果array_cache中object没有达到上限，则将object释放到array_cache。

重要函数分析

cache_alloc_refill

static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
{
    int batchcount;
    struct kmem_cache_node *n;
    struct array_cache *ac, *shared;
    int node;
    void *list = NULL;
    struct page *page;

    check_irq_off();
    node = numa_mem_id();

    ac = cpu_cache_get(cachep);
    batchcount = ac->batchcount;
    if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
        /*
         * If there was little recent activity on this cache, then
         * perform only a partial refill.  Otherwise we could generate
         * refill bouncing.
         */
        batchcount = BATCHREFILL_LIMIT;
    }
    n = get_node(cachep, node);

    BUG_ON(ac->avail > 0 || !n);
    shared = READ_ONCE(n->shared);
    if (!n->free_objects && (!shared || !shared->avail))
        goto direct_grow;

    spin_lock(&n->list_lock);
    shared = READ_ONCE(n->shared);

    /* See if we can refill from the shared array */
    if (shared && transfer_objects(ac, shared, batchcount)) {
        shared->touched = 1;
        goto alloc_done;
    }

    while (batchcount > 0) {
        /* Get slab alloc is to come from. */
        page = get_first_slab(n, false);
        if (!page)
            goto must_grow;

        check_spinlock_acquired(cachep);

        batchcount = alloc_block(cachep, ac, page, batchcount);
        fixup_slab_list(cachep, n, page, &list);
    }

must_grow:
    n->free_objects -= ac->avail;
alloc_done:
    spin_unlock(&n->list_lock);
    fixup_objfreelist_debug(cachep, &list);

direct_grow:
    if (unlikely(!ac->avail)) {
        /* Check if we can use obj in pfmemalloc slab */
        if (sk_memalloc_socks()) {
            void *obj = cache_alloc_pfmemalloc(cachep, n, flags);

            if (obj)
                return obj;
        }

        page = cache_grow_begin(cachep, gfp_exact_node(flags), node);

        /*
         * cache_grow_begin() can reenable interrupts,
         * then ac could change.
         */
        ac = cpu_cache_get(cachep);
        if (!ac->avail && page)
            alloc_block(cachep, ac, page, batchcount);
        cache_grow_end(cachep, page);

        if (!ac->avail)
            return NULL;
    }
    ac->touched = 1;

    return ac->entry[--ac->avail];
}

该函数用于填充kmem cache实例中的array_cache，处理过程如下：

如果存在共享array_cache，则调用transfer_objects将object从共享array cache中移入array_cache中，移入的数量是batchcount。
如果不存在共享array cache，或者共享array cache分配不到object，则尝试将slab中的object移入array_cache中，移入的数量是batchcount。
如果以上都分配不到object，则只能从buddy system获取slab，并将slab中的object移入array_cache中，移入的数量是batchcount。
page = cache_grow_begin(cachep, gfp_exact_node(flags), node);
alloc_block(cachep, ac, page, batchcount);

这里有两个变量特别重要,需要特别说明一下：

ac->batchcount 一次可以处理多少个object，比如本函数中将object移入array_cache，batchcount表示一次可以将多少object移入。

virt_to_cache

virt_to_cache调用virt_to_head_page(obj)获取该object所在的pages，而page->slab_cache则指向mem cache实例。所以这里最终要的是理解如何通过object地址获取其所在的page。

static inline struct page *compound_head(struct page *page)
{
    unsigned long head = READ_ONCE(page->compound_head);

    if (unlikely(head & 1))
        return (struct page *) (head - 1);
    return page;
}
static inline struct page *virt_to_head_page(const void *x)
{
    struct page *page = virt_to_page(x);

    return compound_head(page);
}

这里最终要的是理解page->compound_head。学习过buddy system我们知道page是按order来分配和管理的，所以有可能一组连续的page组成pages，这组pages中所有的page实例的compound_head成员都指向第一个page的地址。这里的处理流程分为三步：

通过object地址获取该object所在的page。
由page找到这组pages的头。

cache_flusharray

static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
{
    int batchcount;
    struct kmem_cache_node *n;
    int node = numa_mem_id();
    LIST_HEAD(list);

    batchcount = ac->batchcount;

    check_irq_off();
    n = get_node(cachep, node);
    spin_lock(&n->list_lock);
    if (n->shared) {
        struct array_cache *shared_array = n->shared;
        int max = shared_array->limit - shared_array->avail;
        if (max) {
            if (batchcount > max)
                batchcount = max;
            memcpy(&(shared_array->entry[shared_array->avail]),
                   ac->entry, sizeof(void *) * batchcount);
            shared_array->avail += batchcount;
            goto free_done;
        }
    }

    free_block(cachep, ac->entry, batchcount, node, &list);
free_done:
#if STATS
    {
        int i = 0;
        struct page *page;

        list_for_each_entry(page, &n->slabs_free, lru) {
            BUG_ON(page->active);

            i++;
        }
        STATS_SET_FREEABLE(cachep, i);
    }
#endif
    spin_unlock(&n->list_lock);
    slabs_destroy(cachep, &list);
    ac->avail -= batchcount;
    memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
}

该函数用于将array_cache中的object移入shared array cache或者slab中，处理流程如下：

如果存在shared array cache，并且其中的object没有达到上限，则将batchcount个object移入shared array cache中。
否则调用free_block将batchcount个object移入slab，详见重要函数分析。

free_block

static void free_block(struct kmem_cache *cachep, void **objpp,
            int nr_objects, int node, struct list_head *list)
{
    int i;
    struct kmem_cache_node *n = get_node(cachep, node);
    struct page *page;

    n->free_objects += nr_objects;

    for (i = 0; i < nr_objects; i++) {
        void *objp;
        struct page *page;

        objp = objpp[i];

        page = virt_to_head_page(objp);
        list_del(&page->lru);
        check_spinlock_acquired_node(cachep, node);
        slab_put_obj(cachep, page, objp);
        STATS_DEC_ACTIVE(cachep);

        /* fixup slab chains */
        if (page->active == 0)
            list_add(&page->lru, &n->slabs_free);
        else {
            /* Unconditionally move a slab to the end of the
             * partial list on free - maximum time for the
             * other objects to be freed, too.
             */
            list_add_tail(&page->lru, &n->slabs_partial);
        }
    }

    while (n->free_objects > n->free_limit && !list_empty(&n->slabs_free)) {
        n->free_objects -= cachep->num;

        page = list_last_entry(&n->slabs_free, struct page, lru);
        list_move(&page->lru, list);
        n->num_slabs--;
    }
}

该函数将object移入slab中，处理流程如下：

一次将object加入slab中。如果slab中的object满了，则加入slabs_free，如果没有满，则加入slabs_partial。
如果slabs中空闲的object超过了上限，并且有object满的slabs，即slabs_free不为空，则将该slab从链表中删除，即回到buddy system中。

优雅的slab内存分配器（三）——分配和释放内存(kmalloc/kfree)