slab分配器的基本思想是:将内核中经常使用的对象放到高速缓存中,并且由系统保持为初始的可利用状态。如果没有基于对象的分配器,内核将花费更多的时间分配,初始化以及释放一个对象。slab分配器的目的是缓存这些空闲的对象,保留基本结构,从而能够重复使用它们。
slab分配器由多个高速缓存组成,它们通过双向循环链表链接在一起,称为高速缓存链表。每个高速缓存包含若干slab,slab由连续的页面帧组成,它们被划分成许多小的块以存放由高速缓存所管理的数据结构和对象。
slab分配器有三个基本目标:
- 减少伙伴系统分配小块内存时所产生的内部碎片。
- 把经常使用的对象缓存起来,减少分配,初始化以及释放对象的时间开销。
- 调整对象以更好地使用L1和L2硬件高速缓存。
1 slab分配器
1.1 slab描述符
高速缓存描述符用结构体 struct kmem_cache定义,在<linux/slab_def.h>中定义如下:
struct kmem_cache {
struct array_cache __percpu *cpu_cache;
/* 1) Cache tunables. Protected by slab_mutex */
unsigned int batchcount;
unsigned int limit;
unsigned int shared;
unsigned int size;
struct reciprocal_value reciprocal_buffer_size;
/* 2) touched by every alloc & free from the backend */
unsigned int flags; /* constant flags */
unsigned int num; /* # of objs per slab */
/* 3) cache_grow/shrink */
/* order of pgs per slab (2^n) */
unsigned int gfporder;
/* force GFP flags, e.g. GFP_DMA */
gfp_t allocflags;
size_t colour; /* cache colouring range */
unsigned int colour_off; /* colour offset */
struct kmem_cache *freelist_cache;
unsigned int freelist_size;
/* constructor func */
void (*ctor)(void *obj);
/* 4) cache creation/removal */
const char *name;
struct list_head list;
int refcount;
int object_size;/
int align;
/* 5) statistics */
#ifdef CONFIG_DEBUG_SLAB
unsigned long num_active;
unsigned long num_allocations;
unsigned long high_mark;
unsigned long grown;
unsigned long reaped;
unsigned long errors;
unsigned long max_freeable;
unsigned long node_allocs;
unsigned long node_frees;
unsigned long node_overflow;
atomic_t allochit;
atomic_t allocmiss;
atomic_t freehit;
atomic_t freemiss;
/*
* If debugging is enabled, then the allocator can add additional
* fields and/or padding to every object. size contains the total
* object size including these internal fields, the following two
* variables contain the offset to the user object and its size.
*/
int obj_offset;
#endif /* CONFIG_DEBUG_SLAB */
#ifdef CONFIG_MEMCG_KMEM
struct memcg_cache_params memcg_params;
#endif
struct kmem_cache_node *node[MAX_NUMNODES];
};
下面对高速缓存描述符的各个字段进行解释:
- cpu_cache:由结构体struct array_cache定义,本地CPU的对象缓冲池,可以加快每个CPU上的分配过程。struct array_cache在<kernel/mm/slab.c>中定义如下:
/*
* struct array_cache
*
* Purpose:
* - LIFO ordering, to hand out cache-warm objects from _alloc
* - reduce the number of linked list operations
* - reduce spinlock operations
*
* The limit is stored in the per-cpu structure to reduce the data cache
* footprint.
*
*/
struct array_cache {
/* 可用数量 */
unsigned int avail;
/* 最大对象数,多余的放回slab */
unsigned int limit;
/* 每次在slab缓存和slab中移入移出的数量 */
unsigned int batchcount;
/* 是否从SLAB缓冲池中分配过内存,从缓冲池移除一个对象时,
* 将touched置1,收缩缓存时,将touched置0
*/
unsigned int touched;
//slab对象地址
void *entry[]; /*
* Must have this definition in here for the proper
* alignment of array_cache. Also simplifies accessing
* the entries.
*
* Entries should not be directly dereferenced as
* entries belonging to slabs marked pfmemalloc will
* have the lower bits set SLAB_OBJ_PFMEMALLOC
*/
};
- batchcount:由slab_mutex保护,当CPU的本地对象缓冲池array_cache为空时,从共享的缓冲池或者slabs_partial、slabs_free列表中获取对象的数目。
- limit:高速缓存中缓存对象数量限制,当本地对象缓冲池的空闲对象数目大于limit时,就会主动释放batchcount个对象,便于内核回收和销毁slab。
- shared:用于SMP,在所有CPU之间灵活机动共享的SLAB个数
- size:slab对象长度大小,不含对齐所需补充的字节
- reciprocal_buffer_size:加快内部计算过程用的临时变量
- flags:描述slab对象属性的标志.
- num:每个slab中的对象个数
- gfporder:扩充或者收缩slab时,分配或者释放的内存order.每个slab占用2^gfporder个连续页面,分配时由伙伴系统提供。
- allocflags:分配内存的GFP标志
- colour:着色范围,一个slab中有几个不同的cache line。
- colour_off:两次着色之间的差,根据缓存行大小和对象对齐大小来计算
- freelist_cache:当kmem_cache位于slab外面时,预先分配的kmem_cache对象指针及其数量
- freelist_size:每个对象要占用1Byte来存放freelist。
- ctor:分配slab时的回调
- name:该高速缓存的名称,可以在proc/slabinfo中查看
- list:通过此字段将对象链接到cache_chain链表
- refcount:引用计数
- object_size:slab对象大小,含对齐所需字节
- align:对齐长度
- node:按节点管理的kmem_cache_node描述符。
1.2 创建slab描述符
函数kmem_cache_create用于创建slab描述符,kmem_cache_create在<mm/slab_comm.c>中实现如下:
struct kmem_cache *
kmem_cache_create(const char *name, size_t size, size_t align,
unsigned long flags, void (*ctor)(void *))
{
struct kmem_cache *s;
const char *cache_name;
int err;
get_online_cpus();
get_online_mems();
memcg_get_cache_ids();
mutex_lock(&slab_mutex);
err = kmem_cache_sanity_check(name, size);
if (err) {
s = NULL; /* suppress uninit var warning */
goto out_unlock;
}
/*
* Some allocators will constraint the set of valid flags to a subset
* of all flags. We expect them to define CACHE_CREATE_MASK in this
* case, and we'll just provide them with a sanitized version of the
* passed flags.
*/
flags &= CACHE_CREATE_MASK;
s = __kmem_cache_alias(name, size, align, flags, ctor);
if (s)
goto out_unlock;
cache_name = kstrdup_const(name, GFP_KERNEL);
if (!cache_name) {
err = -ENOMEM;
goto out_unlock;
}
s = do_kmem_cache_create(cache_name, size, size,
calculate_alignment(flags, align, size),
flags, ctor, NULL, NULL);
if (IS_ERR(s)) {
err = PTR_ERR(s);
kfree_const(cache_name);
}
out_unlock:
mutex_unlock(&slab_mutex);
memcg_put_cache_ids();
put_online_mems();
put_online_cpus();
if (err) {
if (flags & SLAB_PANIC)
panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
name, err);
else {
printk(KERN_WARNING "kmem_cache_create(%s) failed with error %d",
name, err);
dump_stack();
}
return NULL;
}
return s;
}
kmem_cache_create参数:
- name:slab描述符名称
- size:缓存对象大小
- align:缓存对象需要对齐的字节数
- flags:分配掩码
- ctor:对象的构造函数
在kmem_cache_create中,函数kmem_cache_sanity_check检查是否已经创建了同名的slab,如果没有创建过同名的slab,就调用函数__kmem_cache_alias查找是否有现成的slab描述符可以复用,若没有就调用do_kmem_cache_create来创建一个新的slab描述符。do_kmem_cache_create的实现如下:
static struct kmem_cache *
do_kmem_cache_create(const char *name, size_t object_size, size_t size,
size_t align, unsigned long flags, void (*ctor)(void *),
struct mem_cgroup *memcg, struct kmem_cache *root_cache)
{
struct kmem_cache *s;
int err;
err = -ENOMEM;
s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
if (!s)
goto out;
s->name = name;
s->object_size = object_size;
s->size = size;
s->align = align;
s->ctor = ctor;
err = init_memcg_params(s, memcg, root_cache);
if (err)
goto out_free_cache;
err = __kmem_cache_create(s, flags);
if (err)
goto out_free_cache;
s->refcount = 1;
list_add(&s->list, &slab_caches);
out:
if (err)
return ERR_PTR(err);
return s;
out_free_cache:
destroy_memcg_params(s);
kmem_cache_free(kmem_cache, s);
goto out;
}
do_kmem_cache_create调用kmem_cache_zalloc分配一个slab描述符,然后调用__kmem_cache_create创建一个slab缓存,创建成功后将新创建的slab描述符加入全局slab_caches中。
__kmem_cache_create的实现如下:
int
__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
{
size_t left_over, freelist_size;
size_t ralign = BYTES_PER_WORD;
gfp_t gfp;
int err;
size_t size = cachep->size;
/*
* Check that size is in terms of words. This is needed to avoid
* unaligned accesses for some archs when redzoning is used, and makes
* sure any on-slab bufctl's are also correctly aligned.
*/
if (size & (BYTES_PER_WORD - 1)) {
size += (BYTES_PER_WORD - 1);
size &= ~(BYTES_PER_WORD - 1);
}
if (flags & SLAB_RED_ZONE) {
ralign = REDZONE_ALIGN;
/* If redzoning, ensure that the second redzone is suitably
* aligned, by adjusting the object size accordingly. */
size += REDZONE_ALIGN - 1;
size &= ~(REDZONE_ALIGN - 1);
}
/* 3) caller mandated alignment */
if (ralign < cachep->align) {
ralign = cachep->align;
}
/* disable debug if necessary */
if (ralign > __alignof__(unsigned long long))
flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
/*
* 4) Store it.
*/
cachep->align = ralign;
if (slab_is_available())
gfp = GFP_KERNEL;
else
gfp = GFP_NOWAIT;
/*
* Determine if the slab management is 'on' or 'off' slab.
* (bootstrapping cannot cope with offslab caches so don't do
* it too early on. Always use on-slab management when
* SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
*/
if ((size >= (PAGE_SIZE >> 5)) && !slab_early_init &&/* SLAB对象较大 */
!(flags & SLAB_NOLEAKTRACE))
/*
* Size is large, assume best to place the slab management obj
* off-slab (should allow better packing of objs).
*/
flags |= CFLGS_OFF_SLAB;
size = ALIGN(size, cachep->align);
/*
* We should restrict the number of objects in a slab to implement
* byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition.
*/
if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE)
size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align);
left_over = calculate_slab_order(cachep, size, cachep->align, flags);
if (!cachep->num)
return -E2BIG;
freelist_size = calculate_freelist_size(cachep->num, cachep->align);
/*
* If the slab has been placed off-slab, and we have enough space then
* move it on-slab. This is at the expense of any extra colouring.
*/
if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) {
flags &= ~CFLGS_OFF_SLAB;
left_over -= freelist_size;
}
if (flags & CFLGS_OFF_SLAB) {
/* really off slab. No need for manual alignment */
freelist_size = calculate_freelist_size(cachep->num, 0);
#ifdef CONFIG_PAGE_POISONING
/* If we're going to use the generic kernel_map_pages()
* poisoning, then it's going to smash the contents of
* the redzone and userword anyhow, so switch them off.
*/
if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
#endif
}
cachep->colour_off = cache_line_size();
/* Offset must be a multiple of the alignment. */
if (cachep->colour_off < cachep->align)
cachep->colour_off = cachep->align;
cachep->colour = left_over / cachep->colour_off;
cachep->freelist_size = freelist_size;
cachep->flags = flags;
cachep->allocflags = __GFP_COMP;
if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
cachep->allocflags |= GFP_DMA;
cachep->size = size;
cachep->reciprocal_buffer_size = reciprocal_value(size);
if (flags & CFLGS_OFF_SLAB) {
cachep->freelist_cache = kmalloc_slab(freelist_size, 0u);
/*
* This is a possibility for one of the kmalloc_{dma,}_caches.
* But since we go off slab only for object size greater than
* PAGE_SIZE/8, and kmalloc_{dma,}_caches get created
* in ascending order,this should not happen at all.
* But leave a BUG_ON for some lucky dude.
*/
BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache));
}
err = setup_cpu_cache(cachep, gfp);
if (err) {
__kmem_cache_shutdown(cachep);
return err;
}
return 0;
}
__kmem_cache_create中,函数calculate_slab_order用于计算页面分配的order数量,可以容纳多少个obj对象,及每个SLAB中的对象容量,剩下的空间用于cache colour着色。calculate_slab_order的实现如下:
static size_t calculate_slab_order(struct kmem_cache *cachep,
size_t size, size_t align, unsigned long flags)
{
unsigned long offslab_limit;
size_t left_over = 0;
int gfporder;
for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
unsigned int num;
size_t remainder;
cache_estimate(gfporder, size, align, flags, &remainder, &num);
if (!num)
continue;
/* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */
if (num > SLAB_OBJ_MAX_NUM)
break;
if (flags & CFLGS_OFF_SLAB) {
size_t freelist_size_per_obj = sizeof(freelist_idx_t);
/*
* Max number of objs-per-slab for caches which
* use off-slab slabs. Needed to avoid a possible
* looping condition in cache_grow().
*/
if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
freelist_size_per_obj += sizeof(char);
offslab_limit = size;
offslab_limit /= freelist_size_per_obj;
if (num > offslab_limit)
break;
}
/* Found something acceptable - save it away */
cachep->num = num;
cachep->gfporder = gfporder;
left_over = remainder;
/*
* A VFS-reclaimable slab tends to have most allocations
* as GFP_NOFS and we really don't want to have to be allocating
* higher-order pages when we are unable to shrink dcache.
*/
if (flags & SLAB_RECLAIM_ACCOUNT)
break;
/*
* Large number of objects is good, but very large slabs are
* currently bad for the gfp()s.
*/
if (gfporder >= slab_max_order)
break;
/*
* Acceptable internal fragmentation?
*/
if (left_over * 8 <= (PAGE_SIZE << gfporder))
break;
}
return left_over;
}
回到__kmem_cache_create中,函数cache_line_size()计算L1 cache行的大小,用left_over除以L1 cache行的大小,就计算出了该slab空闲空间可以容纳多少次着色。
最后,__kmem_cache_create调用setup_cpu_cache函数来继续配置slab描述符。setup_cpu_cache函数实现如下:
static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
{
if (slab_state >= FULL)
return enable_cpucache(cachep, gfp);
cachep->cpu_cache = alloc_kmem_cache_cpus(cachep, 1, 1);
if (!cachep->cpu_cache)
return 1;
if (slab_state == DOWN) {
/* Creation of first cache (kmem_cache). */
set_up_node(kmem_cache, CACHE_CACHE);
} else if (slab_state == PARTIAL) {
/* For kmem_cache_node */
set_up_node(cachep, SIZE_NODE);
} else {
int node;
for_each_online_node(node) {
cachep->node[node] = kmalloc_node(
sizeof(struct kmem_cache_node), gfp, node);
BUG_ON(!cachep->node[node]);
kmem_cache_node_init(cachep->node[node]);
}
}
cachep->node[numa_mem_id()]->next_reap =
jiffies + REAPTIMEOUT_NODE +
((unsigned long)cachep) % REAPTIMEOUT_NODE;
cpu_cache_get(cachep)->avail = 0;
cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
cpu_cache_get(cachep)->batchcount = 1;
cpu_cache_get(cachep)->touched = 0;
cachep->batchcount = 1;
cachep->limit = BOOT_CPUCACHE_ENTRIES;
return 0;
}
setup_cpu_cache中,如果slab_state为FULL,即slab已经初始化完成了,就调用enable_cpucache直接分配内存并初始化。
1.3 分配slab对象
kmem_cache_alloc()函数是分配slab缓存对象的核心函数,在slab分配过程中,全程关闭本地中断。kmem_cache_alloc()在<mm/slab.c>中的实现如下:
void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
void *ret = slab_alloc(cachep, flags, _RET_IP_);
trace_kmem_cache_alloc(_RET_IP_, ret,
cachep->object_size, cachep->size, flags);
return ret;
}
kmem_cache_alloc()通过调用slab_alloc()函数进行分配,slab_alloc()函数实现如下:
static __always_inline void *
slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
{
unsigned long save_flags;
void *objp;
flags &= gfp_allowed_mask;
lockdep_trace_alloc(flags);
if (slab_should_failslab(cachep, flags))
return NULL;
cachep = memcg_kmem_get_cache(cachep, flags);
cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
objp = __do_cache_alloc(cachep, flags);
local_irq_restore(save_flags);
objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,
flags);
prefetchw(objp);
if (likely(objp)) {
kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size);
if (unlikely(flags & __GFP_ZERO))
memset(objp, 0, cachep->object_size);
}
memcg_kmem_put_cache(cachep);
return objp;
}
函数slab_alloc()关闭中断后调用函数__do_cache_alloc()进行slab分配,函数__do_cache_alloc()内部调用____cache_alloc()进行slab分配。____cache_alloc()是slab分配的核心函数,如下所示:
static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
void *objp;
struct array_cache *ac;
bool force_refill = false;
check_irq_off();
ac = cpu_cache_get(cachep);
if (likely(ac->avail)) {
ac->touched = 1;
objp = ac_get_obj(cachep, ac, flags, false);
/*
* Allow for the possibility all avail objects are not allowed
* by the current flags
*/
if (objp) {
STATS_INC_ALLOCHIT(cachep);
goto out;
}
force_refill = true;
}
STATS_INC_ALLOCMISS(cachep);
objp = cache_alloc_refill(cachep, flags, force_refill);
/*
* the 'ac' may be updated by cache_alloc_refill(),
* and kmemleak_erase() requires its correct value.
*/
ac = cpu_cache_get(cachep);
out:
/*
* To avoid a false negative, if an object that is in one of the
* per-CPU caches is leaked, we need to make sure kmemleak doesn't
* treat the array pointers as a reference to the object.
*/
if (objp)
kmemleak_erase(&ac->entry[ac->avail]);
return objp;
}