linux内存管理之kmalloc(2)-linuxDOS-ChinaUnix博客

上一篇文章中简单说了下slab分配器下kmalloc是如何分配内存的。在看cache_alloc_refill这个函数的时候逻辑上还有一些困惑。

点击(此处)折叠或打开

static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
bool force_refill)
{
int batchcount;
struct kmem_list3 *l3;
struct array_cache *ac;
int node;
check_irq_off();
node = numa_mem_id();
if (unlikely(force_refill))
goto force_grow;
retry:
ac = cpu_cache_get(cachep);
batchcount = ac->batchcount;
if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
/*
* If there was little recent activity on this cache, then
* perform only a partial refill. Otherwise we could generate
* refill bouncing.
*/
batchcount = BATCHREFILL_LIMIT;
}
l3 = cachep->nodelists[node];

BUG_ON(ac->avail > 0 || !l3);
spin_lock(&l3->list_lock);
/* See if we can refill from the shared array */
if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) {
l3->shared->touched = 1;
goto alloc_done;
}
while (batchcount > 0) {
struct list_head *entry;
struct slab *slabp;
/* Get slab alloc is to come from. */
entry = l3->slabs_partial.next;
if (entry == &l3->slabs_partial) {
l3->free_touched = 1;
entry = l3->slabs_free.next;
if (entry == &l3->slabs_free)
goto must_grow;
}
slabp = list_entry(entry, struct slab, list);
check_slabp(cachep, slabp);
check_spinlock_acquired(cachep);
/*
* The slab was either on partial or free list so
* there must be at least one object available for
* allocation.
*/
BUG_ON(slabp->inuse >= cachep->num);
while (slabp->inuse < cachep->num && batchcount--) {
STATS_INC_ALLOCED(cachep);
STATS_INC_ACTIVE(cachep);
STATS_SET_HIGH(cachep);
ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp,
node));
}
check_slabp(cachep, slabp);
/* move slabp to correct slabp list: */
list_del(&slabp->list);
if (slabp->free == BUFCTL_END)
list_add(&slabp->list, &l3->slabs_full);
else
list_add(&slabp->list, &l3->slabs_partial);
}
must_grow:
l3->free_objects -= ac->avail;
alloc_done:
spin_unlock(&l3->list_lock);
if (unlikely(!ac->avail)) {
int x;
force_grow:
x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
/* cache_grow can reenable interrupts, then ac could change. */
ac = cpu_cache_get(cachep);
node = numa_mem_id();
/* no objects in sight? abort */
if (!x && (ac->avail == 0 || force_refill))
return NULL;
if (!ac->avail) /* objects refilled by interrupt? */
goto retry;
}
ac->touched = 1;
return ac_get_obj(cachep, ac, flags, force_refill);
}

主要是关于 batchcount = ac->batchcount; 的问题。在默认初始化的时候即在kmem_cache_init中系统的cache都会调用到__kmem_cache_create中setup_cpu_cache的有这样一段代码：

点击(此处)折叠或打开

cpu_cache_get(cachep)->avail = 0;
cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
cpu_cache_get(cachep)->batchcount = 1;
cpu_cache_get(cachep)->touched = 0;
cachep->batchcount = 1;
cachep->limit = BOOT_CPUCACHE_ENTRIES;
return 0;

那么我是不是就可以认为ac->batchcount的值就是1了呢？那么 ac_put_obj的时候只放一个obj到array中。每次都这样，那么在__cache_alloc中

点击(此处)折叠或打开

ac = cpu_cache_get(cachep);
if (likely(ac->avail)) {
ac->touched = 1;
objp = ac_get_obj(cachep, ac, flags, false);
/*
* Allow for the possibility all avail objects are not allowed
* by the current flags
*/
if (objp) {
STATS_INC_ALLOCHIT(cachep);
goto out;
}
force_refill = true;
}

它的意义又何在呢？因为batchcount为1的话，每次放入一个obj到array 设置avail从0到1，但是get一个obj后，avail又为0了。当然这样效率很低。
后来才发现是自己代码没看全- -，我们看这样一段代码它在kmem_cache_init初始化后，调用的

点击(此处)折叠或打开

void __init kmem_cache_init_late(void)
{
struct kmem_cache *cachep;
slab_state = UP;
/* 6) resize the head arrays to their final sizes */
mutex_lock(&slab_mutex);
list_for_each_entry(cachep, &slab_caches, list)
if (enable_cpucache(cachep, GFP_NOWAIT))
BUG();
mutex_unlock(&slab_mutex);
/* Annotate slab for lockdep -- annotate the malloc caches */
init_lock_keys();
/* */
slab_state = FULL;
/*
* Register a cpu startup notifier callback that initializes
* cpu_cache_get for all new cpus
*/
register_cpu_notifier(&cpucache_notifier);
#ifdef CONFIG_NUMA
/*
* Register a memory hotplug callback that initializes and frees
* nodelists.
*/
hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
#endif
/*
* The reap timers are started later, with a module init call: That part
* of the kernel is not yet operational.
*/
}

这个函数就是把slab_caches链表上的所有cache都调用enable_cpucache(cachep, GFP_NOWAIT)一遍！

点击(此处)折叠或打开

/* Called with slab_mutex held always */
static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
{
int err;
int limit = 0;
int shared = 0;
int batchcount = 0;
if (!is_root_cache(cachep)) {
struct kmem_cache *root = memcg_root_cache(cachep);
limit = root->limit;
shared = root->shared;
batchcount = root->batchcount;
}
if (limit && shared && batchcount)
goto skip_setup;
/*
* The head array serves three purposes:
* - create a LIFO ordering, i.e. return objects that are cache-warm
* - reduce the number of spinlock operations.
* - reduce the number of linked list operations on the slab and
* bufctl chains: array operations are cheaper.
* The numbers are guessed, we should auto-tune as described by
* Bonwick.
*/
if (cachep->size > 131072) // size 大一128k 小于page_size 则limit为1
limit = 1;
else if (cachep->size > PAGE_SIZE)
limit = 8;
else if (cachep->size > 1024)
limit = 24;
else if (cachep->size > 256)
limit = 54;
else
limit = 120;
/*
* CPU bound tasks (e.g. network routing) can exhibit cpu bound
* allocation behaviour: Most allocs on one cpu, most free operations
* on another cpu. For these cases, an efficient object passing between
* cpus is necessary. This is provided by a shared array. The array
* replaces Bonwick's magazine layer.
* On uniprocessor, it's functionally equivalent (but less efficient)
* to a larger limit. Thus disabled by default.
*/
shared = 0;
if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1) // smp 下 shared为8 ，单核为0
shared = 8;
#if DEBUG
/*
* With debugging enabled, large batchcount lead to excessively long
* periods with disabled local interrupts. Limit the batchcount
*/
if (limit > 32)
limit = 32;
#endif
batchcount = (limit + 1) / 2;
skip_setup:
err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp); //设置参数值到cache里
if (err)
printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
cachep->name, -err);
return err;
}

对我们看到了limit 、shared 、 batchcount的新初始化.

点击(此处)折叠或打开

static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
int batchcount, int shared, gfp_t gfp)
{
int ret;
struct kmem_cache *c = NULL;
int i = 0;
ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp); // 设置传递进来的cache的东西
if (slab_state < FULL)
return ret;
if ((ret < 0) || !is_root_cache(cachep))
return ret;
VM_BUG_ON(!mutex_is_locked(&slab_mutex));
for_each_memcg_cache_index(i) {
c = cache_from_memcg(cachep, i);
if (c)
/* return value determined by the parent cache only */
__do_tune_cpucache(c, limit, batchcount, shared, gfp);
}
return ret;
}

而具体实现在

点击(此处)折叠或打开

/* Always called with the slab_mutex held */
static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
int batchcount, int shared, gfp_t gfp)
{
struct ccupdate_struct *new;
// 说明一下上面的结构体
点击(此处)折叠或打开
1. struct ccupdate_struct {
2. struct kmem_cache *cachep;
3. struct array_cache *new[0];
4. };
int i;
new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *), // 这个函数用完new就释放了。说明它只是起到一个中转的作用.
gfp);
if (!new)
return -ENOMEM;
for_each_online_cpu(i) {
new->new[i] = alloc_arraycache(cpu_to_mem(i), limit,
batchcount, gfp);
if (!new->new[i]) {
for (i--; i >= 0; i--)
kfree(new->new[i]);
kfree(new);
return -ENOMEM;
}
}
new->cachep = cachep;
on_each_cpu(do_ccupdate_local, (void *)new, 1); // 关键点：每个cpu上都调用do_ccupdate_local处理new。
check_irq_on();
cachep->batchcount = batchcount;
cachep->limit = limit;
cachep->shared = shared;
for_each_online_cpu(i) {
struct array_cache *ccold = new->new[i];
if (!ccold)
continue;
spin_lock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i)); //
spin_unlock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
kfree(ccold);
}
kfree(new);
return alloc_kmemlist(cachep, gfp);
}

我们就看看do_ccupdate_local做了什么

点击(此处)折叠或打开

static void do_ccupdate_local(void *info)
{
struct ccupdate_struct *new = info;
struct array_cache *old;
check_irq_off();
old = cpu_cache_get(new->cachep);
new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];// 由于之前 new->cache已经指向了我们的cache，所以这里操作的是我们cache的array指向新的地方.
// 而new->new这个array的初始化是在申请它的时候见上个函数里的alloc_arraycache：
点击(此处)折叠或打开
1. static struct array_cache *alloc_arraycache(int node, int entries,
2. int batchcount, gfp_t gfp)
3. {
4. int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
5. struct array_cache *nc = NULL;
7. nc = kmalloc_node(memsize, gfp, node);
8. /*
9. * The array_cache structures contain pointers to free object.
10. * However, when such objects are allocated or transferred to another
11. * cache the pointers are not cleared and they could be counted as
12. * valid references during a kmemleak scan. Therefore, kmemleak must
13. * not scan such objects.
14. */
15. kmemleak_no_scan(nc);
16. if (nc) {
17. nc->avail = 0;
18. nc->limit = entries;
19. nc->batchcount = batchcount;
20. nc->touched = 0;
21. spin_lock_init(&nc->lock);
22. }
23. return nc;
24. }
new->new[smp_processor_id()] = old;
}

这样就和函数cache_alloc_refill接起来了
我们可以看看实际的内核开启slab的信息：

点击(此处)折叠或打开

cat /proc/slabinfo
slabinfo - version: 2.1
# name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab> : tunables <limit> <batchcount> <sharedfactor> : slabdata <active_slabs> <num_slabs> <sharedavail>
nf_conntrack_expect 0 0 152 26 1 : tunables 120 60 8 : slabdata 0 0 0
nf_conntrack_8050c5f0 2 26 296 13 1 : tunables 54 27 8 : slabdata 2 2 0
bridge_fdb_cache       4     78     48   78    1 : tunables 120   60    8 : slabdata      1      1      0

fib6_nodes            12    113     32 113    1 : tunables 120   60    8 : slabdata      1      1      0

ip6_dst_cache         25     57    208   19    1 : tunables 120   60    8 : slabdata      3      3      0

ip6_mrt_cache          0      0    112   35    1 : tunables 120   60    8 : slabdata      0      0      0

RAWv6                  8     15    720    5    1 : tunables   54   27    8 : slabdata      3      3      0

UDPLITEv6              0      0    688   11    2 : tunables   54   27    8 : slabdata      0      0      0

UDPv6                  3     22    688   11    2 : tunables   54   27    8 : slabdata      2      2      0

tw_sock_TCPv6          0     0    144   27    1 : tunables 120   60    8 : slabdata      0      0      0

request_sock_TCPv6      0      0    112   35    1 : tunables 120   60    8 : slabdata      0      0      0

TCPv6                  5      6   1328    3    1 : tunables   24   12    8 : slabdata      2      2      0

ubi_wl_entry_slab    463    580     24 145    1 : tunables 120   60    8 : slabdata      4      4      0

sd_ext_cdb             2    113     32 113    1 : tunables 120   60    8 : slabdata      1      1      0

fuse_request           0      0    384   10    1 : tunables   54   27    8 : slabdata      0      0      0

fuse_inode             0      0    416    9    1 : tunables   54   27    8 : slabdata      0      0      0

jffs2_inode_cache     15    145     24 145    1 : tunables 120   60    8 : slabdata      1      1      0

jffs2_node_frag      130    290     24 145    1 : tunables 120   60    8 : slabdata      2      2      0

uid_cache              0      0     48   78    1 : tunables 120   60    8 : slabdata      0      0      0

UNIX                  24     32    480    8    1 : tunables   54   27    8 : slabdata      4      4      0

ip_mrt_cache           0      0     96   40    1 : tunables 120   60    8 : slabdata      0      0      0

UDP-Lite               0      0    560    7    1 : tunables   54   27    8 : slabdata      0      0      0

tcp_bind_bucket        6    113     32 113    1 : tunables 120   60    8 : slabdata      1      1      0

inet_peer_cache        8     24    160   24    1 : tunables 120   60    8 : slabdata      1      1      0

ip_fib_trie            7    113     32 113    1 : tunables 120   60    8 : slabdata      1      1      0

ip_fib_alias           8    145     24 145    1 : tunables 120   60    8 : slabdata      1      1      0

ip_dst_cache           6     27    144   27    1 : tunables 120   60    8 : slabdata      1      1      0

PING                   0      0    528    7    1 : tunables   54   27    8 : slabdata      0      0      0

RAW                    4      7    544    7    1 : tunables   54   27    8 : slabdata      1      1      0

UDP                   13     14    560    7    1 : tunables   54   27    8 : slabdata      2      2      0

tw_sock_TCP            0      0    112   35    1 : tunables 120   60    8 : slabdata      0      0      0

request_sock_TCP       0      0     80   48    1 : tunables 120   60    8 : slabdata      0      0      0

TCP                    1      6   1184    6    2 : tunables   24   12    8 : slabdata      1      1      0
......
size-2048(DMA)         0      0   2048    2    1 : tunables   24   12    8 : slabdata      0      0      0

size-2048            192    192   2048    2    1 : tunables   24   12    8 : slabdata     96     96      0

size-1024(DMA)         0      0   1024    4    1 : tunables   54   27    8 : slabdata      0      0      0

size-1024            215    216   1024    4    1 : tunables   54   27    8 : slabdata     54     54      0

size-512(DMA)          0      0    512    8    1 : tunables   54   27    8 : slabdata      0      0      0

size-512             601    624    512    8    1 : tunables   54   27    8 : slabdata     78     78      0

size-256(DMA)          0      0    256   15    1 : tunables 120   60    8 : slabdata      0      0      0

size-256            1234   1245    256   15    1 : tunables 120   60    8 : slabdata     83     83      0

size-192(DMA)          0      0    256   15    1 : tunables 120   60    8 : slabdata      0      0      0

size-192             287    300   256   15    1 : tunables 120   60    8 : slabdata     20     20      0

size-128(DMA)          0      0    128   30    1 : tunables 120   60    8 : slabdata      0      0      0

size-128            1890   1890    128   30    1 : tunables 120   60    8 : slabdata     63     63      0

size-96(DMA)           0      0    128   30    1 : tunables 120   60    8 : slabdata      0      0      0

size-96              930    930    128   30    1 : tunables 120   60    8 : slabdata     31     31      0

size-64(DMA)           0      0    128   30    1 : tunables 120   60    8 : slabdata      0      0      0

size-32(DMA)           0      0    128   30    1 : tunables 120   60    8 : slabdata      0      0      0

size-64             1577   1650    128   30    1 : tunables 120   60    8 : slabdata     55     55      0

size-32             6213   6300    128   30    1 : tunables 120   60    8 : slabdata    210    210      0

kmem_cache           150    160     96   40    1 : tunables 120   60    8 : slabdata      4      4      0

或许你看ubuntu系统的时候发现limit batchcount值为0 ，其实它是用了slub分配器.在slub.c中

点击(此处)折叠或打开

void __init kmem_cache_init_late(void)
{
}

这里顺便说明一下关于slab、slub、slob的简单区别：（具体如何实现的请参考内核代码slab.c /slub.c/slob.c）
slab是slub和slob的基础。
SLOB的目标是针对嵌入式系统的，主要是适用于那些内存非常有限的系统，比如32MB以下的内存，它不太注重large smp系统，虽然最近在这方面有一些小的改进
SLUB allocator，用于替代 slab 代码。通过取消了大量的队列和相关开销、简化 slab 的结构，SLUB 承诺提供更好的性能和更好的系统可伸缩性，并且可以同时保持现有的 slab 分配器接口
说了这么多，我们用个图来简单描述下slab机制：