伙伴系统分配物理页后如何转换成为虚拟地址

概述

极客时间 26 | 内核态内存映射:如何找到正确的会议室? 一道课后问题

伙伴内存分配技术是一种内存分配算法,它将内存划分为多个分区,以尝试尽可能适当地满足内存请求。 该系统利用将内存分成两半来尝试提供最佳匹配。
当Linux内核态和用户态进程申请内存时, 分配的物理页面需要转化为虚拟地址供上层访问。

下面通过内核代码看下Linux内核态的kmallocvmalloc和用户态的mmapmalloc地址转换的时机。

内核态-kmalloc

kmalloc 伙伴系统地址转换时机:
__kmalloc申请空间小于2个页面大小时,申请发起后通过SLAB分配器进行分配, 依次检查 per cpu freelist per cpu partial per node partial链表是否有满足的的缓冲,没有就通过伙伴系统重新申请, 在申请的时候完成后page_address进行地址转换。

过程如下:

  1. 申请空间大于两个页面大小内存直接通过伙伴系统申请,小于这个值使用SLUB分配器(当然最终还是从伙伴系统申请内存)
  • /include/linux/slab.h
static __always_inline void *kmalloc(size_t size, gfp_t flags)
{
    ...
    if (size > KMALLOC_MAX_CACHE_SIZE)
        return kmalloc_large(size, flags);
    ...
    return __kmalloc(size, flags);
}
...
#define KMALLOC_MAX_CACHE_SIZE (1UL << KMALLOC_SHIFT_HIGH)
/*
 * SLUB directly allocates requests fitting in to an order-1 page
 * (PAGE_SIZE*2).  Larger requests are passed to the page allocator.
 */
#define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1)
  1. 通过slab_alloc进入具体的内存申请流程.
void *__kmalloc(size_t size, gfp_t flags)
{
    struct kmem_cache *s;
    void *ret;

    if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
        return kmalloc_large(size, flags);

    s = kmalloc_slab(size, flags);

    if (unlikely(ZERO_OR_NULL_PTR(s)))
        return s;

    ret = slab_alloc(s, flags, _RET_IP_);

    trace_kmalloc(_RET_IP_, ret, size, s->size, flags);

    ret = kasan_kmalloc(s, ret, size, flags);

    return ret;
}
  1. 之后就从 . per cpu freelist查找
static __always_inline void *slab_alloc_node(struct kmem_cache *s,
        gfp_t gfpflags, int node, unsigned long addr)
{
    ...
redo:
...
    do {
        tid = this_cpu_read(s->cpu_slab->tid);
        c = raw_cpu_ptr(s->cpu_slab);
    } while (IS_ENABLED(CONFIG_PREEMPTION) &&
        unlikely(tid != READ_ONCE(c->tid)));
    ...
    object = c->freelist;
    page = c->page;
    ...
    if (unlikely(!object || !node_match(page, node))) {
        object = __slab_alloc(s, gfpflags, node, addr, c);
        stat(s, ALLOC_SLOWPATH);
    } else {
    ...
    }

    ...
    return object;
}

  1. 然后从 . per cpu partial查找
static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
    unsigned long addr, struct kmem_cache_cpu *c)
{
    ...
    new_slab:
    if (slub_percpu_partial(c)) {
        page = c->page = slub_percpu_partial(c);
        slub_set_percpu_partial(c, page);
        ...
    }
    ...
}
  1. 然后从 . per node partial查找
static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
            int node, struct kmem_cache_cpu **pc)
{
    void *freelist;
    struct kmem_cache_cpu *c = *pc;
    struct page *page;

    WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));

    freelist = get_partial(s, flags, node, c);
    ...
}

/*
 * Get a partial page, lock it and return it.
 */
static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
   struct kmem_cache_cpu *c)
{
    void *object;
    int searchnode = node;

    if (node == NUMA_NO_NODE)
        searchnode = numa_mem_id();

    object = get_partial_node(s, get_node(s, searchnode), c, flags);
    if (object || node != NUMA_NO_NODE)
       return object;

     return get_any_partial(s, flags, c);
}
  1. 上面的经历了本地缓存池分配per cpu freelist,per cpu partial, 其他节点缓冲 per node partial, 逐一查找,但是一开始肯定要从伙伴系统分配的,下面看下伙伴系统分配,关注地址转换部分, 可以看到在申请slab时,通过page_address进行了物理地址到虚拟地址转换。
    > 如果支持CONFIG_SLAB_FREELIST_RANDOM打乱了free_list中object顺序,减少堆栈溢出可预测性, 并有利于改善缓冲冲突 (Randomize free memory),对应 shuffle_freelist函数里也可看到相关地址映射的操作。
static inline void *new_slab_objects(struct kmem_cache *s, 
          gfp_t flags, int node, struct kmem_cache_cpu **pc)
{

...
    page = new_slab(s, flags, node);
    if (page) {
       ...
    }
    return freelist;
}

static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
{
    ...
    return allocate_slab(s,
        flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
}

static struct page *allocate_slab(struct kmem_cache *s,
 gfp_t flags, int node)
{

    page = alloc_slab_page(s, alloc_gfp, node, oo);

    start = page_address(page);

    shuffle = shuffle_freelist(s, page);

    if (!shuffle) {
        ...
        page->freelist = start;
        ...
    }
    ...
    return page;
}

static bool shuffle_freelist(struct kmem_cache *s, struct page *page)
{
    ...
    start = fixup_red_left(s, page_address(page));
    ...
    cur = next_freelist_entry(s, page, &pos, start, page_limit,
    freelist_count);
    cur = setup_object(s, page, cur);
    page->freelist = cur;
    ...
    return true;
}

__kmalloc申请空间大于2个页面大小时,申请发起后, 直接就通过伙伴系统申请页了, 这里看到了熟悉的order

void *__kmalloc(size_t size, gfp_t flags)
{
    ...
    if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
        return kmalloc_large(size, flags);
    ...

}

static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
{
    unsigned int order = get_order(size);
    return kmalloc_order_trace(size, flags, order);
}

static __always_inline void *
kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
{
    return kmalloc_order(size, flags, order);
}

可以看到申请完毕后通过page_address为虚拟地址返回

void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
{
    ...
    page = alloc_pages(flags, order);
    if (likely(page)) {
        ret = page_address(page);
        ...
    }
    ...
    return ret;
}

另外, kmem_cache_create kmem_cache_alloc 涉及的缓冲区也是通过SLAB在向伙伴系统申请页面后, 进行地址转换。

内核态-vmalloc

kmalloc通过伙伴系统申请不同, vmalloc分配物理页面可以不连续,申请时是按照单个page申请的, order = 0

vmalloc申请空间虚拟地址的范围:VMALLOC_STARTVMALLOC_END 但是由于物理页不是连续的, 所以如果需要计算每一页的虚拟地址, 需要单独运算.

  1. 可以看到vmalloc调用到 __vmalloc_node_range明确的之指明了虚拟地址的其实和结束位置。

/mm/vmalloc.c

void *__vmalloc(unsigned long size, gfp_t gfp_mask)
{
    return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE,
           __builtin_return_address(0));
}
...
void *__vmalloc_node(unsigned long size, unsigned long align,
       gfp_t gfp_mask, int node, const void *caller)
{
     return __vmalloc_node_range(size, align, 
            VMALLOC_START, VMALLOC_END,
            gfp_mask, PAGE_KERNEL, 0, node, caller);
}
  1. 两个处理步骤:
    (1). 通过 __get_vm_area_node申请struct vmap_area
    (2). 通过__vmalloc_area_node申请物理页面建立映射.

/mm/vmalloc.c

void *__vmalloc_node_range(unsigned long size, unsigned long align,
        unsigned long start, unsigned long end, gfp_t gfp_mask,
        pgprot_t prot, unsigned long vm_flags, int node,
        const void *caller)
{
    ...

    area = __get_vm_area_node(real_size, align,
           VM_ALLOC | VM_UNINITIALIZED | vm_flags,
           start, end, node, gfp_mask, caller);
    ...

    addr = __vmalloc_area_node(area, gfp_mask, prot, node);

    ...
}
  1. __get_vm_area_node处理过程:

kzalloc_node通过kmalloc申请struct vm_struct需要的空间。结构没有大于两个页所以是通过SLAB申请内存。

alloc_vmap_area分配虚拟空间的一个区域

这个函数版本在5.2做了改进:
使用缓冲结构kmem_cache替换原有的rb_node红黑树结构, 不再使用之前的查找地址之间空洞方式, 结构体调整引入subtree_max_size方便找到最符合条件的空闲区域。

static struct vm_struct *__get_vm_area_node(unsigned long size,
        unsigned long align, unsigned long flags, unsigned long start,
        unsigned long end, int node, gfp_t gfp_mask, const void *caller)
{
    struct vmap_area *va;
    struct vm_struct *area;
    unsigned long requested_size = size;

    BUG_ON(in_interrupt());
    size = PAGE_ALIGN(size);
    if (unlikely(!size))
       return NULL;

    if (flags & VM_IOREMAP)
        align = 1ul << clamp_t(int, get_count_order_long(size),
                   PAGE_SHIFT, IOREMAP_MAX_ORDER);

    area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
    if (unlikely(!area))
        return NULL;

    if (!(flags & VM_NO_GUARD))
        size += PAGE_SIZE;

    va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
    if (IS_ERR(va)) {
        kfree(area);
        return NULL;
    }

    kasan_unpoison_vmalloc((void *)va->va_start, requested_size);

    setup_vmalloc_vm(area, va, flags, caller);

    return area;
}

可以看到查找方式也从之前红黑树之间查找符合条件holes 调整为查找符合条件的缓冲区.

static struct vmap_area *alloc_vmap_area(unsigned long size,
         unsigned long align,
         unsigned long vstart, unsigned long vend,
         int node, gfp_t gfp_mask)
{
    ...
    va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
    ...
    addr = __alloc_vmap_area(size, align, vstart, vend);
    return ERR_PTR(-EBUSY);
}

增加最大可用空间subtree_max_size找到最符合条件的空闲区域

这块没太看懂


struct vmap_area { ... union { unsigned long subtree_max_size; /* in "free" tree */ struct vm_struct *vm; /* in "busy" tree */ struct llist_node purge_list; /* in purge list */ }; }; ... static __always_inline unsigned long __alloc_vmap_area(unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend) { ... va = find_vmap_lowest_match(size, align, vstart); ... } static __always_inline struct vmap_area * find_vmap_lowest_match(unsigned long size, unsigned long align, unsigned long vstart) { struct vmap_area *va; struct rb_node *node; unsigned long length; node = free_vmap_area_root.rb_node; length = size + align - 1; while (node) { va = rb_entry(node, struct vmap_area, rb_node); if (get_subtree_max_size(node->rb_left) >= length && vstart < va->va_start) { node = node->rb_left; } else { if (is_within_this_va(va, size, align, vstart)) return va; if (get_subtree_max_size(node->rb_right) >= length) { node = node->rb_right; continue; } while ((node = rb_parent(node))) { va = rb_entry(node, struct vmap_area, rb_node); if (is_within_this_va(va, size, align, vstart)) return va; if (get_subtree_max_size(node->rb_right) >= length && vstart <= va->va_start) { node = node->rb_right; break; } } } } return NULL; }
  1. 通过上一步在. VMALLOC_STARTVMALLOC_END 申请空闲区域后,通过 __vmalloc_area_node申请相关物理空间。

这个函数用到了递归, 申请结构体中, 用于存放页面地址的动态数组空间, 每次申请一page,通过动态数组需要使用的页面空间大小 ,进行递归条件的判断。

struct vm_struct {
    ...
    struct page    **pages;
    unsigned int    nr_pages;
    ...
};

static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
                 pgprot_t prot, int node)
{
    nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
    array_size = (nr_pages * sizeof(struct page *));
    if (array_size > PAGE_SIZE) {
        pages = __vmalloc_node(array_size, 1, nested_gfp|
        highmem_mask, node, area->caller);
    } else {
        pages = kmalloc_node(array_size, nested_gfp, node);
    }

    area->pages = pages;
    area->nr_pages = nr_pages;

    for (i = 0; i < area->nr_pages; i++) {
        struct page *page;

        if (node == NUMA_NO_NODE)
            page = alloc_page(alloc_mask|highmem_mask);
        else
            page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);

         ...
         area->pages[i] = page;
         ...
    }

     ...
    if (map_kernel_range((unsigned long)area->addr,
     get_vm_area_size(area), prot, pages) < 0)
      goto fail;

    return area->addr;
fail:
    return NULL;
}

进行页表录页表项的设置, mm_structpgd->p4d->pud->pmd->pte逐层填充, 如果页表项涉及变化通过`arch_sync_kernel_mappings(start, end);进行更新

int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
    struct page **pages)
{
    int ret;

    ret = map_kernel_range_noflush(start, size, prot, pages);
    flush_cache_vmap(start, start + size);
    return ret;
}

int map_kernel_range_noflush(unsigned long addr, unsigned long size, 
pgprot_t prot, struct page **pages)
{
    unsigned long start = addr;
    unsigned long end = addr + size;
    unsigned long next;
    pgd_t *pgd;
    int err = 0;
    int nr = 0;
    pgtbl_mod_mask mask = 0;

    BUG_ON(addr >= end);
    pgd = pgd_offset_k(addr);
    do {
        next = pgd_addr_end(addr, end);
        if (pgd_bad(*pgd))
        mask |= PGTBL_PGD_MODIFIED;
        err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
        if (err)
           return err;
    } while (pgd++, addr = next, addr != end);

    if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
        arch_sync_kernel_mappings(start, end);

    return 0;
}

这里可以看到通过虚拟地址,截取获得相关pgd信息。X86-64位 偏移在39位的位置。

/include/linux/pgtable.h
#define pgd_offset_k(address)    pgd_offset(&init_mm, (address))
#define pgd_offset(mm, address)    pgd_offset_pgd((mm)->pgd, (address))

static inline pgd_t *pgd_offset_pgd(pgd_t *pgd, unsigned long address)
{
    return (pgd + pgd_index(address));
};

#ifndef pgd_index
#define pgd_index(a)  (((a) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
#endif

#define PGDIR_SHIFT        39

页目录表pgd, p4d, pud, pmd的设置, 一直到页表, 最后将物理页号填充到pte所指向的虚拟地址的的位置中.完成映射


static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { ... p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); do { next = p4d_addr_end(addr, end); if (vmap_pud_range(p4d, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (p4d++, addr = next, addr != end); return 0; } static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { ... pud = pud_alloc_track(&init_mm, p4d, addr, mask); do { next = pud_addr_end(addr, end); if (vmap_pmd_range(pud, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (pud++, addr = next, addr != end); return 0; } static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { ... pmd = pmd_alloc_track(&init_mm, pud, addr, mask); do { next = pmd_addr_end(addr, end); if (vmap_pte_range(pmd, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (pmd++, addr = next, addr != end); return 0; } static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { ... pte = pte_alloc_kernel_track(pmd, addr, mask); do { struct page *page = pages[*nr]; ... set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); (*nr)++; } while (pte++, addr += PAGE_SIZE, addr != end); *mask |= PGTBL_PTE_MODIFIED; return 0; }

通过mk_pte创建一个新的页表项,设置物理页面号,权限控制位组成,通过set_pte()插入到对应的页表中去了。 可以看到pageprot__vmalloc_node设置为 PAGE_KERNEL

#define mk_pte(page, pgprot)   pfn_pte(page_to_pfn(page), (pgprot))

static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
{
    phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
    pfn ^= protnone_mask(pgprot_val(pgprot));
    pfn &= PTE_PFN_MASK;
    return __pte(pfn | check_pgprot(pgprot));
}

static inline pte_t native_make_pte(pteval_t val)
{
    return (pte_t) { .pte = val };
}

...

#define set_pte_at(mm, addr, ptep, pte)    \
                 native_set_pte_at(mm, addr, ptep, pte)

static inline void native_set_pte_at(struct mm_struct *mm, 
unsigned long addr, pte_t *ptep , pte_t pte)
{
    native_set_pte(ptep, pte);
}

static inline void native_set_pte(pte_t *ptep, pte_t pte)
{
    WRITE_ONCE(*ptep, pte);
}

vmalloc使用SLUB使用伙伴系统的申请的是其结构信息, 物理页面是通过伙伴系统申请, 并完成映射。

  1. 一些变化:

在最近的版本中, 已经将vmalloc缺页中断处理移除, 通过XXX_alloc_track跟踪页目录及页表项变化,通过 arch_sync_kernel_mappings在申请和释放阶段同步通知所有页表
参见:x86/mm: remove vmalloc faulting

X86-64 处理如下,将全局pgd同步到每个建立映射的物理页面.

void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
{
    sync_global_pgds(start, end);
}

void sync_global_pgds(unsigned long start, unsigned long end)
{
    if (pgtable_l5_enabled())
        sync_global_pgds_l5(start, end);
    else
        sync_global_pgds_l4(start, end);
}

...
static void sync_global_pgds_l5(unsigned long start, unsigned long end)
{
    ...
    for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
        const pgd_t *pgd_ref = pgd_offset_k(addr);
        struct page *page;

        spin_lock(&pgd_lock);
        list_for_each_entry(page, &pgd_list, lru) {
            pgd_t *pgd;
            spinlock_t *pgt_lock;

            pgd = (pgd_t *)page_address(page) + pgd_index(addr);

            pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
            spin_lock(pgt_lock);

            if (!pgd_none(*pgd_ref) && !pgd_none(*pgd))
                BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));

            if (pgd_none(*pgd))
                set_pgd(pgd, *pgd_ref);

           spin_unlock(pgt_lock);
        }
        spin_unlock(&pgd_lock);
    }
}

可以看到是将虚拟地址addr地址对应pgd全局页表,而这个pgd正是swapper_pg_dir全局页表

用户态-malloc

用户态进程调用malloc,申请小块内存时, glibc根据 M_MMAP_THRESHOLD 判断是否使用 brk还是 mmap

from /malloc/malloc.c

#ifndef DEFAULT_MMAP_THRESHOLD
#define DEFAULT_MMAP_THRESHOLD DEFAULT_MMAP_THRESHOLD_MIN
#endif

#ifndef DEFAULT_MMAP_THRESHOLD_MIN
#define DEFAULT_MMAP_THRESHOLD_MIN (128 * 1024)
#endif

可以看到是128K

用下面例子试了一下, 调整malloc 申请空间分别为1281024-1,1281024+1

#include <stdio.h>

void main()
{
    void * m;
    m = malloc((128*1024-1));
    exit(0);
}

居然不对和描述的对应不上, 怎么办, 看下glibc 是如何实现的。

/malloc/malloc.c

void *
__libc_malloc (size_t bytes)
{
    ...
  if (SINGLE_THREAD_P)
    {
      victim = _int_malloc (&amp;main_arena, bytes);
      ...
      return victim;
    }

  victim = _int_malloc (ar_ptr, bytes);
  ...
}
static void *
_int_malloc (mstate av, size_t bytes)
{
    ...
    if (!checked_request2size (bytes, &amp;nb))
    {
      __set_errno (ENOMEM);
      return NULL;
    }
    ...
}

#define request2size(req)                                         \
  (((req) + SIZE_SZ + MALLOC_ALIGN_MASK &lt; MINSIZE)  ?             \
   MINSIZE :                                                      \
   ((req) + SIZE_SZ + MALLOC_ALIGN_MASK) &amp; ~MALLOC_ALIGN_MASK)

/malloc/malloc-internal.h

#ifndef INTERNAL_SIZE_T
# define INTERNAL_SIZE_T size_t   (8)
#endif

/* The corresponding word size.  */
#define SIZE_SZ (sizeof (INTERNAL_SIZE_T))  (8)

/* The corresponding bit mask value.  */
#define MALLOC_ALIGN_MASK (MALLOC_ALIGNMENT - 1)  

可以看到后续申请都是用nb长度增加了了一个SIZE_SZ,并作对齐(我的环境是16bytes),再从新计算一下

req = 128*1024-8-16  返回的nb = 128*1024 调用mmap
req = 128*1024-8-16-1  返回的nb = 128*1024-16 调用brk

申请验证一下

#include <stdio.h>

void main()
{
    void * m;
    m = malloc((128*1024-24));
    exit(0);
}

strace跟踪一下还是brk

# strace ./mtest
execve("./mtest", ["./mtest"], [/* 27 vars */]) = 0
...
brk(NULL)                               = 0xaa8000
brk(0xac9000)                           = 0xac9000
...

这里使用的是Main_arena, 新增下面一行, 长度对齐后 131072 = 128*1024, 可以看到使用了mmap

    m = malloc((128*1024-23));
# strace ./mtest
execve("./mtest", ["./mtest"], [/* 27 vars */]) = 0
...
brk(NULL)                               = 0x1051000
brk(0x1072000)                          = 0x1072000
...
mmap(NULL, 135168, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0)
 = 0x7f0d2b02c000
...

新增的行调整,申请空间长度对齐后 131056 < 128*1024 = 131072, 可以看到调用brk

    m = malloc((128*1024-24));
# strace ./mtest
brk(NULL)                               = 0x9e9000
brk(0xa0a000)                           = 0xa0a000
fstat(0, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 0), ...}) = 0
read(0,
"\n", 1024)                     = 1
brk(0xa4a000)                           = 0xa4a000

本来只想演示一下malloc的两个系统调用,深入后发现malloc没有说得那么简单详细的可以参见这边文章
glibc内存管理ptmalloc源代码分析

调用brk系统调用流程如下, 首先判断进行页对齐,申请空间在一个页面内的不再单独申请, 如果新堆顶小于当前进程的堆顶进行页面释放,如果大于当前堆顶先申请虚拟地址空间vm_area_struct, 使用do_brk_flags通过slub通过伙伴系统为其重新分配物理空间.

SYSCALL_DEFINE1(brk, unsigned long, brk)
{
    ...

    newbrk = PAGE_ALIGN(brk);
    oldbrk = PAGE_ALIGN(mm->brk);
    if (oldbrk == newbrk) {
        mm->brk = brk;
        goto success;
    }
    ...
    if (brk <= mm->brk) {
        ...
        mm->brk = brk;
        ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true);
        ...
        goto success;
    }

    next = find_vma(mm, oldbrk);
    if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
        goto out;

    if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0)
       goto out;
    mm->brk = brk;

success:
    ...
    return brk;
out:
    ...
}

VMA是通过slab, 从伙伴系统申请page时,然后通过page_address进行了物理地址到虚拟地址转换

static int do_brk_flags(unsigned long addr, unsigned long len, 
unsigned long flags, struct list_head *uf)
{
    ...
    vma = vm_area_alloc(mm);
    ...
}

但这时还没有真正的分配应用需要的物理内存, 需要应用再访问使用虚拟内存的时候,发现没有物理页面, 触发缺页中断时进行物理页面的分配

/arch/x86/mm/fault.c

static __always_inline void
handle_page_fault(struct pt_regs *regs, unsigned long error_code,
                  unsigned long address)
{
    ...
    if (unlikely(fault_in_kernel_space(address))) {
        do_kern_addr_fault(regs, error_code, address);
    } else {
        do_user_addr_fault(regs, error_code, address);
        ...
    }
    ...
}

do_user_addr_fault会分配相关页表目录,创建页表项根据不同情况进行不同调用.


static vm_fault_t handle_pte_fault(struct vm_fault *vmf) { ... if (!vmf->pte) { if (vma_is_anonymous(vmf->vma)) return do_anonymous_page(vmf); else return do_fault(vmf); } if (!pte_present(vmf->orig_pte)) return do_swap_page(vmf); if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) return do_numa_page(vmf); ... if (vmf->flags & FAULT_FLAG_WRITE) { if (!pte_write(entry)) return do_wp_page(vmf); ... } ... }

do_anonymous_page 匿名页流程, 最终调用伙伴系统进行页面分配.

struct page *
alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
        unsigned long addr, int node, bool hugepage)
{

    if (pol->mode == MPOL_INTERLEAVE) {
        ...
        page = alloc_page_interleave(gfp, order, nid);
        goto out;
    }

    if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
    ...
        nmask = policy_nodemask(gfp, pol);
        if (!nmask || node_isset(hpage_node, *nmask)) {
            mpol_cond_put(pol);
            page = __alloc_pages_node(hpage_node,
               gfp | __GFP_THISNODE | __GFP_NORETRY, order);
                ...

            if (!page && (gfp & __GFP_DIRECT_RECLAIM))
            page = __alloc_pages_node(hpage_node,
                   gfp, order);
    ...
    nmask = policy_nodemask(gfp, pol);
    preferred_nid = policy_node(gfp, pol, node);
    page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
    mpol_cond_put(pol);
out:
    return page;
}

用户态-mmap

brk模式使用匿名映射方式申请内存不同, mmap支持文件映射,总体流程一样的还是申请虚拟内存地址范围,并为相关的需内内存节点结构分配空间,通过SLAB

/*
 * The caller must write-lock current->mm->mmap_lock.
 */
unsigned long do_mmap(struct file *file, unsigned long addr,
            unsigned long len, unsigned long prot,
            unsigned long flags, vm_flags_t vm_flags,
            unsigned long pgoff, unsigned long *populate,
            struct list_head *uf)
{
    ...
    addr = get_unmapped_area(file, addr, len, pgoff, flags);
    ...
    addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
    ...
    return addr;
}

发生缺页中断后进行伙伴系统调用分配内存

总结

通过以上几个API分析可以看出伙伴系统的虚拟内地址转换有两种情况 :

  • 申请页面后转虚拟地址:
    • 直接使用伙伴系统申请页,通过page_address进行地址转换,如kmalloc申请大于2个页面时
    • 通过SLAB从伙伴系统申请页,创建new slab时通过page_address进行地址转换, 如vmalloc, VMA结构体申请时
  • 有虚拟地址挂载页面
    • 有指定虚拟地址范围,再通过伙伴系统申请空间,申请释放时统一进行更新页表项, 如vmalloc

参考及引用

图解slub
SLUB分配一个object的流程分析
vmalloc-非连续内存的分配
Linux slab 分配器剖析
Linux kernel 内存 – 页表映射(SHIFT,SIZE,MASK)和转换(32位,64位)
mm/vmalloc.c: keep track of free blocks for vmap allocation
malloc源码分析——_int_malloc
Linux中的页表实现

Be First to Comment

发表回复