linux內存管理--分配內存頁(快速) get_page_from_freelist

先說下快速分配內存頁的參數：

gfp_mask 進入快速分配時，加上了__GFP_HARDWALL 這表示再分配時要加大分配力度；

nodemask 表示節點的mask，就是是否能在該節點上分配內存，這是個bit位數組；

order 是分配的階；

zonelist 是當perferred_zone上沒有合適的頁可以分配時，就要按zonelist中的順序掃描該zonelist中備用zone列表，一個個的試用；

high_zoneidx是表示該分配時，所能分配的最高zone，一般從high --》 normal --》dma 內存越來越昂貴，所以一般從high到dma分配依次分配；

alloc_flags 是分配內存是的標識；

preferred_zone 表示從high_zoneidx後找到的合適的zone，一般會從該zone分配；分配失敗的話，就會在zonelist再找一個preferred_zone = 合適的zone；

migratetype是遷移類型，在zone->free_area.free_list[XXX] 作爲分配下標使用，這個是用來反碎片化的，修改了以前的free_area結構體，在該結構體中再添加了一個數組，該數組以遷移類型爲下標，每個數組元素都掛了對應遷移類型的頁鏈表；

    page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
            zonelist, high_zoneidx, alloc_flags,
            preferred_zone, migratetype);


static struct page *
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
        struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
        struct zone *preferred_zone, int migratetype)
{
    struct zoneref *z;
    struct page *page = NULL;
    int classzone_idx;
    struct zone *zone;
    nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
    int zlc_active = 0;     /* set if using zonelist_cache */
    int did_zlc_setup = 0;      /* just call zlc_setup() one time */

    classzone_idx = zone_idx(preferred_zone);//zone的id
zonelist_scan:
    /*
     * Scan zonelist, looking for a zone with enough free.
     * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
     */
    for_each_zone_zonelist_nodemask(zone, z, zonelist,
                        high_zoneidx, nodemask) {//這個宏是從zonelist->_zonerefs數組中得到合適的zone，具體解釋看後面
        if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
            !zlc_zone_worth_trying(zonelist, z, allowednodes))//z->zone所在的節點不允許分配或者該zone已經飽滿了，都跳過該z
                continue;
        if ((alloc_flags & ALLOC_CPUSET) &&
            !cpuset_zone_allowed_softwall(zone, gfp_mask))//開啓了檢查內存節點是否在指定CPU集合，並且該zone不被允許在該CPU上分配內存，跳過；
                continue;
        /*
         * When allocating a page cache page for writing, we
         * want to get it from a zone that is within its dirty
         * limit, such that no single zone holds more than its
         * proportional share of globally allowed dirty pages.
         * The dirty limits take into account the zone's
         * lowmem reserves and high watermark so that kswapd
         * should be able to balance it without having to
         * write pages from its LRU list.
         *
         * This may look like it could increase pressure on
         * lower zones by failing allocations in higher zones
         * before they are full.  But the pages that do spill
         * over are limited as the lower zones are protected
         * by this very same mechanism.  It should not become
         * a practical burden to them.
         *
         * XXX: For now, allow allocations to potentially
         * exceed the per-zone dirty limit in the slowpath
         * (ALLOC_WMARK_LOW unset) before going into reclaim,
         * which is important when on a NUMA setup the allowed
         * zones are together not big enough to reach the
         * global limit.  The proper fix for these situations
         * will require awareness of zones in the
         * dirty-throttling and the flusher threads.
         */
        if ((alloc_flags & ALLOC_WMARK_LOW) &&
            (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))//判斷該zone上的髒頁是否超過了limit
            goto this_zone_full;//髒頁超過了限制，跳轉到最後設置該zone已經飽滿，這樣可以平衡髒頁分配到各個zone上

        BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
        if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
            unsigned long mark;
            int ret;

            mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
            if (zone_watermark_ok(zone, order, mark,
                    classzone_idx, alloc_flags))//檢查該zone是否有足夠的頁來分配，具體分析見後面
                goto try_this_zone;

            if (IS_ENABLED(CONFIG_NUMA) &&
                    !did_zlc_setup && nr_online_nodes > 1) {
                /*
                 * we do zlc_setup if there are multiple nodes
                 * and before considering the first zone allowed
                 * by the cpuset.
                 */
                allowednodes = zlc_setup(zonelist, alloc_flags);
                zlc_active = 1;
                did_zlc_setup = 1;
            }
            // 上面已經用zone_watermark_ok()測試了該zone是否能分配頁，如果分配不了，而又不能回收，或者該zone不在回收zone範圍，所以就只好設置飽滿了，防止下一次再掃描該zone
            if (zone_reclaim_mode == 0 ||
                !zone_allows_reclaim(preferred_zone, zone))
                goto this_zone_full;

            /*
             * As we may have just activated ZLC, check if the first
             * eligible zone has failed zone_reclaim recently.
             *///因爲上面已經設置過zlc_active等變量，所以要再次掃描下，不行就跳過該zone
            if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
                !zlc_zone_worth_trying(zonelist, z, allowednodes))
                continue;

            ret = zone_reclaim(zone, gfp_mask, order);//運行到這裏表明該zone可以回收頁
            switch (ret) {
            case ZONE_RECLAIM_NOSCAN:
                /* did not scan */
                continue;
            case ZONE_RECLAIM_FULL:
                /* scanned but unreclaimable */
                continue;
            default://上面兩種情況都是沒有回收頁面的，到這裏則表示已經回收了部分頁面
                /* did we reclaim enough *///因爲上面回收了部分頁面，所以要再用看看該zone是否可以分配頁了
                if (zone_watermark_ok(zone, order, mark,
                        classzone_idx, alloc_flags))
                    goto try_this_zone;

                /*
                 * Failed to reclaim enough to meet watermark.
                 * Only mark the zone full if checking the min
                 * watermark or if we failed to reclaim just
                 * 1<<order pages or else the page allocator
                 * fastpath will prematurely mark zones full
                 * when the watermark is between the low and
                 * min watermarks.
                 *///沒辦法，確實盡力了
                if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
                    ret == ZONE_RECLAIM_SOME)
                    goto this_zone_full;

                continue;
            }
        }

try_this_zone://理想情況下，開始分配內存
        page = buffered_rmqueue(preferred_zone, zone, order,
                        gfp_mask, migratetype);
        if (page)
            break;
this_zone_full://該zone已經飽滿了，設置該zone
        if (IS_ENABLED(CONFIG_NUMA))
            zlc_mark_zone_full(zonelist, z);
    }//這裏就結束了遍歷zone分配函數的循環了

    if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
        /* Disable zlc cache for second zonelist scan */
        zlc_active = 0;
        goto zonelist_scan;//再循環一次
    }

    if (page)
        /*
         * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
         * necessary to allocate the page. The expectation is
         * that the caller is taking steps that will free more
         * memory. The caller should avoid the page being used
         * for !PFMEMALLOC purposes.
         *///如果是無水印分配得到的頁，表明該zone已經沒有多少內存頁可以被用來分配了，所以要設置pfmemalloc，讓系統回收點內存；
        page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);

    return page;
}

這個宏是從zonelist->_zonerefs數組中獲取合適的zone

 for_each_zone_zonelist_nodemask(zone, z, zonelist,
                        high_zoneidx, nodemask) {

zone是循環體中可以使用的，遍歷zonelist->_zonerefs數組元素，

#define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
    for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone); \
        zone;                           \
        z = next_zones_zonelist(++z, highidx, nodemask, &zone)) \

static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
                    enum zone_type highest_zoneidx,
                    nodemask_t *nodes,
                    struct zone **zone)
{
    return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes,
                                zone);
}

struct zoneref {
    struct zone *zone;  /* Pointer to actual zone */
    int zone_idx;       /* zone_idx(zoneref->zone) */
};

highest_zoneid 是所能接受的最大zone，如果比它還大，那就不行所以z++，直到找到一個不大於（等於是最合適的）的返回

/* Returns the next zone at or below highest_zoneidx in a zonelist */
struct zoneref *next_zones_zonelist(struct zoneref *z, 
                    enum zone_type highest_zoneidx,
                    nodemask_t *nodes,
                    struct zone **zone)
{
    /*  
     * Find the next suitable zone to use for the allocation.
     * Only filter based on nodemask if it's set
     */
    if (likely(nodes == NULL))
        while (zonelist_zone_idx(z) > highest_zoneidx)
            z++;    
    else
        while (zonelist_zone_idx(z) > highest_zoneidx ||                (z->zone && !zref_in_nodemask(z, nodes)))
            z++;

    *zone = zonelist_zone(z);
    return z;
}

要理解zlc_zone_worth_trying()函數，要先看幾個結構體

struct zonelist {
    struct zonelist_cache *zlcache_ptr;          // NULL or &zlcache
    struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
#ifdef CONFIG_NUMA
    struct zonelist_cache zlcache;               // optional ...
#endif
};

struct zonelist_cache {
    unsigned short z_to_n[MAX_ZONES_PER_ZONELIST];      /* zone->nid */
    DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST);  /* zone full? */
    unsigned long last_full_zap;        /* when last zap'd (jiffies) */
};

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
                        nodemask_t *allowednodes)
{
    struct zonelist_cache *zlc; /* cached zonelist speedup info */
    int i;              /* index of *z in zonelist zones */
    int n;              /* node that zone *z is on */

    zlc = zonelist->zlcache_ptr;//上面結構體說的很明白 zlcache_ptr 是 zlcache的地址
    if (!zlc)//表示 zlcache不存在，內存是UMA模式
        return 1;

    i = z - zonelist->_zonerefs;// 這是 求z在_zonerfs是第幾個元素
    n = zlc->z_to_n[i];//根據i可以得到該zone所在的節點編號nid

    /* This zone is worth trying if it is allowed but not full */
    return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);//判斷該nid是否合法，該zone是否已經達到飽滿了；
}

/*
 * Return true if free pages are above 'mark'. This takes into account the order * of the allocation.
 */
static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
              int classzone_idx, int alloc_flags, long free_pages)
{
    /* free_pages my go negative - that's OK */
    long min = mark;
    long lowmem_reserve = z->lowmem_reserve[classzone_idx];//緊急情況下才能分配的頁數
    int o;
    long free_cma = 0;

    free_pages -= (1 << order) - 1;//減去要分配的頁數
    if (alloc_flags & ALLOC_HIGH)
        min -= min / 2;
    if (alloc_flags & ALLOC_HARDER)
        min -= min / 4;
#ifdef CONFIG_CMA
    /* If allocation can't use CMA areas don't use free CMA pages */
    if (!(alloc_flags & ALLOC_CMA))
        free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
#endif       

    if (free_pages - free_cma <= min + lowmem_reserve)//min + lowmem_reserve是一個界限，小於則不能在該zone上分配
        return false;
    for (o = 0; o < order; o++) {//循環去掉比需要分配的order介還小的階上的free page
        /* At the next order, this order's pages become unavailable */
        free_pages -= z->free_area[o].nr_free << o;

        /* Require fewer higher order pages to be free */
        min >>= 1;//高階頁可以少些

        if (free_pages <= min)
            return false;
    }            
    return true;
}

linux內存管理--分配內存頁(快速) get_page_from_freelist

公司剛入職了一名 Java 中級開發，短短 4 行代碼居然湊齊了 3 個 bug！我哭了~~

Nginx R31 doc-13-Limiting Access to Proxied HTTP Resources 訪問限流

中外程序員到底有啥區別？

Python數據分析與挖掘實戰（5章）

python包：pandas

C++文件/流

一、什麼是Docker

二、Docker 組件

揹包九講一 01揹包

今天！通義靈碼在北京、成都、杭州三城開講啦

linux panic分析

linux文件系統之讀流程 SYSCALL_DEFINE3(read, xxx)

linux內存管理--slab

linux內存管理--vmalloc

recovery: ASCII cpio archive (SVR4 with no CRC)——cpio格式的recovery解壓縮和重新打包的方

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結