linux调度器(五)- load balance(1)

5370阅读 0评论2019-10-11 静静做好一件事
分类:Android平台

前面讲到了fair classselect_task_rq_fair函数,看到了其中的一段如下代码:

  1. group = find_idlest_group(sd, p, cpu, sd_flag);
  2.                 if (!group) {
  3.                         sd = sd->child;
  4.                         continue;
  5.                 }

  6.                 new_cpu = find_idlest_cpu(group, p, cpu);
  7.                 if (new_cpu == -1 || new_cpu == cpu) {
  8.                         /* Now try balancing at a lower domain level of cpu */
  9.                         sd = sd->child;
  10.                         continue;
  11.                 }

我们看到其中的find_idlest_group以及find_idlest_cpu函数。这其中涉及到了load balance的考虑。即scheduler会尽量平衡各个CPU上的负载,即能够降低latency,又能够提高系统的整体吞吐量。

当然光靠select_task_rq_fair里面的这点load balance的考虑是远远不够的。系统是动态,随时都有task进入休眠状态从rq上移除。

Schedulerload balance分为好几种:

(1)    newly idle balanceNewly idle是指CPU上没有可运行的task,准备进入idle 的状态。在这种情况下,scheduler会尝试从别的CPUpull一些进程过来运行。

(2)    idle balance。是指cpu已经进入idle状态,在tick时做的load balance。如果是NOHZ的话,会尝试做nohz idle balance

(3)    busy balance。即cpu上有task运行。是否需要做load balance

以上三种大体可以从fair.c中找出相关的代码,但是代码其实要比这复杂的多。

idle_balance->

load_balance(…,CPU_NEWLY_IDLE…);

上面是Newly idle balancepath。下面从scheduler tick开始。

  1. void scheduler_tick(void)
  2. {
  3.         int cpu = smp_processor_id();
  4.         struct rq *rq = cpu_rq(cpu);
  5.         struct task_struct *curr = rq->curr;
  6. .
  7. #ifdef CONFIG_SMP
  8.         rq->idle_balance = idle_cpu(cpu);
  9.         trigger_load_balance(rq);
  10. #endif

这边先判断当前cpu是否处于idle状态,然后在trigger_load_balance里面会根据是否idle状态进行不同的处理。

  1. void trigger_load_balance(struct rq *rq)
  2. {
  3.         /* Don't need to rebalance while attached to NULL domain */
  4.         if (unlikely(on_null_domain(rq)))
  5.                 return;

  6.         if (time_after_eq(jiffies, rq->next_balance))
  7.                 raise_softirq(SCHED_SOFTIRQ);
  8. #ifdef CONFIG_NO_HZ_COMMON
  9.         if (nohz_kick_needed(rq))
  10.                 nohz_balancer_kick();
  11. #endif
  12. }

其中SCHED_SOFTIRQ的注册处理函数在init_sched_fair_class里面注册的。

__init void init_sched_fair_class(void)

{

#ifdef CONFIG_SMP

        open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);


  1. static void run_rebalance_domains(struct softirq_action *h)
  2. {
  3.         struct rq *this_rq = this_rq();
  4.         enum cpu_idle_type idle = this_rq->idle_balance ?
  5.                                                 CPU_IDLE : CPU_NOT_IDLE;
  6. //idle_balance用于表示CPU是否在idle状态
  7.         /*
  8.          * If this cpu has a pending nohz_balance_kick, then do the
  9.          * balancing on behalf of the other idle cpus whose ticks are
  10.          * stopped. Do nohz_idle_balance *before* rebalance_domains to
  11.          * give the idle cpus a chance to load balance. Else we may
  12.          * load balance only within the local sched_domain hierarchy
  13.          * and abort nohz_idle_balance altogether if we pull some load.
  14.          */
  15.         nohz_idle_balance(this_rq, idle);
  16.         rebalance_domains(this_rq, idle);
  17. }

这里暂时先不考虑nohz相关的函数,从rebalance_domains开始入手。

        for_each_domain(cpu, sd) {

                /*  

                 * Decay the newidle max times here because this is a regular

                 * visit to all the domains. Decay ~1% per second.

                 */

                if (time_after(jiffies, sd->next_decay_max_lb_cost)) {

                        sd->max_newidle_lb_cost =

                                (sd->max_newidle_lb_cost * 253) / 256;

//这里的max_newidle_lb_cost是指做load balance所花时间。如上面注释所说,max_newidle_lb_cost每个1s衰减1%

// next_decay_max_lb_cost是下一次进行衰减的时间,HZjiffies1s时间。

                        sd->next_decay_max_lb_cost = jiffies + HZ;

                        need_decay = 1;

                }   

                max_cost += sd->max_newidle_lb_cost;

 

                if (!(sd->flags & SD_LOAD_BALANCE))

                        continue;

 

                interval = get_sd_balance_interval(sd, idle != CPU_IDLE);

获得balanceinterval,每一个sched domainload balance的时间间隔不一样,越高leveldomain的时间间隔越长。因为越高leveltask之间的迁移的代价越高。

  1. need_serialize = sd->flags & SD_SERIALIZE;
  2.                 if (need_serialize) {
  3.                         if (!spin_trylock(&balancing))
  4.                                 goto out;
  5.                 }

  6.                 if (time_after_eq(jiffies, sd->last_balance + interval)) {
  7.                         if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
  8.                                 /*
  9.                                  * The LBF_DST_PINNED logic could have changed
  10.                                  * env->dst_cpu, so we can't know our idle
  11.                                  * state even if we migrated tasks. Update it.
  12.                                  */
  13.                                 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
  14.                         }
  15.                         sd->last_balance = jiffies;
  16.                         interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
  17.                 }

这里面的关键函数是load_balance

  1. static int load_balance(int this_cpu, struct rq *this_rq,
  2.                         struct sched_domain *sd, enum cpu_idle_type idle,
  3.                         int *continue_balancing)
  4. {
  5.         int ld_moved, cur_ld_moved, active_balance = 0;
  6.         struct sched_domain *sd_parent = sd->parent;
  7.         struct sched_group *group;
  8.         struct rq *busiest;
  9.         unsigned long flags;
  10.         struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
  11.                 
  12.         struct lb_env env = {
  13.                 .sd = sd,
  14.                 .dst_cpu = this_cpu,

Load balance是将一些task从一个cpu迁移到另外一个cpu上,load balance主要是将一些task从负载比较高的cpupull到负载低的cpu上。这里dst_cpu就是需要将task pull到的cpu

                .dst_rq         = this_rq,

                .dst_grpmask    = sched_group_cpus(sd->groups),

由于一些cpu allows的设置,导致一些task不能被迁移到dst_cpu上,所以在出现这种情况的时候,就需要从dst cpu所在的group上选择另外一个cpu

                .idle           = idle,

当前cpu是否是idle

                .loop_break     = sched_nr_migrate_break,

                .cpus           = cpus,

                .fbq_type       = all,

                .tasks          = LIST_HEAD_INIT(env.tasks),

初始化链表,后续会将需要迁移的task暂时放在这个链表里面。

        };

        /*

         * For NEWLY_IDLE load_balancing, we don't need to consider

         * other cpus in our group

         */

        if (idle == CPU_NEWLY_IDLE)

                env.dst_grpmask = NULL;

做NEWLY IDLE balance的时候,只运行将task pull 到当前CPU上。

        cpumask_copy(cpus, cpu_active_mask);

 

        schedstat_inc(sd, lb_count[idle]);

redo:

        if (!should_we_balance(&env)) {

                *continue_balancing = 0;

                goto out_balanced;

        }

判断是否需要在当前cpu上做load balance。

(1)       如果是NEWLY IDLE,需要做load balance

(2)       否则的需要在idle cpu上做balance

(3)       如果没有idle cpu的话,就在group的第一个cpu上做load balance

        group = find_busiest_group(&env);

统计domain上每一个group的load信息,然后找出busiest的group.其中find_busiest_group->update_sd_lb_stats用到了一个get_sd_load_idx函数。这里面涉及到了一个unsigned long cpu_load[CPU_LOAD_IDX_MAX];数组。scheduler会更具不同的load balance类型(busy,newly idle,idle)选择不同的load进行计算。
__update_cpu_load函数用于更新cpu load,特别是decay_load_missed函数,用于计算load的衰减。计算公式为:
load = ((2^idx - 1) / 2^idx)^(n-1) * load
其中idx是load balance类型,n是经过多少个jiffies周期,即n×10ms。
至于为什么设置这样的数组,主要是基于稳定的考虑。我们知道cpu的load带有极大的不稳定。如果由于这样的不稳定性导致task频繁的migrate是很不合适的。
同时为了方便计算,就预设了degrade_factor跟degrade_zero_ticks两个数组。

        if (!group) {

                schedstat_inc(sd, lb_nobusyg[idle]);

                goto out_balanced;

        }

 

        busiest = find_busiest_queue(&env, group);

从group中再找出buiest的cpu

        if (!busiest) {

                schedstat_inc(sd, lb_nobusyq[idle]);

                goto out_balanced;

        }

这上面跟select_task_rq_fair有点类似,只不过select_task_rq_fair选择的是idlestcpu,而这里选择的是busiestcpu

        BUG_ON(busiest == env.dst_rq);

 

        schedstat_add(sd, lb_imbalance[idle], env.imbalance);

 

        env.src_cpu = busiest->cpu;

        env.src_rq = busiest;

最终busiest cpu就是load balancesrc cpu,而当前cpuload balancedst cpu

接着往下

        ld_moved = 0;

ld?_moved变量用于记录在load balance过程中转移的load

        if (busiest->nr_running > 1) {

                /*

                 * Attempt to move tasks. If find_busiest_group has found

                 * an imbalance but busiest->nr_running <= 1, the group is

                 * still unbalanced. ld_moved simply stays zero, so it is

                 * correctly treated as an imbalance.

                 */

                env.flags |= LBF_ALL_PINNED;

                env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);

 

more_balance:

                raw_spin_lock_irqsave(&busiest->lock, flags);

 

                /*

                 * cur_ld_moved - load moved in current iteration

                 * ld_moved     - cumulative load moved across iterations

                 */

                cur_ld_moved = detach_tasks(&env);

cur_ld_moved用于记录本次运行detack tasks转移的load,这些将要migratetask被组织在env的链表中。很显然将这些task从原来的cpudequeue掉了。

env->imbalance用于表示当前系统不均衡的load,本次load balance就是需要消除这种不均衡,需要将load env->imbalancetaskssrc rq迁移到dst rq上去。

                raw_spin_unlock(&busiest->lock);

 

                if (cur_ld_moved) {

                        attach_tasks(&env);

attach tasks将这些tasks添加到了新的rq中了,也就是enqueue操作。

                        ld_moved += cur_ld_moved;

                }

 

                local_irq_restore(flags);

 

                if (env.flags & LBF_NEED_BREAK) {

LBF_NEED_BREAK这个flag是在detach_tasks中设置的,因为detach_tasks是在持有spinlock的情况下运行的。长时间的spinlock会带来一些问题。

                        env.flags &= ~LBF_NEED_BREAK;

                        goto more_balance;

                }

 

                if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {

load balance的过程中有很多的flags标志位,其中LBF_SOME_PINNEDflag,用来表示dst rq不在taskcpu allowed里面。如果该task可以migrategroup的其余cpu上去,另外用了一个flag叫着LBF_DST_PINNED来表示。在这种情况下,会从group中重新选择一个cpu作为target cpu继续load balance。很显然在NEWLY IDLEload balance里面不会出现LBF_DST_PINNED的情况,因为NEWLY IDLEload balancedst_grpmaskNULL

                        /* Prevent to re-select dst_cpu via env's cpus */

                        cpumask_clear_cpu(env.dst_cpu, env.cpus);

 

                        env.dst_rq       = cpu_rq(env.new_dst_cpu);

                        env.dst_cpu      = env.new_dst_cpu;

                        env.flags       &= ~LBF_DST_PINNED;

                        env.loop         = 0;

                        env.loop_break   = sched_nr_migrate_break;

 

                        /*

                         * Go back to "more_balance" rather than "redo" since we

                         * need to continue with same src_cpu.

                         */

                        goto more_balance;

                }

在出现上面所说的情况时,选择一个新的dst rq,继续load balancemigrate tasks

                /*

                 * We failed to reach balance because of affinity.

                 */

                if (sd_parent) {

                        int *group_imbalance = &sd_parent->groups->sgc->imbalance;

 

                        if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)

                                *group_imbalance = 1;

                }

如果非顶层sched domain。并且LBF_SOME_PINNED,即一些task无法转移,而且仍然存在imbalance的情况时,设置groupsgc相应imbalance位。

说到&sd_parent->groups->sgc->imbalance时,又不得不涉及struct sg_lb_stats结构体的成员变量enum group_type group_type;

这是一个枚举变量,用来在load balance的过程中表示group的类型。

enum group_type {

        group_other = 0,

        group_imbalanced,

        group_overloaded,

};

除了group_other之外,还涉及到两个类型分别为overloadedimbalanced

static inline enum

group_type group_classify(struct sched_group *group,

                          struct sg_lb_stats *sgs)

{

        if (sgs->group_no_capacity)

                return group_overloaded;

 

        if (sg_imbalanced(group))

                return group_imbalanced;

 

        return group_other;

}

关于group_imbalanced在上面提及LBF_SOME_PINNED时有涉及。关于overloaded,可以从group_is_overloaded函数看出来,即一个group的使用率超过一定的百分比(80%)。

另外有一个函数can_migrate_task用来检查一个task是否可以从src rq迁移到dst rq上去。

在这个函数的注释里面提及到几种不能migrate的情况

        /*

         * We do not migrate tasks that are:

         * 1) throttled_lb_pair, or

         * 2) cannot be migrated to this CPU due to cpus_allowed, or

         * 3) running (obviously), or

         * 4) are cache-hot on their current CPU.

         */

1)其中throttled_lb_pair涉及到cgroup中关于bandwidth的部分,这边先不做涉及。

2)由于taskcpus allowed导致task不能migratedst rq

3)正在runningtask不能migrate

4cache hottask不能migrate,因为这样cache hottask migrate是得不偿失的。

        if (!ld_moved) {

如果ld_moved0的话,很显然,上面的所有尝试的已经失败了,即pull失败了。那么下面就需要更加的aggressive了,需要在busiest cpu上进行push操作。

                schedstat_inc(sd, lb_failed[idle]);

增加load balance的统计计数lb_failed[idle]

                /*

                 * Increment the failure counter only on periodic balance.

                 * We do not want newidle balance, which can be very

                 * frequent, pollute the failure counter causing

                 * excessive cache_hot migrations and active balances.

                 */

                if (idle != CPU_NEWLY_IDLE)

                        sd->nr_balance_failed++;

 

                if (need_active_balance(&env)) {

用于判断是否需要在busiest cpu上进行push操作。这个跟上面的pull有些差异。因为更加倾向于

                        raw_spin_lock_irqsave(&busiest->lock, flags);

 

                        /* don't kick the active_load_balance_cpu_stop,

                         * if the curr task on busiest cpu can't be

                         * moved to this_cpu

                         */

                        if (!cpumask_test_cpu(this_cpu,

                                        tsk_cpus_allowed(busiest->curr))) {

                                raw_spin_unlock_irqrestore(&busiest->lock,

                                                            flags);

                                env.flags |= LBF_ALL_PINNED;

                                goto out_one_pinned;

                        }

                        /*

                         * ->active_balance synchronizes accesses to

                         * ->active_balance_work.  Once set, it's cleared

                         * only after active load balance is finished.

                         */

                        if (!busiest->active_balance) {

                                busiest->active_balance = 1;

                                busiest->push_cpu = this_cpu;

                                active_balance = 1;

                        }

                        raw_spin_unlock_irqrestore(&busiest->lock, flags);

 

                        if (active_balance) {

                                stop_one_cpu_nowait(cpu_of(busiest),

                                        active_load_balance_cpu_stop, busiest,

                                        &busiest->active_balance_work);

                        }

这边的load balance更加的激进,采用了一个stop class的进程(在前面介绍过,stop > deadline > real time> fair > idle),同时将src rqrunningtask重新enqueuerq中成为runnable状态。这样将runningtask纳入到了load balance的范围。







上一篇:没有了
下一篇:没有了