浅析linux组调度配额、带宽和分享多余带宽发红包-gliethttp-ChinaUnix博客

#ifdef CONFIG_CFS_BANDWIDTH
/*
* Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
* each time a cfs_rq requests quota.
*
* Note: in the case that the slice exceeds the runtime remaining (either due
* to consumption or the quota being specified to be smaller than the slice)
* we will always only issue the remaining available time.
*
* default: 5 msec, units: microseconds
*/
unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
#endif
/*
* Replenish runtime according to assigned quota and update expiration time.
* We use sched_clock_cpu directly instead of rq->clock to avoid adding
* additional synchronization around rq->lock.
*
* requires cfs_b->lock
*/
void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
{
u64 now;
if (cfs_b->quota == RUNTIME_INF)
return;
now = sched_clock_cpu(smp_processor_id());
cfs_b->runtime = cfs_b->quota;
cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
// 组调度, 配额, quota
// 也就是说cfs_b->period这段时间分配给该cfs_b组的runtime配额为cfs_b->quota
// 不论该组消耗了多少配额, 都会在这里将配额重新填满, 触发接下来的消耗period[2014-11-26]
// start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
}
static inline u64 sched_cfs_bandwidth_slice(void)
{
return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
}
/* returns 0 on failure to allocate runtime */
// 努力延长本cfs_rq的runtime_remaining时间寿命[gliethttp]
static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
struct task_group *tg = cfs_rq->tg;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
u64 amount = 0, min_amount, expires;
/* note: this is a positive sum as runtime_remaining <= 0 */
// 5ms + 欠的账(超出配额的部分), 如注释所说cfs_rq->runtime_remaining是一个负数, 欠费了
min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
raw_spin_lock(&cfs_b->lock);
if (cfs_b->quota == RUNTIME_INF)
amount = min_amount; // 如果是infinite的,那么每次都获取5ms
else {
/*
* If the bandwidth pool has become inactive, then at least one
* period must have elapsed since the last consumption.
* Refresh the global state and ensure bandwidth timer becomes
* active.
*/
if (!cfs_b->timer_active) {
// 如果带宽timer刚好stop了, 那么重新填充带宽timer的runtime和runtime_expires
// cfs_b->runtime = cfs_b->quota;
// cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
// 进入下一轮以cfs_b->period为间隔的, 有效时长(单位ns纳秒)cfs_b->runtime的带宽period[gliethttp]
__refill_cfs_bandwidth_runtime(cfs_b);
__start_cfs_bandwidth(cfs_b);
}
if (cfs_b->runtime > 0) {
amount = min(cfs_b->runtime, min_amount); // 如果剩余的cfs_b->runtime不足5ms(sysctl_sched_cfs_bandwidth_slice)
// 更甚至与不足-cfs_rq->runtime_remaining, 那么amount将等于2者最小者.
cfs_b->runtime -= amount; // 被瓜分出去amount
cfs_b->idle = 0;
}
}
expires = cfs_b->runtime_expires;
raw_spin_unlock(&cfs_b->lock);
cfs_rq->runtime_remaining += amount; // 重新为自己充电amount纳秒
/*
* we may have advanced our local expiration to account for allowed
* spread between our sched_clock and the one on which runtime was
* issued.
*/
if ((s64)(expires - cfs_rq->runtime_expires) > 0)
cfs_rq->runtime_expires = expires;
return cfs_rq->runtime_remaining > 0;
}
// 发红包啦[20141128]! 每个处于trhottled的cfs_rq获得
// 我有些多余的remaining纳秒带宽, 将在expires时到期
// cfs_b你把这些带宽发给你手里饥渴的人吧[20141128]
static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
u64 remaining, u64 expires)
{
struct cfs_rq *cfs_rq;
u64 runtime = remaining;
rcu_read_lock();
list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
throttled_list) {
struct rq *rq = rq_of(cfs_rq);
raw_spin_lock(&rq->lock);
if (!cfs_rq_throttled(cfs_rq))
goto next;
runtime = -cfs_rq->runtime_remaining + 1; // 发红包啦[20141128]! 每个处于trhottled获得
// -cfs_rq->runtime_remaining, 填平到数值0, 同时再额外附送1ns
// 其实对于当前cfs_rq只获得了可用的1ns,
// 但是发红包的人,可把这个cfs_rq签得账-cfs_rq->runtime_remaining都给偿还上了
// 调用这个函数发红包的人, 真是个runtime富裕的大好人[20141128]
// 不要想太美啦, 世上哪有好事,便宜让你占呀,
// 其实是do_sched_cfs_period_timer自己欠下的账, 自己在这个时机来还而已[gliethttp]
if (runtime > remaining)
runtime = remaining;
remaining -= runtime; // 红包发出去了runtime, 计算我还剩多少ns红包
cfs_rq->runtime_remaining += runtime; // 红包给你这个cfs_rq了[20141128]
cfs_rq->runtime_expires = expires; // 你cfs_rq使用我的expires吧, 我和我富裕出来的红包生命将在expires时刻因过期而终结
/* we check whether we're throttled above */
if (cfs_rq->runtime_remaining > 0) // 把自己throttled吧
unthrottle_cfs_rq(cfs_rq);
next:
raw_spin_unlock(&rq->lock);
if (!remaining)
break;
}
rcu_read_unlock();
return remaining;
}
/*
* Responsible for refilling a task_group's bandwidth and unthrottling its
* cfs_rqs as appropriate. If there has been no activity within the last
* period the timer is deactivated until scheduling resumes; cfs_b->idle is
* used to track this state.
*/
// 新的cfs_b->period_timer带宽周期到达,
// 带宽数值(单位ns纳秒): cfs_b->runtime (cfs_b->runtime = cfs_b->quota;)
// 带宽有效期(单位ns纳秒): cfs_b->runtime_expires (cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);)
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
{
......
/*
* This check is repeated as we are holding onto the new bandwidth
* while we unthrottle. This can potentially race with an unthrottled
* group trying to acquire new bandwidth from the global pool.
*/
while (throttled && runtime > 0) {
raw_spin_unlock(&cfs_b->lock);
/* we can't nest cfs_b->lock while distributing bandwidth */
runtime = distribute_cfs_runtime(cfs_b, runtime,
runtime_expires); // 把自己欠的账还回去
raw_spin_lock(&cfs_b->lock);
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
}
/* return (any) remaining runtime */
cfs_b->runtime = runtime;
......
}