I,idle进程的产生
idle进程是进程号为0的进程。我们通过ps可以看到最小进程号的进程是init,那是idle进程的儿子。看代码:
- // /init/main.c
-
- asmlinkage void __init start_kernel(void)
-
{
- ......
- /* Do the rest non-__init'ed, we're now alive */
-
rest_init();
-
}
-
-
// /init/main.c
-
- static noinline void __init_refok rest_init(void)
- {
- int pid;
- /*
- * We need to spawn init first so that it obtains pid 1, however
- * the init task will end up wanting to create kthreads, which, if
- * we schedule it before we create kthreadd, will OOPS.
- */
-
kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND);
-
- ....
- /* Call into cpu_idle with preempt disabled */
- cpu_idle();
-
}
- // /arch/x86/kernel/process_32.c
- /*
- * The idle thread. There's no useful work to be
- * done, so just try to conserve power and have a
- * low exit latency (ie sit in a loop waiting for
- * somebody to say that they'd like to reschedule)
-
*/
-
- void cpu_idle(void)
-
{
- /* endless idle loop with no priority at all */
- while (1) {
- stop_critical_timings();
- pm_idle();
- start_critical_timings();
-
}
- }
II,cpu_idle driver而cpu_idle()函数的核心是pm_idle()函数。pm_idle()函数是一个全局变量,它首先会在identify_cpu()函数中,即系统识别CPU时被赋值,默认是default_dile()函数。接下来cpu_idle这支driver会根据CPU的类型和内核中存在的idle driver来重新设定pm_idle全局变量。
确切的说,cpu_idle是一个驱动框架,在此框架下,不同的idle驱动可以根据硬件不同和需求不同来编写不同进入和退出Cx的行为。
1,核心数据结构首先是cpuidle_state,
- struct cpuidle_state {
- char name[CPUIDLE_NAME_LEN];
- char desc[CPUIDLE_DESC_LEN];
- void *driver_data;
- unsigned int flags;
- unsigned int exit_latency; /* in US */
- unsigned int power_usage; /* in mW */
- unsigned int target_residency; /* in US */
- unsigned long long usage;
- unsigned long long time; /* in US */
- int (*enter) (struct cpuidle_device *dev,
- struct cpuidle_state *state);
- };
然后是cpuidle_device,
- struct cpuidle_device {
- unsigned int registered:1;
- unsigned int enabled:1;
- unsigned int power_specified:1;
- unsigned int cpu;
- int last_residency;
- int state_count;
- struct cpuidle_state states[CPUIDLE_STATE_MAX];
- struct cpuidle_state_kobj *kobjs[CPUIDLE_STATE_MAX];
- struct cpuidle_state *last_state;
- struct list_head device_list;
- struct kobject kobj;
- struct completion kobj_unregister;
- void *governor_data;
- struct cpuidle_state *safe_state;
- int (*prepare) (struct cpuidle_device *dev);
- };
另外,还有一个注册driver用的cpuidle_driver,
- /****************************
- * CPUIDLE DRIVER INTERFACE *
- ****************************/
- struct cpuidle_driver {
- char name[CPUIDLE_NAME_LEN];
- struct module *owner;
- };
2.2 cpuidle_register_driver(struct cpuidle_driver *drv)idle driver使用该接口将struct cpuidle_driver注册到cpu_idle框架中。查看cpuidle_register_driver的代码可知,cpuidle不允许多支idle driver同时运行。
3,核心流程上面提到pm_idle全局量在cpuidle_register_device()函数中被赋为cpuidle_idle_call(),那么每次idle被调度(准确的说idle进程并不参与调度,而是当可运行进程列表为空时被运行),都会进入cpuidle_idle_call()函数。
- /**
- * cpuidle_idle_call - the main idle loop
- *
- * NOTE: no locks or semaphores should be used here
- */
- static void cpuidle_idle_call(void)
- {
- struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
- struct cpuidle_state *target_state;
- int next_state;
- /* check if the device is ready */
- if (!dev || !dev->enabled) {
- if (pm_idle_old)
- pm_idle_old();
- else
- #if defined(CONFIG_ARCH_HAS_DEFAULT_IDLE)
- default_idle();
- #else
- local_irq_enable();
- #endif
- return;
- }
- if (dev->prepare)
- dev->prepare(dev);
- /* ask the governor for the next state */
- next_state = cpuidle_curr_governor->select(dev);
- if (need_resched()) {
- local_irq_enable();
- return;
- }
- target_state = &dev->states[next_state];
- /* enter the state and update stats */
- dev->last_state = target_state;
- trace_power_start(POWER_CSTATE, next_state, dev->cpu);
- trace_cpu_idle(next_state, dev->cpu);
- dev->last_residency = target_state->enter(dev, target_state);
- trace_power_end(dev->cpu);
- trace_cpu_idle(PWR_EVENT_EXIT, dev->cpu);
- if (dev->last_state)
- target_state = dev->last_state;
- target_state->time += (unsigned long long)dev->last_residency;
- target_state->usage++;
- /* give the governor an opportunity to reflect on the outcome */
- if (cpuidle_curr_governor->reflect)
- cpuidle_curr_governor->reflect(dev);
- }
4,其它idle进程在每个CPU核中都有一个。事实上cpu_idle()函数除了在start_kernel()中出现外,另一个出现的地方就是smpboot.c-start_secondary()函数末尾。而我们通过上面的代码也可以看到,函数使用了__this_cpu_read()来获取当前CPU的cpuidle_devices变量。
IV,idle driver前面说过,cpuidle只是一个框架,需要更底层idle driver来执行实际的硬件操作。在目前的kernel中,存在两支idle driver:ACPI_idle和intel_idle。
ACPI_idle driver是支持ACPI系统中默认的idle driver,按照ACPI spec实现。intel_idle driver则是为Intel的一些较新的架构(如Atom, Nehalem(i5,i7))所写的idle driver,因为Intel的CPU在电源管理部分存在很多ACPI spec之外的东西,如C3之后的state 扩展,如更高效的进入和离开Cx state的方法。
我们简单了解一下其具体实现。1,intel idle driver在nehalm平台下的实现
- static struct cpuidle_state nehalem_cstates[MWAIT_MAX_NUM_CSTATES] = {
- { /* MWAIT C0 */ },
- { /* MWAIT C1 */
- .name = "C1-NHM",
- .desc = "MWAIT 0x00",
- .driver_data = (void *) 0x00,
- .flags = CPUIDLE_FLAG_TIME_VALID,
- .exit_latency = 3,
- .target_residency = 6,
- .enter = &intel_idle },
- { /* MWAIT C2 */
- .name = "C3-NHM",
- .desc = "MWAIT 0x10",
- .driver_data = (void *) 0x10,
- .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED,
- .exit_latency = 20,
- .target_residency = 80,
- .enter = &intel_idle },
- { /* MWAIT C3 */
- .name = "C6-NHM",
- .desc = "MWAIT 0x20",
- .driver_data = (void *) 0x20,
- .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED,
- .exit_latency = 200,
- .target_residency = 800,
- .enter = &intel_idle },
- };
- static int intel_idle(struct cpuidle_device *dev, struct cpuidle_state *state)
- {
- unsigned long ecx = 1; /* break on interrupt flag */
- unsigned long eax = (unsigned long)cpuidle_get_statedata(state);
- unsigned int cstate;
- ktime_t kt_before, kt_after;
- s64 usec_delta;
- int cpu = smp_processor_id();
- cstate = (((eax) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) + 1;
- local_irq_disable();
- /*
- * leave_mm() to avoid costly and often unnecessary wakeups
- * for flushing the user TLB's associated with the active mm.
- */
- if (state->flags & CPUIDLE_FLAG_TLB_FLUSHED)
- leave_mm(cpu);
- if (!(lapic_timer_reliable_states & (1 << (cstate))))
- clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
- kt_before = ktime_get_real();
- stop_critical_timings();
- if (!need_resched()) {
- __monitor((void *)¤t_thread_info()->flags, 0, 0);
- smp_mb();
- if (!need_resched())
- __mwait(eax, ecx);
- }
- start_critical_timings();
- kt_after = ktime_get_real();
- usec_delta = ktime_to_us(ktime_sub(kt_after, kt_before));
- local_irq_enable();
- if (!(lapic_timer_reliable_states & (1 << (cstate))))
- clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
- return usec_delta;
- }
按照oracal这这篇文章来看,这种进入Cx的方式有较低的唤醒延迟。
值得注意的是,如果内核使用intel_idle,即使你通过BIOS或者内核参数关闭ACPI,或者禁用了某个Cx state,
intel_idle一样会工作。如果要禁用intel_idle,可使用内核参数"intel_idle.max_cstate=0"
网上有人碰到这样的问题:
机器响应很迟钝,打开ACPI时,发现是acpi_idle_enter_bm()吃掉了大多数时间,关闭ACPI,发现是mwait_idle()吃掉了大多数时间。