Linux时间管理之clocksource

1950阅读 0评论2015-06-23 九阳神功爱喝茶
分类:LINUX

   前面提到了Linux下的时间相关的硬件。TSC PIT,HPET,ACPI_PM,这些硬件以一定的频率产生时钟中断,来帮助我们计时。Linux为了管理这些硬件,抽象出来clocksource。
  1. struct clocksource {
  2.     /*
  3.      * Hotpath data, fits in a single cache line when the
  4.      * clocksource itself is cacheline aligned.
  5.      */
  6.     cycle_t (*read)(struct clocksource *cs);
  7.     cycle_t cycle_last;
  8.     cycle_t mask;
  9.     u32 mult;
  10.     u32 shift;
  11.     u64 max_idle_ns;
  12.     u32 maxadj;
  13. #ifdef CONFIG_ARCH_CLOCKSOURCE_DATA
  14.     struct arch_clocksource_data archdata;
  15. #endif

  16.     const char *name;
  17.     struct list_head list;
  18.     int rating;
  19.     int (*enable)(struct clocksource *cs);
  20.     void (*disable)(struct clocksource *cs);
  21.     unsigned long flags;
  22.     void (*suspend)(struct clocksource *cs);
  23.     void (*resume)(struct clocksource *cs);

  24.     /* private: */
  25. #ifdef CONFIG_CLOCKSOURCE_WATCHDOG
  26.     /* Watchdog related data, used by the framework */
  27.     struct list_head wd_list;
  28.     cycle_t cs_last;
  29.     cycle_t wd_last;
  30. #endif
  31. } ____cacheline_aligned;
    这些参数当中,比较重要的是rating,shift,mult。其中rating在上一篇博文提到了:
  • 1--99: 不适合于用作实际的时钟源,只用于启动过程或用于测试;
  • 100--199:基本可用,可用作真实的时钟源,但不推荐;
  • 200--299:精度较好,可用作真实的时钟源;
  • 300--399:很好,精确的时钟源;
  • 400--499:理想的时钟源,如有可能就必须选择它作为时钟源;
   我们基本在前面看到:    
  1. include/linux/acpi_pmtmr.h
  2. ------------------------------------------
  3. #define PMTMR_TICKS_PER_SEC 3579545

  4. drivers/clocksource/acpi_pm.c
  5. ---------------------------------------------
  6. static struct clocksource clocksource_acpi_pm = {
  7.           .name = "acpi_pm",
  8.           .rating = 200,
  9.           .read = acpi_pm_read,
  10.           .mask = (cycle_t)ACPI_PM_MASK,
  11.           .mult = 0, /*to be calculated*/
  12.           .shift = 22,
  13.           .flags = CLOCK_SOURCE_IS_CONTINUOUS,
  14.  
  15.  };

  16. dmesg output
  17. ------------------------
  18. [ 0.664201] hpet0: 8 comparators, 64-bit 14.318180 MHz counter

  19. arch/86/kernel/hpet.c
  20. --------------------------------
  21. static struct clocksource clocksource_hpet = {
  22.     .name = "hpet",
  23.     .rating = 250,
  24.     .read = read_hpet,
  25.     .mask = HPET_MASK,
  26.     .flags = CLOCK_SOURCE_IS_CONTINUOUS,
  27.     .resume = hpet_resume_counter,
  28. #ifdef CONFIG_X86_64
  29.     .archdata = { .vclock_mode = VCLOCK_HPET },
  30. #endif
  31. };


  32. dmesg output:
  33. -----------------------------
  34. [ 0.004000] Detected 2127.727 MHz processor.


  35. arch/x86/kernel/tsc.c
  36. --------------------------------------
  37. static struct clocksource clocksource_tsc = {
  38.     .name = "tsc",
  39.     .rating = 300,
  40.     .read = read_tsc,
  41.     .resume = resume_tsc,
  42.     .mask = CLOCKSOURCE_MASK(64),
  43.     .flags = CLOCK_SOURCE_IS_CONTINUOUS |
  44.                   CLOCK_SOURCE_MUST_VERIFY,
  45. #ifdef CONFIG_X86_64
  46.     .archdata = { .vclock_mode = VCLOCK_TSC },
  47. #endif
  48. };
    从上面可以看到,acpi_pm,hpet tsc的rating分别是200,250,300,他们的rating基本是和他们的frequency符合,TSC以2127.727MHz的频率技压群雄,等级rating=300最高,被选择成current_clocksource:
  1. root@manu:~# cat /sys/devices/system/clocksource/clocksource0/available_clocksource
  2. tsc hpet acpi_pm
  3. root@manu:~# cat /sys/devices/system/clocksource/clocksource0/current_clocksource
  4. tsc
    除此外,还有两个参数shift和mult,这两个参数是干啥的呢?
   我们想一下,假如我们需要给你个以一定频率输出中断的硬件,你如何计时?比如我有一个频率是1000Hz的硬件,当前时钟源计数是3500,过了一段时间,我抬头看了下时钟源计数至是5500,过去了2000cycles,我就知道了过去了2000/1000 =2 second。
  1.  times_elapse = cycles_interval / frequency 
    从上面的例子中,我抬头看了下当前计数值这个肯定是瞎掰了,实际上要想获取时钟源还是需要和硬件打交道的。在clocksource中有一个成员变量是read,这个就是一个时钟源注册的时候,提供的一个函数,如果你想获得我的当前计数值,请调用这个read 函数。以TSC时钟为例:
  1. static struct clocksource clocksource_tsc = {
  2.     .name = "tsc",
  3.     .rating = 300,
  4.     .read = read_tsc,
  5.     .resume = resume_tsc,
  6.     .mask = CLOCKSOURCE_MASK(64),
  7.     .flags = CLOCK_SOURCE_IS_CONTINUOUS |
  8.                   CLOCK_SOURCE_MUST_VERIFY,
  9. #ifdef CONFIG_X86_64
  10.     .archdata = { .vclock_mode = VCLOCK_TSC },
  11. #endif
  12. };

  13. /*--------- arch/x86/kernel/tsc.c -------------------*/
  14. static cycle_t read_tsc(struct clocksource *cs)
  15. {
  16.     cycle_t ret = (cycle_t)get_cycles();

  17.     return ret >= clocksource_tsc.cycle_last ?
  18.         ret : clocksource_tsc.cycle_last;
  19. }


  20. /*------- arch/x86/include/asm/tsc.h----------------------*/
  21. static inline cycles_t get_cycles(void)
  22. {
  23.     unsigned long long ret = 0;

  24. #ifndef CONFIG_X86_TSC
  25.     if (!cpu_has_tsc)
  26.         return 0;
  27. #endif
  28.     rdtscll(ret);

  29.     return ret;
  30. }

  31. /*------arch/x86/include/asm/msr.h-----------------*/
  32. #define rdtscll(val)                        \
  33.     ((val) = __native_read_tsc())


  34. static __always_inline unsigned long long __native_read_tsc(void)
  35. {
  36.     DECLARE_ARGS(val, low, high);

  37.     asm volatile("rdtsc" : EAX_EDX_RET(val, low, high));

  38.     return EAX_EDX_VAL(val, low, high);
  39. }
    根据这个脉络,我们知道,最终就是rdtsc这条指令来获取当前计数值cycles。rdtsc这条指令我前面有有博文介绍摸我  
    扯了半天read这个成员变量,可以回到shift和mult了。其实shift和mult是为了解决下面这个公式的:
  1. times_elapse = cycles_interval / frequency
   就像上面的公式,有频率就足以计时了。为啥弄出来个shift和mult。原因在于kernel搞个除法不太方便,必须转化乘法和移位。Kernel中有很多这种把除法转化成乘法的样例。那么公式变成了:
  1. times_elapse = cycles_interval * mult >> shift
   Kernel用乘法+移位来替换除法:根据cycles来计算过去了多少ns。
  1. /**
  2.  * clocksource_cyc2ns - converts clocksource cycles to nanoseconds
  3.  * @cycles:    cycles
  4.  * @mult:    cycle to nanosecond multiplier
  5.  * @shift:    cycle to nanosecond divisor (power of two)
  6.  *
  7.  * Converts cycles to nanoseconds, using the given mult and shift.
  8.  *
  9.  * XXX - This could use some mult_lxl_ll() asm optimization
  10.  */
  11. static inline s64 clocksource_cyc2ns(cycle_t cycles, u32 mult, u32 shift)
  12. {
  13.     return ((u64) cycles * mult) >> shift;
  14. }
    单纯从精度上讲,肯定是mult越大越好,但是计算过程可能溢出,所以mult也不能无限制的大,这个计算中有个magic number 600 :
  1. void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
  2. {
  3.     u64 sec;
  4.     /*
  5.      * Calc the maximum number of seconds which we can run before
  6.      * wrapping around. For clocksources which have a mask > 32bit
  7.      * we need to limit the max sleep time to have a good
  8.      * conversion precision. 10 minutes is still a reasonable
  9.      * amount. That results in a shift value of 24 for a
  10.      * clocksource with mask >= 40bit and f >= 4GHz. That maps to
  11.      * ~ 0.06ppm granularity for NTP. We apply the same 12.5%
  12.      * margin as we do in clocksource_max_deferment()
  13.      */
  14.     sec = (cs->mask - (cs->mask >> 3));
  15.     do_div(sec, freq);
  16.     do_div(sec, scale);
  17.     if (!sec)
  18.         sec = 1;
  19.     else if (sec > 600 && cs->mask > UINT_MAX)
  20.         sec = 600;

  21.     clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
  22.              NSEC_PER_SEC / scale, sec * scale);

  23.     /*
  24.      * for clocksources that have large mults, to avoid overflow.
  25.      * Since mult may be adjusted by ntp, add an safety extra margin
  26.      *
  27.      */
  28.     cs->maxadj = clocksource_max_adjustment(cs);
  29.     while ((cs->mult + cs->maxadj < cs->mult)
  30.         || (cs->mult - cs->maxadj > cs->mult)) {
  31.         cs->mult >>= 1;
  32.         cs->shift--;
  33.         cs->maxadj = clocksource_max_adjustment(cs);
  34.     }

  35.     cs->max_idle_ns = clocksource_max_deferment(cs);
  36. }
    这个600的意思是600秒,表示的Timer两次计算当前计数值的差不会超过10分钟。主要考虑的是系统进入IDLE状态之后,时间信息不会被更新,10分钟内只要退出IDLE,clocksource还是可以成功的转换时间。当然了,最后的这个时间不一定就是10分钟,它由clocksource_max_deferment计算并将结果存储在max_idle_ns中.
   
    筒子比较关心的问题是如何计算 ,精度如何,其实我不太喜欢这种计算,Kernel总是因为某些原因把代码写的很蛋疼.反正揣摩代码意图要花不少时间,收益嘛其实也不太大.如何实现我也不解释了,我以TSC为例子我评估下这种mult+shift的精度.
  1. #include<stdio.h>
  2. #include<stdlib.h>

  3. typedef unsigned int u32;
  4. typedef unsigned long long u64;

  5. #define NSEC_PER_SEC 1000000000L

  6. void
  7. clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec)
  8. {
  9.     u64 tmp;
  10.     u32 sft, sftacc= 32;

  11.     /*
  12.      * * Calculate the shift factor which is limiting the conversion
  13.      * * range:
  14.      * */
  15.     tmp = ((u64)maxsec * from) >> 32;
  16.     while (tmp) {
  17.             tmp >>=1;
  18.             sftacc--;
  19.         }

  20.     /*
  21.      * * Find the conversion shift/mult pair which has the best
  22.      * * accuracy and fits the maxsec conversion range:
  23.      * */
  24.     for (sft = 32; sft > 0; sft--) {
  25.             tmp = (u64) to << sft;
  26.             tmp += from / 2;
  27.             //do_div(tmp, from);
  28.             tmp = tmp/from;
  29.             if ((tmp >> sftacc) == 0)
  30.                 break;
  31.         }
  32.     *mult = tmp;
  33.     *shift = sft;
  34. }


  35. int main()
  36. {

  37.     u32 tsc_mult;
  38.     u32 tsc_shift ;

  39.     
  40.     u32 tsc_frequency = 2127727000/1000; //TSC frequency(KHz)
  41.     clocks_calc_mult_shift(&tsc_mult,&tsc_shift,tsc_frequency,NSEC_PER_SEC/1000,600*1000); //NSEC_PER_SEC/1000是因为TSC的注册是clocksource_register_khz

  42.     fprintf(stderr,"mult = %d shift = %d\n",tsc_mult,tsc_shift);
  43.     return 0;
  44. }
    600是根据TSC clocksource的MASK算出来的的入参,感兴趣可以自己推算看下结果:
  1. mult = 7885042 shift = 24
    root@manu:~/code/c/self/time# python
    Python 2.7.3 (default, Apr 10 2013, 05:46:21) 
    [GCC 4.6.3] on linux2
    Type "help", "copyright", "credits" or "license" for more information.
    >>> (2127727000*7885042)>>24
  2. 1000000045L
    >>> 
    我们知道TSC的frequency是2127727000Hz,如果cycle走过2127727000,就意味过去了1秒,或者说10^9(us).按照我们的算法得出的时间是1000000045us. 这个误差是多大呢,每走10^9秒,误差是45秒,换句话说,运行257天,产生1秒的计算误差.考虑到NTP的存在,这个运算精度还可以了.
   接下来是注册和各大clocksource PK.
   各大clocksource会调用clocksource_register_khz或者clocksource_register_hz来注册. 
  1. HPET (arch/x86/kernel/hpet)
  2. ----------------------------------------
  3. hpet_enable
  4. |_____hpet_clocksource_register
  5.            |_____clocksource_register_hz

  6. TSC  (arch/x86/kernel/tsc.c)
  7. ----------------------------------------
  8. device_initcall(init_tsc_clocksource);

  9. init_tsc_clocksource
  10. |_____clocksource_register_khz


  11. ACPI_PM(drivers/cloclsource/acpi_pm.c)
  12. -------------------------------------------
  13. fs_initcall(init_acpi_pm_clocksource);

  14. init_acpi_pm_clocksource
  15. |_____clocksource_register_hz
    最终都会调用__clocksource_register_scale. 
  1. int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
  2. {

  3.     /* Initialize mult/shift and max_idle_ns */
  4.     __clocksource_updatefreq_scale(cs, scale, freq);

  5.     /* Add clocksource to the clcoksource list */
  6.     mutex_lock(&clocksource_mutex);
  7.     clocksource_enqueue(cs);
  8.     clocksource_enqueue_watchdog(cs);
  9.     clocksource_select();
  10.     mutex_unlock(&clocksource_mutex);
  11.     return 0;
  12. }
   第一函数是__clocksource_updatefreq_scale,计算shift,mult还有max_idle_ns,前面讲过了.
    clocksource_enqueue将clocksource链入全局链表.根据的是rating,rating高的放前面. 
   
clocksource_select会选择最好的clocksource记录在全局变量curr_clocksource,同时会通知timekeeping,切换最好的clocksourcelog:   
  1. manu@manu:~$ dmesg|grep Switching
  2. [ 0.673002] Switching to clocksource hpet
  3. [ 1.720643] Switching to clocksource tsc
      clocksource_enqueue_watchdog会将clocksource挂到watchdog链表.watchdog顾名思义,监控所有clocksource:
  1. #define WATCHDOG_INTERVAL (HZ >> 1)
  2. #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
      如果0.5秒内,误差大于0.0625s,表示这个clocksource精度极差,将rating设成0.

      总算可以睡觉了.亲下我家小宝宝 去睡觉.

参考文献:
Linux时间子系统之一:clock source(时钟源)
2 Linux 3.4.61 source code.

上一篇:Linux时间管理之hardware
下一篇:跟我一起学Load Balance(1)