TCP/IP学习(40)——Kernel中路由表的实现(3)

2450阅读 0评论2013-10-17 瀚海书香
分类:LINUX

本文的copyleft归gfree.wind@gmail.com所有,使用GPL发布,可以自由拷贝,转载。但转载请保持文档的完整性,注明原作者及原链接,严禁用于任何商业用途。
作者:gfree.wind@gmail.com
博客:linuxfocus.blog.chinaunix.net
   

今天开始学习路由表的另一种实现方式trie。
先看关键的数据结构,然后通过查找函数,来理解trie的路由表组织形式。

1. 路由节点
  1. struct rt_trie_node {
  2.     /*
  3.     该变量进行了复用。
  4.     对于32位机来说,每个节点的地址都是4字节对齐的。所以路由表中节点地址的后两位为0.
  5.     这样parent的最低位就用作了标志位,0为中间节点,1为叶子节点。
  6.     */
  7.     unsigned long parent;
  8.     /* t_key实际上为无符号整形变量,对于ipv4来说,就是IP地址 */
  9.     t_key key;
  10. };

/* 下面的宏就是用于获取父节点的类型 */
#define T_TNODE 0
#define T_LEAF  1
#define NODE_TYPE_MASK  0x1UL
#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)

2. 叶子节点
  1. struct leaf {
  2.     /* 同上 */
  3.     unsigned long parent;
  4.     t_key key;
  5.     struct hlist_head list;
  6.     struct rcu_head rcu;
  7. };
注意:每一个route条目都对应一个叶子节点。


3. 叶子信息
  1. struct leaf_info {
  2.     struct hlist_node hlist;
  3.     struct rcu_head rcu;
  4.     int plen;
  5.     struct list_head falh;
  6. };

4. 中间节点
  1. struct tnode {
  2.     unsigned long parent;
  3.     t_key key;
  4.     /* pos表示从何处开始检查子节点的匹配,即pos之前的位的值为该节点的key有效位 */
  5.     unsigned char pos; /* 2log(KEYLENGTH) bits needed */
  6.     /* bits表示子节点的个数 */
  7.     unsigned char bits; /* 2log(KEYLENGTH) bits needed */
  8.     unsigned int full_children; /* KEYLENGTH bits needed */
  9.     unsigned int empty_children; /* KEYLENGTH bits needed */
  10.     union {
  11.         struct rcu_head rcu;
  12.         struct work_struct work;
  13.         struct tnode *tnode_free;
  14.     };
     /* 子节点的柔性数组 */
  1.     struct rt_trie_node *child[0];
  2. };
5. 根节点
  1. struct trie {
  2.     struct rt_trie_node *trie;
  3. #ifdef CONFIG_IP_FIB_TRIE_STATS
  4.     struct trie_use_stats stats;
  5. #endif
  6. };


下面看查找函数
  1. int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
  2.          struct fib_result *res, int fib_flags)
  3. {
  4.     /* 与hash方式相同,tb->tb_data为实际的trie */
  5.     struct trie *t = (struct trie *) tb->tb_data;
  6.     int ret;
  7.     struct rt_trie_node *n;
  8.     struct tnode *pn;
  9.     unsigned int pos, bits;
  10.     /* 将目的地址转为主机序 */
  11.     t_key key = ntohl(flp->daddr);
  12.     unsigned int chopped_off;
  13.     t_key cindex = 0;
  14.     unsigned int current_prefix_length = KEYLENGTH;
  15.     struct tnode *cn;
  16.     t_key pref_mismatch;

  17.     rcu_read_lock();

  18.     n = rcu_dereference(t->trie);
  19.     if (!n)
  20.         goto failed;

  21. #ifdef CONFIG_IP_FIB_TRIE_STATS
  22.     t->stats.gets ;
  23. #endif

  24.     /* Just a leaf? */
  25.     if (IS_LEAF(n)) {
  26.         /* 
  27.         只有一个叶子节点
  28.         叶子节点保存到达某一地址的所有route信息,通过check_leaf判断地址是否匹配,并从中找出是否有匹配
  29.         的路由信息。
  30.         */
  31.         ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
  32.         goto found;
  33.     }

  34.     pn = (struct tnode *) n;
  35.     chopped_off = 0;

  36.     while (pn) {
  37.         pos = pn->pos;
  38.         bits = pn->bits;

  39.         if (!chopped_off) {
  40.             /*
  41.             mask_pfx对key取current_prefix_length的掩码
  42.             tkey_extract_bits从第一个参数的第pos位,取bits位的值
  43.             并将这个值作为孩子节点的索引。 
  44.             */
  45.             cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
  46.                          pos, bits);
  47.         }
         
         /* 以cindex为索引,取得pn该中间节点的孩子节点 */
  1.         n = tnode_get_child_rcu(pn, cindex);

  2.         if (n == NULL) {
  3.             /* 没有子节点, 需要回溯 */
  4. #ifdef CONFIG_IP_FIB_TRIE_STATS
  5.             t->stats.null_node_hit ;
  6. #endif
  7.             goto backtrace;
  8.         }

  9.         if (IS_LEAF(n)) {
  10.             /* 该节点为叶子节点,使用check_leaf来检测是否有对应的route */
  11.             ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
  12.             if (ret > 0)
  13.                 goto backtrace; //没有找到,回溯
             //找到了route
  1.             goto found;
  2.         }
          
         /* 仍然为中间节点 */
  1.         cn = (struct tnode *)n;

  2.         /*
  3.          * It's a tnode, and we can do some extra checks here if we
  4.          * like, to avoid descending into a dead-end branch.
  5.          * This tnode is in the parent's child array at index
  6.          * key[p_pos..p_pos p_bits] but potentially with some bits
  7.          * chopped off, so in reality the index may be just a
  8.          * subprefix, padded with zero at the end.
  9.          * We can also take a look at any skipped bits in this
  10.          * tnode - everything up to p_pos is supposed to be ok,
  11.          * and the non-chopped bits of the index (se previous
  12.          * paragraph) are also guaranteed ok, but the rest is
  13.          * considered unknown.
  14.          *
  15.          * The skipped bits are key[pos bits..cn->pos].
  16.          */

  17.         /* If current_prefix_length < pos bits, we are already doing
  18.          * actual prefix matching, which means everything from
  19.          * pos (bits-chopped_off) onward must be zero along some
  20.          * branch of this subtree - otherwise there is *no* valid
  21.          * prefix present. Here we can only check the skipped
  22.          * bits. Remember, since we have already indexed into the
  23.          * parent's child array, we know that the bits we chopped of
  24.          * *are* zero.
  25.          */

  26.         /* NOTA BENE: Checking only skipped bits
  27.          for the new node here */

  28.         if (current_prefix_length < pos+bits) {
  29.             /*
  30.             提前判断是否需要回溯
  31.             1. 相当于该节点对应的route的prefix都比要查找的目的地址的prefi小,所以肯定不匹配;
  32.             2. 该节点没有子节点,那么自然不匹配,需要回溯;
  33.             */
  34.             if (tkey_extract_bits(cn->key, current_prefix_length,
  35.                         cn->pos - current_prefix_length)
  36.              || !(cn->child[0]))
  37.                 goto backtrace;
  38.         }

  39.         /*
  40.          * If chopped_off=0, the index is fully validated and we
  41.          * only need to look at the skipped bits for this, the new,
  42.          * tnode. What we actually want to do is to find out if
  43.          * these skipped bits match our key perfectly, or if we will
  44.          * have to count on finding a matching prefix further down,
  45.          * because if we do, we would like to have some way of
  46.          * verifying the existence of such a prefix at this point.
  47.          */

  48.         /* The only thing we can do at this point is to verify that
  49.          * any such matching prefix can indeed be a prefix to our
  50.          * key, and if the bits in the node we are inspecting that
  51.          * do not match our key are not ZERO, this cannot be true.
  52.          * Thus, find out where there is a mismatch (before cn->pos)
  53.          * and verify that all the mismatching bits are zero in the
  54.          * new tnode's key.
  55.          */

  56.         /*
  57.          * Note: We aren't very concerned about the piece of
  58.          * the key that precede pn->pos pn->bits, since these
  59.          * have already been checked. The bits after cn->pos
  60.          * aren't checked since these are by definition
  61.          * "unknown" at this point. Thus, what we want to see
  62.          * is if we are about to enter the "prefix matching"
  63.          * state, and in that case verify that the skipped
  64.          * bits that will prevail throughout this subtree are
  65.          * zero, as they have to be if we are to find a
  66.          * matching prefix.
  67.          */
         /* 计算pos位以前的不匹配位 */
  1.         pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);

  2.         /*
  3.          * In short: If skipped bits in this node do not match
  4.          * the search key, enter the "prefix matching"
  5.          * state.directly.
  6.          */
  7.         if (pref_mismatch) {
  8.             /* 
  9.             fls找到第一个为1的位,即第一个不匹配的位
  10.             那么mp即为所有前面匹配的位的个数。
  11.             */
  12.             int mp = KEYLENGTH - fls(pref_mismatch);
              
             /*
             这个节点从mp位到pos位之间的bits,有为1的位,则这个节点以及子节点肯定不匹配,
             需要回溯
             */
  1.             if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
  2.                 goto backtrace;
              
             /* 
             该节点mp位到pos位以后的位均为0,且当前的prefix大于pos,那么prefix需要缩短到mp。
             即该节点仍然可能匹配
             */
  1.             if (current_prefix_length >= cn->pos)
  2.                 current_prefix_length = mp;
  3.         }
         
         /* 继续向下匹配 */
  1.         pn = (struct tnode *)n; /* Descend */
  2.         chopped_off = 0;
  3.         continue;

  4. backtrace:
  5.         chopped_off++;

         /* 
         放弃选定的子节点,寻找最长匹配的子节点。
         如之前选择的子节点索引为7(111),那么尝试索引为6的子节点(110)
          */
  1.         /* As zero don't change the child key (cindex) */
  2.         while ((chopped_off <= pn->bits)
  3.          && !(cindex & (1<<(chopped_off-1))))
  4.             chopped_off++;
  1.         /* Decrease current_... with bits chopped off */
  2.         /* 根据chopped_off,调整prefix长度*/
  3.         if (current_prefix_length > pn->pos+pn->bits - chopped_off)
  4.             current_prefix_length = pn->pos+pn->bits
  5.                 - chopped_off;

  6.         /*
  7.          * Either we do the actual chop off according or if we have
  8.          * chopped off all bits in this tnode walk up to our parent.
  9.          */

  10.         if (chopped_off <= pn->bits) {
  11.             /* 找到了另外一个合适的子节点 */
  12.             cindex &= ~(1 << (chopped_off-1));
  13.         } else {
  14.             /* 该节点的所有子节点都不合适,则向上回溯一个节点,然后重复最长匹配的操作 */
  15.             struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
  16.             if (!parent)
  17.                 goto failed;

  18.             /* Get Child's index */
  19.             cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
  20.             pn = parent;
  21.             chopped_off = 0;

  22. #ifdef CONFIG_IP_FIB_TRIE_STATS
  23.             t->stats.backtrack ;
  24. #endif
  25.             goto backtrace;
  26.         }
  27.     }
  28. failed:
  29.     ret = 1;
  30. found:
  31.     rcu_read_unlock();
  32.     return ret;
  33. }

基本查找的函数就是这样,通过分析查找函数,对于kernel的路由表的trie实现结构,基本上有了比较清晰的结构。但是仅仅看上面的代码,还不能完全反映出tries的结构组织。

后文,我会详细的描写tries的路由表的数据组织结构


参考文章http://blog.csdn.net/dog250/article/details/6596046 该文虽然没有具体代码,但是对于trie的原理讲解的还是很清楚的,推荐一看。

上一篇:Suse 11 SP2 修改运行进程的limits
下一篇:TCP/IP学习(41)——Kernel中路由表的实现(4)