socket创建过程

3240阅读 0评论2013-12-10 liubangbo
分类:LINUX

对于网络编程程序员来说sockfd = socket(AF_INET, SOCKET_DGRM, 0);这行代码是最熟悉不过,但这行代码的背后是......

1. socket这个api是库函数,我们直接调用就可以了,调用之后,产生0x80号软中断,linux系统由用户态切换到内核态,接着执行系统调用函数,在内核态执行相应的服务例程,针对socket这个函数,服务例程
   是sys_socket函数。至于这个过程是怎么实现的,在这里不阐述。下面我们分析sys_socket函数,看socket是怎么创建的。

2. 在分析sys_socket函数之前,我们先看一下sock_init初始化过程
   

点击(此处)折叠或打开

  1. static int __init sock_init(void)
  2. {
  3.     /*
  4.      * Initialize sock SLAB cache.
  5.      */

  6.     sk_init(); 

  7.     /*
  8.      * Initialize skbuff SLAB cache
  9.      */
  10.     skb_init();

  11.     /*
  12.      * Initialize the protocols module.
  13.      */

  14.     init_inodecache();   //在这里创建了名为sock_inode_cache的cache
  15.     register_filesystem(&sock_fs_type);
  16.     sock_mnt = kern_mount(&sock_fs_type);

  17.     /* The real protocol initialization is performed in later initcalls.
  18.      */

  19. #ifdef CONFIG_NETFILTER
  20.     netfilter_init();
  21. #endif

  22.     return 0;
  23. }



     struct socket_alloc {
               struct socket socket;
               struct inode vfs_inode;
     };

  
     static int init_inodecache(void)
     {
        sock_inode_cachep = kmem_cache_create("sock_inode_cache",
                          sizeof(struct socket_alloc),         //在这里创建了名为sock_inode_cache,大小为sizeof(struct socket_alloc)的slab高速缓存  
                                                              //猜测创建slab高速缓存,而不是普通内存,那么操作socket结构就快了
                          0,
                          (SLAB_HWCACHE_ALIGN |
                           SLAB_RECLAIM_ACCOUNT |
                           SLAB_MEM_SPREAD),
                          init_once,
                          NULL);
        if (sock_inode_cachep == NULL)
            return -ENOMEM;
        return 0;
     }

     static struct vfsmount *sock_mnt __read_mostly;

     static struct file_system_type sock_fs_type = {    
             .name =        "sockfs",
             .get_sb =    sockfs_get_sb,
             .kill_sb =    kill_anon_super,
     };
    
     register_filesystem(&sock_fs_type);   //在这里注册了名为sockfs的VFS
     sock_mnt = kern_mount(&sock_fs_type);  //并在这里得到struct vfsmount 结构的sock_mnt变量,这个变量是全局变量,在创建socket的时候会用到

     static struct super_operations sockfs_ops = {
          .alloc_inode =    sock_alloc_inode,      //这里就是最终创建struct socket_alloc结构的函数
          .destroy_inode =sock_destroy_inode,
          .statfs =    simple_statfs,
     };

     static int sockfs_get_sb(struct file_system_type *fs_type,
             int flags, const char *dev_name, void *data,
             struct vfsmount *mnt)
     {
          return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC,
                                  mnt);
     }

     static struct inode *sock_alloc_inode(struct super_block *sb)
     {
          struct socket_alloc *ei;

          ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);  //在这里我们看到了memory allocate 操作
          if (!ei)
              return NULL;
          init_waitqueue_head(&ei->socket.wait);

          ei->socket.fasync_list = NULL;          //在这里对socket结构一些字段进行了初始化
          ei->socket.state = SS_UNCONNECTED;
          ei->socket.flags = 0;
          ei->socket.ops = NULL;
          ei->socket.sk = NULL;
          ei->socket.file = NULL;

          return &ei->vfs_inode;
}


3. 前面进行的这些初始化,为后面做好了准备,接着往下看吧:

 
  

点击(此处)折叠或打开

  1. asmlinkage long sys_socket(int family, int type, int protocol)
  2. {
  3.     int retval;
  4.     struct socket *sock;

  5.     retval = sock_create(family, type, protocol, &sock);  //在这个函数完成了socket的创建过程
  6.     if (retval < 0)
  7.         goto out;

  8.     retval = sock_map_fd(sock);  //把创建的socket和文件相关联,
  9.     if (retval < 0)
  10.         goto out_release;

  11. out:
  12.     /* It may be already another descriptor 8) Not kernel problem. */
  13.     return retval;

  14. out_release:
  15.     sock_release(sock);
  16.     return retval;
  17. }
sock_create函数是封装函数,实际调用的是__sock_create函数

点击(此处)折叠或打开

  1. static int __sock_create(int family, int type, int protocol,
  2.              struct socket **res, int kern)
  3. {
  4.     int err;
  5.     struct socket *sock;
  6.     const struct net_proto_family *pf;

  7.     /*
  8.      * Check protocol is in range
  9.      */
  10.     if (family < 0 || family >= NPROTO)
  11.         return -EAFNOSUPPORT;
  12.     if (type < 0 || type >= SOCK_MAX)
  13.         return -EINVAL;

  14.     /* Compatibility.

  15.      This uglymoron is moved from INET layer to here to avoid
  16.      deadlock in module load.
  17.      */
  18.     if (family == PF_INET && type == SOCK_PACKET) {
  19.         static int warned;
  20.         if (!warned) {
  21.             warned = 1;
  22.             printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n",
  23.              current->comm);
  24.         }
  25.         family = PF_PACKET;
  26.     }

  27.     err = security_socket_create(family, type, protocol, kern);
  28.     if (err)
  29.         return err;

  30.     /*
  31.      *    Allocate the socket and allow the family to set things up. if
  32.      *    the protocol is 0, the family is instructed to select an appropriate
  33.      *    default.
  34.      */
  35.     sock = sock_alloc(); //这个函数调用了初始化时注册的创建socket和inode节点的回调函数,完成了socket和inode节点的创建。在unix和类unix系统中把socket当做文件节点来处理,所以有inode节点
  36.                          //后面我们分析这个函数
  37.     if (!sock) {
  38.         if (net_ratelimit())
  39.             printk(KERN_WARNING "socket: no more sockets\n");
  40.         return -ENFILE;    /* Not exactly a match, but its the
  41.                  closest posix thing */
  42.     }

  43.     sock->type = type;

  44. #if defined(CONFIG_KMOD)
  45.     /* Attempt to load a protocol module if the find failed.
  46.      *
  47.      * 12/09/1996 Marcin: this makes REALLY only sense, if the user
  48.      * requested real, full-featured networking support upon configuration.
  49.      * Otherwise module support will
  50.      */
  51.     if (net_families[family] == NULL)
  52.         request_module("net-pf-%d", family);
  53. #endif

  54.     rcu_read_lock();
  55.     pf = rcu_dereference(net_families[family]);  //根据协议族family得到struct net_proto_family结构,这个net_families数组在inet_init函数中初始化,稍后我们看看这个初始化过程
  56.     err = -EAFNOSUPPORT;
  57.     if (!pf)
  58.         goto out_release;

  59.     /*
  60.      * We will call the ->create function, that possibly is in a loadable
  61.      * module, so we have to bump that loadable module refcnt first.
  62.      */
  63.     if (!try_module_get(pf->owner))
  64.         goto out_release;

  65.     /* Now protected by module ref count */
  66.     rcu_read_unlock();

  67.     err = pf->create(sock, protocol); //在这里创建了庞大的struct sock 结构,并进行了初始化。这个挂入的inet_create函数
  68.     if (err < 0)
  69.         goto out_module_put;

  70.     /*
  71.      * Now to bump the refcnt of the [loadable] module that owns this
  72.      * socket at sock_release time we decrement its refcnt.
  73.      */
  74.     if (!try_module_get(sock->ops->owner))
  75.         goto out_module_busy;

  76.     /*
  77.      * Now that we're done with the ->create function, the [loadable]
  78.      * module can have its refcnt decremented
  79.      */
  80.     module_put(pf->owner);
  81.     err = security_socket_post_create(sock, family, type, protocol, kern);
  82.     if (err)
  83.         goto out_release;
  84.     *res = sock;

  85.     return 0;

  86. out_module_busy:
  87.     err = -EAFNOSUPPORT;
  88. out_module_put:
  89.     sock->ops = NULL;
  90.     module_put(pf->owner);
  91. out_sock_release:
  92.     sock_release(sock);
  93.     return err;

  94. out_release:
  95.     rcu_read_unlock();
  96.     goto out_sock_release;
  97. }
从上面的代码中看到_sock_create函数调用了回调函数完成了socket创建和初始化过程,下面我们看创建socket结构的过程:sock = sock_alloc();

点击(此处)折叠或打开

  1. static struct socket *sock_alloc(void)
  2. {
  3.     struct inode *inode;
  4.     struct socket *sock;

  5.     inode = new_inode(sock_mnt->mnt_sb); //在这里我们看到了sock_init函数中得到的全局变量sock_mnt,稍后看下new_inode函数
  6.     if (!inode)
  7.         return NULL;

  8.     sock = SOCKET_I(inode); //得到了socket结构

  9.     inode->i_mode = S_IFSOCK | S_IRWXUGO;
  10.     inode->i_uid = current->fsuid;
  11.     inode->i_gid = current->fsgid;

  12.     get_cpu_var(sockets_in_use)++;
  13.     put_cpu_var(sockets_in_use);
  14.     return sock;
  15. }
  16. struct inode *new_inode(struct super_block *sb)
    {
        static unsigned long last_ino;
        struct inode * inode;

        spin_lock_prefetch(&inode_lock);
        
        inode = alloc_inode(sb);  //接着看这个函数
        if (inode) {
            spin_lock(&inode_lock);
            inodes_stat.nr_inodes++;
            list_add(&inode->i_list, &inode_in_use);
            list_add(&inode->i_sb_list, &sb->s_inodes);
            inode->i_ino = ++last_ino;
            inode->i_state = 0;
            spin_unlock(&inode_lock);
        }
        return inode;
    }
  17. static struct inode *alloc_inode(struct super_block *sb)
    {
        static const struct address_space_operations empty_aops;
        static struct inode_operations empty_iops;
        static const struct file_operations empty_fops;
        struct inode *inode;

        if (sb->s_op->alloc_inode) //在这里我们看到 if调节满足,因为在sock_init函数中我们挂入了sock_alloc_inode函数,之前我们也看到了sock_alloc_inode函数创建了sizeof(struct socket_alloc
  18.                                //大小的slab高速缓存
            inode = sb->s_op->alloc_inode(sb); 
        else
            inode = (struct inode *) kmem_cache_alloc(inode_cachep, GFP_KERNEL);

        if (inode) {
            struct address_space * const mapping = &inode->i_data;

            inode->i_sb = sb;
            inode->i_blkbits = sb->s_blocksize_bits;
            inode->i_flags = 0;
            atomic_set(&inode->i_count, 1);
            inode->i_op = &empty_iops;
            inode->i_fop = &empty_fops;
            inode->i_nlink = 1;
            atomic_set(&inode->i_writecount, 0);
            inode->i_size = 0;
            inode->i_blocks = 0;
            inode->i_bytes = 0;
            inode->i_generation = 0;
    #ifdef CONFIG_QUOTA
            memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
    #endif
            inode->i_pipe = NULL;
            inode->i_bdev = NULL;
            inode->i_cdev = NULL;
            inode->i_rdev = 0;
            inode->dirtied_when = 0;
            if (security_inode_alloc(inode)) {
                if (inode->i_sb->s_op->destroy_inode)
                    inode->i_sb->s_op->destroy_inode(inode);
                else
                    kmem_cache_free(inode_cachep, (inode));
                return NULL;
            }

            mapping->a_ops = &empty_aops;
             mapping->host = inode;
            mapping->flags = 0;
            mapping_set_gfp_mask(mapping, GFP_HIGHUSER);
            mapping->assoc_mapping = NULL;
            mapping->backing_dev_info = &default_backing_dev_info;

            /*
             * If the block_device provides a backing_dev_info for client
             * inodes then use that.  Otherwise the inode share the bdev's
             * backing_dev_info.
             */
            if (sb->s_bdev) {
                struct backing_dev_info *bdi;

                bdi = sb->s_bdev->bd_inode_backing_dev_info;
                if (!bdi)
                    bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
                mapping->backing_dev_info = bdi;
            }
            inode->i_private = NULL;
            inode->i_mapping = mapping;
        }
        return inode;
    }

       

从上面的分析中我们就可以很好的理解得到socket结构的过程:根据inode 得到socket
sock = SOCKET_I(inode);  
static inline struct socket *SOCKET_I(struct inode *inode)
{
    return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
}

4. 现在创建socket结构的过程也就完成了,下面我们看看创建struct sock结构的过程
   在inet_init函数中,
   (void)sock_register(&inet_family_ops);
  static struct net_proto_family inet_family_ops = {
           .family = PF_INET,
           .create = inet_create,
          .owner    = THIS_MODULE,
  };
在这里我们看到了挂入的过程,net_families数组以family为下标,组成了各个协议创建函数,还记得执行create函数的地方吧?但在看这个函数以前先看看这里:

点击(此处)折叠或打开

  1. /* Upon startup we insert all the elements in inetsw_array[] into
  2.  * the linked list inetsw.
  3.  */
  4. static struct inet_protosw inetsw_array[] =
  5. {
  6.     {
  7.         .type = SOCK_STREAM,
  8.         .protocol = IPPROTO_TCP,
  9.         .prot = &tcp_prot,
  10.         .ops = &inet_stream_ops,
  11.         .capability = -1,
  12.         .no_check = 0,
  13.         .flags = INET_PROTOSW_PERMANENT |
  14.              INET_PROTOSW_ICSK,
  15.     },

  16.     {
  17.         .type = SOCK_DGRAM,
  18.         .protocol = IPPROTO_UDP,
  19.         .prot = &udp_prot,
  20.         .ops = &inet_dgram_ops,
  21.         .capability = -1,
  22.         .no_check = UDP_CSUM_DEFAULT,
  23.         .flags = INET_PROTOSW_PERMANENT,
  24.        },


  25.        {
  26.      .type = SOCK_RAW,
  27.      .protocol = IPPROTO_IP,    /* wild card */
  28.      .prot = &raw_prot,
  29.      .ops = &inet_sockraw_ops,
  30.      .capability = CAP_NET_RAW,
  31.      .no_check = UDP_CSUM_DEFAULT,
  32.      .flags = INET_PROTOSW_REUSE,
  33.        }
  34. };

  35. //下面的代码是在inet_init函数中执行的
  36. /* Register the socket-side information for inet_create. */
  37.     for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
  38.         INIT_LIST_HEAD(r);

  39.     for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
  40.         inet_register_protosw(q);
我们来看看struct inet_protosw 这个结构

点击(此处)折叠或打开

  1. /* This is used to register socket interfaces for IP protocols. */
  2. struct inet_protosw {
  3.     struct list_head list;

  4.         /* These two fields form the lookup key. */
  5.     unsigned short     type;     /* This is the 2nd argument to socket(2). */
  6.     unsigned short     protocol; /* This is the L4 protocol number. */

  7.     struct proto     *prot;
  8.     const struct proto_ops *ops;
  9.   
  10.     int capability; /* Which (if any) capability do
  11.                  * we need to use this socket
  12.                  * interface?
  13.                                       */
  14.     char no_check; /* checksum on rcv/xmit/none? */
  15.     unsigned char     flags; /* See INET_PROTOSW_* below. */
  16. };




点击(此处)折叠或打开

  1. /*
  2.  *    Create an inet socket. //从这个注释中我们可以看到,还可以创建其他类型的socket
  3.  */

  4. static int inet_create(struct socket *sock, int protocol)
  5. {
  6.     struct sock *sk;
  7.     struct list_head *p;
  8.     struct inet_protosw *answer;
  9.     struct inet_sock *inet;
  10.     struct proto *answer_prot;
  11.     unsigned char answer_flags;
  12.     char answer_no_check;
  13.     int try_loading_module = 0;
  14.     int err;

  15.     sock->state = SS_UNCONNECTED;

  16.     /* Look for the requested type/protocol pair. */
  17.     answer = NULL;
  18. lookup_protocol:
  19.     err = -ESOCKTNOSUPPORT;
  20.     rcu_read_lock();
  21.     list_for_each_rcu(p, &inetsw[sock->type]) {   //在这里我们遍历inetsw数组,根据是UDP,TCP,RAW类型得到了struct inet_protosw结构
  22.         answer = list_entry(p, struct inet_protosw, list);

  23.         /* Check the non-wild match. */
  24.         if (protocol == answer->protocol) {
  25.             if (protocol != IPPROTO_IP)
  26.                 break;
  27.         } else {
  28.             /* Check for the two wild cases. */
  29.             if (IPPROTO_IP == protocol) {
  30.                 protocol = answer->protocol;
  31.                 break;
  32.             }
  33.             if (IPPROTO_IP == answer->protocol)
  34.                 break;
  35.         }
  36.         err = -EPROTONOSUPPORT;
  37.         answer = NULL;
  38.     }

  39.     if (unlikely(answer == NULL)) {
  40.         if (try_loading_module < 2) {
  41.             rcu_read_unlock();
  42.             /*
  43.              * Be more specific, e.g. net-pf-2-proto-132-type-1
  44.              * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
  45.              */
  46.             if (++try_loading_module == 1)
  47.                 request_module("net-pf-%d-proto-%d-type-%d",
  48.                      PF_INET, protocol, sock->type);
  49.             /*
  50.              * Fall back to generic, e.g. net-pf-2-proto-132
  51.              * (net-pf-PF_INET-proto-IPPROTO_SCTP)
  52.              */
  53.             else
  54.                 request_module("net-pf-%d-proto-%d",
  55.                      PF_INET, protocol);
  56.             goto lookup_protocol;
  57.         } else
  58.             goto out_rcu_unlock;
  59.     }

  60.     err = -EPERM;
  61.     if (answer->capability > 0 && !capable(answer->capability))
  62.         goto out_rcu_unlock;

  63.     sock->ops = answer->ops;    //对socket结构进行了初始化
  64.     answer_prot = answer->prot;
  65.     answer_no_check = answer->no_check;
  66.     answer_flags = answer->flags;
  67.     rcu_read_unlock();

  68.     BUG_TRAP(answer_prot->slab != NULL);

  69.     err = -ENOBUFS;
  70.     sk = sk_alloc(PF_INET, GFP_KERNEL, answer_prot, 1);   //这个函数创建了struct sock 这个庞然大物
  71.     if (sk == NULL)
  72.         goto out;

  73.     err = 0;
  74.     sk->sk_no_check = answer_no_check;
  75.     if (INET_PROTOSW_REUSE & answer_flags)
  76.         sk->sk_reuse = 1;

  77.     inet = inet_sk(sk);
  78.     inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;

  79.     if (SOCK_RAW == sock->type) {
  80.         inet->num = protocol;
  81.         if (IPPROTO_RAW == protocol)
  82.             inet->hdrincl = 1;
  83.     }

  84.     if (ipv4_config.no_pmtu_disc)
  85.         inet->pmtudisc = IP_PMTUDISC_DONT;
  86.     else
  87.         inet->pmtudisc = IP_PMTUDISC_WANT;

  88.     inet->id = 0;

  89.     sock_init_data(sock, sk);  //在这里对struct sock里面重要的字段进行了初始化,包括接受队列,发送队列,以及长度等

  90.     sk->sk_destruct     = inet_sock_destruct;   
  91.     sk->sk_family     = PF_INET;
  92.     sk->sk_protocol     = protocol;
  93.     sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;

  94.     inet->uc_ttl    = -1;
  95.     inet->mc_loop    = 1;
  96.     inet->mc_ttl    = 1;
  97.     inet->mc_index    = 0;
  98.     inet->mc_list    = NULL;

  99.     sk_refcnt_debug_inc(sk);

  100.     if (inet->num) {    //我们看到当我们调用RAW类型的socket的时候,这个if条件就成立了
  101.         /* It assumes that any protocol which allows
  102.          * the user to assign a number at socket
  103.          * creation time automatically
  104.          * shares.
  105.          */
  106.         inet->sport = htons(inet->num);
  107.         /* Add to protocol hash chains. */
  108.         sk->sk_prot->hash(sk);
  109.     }

  110.     if (sk->sk_prot->init) {           //看L4层是否注册了初始化函数,我们看到UDP类型的socket为空,而TCP类型的socket注册了初始化函数
  111.         err = sk->sk_prot->init(sk);
  112.         if (err)
  113.             sk_common_release(sk);
  114.     }
  115. out:
  116.     return err;
  117. out_rcu_unlock:
  118.     rcu_read_unlock();
  119.     goto out;
  120. }

点击(此处)折叠或打开

  1. void sock_init_data(struct socket *sock, struct sock *sk)
  2. {
  3.     skb_queue_head_init(&sk->sk_receive_queue); //接受队列
  4.     skb_queue_head_init(&sk->sk_write_queue);   //发送队列
  5.     skb_queue_head_init(&sk->sk_error_queue);
  6. #ifdef CONFIG_NET_DMA
  7.     skb_queue_head_init(&sk->sk_async_wait_queue);
  8. #endif

  9.     sk->sk_send_head    =    NULL;

  10.     init_timer(&sk->sk_timer);

  11.     sk->sk_allocation    =    GFP_KERNEL;
  12.     sk->sk_rcvbuf        =    sysctl_rmem_default;  //接受缓冲区大小
  13.     sk->sk_sndbuf        =    sysctl_wmem_default;  //发送缓冲区大小
  14.     sk->sk_state        =    TCP_CLOSE;   //被初始化为TCP_CLOSE,再下一篇绑定分析中我们会看到会检查这个状态
  15.     sk->sk_socket        =    sock;

  16.     sock_set_flag(sk, SOCK_ZAPPED);

  17.     if(sock)
  18.     {
  19.         sk->sk_type    =    sock->type;
  20.         sk->sk_sleep    =    &sock->wait;
  21.         sock->sk    =    sk;
  22.     } else
  23.         sk->sk_sleep    =    NULL;

  24.     rwlock_init(&sk->sk_dst_lock);
  25.     rwlock_init(&sk->sk_callback_lock);
  26.     lockdep_set_class(&sk->sk_callback_lock,
  27.              af_callback_keys + sk->sk_family);

  28.     sk->sk_state_change    =    sock_def_wakeup;
  29.     sk->sk_data_ready    =    sock_def_readable;
  30.     sk->sk_write_space    =    sock_def_write_space;
  31.     sk->sk_error_report    =    sock_def_error_report;
  32.     sk->sk_destruct        =    sock_def_destruct;

  33.     sk->sk_sndmsg_page    =    NULL;
  34.     sk->sk_sndmsg_off    =    0;

  35.     sk->sk_peercred.pid     =    0;
  36.     sk->sk_peercred.uid    =    -1;
  37.     sk->sk_peercred.gid    =    -1;
  38.     sk->sk_write_pending    =    0;
  39.     sk->sk_rcvlowat        =    1;
  40.     sk->sk_rcvtimeo        =    MAX_SCHEDULE_TIMEOUT;
  41.     sk->sk_sndtimeo        =    MAX_SCHEDULE_TIMEOUT;

  42.     sk->sk_stamp.tv_sec = -1L;
  43.     sk->sk_stamp.tv_usec = -1L;

  44.     atomic_set(&sk->sk_refcnt, 1);
  45. }
就写到这个程度吧,代码里面有很多对方没有明白,以后明白了再加上......




上一篇:数据包在L4被挂入接收队列过程
下一篇:bind函数