// 新建一个socket结构体,并且创建一个下层的sock结构体,互相关联static int sock_socket(int family, int type, int protocol){ int i, fd; struct socket *sock; struct proto_ops *ops; for (i = 0; i < NPROTO; ++i) { // 从props数组中找到family协议对应的操作函数集,props由系统初始化时sock_register进行操作 if (pops[i] == NULL) continue; if (pops[i]->family == family) break; } if (i == NPROTO) { return -EINVAL; } // 函数集 ops = pops[i]; // 检查一下类型 if ((type != SOCK_STREAM && type != SOCK_DGRAM && type != SOCK_SEQPACKET && type != SOCK_RAW && type != SOCK_PACKET) || protocol < 0) return(-EINVAL); // 分配一个新的socket结构体,下面进行分析 if (!(sock = sock_alloc())) { printk("NET: sock_socket: no more sockets\n"); return(-ENOSR); /* Was: EAGAIN, but we are out of system resources! */ } // 设置类型和操作函数集 sock->type = type; sock->ops = ops; // 创建一个struct sock结构体,和sock_alloc分配的socket结构体互相关联 if ((i = sock->ops->create(sock, protocol)) < 0) { sock_release(sock); return(i); } // 返回一个新的文件描述符,下面分析 if ((fd = get_fd(SOCK_INODE(sock))) < 0) { sock_release(sock); return(-EINVAL); } return(fd);}复制代码
由上面的代码可以知道,socket函数主要是三个步骤,下面逐个分析。 1 拿到一个新的socket结构体
struct socket *sock_alloc(void){ struct inode * inode; struct socket * sock; // 获取一个可用的inode节点 inode = get_empty_inode(); if (!inode) return NULL; // 初始化某些字段 inode->i_mode = S_IFSOCK; inode->i_sock = 1;// socket文件 inode->i_uid = current->uid; inode->i_gid = current->gid; // 执行inode的socket结构体,初始化inode结构体的socket结构体 sock = &inode->u.socket_i; sock->state = SS_UNCONNECTED; sock->flags = 0; sock->ops = NULL; sock->data = NULL; sock->conn = NULL; sock->iconn = NULL; sock->next = NULL; // 这个结构很重要,在阻塞性的网络函数里会用到,主要是用于阻塞和唤醒进程 sock->wait = &inode->i_wait; // 互相引用 sock->inode = inode; /* "backlink": we could use pointer arithmetic instead */ sock->fasync_list = NULL; // socket数加一 sockets_in_use++; // 返回新的socket结构体,实际上是inode中的一个字段 return sock;}复制代码
2 sock->ops->create,根据props[(网络源码初始化分析)](https://blog.csdn.net/THEANARKH/article/details/85550187)数组的结构可知,create函数对应的是inet_create
// 创建一个sock结构体,和socket结构体互相关联static int inet_create(struct socket *sock, int protocol){ struct sock *sk; struct proto *prot; int err; // 分配一个sock结构体 sk = (struct sock *) kmalloc(sizeof(*sk), GFP_KERNEL); if (sk == NULL) return(-ENOBUFS); sk->num = 0; sk->reuse = 0; switch(sock->type) { case SOCK_STREAM: case SOCK_SEQPACKET: if (protocol && protocol != IPPROTO_TCP) { kfree_s((void *)sk, sizeof(*sk)); return(-EPROTONOSUPPORT); } protocol = IPPROTO_TCP; sk->no_check = TCP_NO_CHECK; // 函数集 prot = &tcp_prot; break; case SOCK_DGRAM: if (protocol && protocol != IPPROTO_UDP) { kfree_s((void *)sk, sizeof(*sk)); return(-EPROTONOSUPPORT); } protocol = IPPROTO_UDP; sk->no_check = UDP_NO_CHECK; prot=&udp_prot; break; // 下面两种类型需要root身份 case SOCK_RAW: if (!suser()) { kfree_s((void *)sk, sizeof(*sk)); return(-EPERM); } if (!protocol) { kfree_s((void *)sk, sizeof(*sk)); return(-EPROTONOSUPPORT); } prot = &raw_prot; sk->reuse = 1; sk->no_check = 0; /* * Doesn't matter no checksum is * performed anyway. */ sk->num = protocol; break; case SOCK_PACKET: if (!suser()) { kfree_s((void *)sk, sizeof(*sk)); return(-EPERM); } if (!protocol) { kfree_s((void *)sk, sizeof(*sk)); return(-EPROTONOSUPPORT); } prot = &packet_prot; sk->reuse = 1; sk->no_check = 0; /* Doesn't matter no checksum is * performed anyway. */ sk->num = protocol; break; default: kfree_s((void *)sk, sizeof(*sk)); return(-ESOCKTNOSUPPORT); } // sock结构体的socket字段指向上层的socket结构体 sk->socket = sock;#ifdef CONFIG_TCP_NAGLE_OFF sk->nonagle = 1;#else sk->nonagle = 0;#endif sk->type = sock->type; sk->stamp.tv_sec=0; sk->protocol = protocol; sk->wmem_alloc = 0; sk->rmem_alloc = 0; sk->sndbuf = SK_WMEM_MAX; sk->rcvbuf = SK_RMEM_MAX; sk->pair = NULL; sk->opt = NULL; sk->write_seq = 0; sk->acked_seq = 0; sk->copied_seq = 0; sk->fin_seq = 0; sk->urg_seq = 0; sk->urg_data = 0; sk->proc = 0; sk->rtt = 0; /*TCP_WRITE_TIME << 3;*/ sk->rto = TCP_TIMEOUT_INIT; /*TCP_WRITE_TIME*/ sk->mdev = 0; sk->backoff = 0; sk->packets_out = 0; sk->cong_window = 1; /* start with only sending one packet at a time. */ sk->cong_count = 0; sk->ssthresh = 0; sk->max_window = 0; sk->urginline = 0; sk->intr = 0; sk->linger = 0; sk->destroy = 0; sk->priority = 1; sk->shutdown = 0; sk->keepopen = 0; sk->zapped = 0; sk->done = 0; sk->ack_backlog = 0; sk->window = 0; sk->bytes_rcv = 0; sk->state = TCP_CLOSE; sk->dead = 0; sk->ack_timed = 0; sk->partial = NULL; sk->user_mss = 0; sk->debug = 0; /* this is how many unacked bytes we will accept for this socket. */ sk->max_unacked = 2048; /* needs to be at most 2 full packets. */ /* how many packets we should send before forcing an ack. if this is set to zero it is the same as sk->delay_acks = 0 */ sk->max_ack_backlog = 0; sk->inuse = 0; sk->delay_acks = 0; skb_queue_head_init(&sk->write_queue); skb_queue_head_init(&sk->receive_queue); sk->mtu = 576; // 下层的操作函数集 sk->prot = prot; // 来自socket结构体的wait字段,wait字段来自inode的wait字段 sk->sleep = sock->wait; sk->daddr = 0; sk->saddr = 0 /* ip_my_addr() */; sk->err = 0; sk->next = NULL; sk->pair = NULL; sk->send_tail = NULL; sk->send_head = NULL; sk->timeout = 0; sk->broadcast = 0; sk->localroute = 0; init_timer(&sk->timer); init_timer(&sk->retransmit_timer); sk->timer.data = (unsigned long)sk; sk->timer.function = &net_timer; skb_queue_head_init(&sk->back_log); sk->blog = 0; // socket结构体的data字段指向底层的sock结构体 sock->data =(void *) sk; // 初始化tcp头 sk->dummy_th.doff = sizeof(sk->dummy_th)/4; sk->dummy_th.res1=0; sk->dummy_th.res2=0; sk->dummy_th.urg_ptr = 0; sk->dummy_th.fin = 0; sk->dummy_th.syn = 0; sk->dummy_th.rst = 0; sk->dummy_th.psh = 0; sk->dummy_th.ack = 0; sk->dummy_th.urg = 0; sk->dummy_th.dest = 0; sk->ip_tos=0; sk->ip_ttl=64;#ifdef CONFIG_IP_MULTICAST sk->ip_mc_loop=1; sk->ip_mc_ttl=1; *sk->ip_mc_name=0; sk->ip_mc_list=NULL;#endif // 下面两个函数用于阻塞型的网络函数被阻塞时,一旦底层条件符合,则回调下面的函数通知上层,即唤醒进程 sk->state_change = def_callback1; sk->data_ready = def_callback2; sk->write_space = def_callback3; sk->error_report = def_callback1; if (sk->num) { /* * It assumes that any protocol which allows * the user to assign a number at socket * creation time automatically * shares. */ // 根据端口,把sock结构体放到下层协议的sock_srray数组 put_sock(sk->num, sk); sk->dummy_th.source = ntohs(sk->num); } // 执行底层的初始化函数,tcp和udp都没有init函数 if (sk->prot->init) { err = sk->prot->init(sk); if (err != 0) { destroy_sock(sk); return(err); } } return(0);}复制代码
3 get_fd,经过上面的几个步骤,我们拿到了一个inode、一个socket、一个sock。最后我们要再拿到一个文件描述符返回给应用层,在操作系统中,每个进程有一个fd数组,记录进程打开的文件信息,数组的一个或多个项指向一个struct file结构体,一个或多个file结构体又指向一个inode结构体。所以我们拿到一个inode后,还需要拿到一个file结构,最后拿到一个fd结构,返回给用户。
static int get_fd(struct inode *inode){ int fd; struct file *file; /* * Find a file descriptor suitable for return to the user. */ // 获取一个可以的file结构体 file = get_empty_filp(); if (!file) return(-1); // 挂载到进程的fd数组中 for (fd = 0; fd < NR_OPEN; ++fd) if (!current->files->fd[fd]) break; if (fd == NR_OPEN) { file->f_count = 0; return(-1); } FD_CLR(fd, ¤t->files->close_on_exec); current->files->fd[fd] = file; // 设置文件操作函数集,操作socket像操作文件一样 file->f_op = &socket_file_ops; file->f_mode = 3; file->f_flags = O_RDWR; file->f_count = 1; // 关联inode节点 file->f_inode = inode; if (inode) inode->i_count++; file->f_pos = 0; return(fd);}复制代码
最后,当我们指向一个socket函数时,内存视图是: