leevis.com icon indicating copy to clipboard operation
leevis.com copied to clipboard

linux-0.98网络系统socket创建和使用

Open vislee opened this issue 3 years ago • 0 comments

概述

程序中使用网络首先会调用socket函数创建一个socket套接字的文件fd。 然后进行bind listen或者connect. 我们先来看socket tcp套件字的创建int fd = socket(AF_INET, SOCK_STREAM, 0);。 在glibc代码中,通过汇编调用80中断,中断号是sys_socketcall在系统调用表sys_call_table中的下标。 潜入到内核态调用sys_socketcall函数。

代码

int
sys_socketcall(int call, unsigned long *args)
{
	switch (call) {
	case SYS_SOCKET:
		verify_area(args, 3 * sizeof(long));
		return sock_socket(get_fs_long(args+0),
				   get_fs_long(args+1),
				   get_fs_long(args+2));

	case SYS_BIND:
		verify_area(args, 3 * sizeof(long));
		return sock_bind(get_fs_long(args+0),
				 (struct sockaddr *)get_fs_long(args+1),
				 get_fs_long(args+2));

        ......

	default:
		return -EINVAL;
	}
}

从上面代码看,sys_socketcall是个封装函数,真正的逻辑函数是sock_socket


/*
 * perform the socket system call. we locate the appropriate family, then
 * create a fresh socket.
 */
// 根据用户程序调用,family是AF_INET,type是SOCKET_STREAM,protocol是0。
static int
sock_socket(int family, int type, int protocol)
{
	int i, fd;
	struct socket *sock;
	struct proto_ops *ops;

	PRINTK("sys_socket: family = %d (%s), type = %d, protocol = %d\n",
	       family, family_name(family), type, protocol);

	/*
	 * locate the correct protocol family
	 */
	for (i = 0; i < NPROTO; ++i)
		if (proto_table[i].family == family) // AF_UNIX AF_INET
			break;
	if (i == NPROTO) {
		PRINTK("sys_socket: family not found\n");
		return -EINVAL;
	}
	ops = proto_table[i].ops;  // inet_proto_ops

	/*
	 * check that this is a type that we know how to manipulate and
	 * the protocol makes sense here. the family can still reject the
	 * protocol later.
	 */
	if ((type != SOCK_STREAM &&
	     type != SOCK_DGRAM &&
	     type != SOCK_SEQPACKET &&
	     type != SOCK_RAW) ||
	    protocol < 0)
		return -EINVAL;

	/*
	 * allocate the socket and allow the family to set things up. if
	 * the protocol is 0, the family is instructed to select an appropriate
	 * default.
	 */
        // 分配通用套接字结构体
	if (!(sock = sock_alloc(1))) {
		printk("sys_socket: no more sockets\n");
		return -EAGAIN;
	}
	sock->type = type;
	sock->ops = ops;
        // 分配和协议族相关sock结构体,AF_INET调用ip_proto_create函数。
	if ((i = sock->ops->create(sock, protocol)) < 0) {
		sock_release(sock);
		return i;
	}
        // 分配一个文件结构体
	if ((fd = get_fd(SOCK_INODE(sock))) < 0) {
		sock_release(sock);
		return -EINVAL;
	}

	return fd;
}

socket句柄创建好以后调用bind函数绑定地址。介绍该函数之前先看下通过fd如何查找socket结构体。

static inline struct socket *
sockfd_lookup(int fd, struct file **pfile)
{
	struct file *file;
        // 从进程打开文件的数组中根据下标查找
	if (fd < 0 || fd >= NR_OPEN || !(file = current->filp[fd]))
		return NULL;
	if (pfile)
		*pfile = file;
	return socki_lookup(file->f_inode);
}

static inline struct socket *
socki_lookup(struct inode *inode)
{
	struct socket *sock;
        // 遍历socket结构体数组通过比较inode
	for (sock = sockets; sock <= last_socket; ++sock)
		if (sock->state != SS_FREE && SOCK_INODE(sock) == inode)
			return sock;
	return NULL;
}

知道如何根据fd查找socket结构体后,再看bind的内核函数。

/*
 * binds a name to a socket. nothing much to do here since its the
 * protocol's responsibility to handle the local address
 */
static int
sock_bind(int fd, struct sockaddr *umyaddr, int addrlen)
{
	struct socket *sock;
	int i;

	PRINTK("sys_bind: fd = %d\n", fd);
	if (!(sock = sockfd_lookup(fd, NULL)))
		return -EBADF;
        // ip_proto_bind
	if ((i = sock->ops->bind(sock, umyaddr, addrlen)) < 0) {
		PRINTK("sys_bind: bind failed\n");
		return i;
	}
	return 0;
}

static int
ip_proto_bind (struct socket *sock, struct sockaddr *uaddr,
             int addr_len)
{
  struct sockaddr_in addr;
  volatile struct sock *sk, *sk2;
  unsigned short snum;
  sk = sock->data;
   if (sk == NULL)
     {
      printk ("Warning: sock->data = NULL: %d\n" ,__LINE__);
      return (0);
     }
  /* check this error. */
  if (sk->state != TCP_CLOSE) return (-EIO);
  verify_area (uaddr, addr_len);
  memcpy_fromfs (&addr, uaddr, min (sizeof (addr), addr_len));
  if (addr.sin_family && addr.sin_family != AF_INET)
    return (-EIO); /* this needs to be changed. */
  snum = net16(addr.sin_port);
  PRINTK ("bind sk =%X to port = %d\n", sk, snum);
  print_sk (sk);
  sk = sock->data;

  /* we can't just leave the socket bound wherever it is, it might be bound
     to a priveledged port. However, since there seems to be a bug here,
     we will leave it if the port is not priveledged(sp?) */

  if (snum == 0)
    {
       if ( sk->num > PROT_SOCK) return (0);
       snum = get_new_socknum (sk->prot, 0);
    }

  if (snum <= PROT_SOCK && !suser())
    return (-EPERM);

  if (my_ip_addr(addr.sin_addr.s_addr) || addr.sin_addr.s_addr == 0)
    sk->saddr = addr.sin_addr.s_addr;
  PRINTK ("sock_array[%d] = %X:\n", snum & (SOCK_ARRAY_SIZE -1),
        sk->prot->sock_array[snum & (SOCK_ARRAY_SIZE -1)]);
  print_sk (sk->prot->sock_array[snum & (SOCK_ARRAY_SIZE -1)]);

  /* make sure we are allowed to bind here. */
  for (sk2 = sk->prot->sock_array[snum & (SOCK_ARRAY_SIZE -1)];
       sk2 != NULL;
       sk2 = sk2->next)
    {
       if (sk2->num != snum) continue;
       if (sk2->saddr != sk->saddr) continue;
       if (!sk->reuse) return (-EADDRINUSE);
       if (!sk2->reuse) return (-EADDRINUSE);
    }
  remove_sock (sk);
  put_sock(snum, sk);
  sk->dummy_th.source = net16(sk->num);
  sk->daddr = 0;
  sk->dummy_th.dest = 0;
  return (0);
}

绑定了本机地址后就可以listen了。

/*
 * perform a listen. basically, we allow the protocol to do anything
 * necessary for a listen, and if that works, we mark the socket as
 * ready for listening.
 */
static int
sock_listen(int fd, int backlog)
{
	struct socket *sock;

	PRINTK("sys_listen: fd = %d\n", fd);
	if (!(sock = sockfd_lookup(fd, NULL)))
		return -EBADF;
	if (sock->state != SS_UNCONNECTED) {
		PRINTK("sys_listen: socket isn't unconnected\n");
		return -EINVAL;
	}
	if (sock->flags & SO_ACCEPTCON) {
		PRINTK("sys_listen: socket already accepting connections!\n");
		return -EINVAL;
	}
	if (sock->ops && sock->ops->listen)
	  sock->ops->listen (sock, backlog); // ip_proto_listen
	sock->flags |= SO_ACCEPTCON;
	return 0;
}

static int
ip_proto_listen(struct socket *sock, int backlog)
{
  volatile struct sock *sk;
  sk = sock->data;
   if (sk == NULL)
     {
      printk ("Warning: sock->data = NULL: %d\n" ,__LINE__);
      return (0);
     }
  sk->state = TCP_LISTEN;
  return (0);
}

从代码上看listen实际上只是改了socket的状态,至于backlog这个版本应该是没有实现。

服务端口监听好以后,客户端就可以调用connect函数向服务端发起连接了。

//  ./net/socket.c
/*
 * attempt to connect to a socket with the server address.
 */
static int
sock_connect(int fd, struct sockaddr *uservaddr, int addrlen)
{
	struct socket *sock;
	struct file *file;
	int i;

	PRINTK("sys_connect: fd = %d\n", fd);
	if (!(sock = sockfd_lookup(fd, &file)))
		return -EBADF;
	if (sock->state != SS_UNCONNECTED) {
		PRINTK("sys_connect: socket not unconnected\n");
		return -EINVAL;
	}
        // ip_proto_connect
	i = sock->ops->connect(sock, uservaddr, addrlen, file->f_flags);
	if (i < 0) {
		PRINTK("sys_connect: connect failed\n");
		return i;
	}
	return 0;
}

// ./net/tcp/sock.c
static int
ip_proto_connect (struct socket *sock, struct sockaddr * uaddr,
              int addr_len, int flags)
{
  volatile struct sock *sk;
  int err;
  sock->conn = NULL;
  sk = sock->data;
   if (sk == NULL)
     {
      printk ("Warning: sock->data = NULL: %d\n" ,__LINE__);
      return (0);
     }
  if (sk->prot->connect == NULL)
    return (-EOPNOTSUPP);

  if (sk->intr == 0)  // 被信号打断则不执行
    {
      // tcp_connect
      err = sk->prot->connect (sk, (struct sockaddr_in *)uaddr, addr_len);
      if (err < 0) return (err);
    }

  sock->state = SS_CONNECTED;
  // 非阻塞则直接返回
  if (flags & O_NONBLOCK) return (0);

  cli(); /* avoid the race condition */

  while (sk->state != TCP_ESTABLISHED && sk->state < TCP_CLOSING)
    {
      interruptible_sleep_on (sk->sleep);
      if (current->signal & ~current->blocked)
      {
         sti();
         sk->intr = 1; // 信号打断
         return (-ERESTARTSYS);
      }
    }
  sti();
  sk->intr = 0;
  if (sk->state != TCP_ESTABLISHED && sk->err)
    {
      return (-sk->err);
    }
  return (0);
}

//   ./net/tcp/tcp.c
/* this will initiate an outgoing connection. */
static int
tcp_connect (volatile struct sock *sk, struct sockaddr_in *usin, int addr_len)
{
  struct sk_buff *buff;
  struct sockaddr_in sin;
  struct device *dev=NULL;
  unsigned char *ptr;
  int tmp;
  struct tcp_header *t1;
  if (sk->state != TCP_CLOSE) return (-EISCONN);
  if (addr_len < 8) return (-EINVAL);

  verify_area (usin, addr_len);
  memcpy_fromfs (&sin,usin, min(sizeof (sin), addr_len));

  if (sin.sin_family && sin.sin_family != AF_INET) return (-EAFNOSUPPORT);

  sk->daddr = sin.sin_addr.s_addr;
  sk->send_seq = timer_seq*SEQ_TICK-seq_offset;
  sk->rcv_ack_seq = sk->send_seq -1;
  sk->err = 0;
  sk->dummy_th.dest = sin.sin_port;

  buff=sk->prot->wmalloc(sk,MAX_SYN_SIZE,0);
  if (buff == NULL) 
    {
      return (-ENOMEM);
    }
  sk->inuse = 1;
  buff->mem_addr = buff;
  buff->mem_len = MAX_SYN_SIZE;
  buff->len=24; //? mac地址吗?
  buff->sk = sk;
  t1=(struct tcp_header *)(buff + 1);
  /* put in the ip_header and routing stuff. */
  /* We need to build the routing stuff fromt the things saved
     in skb. */
  // ip_build_header
  tmp = sk->prot->build_header (buff, sk->saddr, sk->daddr, &dev,
				IP_TCP, NULL, MAX_SYN_SIZE);
  if (tmp < 0)
    {
      sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
      release_sock (sk);
      return (-ENETUNREACH);
    }
  buff->len += tmp; // IP 头大小
  t1 = (struct tcp_header *)((char *)t1 +tmp);

  memcpy (t1, (void *)&(sk->dummy_th), sizeof (*t1));
  t1->seq = net32(sk->send_seq++);
  buff->h.seq = sk->send_seq;
  t1->ack = 0;
  t1->window = 2;
  t1->res1=0;
  t1->res2=0;
  t1->rst = 0;
  t1->urg = 0;
  t1->psh = 0;
  t1->syn = 1;
  t1->urg_ptr = 0;
  t1->doff =6;
  /* put in the tcp options to say mtu. */
  ptr=(unsigned char *)(t1+1);
  ptr[0]=2;
  ptr[1]=4;
  ptr[2]=(dev->mtu- HEADER_SIZE) >> 8;
  ptr[3]=(dev->mtu- HEADER_SIZE) & 0xff;
  sk->mtu = dev->mtu - HEADER_SIZE;
  tcp_send_check (t1, sk->saddr, sk->daddr,
		  sizeof (struct tcp_header) + 4, sk);
  /* this must go first otherwise a really quick response will
     get reset. */
  sk->state = TCP_SYN_SENT;
  // ip_queue_xmit
  sk->prot->queue_xmit(sk, dev, buff, 0);
  
  sk->time_wait.len = TCP_CONNECT_TIME;
  reset_timer ((struct timer *)&sk->time_wait);
  sk->retransmits = TCP_RETR1 - TCP_SYN_RETRIES;
  release_sock (sk);
  return (0);
}


// ./net/tcp/ip.c
/* This routine builds the appropriate hardware/ip headers for
   the routine.  It assumes that if *prot != NULL then the
   protocol knows what it's doing, otherwise it uses the
   routing/arp tables to select a protocol struct. */

int
ip_build_header (struct sk_buff *skb, unsigned long saddr,
             unsigned long daddr, struct device **dev, int type,
             struct options *opt, int len)
{
  static struct options optmem;
  struct ip_header *iph;
  unsigned char *buff;
  static int count = 0;
  unsigned long raddr; /* for the router. */
  int tmp;
  if (saddr == 0) saddr = MY_IP_ADDR;
  PRINTK ("ip_build_header (skb=%X, saddr=%X, daddr=%X, *dev=%X,\n"
        "                 type=%d, opt=%X, len = %d)\n",
        skb, saddr, daddr, *dev, type, opt, len);
  buff = (unsigned char *)(skb + 1);
  /* see if we need to look up the device. */
  if (*dev == NULL)
    {
      *dev = ip_route(&optmem,daddr, &raddr);
      if (*dev == NULL)
      {
        return (-ENETUNREACH);
      }
      opt = &optmem;
    }
  else
    {
      /* we still need the address of the first hop. */
      ip_route (&optmem, daddr, &raddr);
    }
  if (raddr == 0) raddr = daddr;
  /* now build the header. */
  /* we need to worry about routing in here.  daddr should
     really be the address of the next hop. */
  /* but raddr is . */
  if ((*dev)->hard_header)
    {
       tmp = (*dev)->hard_header(buff, *dev, ETHERTYPE_IP, raddr, saddr, len);
    }
  else
    {
       tmp = 0;
    }
  if (tmp < 0)
    {
       tmp = -tmp;
       skb->arp = 0;
    }
  else
    {
       skb->arp = 1;
    }
  buff += tmp;
  len -= tmp;
  skb->dev = *dev;
  /* now build the ip header. */
  iph = (struct ip_header *)buff;
  iph->version = 4;
  iph->tos = 0;
  iph->frag_off = 0;
  iph->ttl = 32;
  iph->daddr = daddr;
  iph->saddr = saddr;
  iph->protocol=type;
  iph->ihl = 5;
  iph->id = net16(count++);
  /* build_options (iph, opt);*/
  return (20+tmp);
}

/* queues a packet to be sent, and starts the transmitter if
   necessary.  if free = 1 then we free the block after transmit,
   otherwise we don't. */
/* This routine also needs to put in the total length, and compute
   the checksum. */
void
ip_queue_xmit (volatile struct sock *sk, struct device *dev, 
             struct sk_buff *skb, int free)
{
  struct ip_header *iph;
  unsigned char *ptr;
  if (sk == NULL) free = 1;
  skb->free = free;
  skb->dev = dev;
  skb->when = jiffies;
  PRINTK(">>\n");
  ptr = (unsigned char *)(skb + 1);
  ptr += dev->hard_header_len;
  iph = (struct ip_header *)ptr;
  iph->tot_len = net16(skb->len-dev->hard_header_len);
  ip_send_check (iph);
  print_iph(iph);
  skb->next = NULL;
  if (!free)
    {
      skb->link3 = NULL;
      sk->packets_out++;
      cli();
      if (sk->send_tail == NULL)
      {
        sk->send_tail = skb;
        sk->send_head = skb;
      }
      else
      {
        sk->send_tail->link3 = skb;
        sk->send_tail = skb;
      }
      sti();
      sk->time_wait.len = sk->rtt*2;
      sk->timeout=TIME_WRITE;
      reset_timer ((struct timer *)&sk->time_wait);
   }
  else
    {
       skb->sk = sk;
    }
  if (dev->up)
    {
       if (sk)
       dev->queue_xmit(skb, dev, sk->priority);
       else
       dev->queue_xmit (skb, dev, SOPRI_NORMAL);
    }
  else
    {
       if (free) 
       free_skb (skb, FREE_WRITE);
    }
}

总结

vislee avatar Feb 02 '22 10:02 vislee