leevis.com
leevis.com copied to clipboard
linux-0.98网络系统socket创建和使用
概述
程序中使用网络首先会调用socket
函数创建一个socket套接字的文件fd。
然后进行bind
listen
或者connect
.
我们先来看socket tcp套件字的创建int fd = socket(AF_INET, SOCK_STREAM, 0);
。
在glibc代码中,通过汇编调用80中断,中断号是sys_socketcall
在系统调用表sys_call_table
中的下标。
潜入到内核态调用sys_socketcall
函数。
代码
int
sys_socketcall(int call, unsigned long *args)
{
switch (call) {
case SYS_SOCKET:
verify_area(args, 3 * sizeof(long));
return sock_socket(get_fs_long(args+0),
get_fs_long(args+1),
get_fs_long(args+2));
case SYS_BIND:
verify_area(args, 3 * sizeof(long));
return sock_bind(get_fs_long(args+0),
(struct sockaddr *)get_fs_long(args+1),
get_fs_long(args+2));
......
default:
return -EINVAL;
}
}
从上面代码看,sys_socketcall
是个封装函数,真正的逻辑函数是sock_socket
。
/*
* perform the socket system call. we locate the appropriate family, then
* create a fresh socket.
*/
// 根据用户程序调用,family是AF_INET,type是SOCKET_STREAM,protocol是0。
static int
sock_socket(int family, int type, int protocol)
{
int i, fd;
struct socket *sock;
struct proto_ops *ops;
PRINTK("sys_socket: family = %d (%s), type = %d, protocol = %d\n",
family, family_name(family), type, protocol);
/*
* locate the correct protocol family
*/
for (i = 0; i < NPROTO; ++i)
if (proto_table[i].family == family) // AF_UNIX AF_INET
break;
if (i == NPROTO) {
PRINTK("sys_socket: family not found\n");
return -EINVAL;
}
ops = proto_table[i].ops; // inet_proto_ops
/*
* check that this is a type that we know how to manipulate and
* the protocol makes sense here. the family can still reject the
* protocol later.
*/
if ((type != SOCK_STREAM &&
type != SOCK_DGRAM &&
type != SOCK_SEQPACKET &&
type != SOCK_RAW) ||
protocol < 0)
return -EINVAL;
/*
* allocate the socket and allow the family to set things up. if
* the protocol is 0, the family is instructed to select an appropriate
* default.
*/
// 分配通用套接字结构体
if (!(sock = sock_alloc(1))) {
printk("sys_socket: no more sockets\n");
return -EAGAIN;
}
sock->type = type;
sock->ops = ops;
// 分配和协议族相关sock结构体,AF_INET调用ip_proto_create函数。
if ((i = sock->ops->create(sock, protocol)) < 0) {
sock_release(sock);
return i;
}
// 分配一个文件结构体
if ((fd = get_fd(SOCK_INODE(sock))) < 0) {
sock_release(sock);
return -EINVAL;
}
return fd;
}
socket句柄创建好以后调用bind函数绑定地址。介绍该函数之前先看下通过fd如何查找socket结构体。
static inline struct socket *
sockfd_lookup(int fd, struct file **pfile)
{
struct file *file;
// 从进程打开文件的数组中根据下标查找
if (fd < 0 || fd >= NR_OPEN || !(file = current->filp[fd]))
return NULL;
if (pfile)
*pfile = file;
return socki_lookup(file->f_inode);
}
static inline struct socket *
socki_lookup(struct inode *inode)
{
struct socket *sock;
// 遍历socket结构体数组通过比较inode
for (sock = sockets; sock <= last_socket; ++sock)
if (sock->state != SS_FREE && SOCK_INODE(sock) == inode)
return sock;
return NULL;
}
知道如何根据fd查找socket结构体后,再看bind的内核函数。
/*
* binds a name to a socket. nothing much to do here since its the
* protocol's responsibility to handle the local address
*/
static int
sock_bind(int fd, struct sockaddr *umyaddr, int addrlen)
{
struct socket *sock;
int i;
PRINTK("sys_bind: fd = %d\n", fd);
if (!(sock = sockfd_lookup(fd, NULL)))
return -EBADF;
// ip_proto_bind
if ((i = sock->ops->bind(sock, umyaddr, addrlen)) < 0) {
PRINTK("sys_bind: bind failed\n");
return i;
}
return 0;
}
static int
ip_proto_bind (struct socket *sock, struct sockaddr *uaddr,
int addr_len)
{
struct sockaddr_in addr;
volatile struct sock *sk, *sk2;
unsigned short snum;
sk = sock->data;
if (sk == NULL)
{
printk ("Warning: sock->data = NULL: %d\n" ,__LINE__);
return (0);
}
/* check this error. */
if (sk->state != TCP_CLOSE) return (-EIO);
verify_area (uaddr, addr_len);
memcpy_fromfs (&addr, uaddr, min (sizeof (addr), addr_len));
if (addr.sin_family && addr.sin_family != AF_INET)
return (-EIO); /* this needs to be changed. */
snum = net16(addr.sin_port);
PRINTK ("bind sk =%X to port = %d\n", sk, snum);
print_sk (sk);
sk = sock->data;
/* we can't just leave the socket bound wherever it is, it might be bound
to a priveledged port. However, since there seems to be a bug here,
we will leave it if the port is not priveledged(sp?) */
if (snum == 0)
{
if ( sk->num > PROT_SOCK) return (0);
snum = get_new_socknum (sk->prot, 0);
}
if (snum <= PROT_SOCK && !suser())
return (-EPERM);
if (my_ip_addr(addr.sin_addr.s_addr) || addr.sin_addr.s_addr == 0)
sk->saddr = addr.sin_addr.s_addr;
PRINTK ("sock_array[%d] = %X:\n", snum & (SOCK_ARRAY_SIZE -1),
sk->prot->sock_array[snum & (SOCK_ARRAY_SIZE -1)]);
print_sk (sk->prot->sock_array[snum & (SOCK_ARRAY_SIZE -1)]);
/* make sure we are allowed to bind here. */
for (sk2 = sk->prot->sock_array[snum & (SOCK_ARRAY_SIZE -1)];
sk2 != NULL;
sk2 = sk2->next)
{
if (sk2->num != snum) continue;
if (sk2->saddr != sk->saddr) continue;
if (!sk->reuse) return (-EADDRINUSE);
if (!sk2->reuse) return (-EADDRINUSE);
}
remove_sock (sk);
put_sock(snum, sk);
sk->dummy_th.source = net16(sk->num);
sk->daddr = 0;
sk->dummy_th.dest = 0;
return (0);
}
绑定了本机地址后就可以listen了。
/*
* perform a listen. basically, we allow the protocol to do anything
* necessary for a listen, and if that works, we mark the socket as
* ready for listening.
*/
static int
sock_listen(int fd, int backlog)
{
struct socket *sock;
PRINTK("sys_listen: fd = %d\n", fd);
if (!(sock = sockfd_lookup(fd, NULL)))
return -EBADF;
if (sock->state != SS_UNCONNECTED) {
PRINTK("sys_listen: socket isn't unconnected\n");
return -EINVAL;
}
if (sock->flags & SO_ACCEPTCON) {
PRINTK("sys_listen: socket already accepting connections!\n");
return -EINVAL;
}
if (sock->ops && sock->ops->listen)
sock->ops->listen (sock, backlog); // ip_proto_listen
sock->flags |= SO_ACCEPTCON;
return 0;
}
static int
ip_proto_listen(struct socket *sock, int backlog)
{
volatile struct sock *sk;
sk = sock->data;
if (sk == NULL)
{
printk ("Warning: sock->data = NULL: %d\n" ,__LINE__);
return (0);
}
sk->state = TCP_LISTEN;
return (0);
}
从代码上看listen实际上只是改了socket的状态,至于backlog这个版本应该是没有实现。
服务端口监听好以后,客户端就可以调用connect函数向服务端发起连接了。
// ./net/socket.c
/*
* attempt to connect to a socket with the server address.
*/
static int
sock_connect(int fd, struct sockaddr *uservaddr, int addrlen)
{
struct socket *sock;
struct file *file;
int i;
PRINTK("sys_connect: fd = %d\n", fd);
if (!(sock = sockfd_lookup(fd, &file)))
return -EBADF;
if (sock->state != SS_UNCONNECTED) {
PRINTK("sys_connect: socket not unconnected\n");
return -EINVAL;
}
// ip_proto_connect
i = sock->ops->connect(sock, uservaddr, addrlen, file->f_flags);
if (i < 0) {
PRINTK("sys_connect: connect failed\n");
return i;
}
return 0;
}
// ./net/tcp/sock.c
static int
ip_proto_connect (struct socket *sock, struct sockaddr * uaddr,
int addr_len, int flags)
{
volatile struct sock *sk;
int err;
sock->conn = NULL;
sk = sock->data;
if (sk == NULL)
{
printk ("Warning: sock->data = NULL: %d\n" ,__LINE__);
return (0);
}
if (sk->prot->connect == NULL)
return (-EOPNOTSUPP);
if (sk->intr == 0) // 被信号打断则不执行
{
// tcp_connect
err = sk->prot->connect (sk, (struct sockaddr_in *)uaddr, addr_len);
if (err < 0) return (err);
}
sock->state = SS_CONNECTED;
// 非阻塞则直接返回
if (flags & O_NONBLOCK) return (0);
cli(); /* avoid the race condition */
while (sk->state != TCP_ESTABLISHED && sk->state < TCP_CLOSING)
{
interruptible_sleep_on (sk->sleep);
if (current->signal & ~current->blocked)
{
sti();
sk->intr = 1; // 信号打断
return (-ERESTARTSYS);
}
}
sti();
sk->intr = 0;
if (sk->state != TCP_ESTABLISHED && sk->err)
{
return (-sk->err);
}
return (0);
}
// ./net/tcp/tcp.c
/* this will initiate an outgoing connection. */
static int
tcp_connect (volatile struct sock *sk, struct sockaddr_in *usin, int addr_len)
{
struct sk_buff *buff;
struct sockaddr_in sin;
struct device *dev=NULL;
unsigned char *ptr;
int tmp;
struct tcp_header *t1;
if (sk->state != TCP_CLOSE) return (-EISCONN);
if (addr_len < 8) return (-EINVAL);
verify_area (usin, addr_len);
memcpy_fromfs (&sin,usin, min(sizeof (sin), addr_len));
if (sin.sin_family && sin.sin_family != AF_INET) return (-EAFNOSUPPORT);
sk->daddr = sin.sin_addr.s_addr;
sk->send_seq = timer_seq*SEQ_TICK-seq_offset;
sk->rcv_ack_seq = sk->send_seq -1;
sk->err = 0;
sk->dummy_th.dest = sin.sin_port;
buff=sk->prot->wmalloc(sk,MAX_SYN_SIZE,0);
if (buff == NULL)
{
return (-ENOMEM);
}
sk->inuse = 1;
buff->mem_addr = buff;
buff->mem_len = MAX_SYN_SIZE;
buff->len=24; //? mac地址吗?
buff->sk = sk;
t1=(struct tcp_header *)(buff + 1);
/* put in the ip_header and routing stuff. */
/* We need to build the routing stuff fromt the things saved
in skb. */
// ip_build_header
tmp = sk->prot->build_header (buff, sk->saddr, sk->daddr, &dev,
IP_TCP, NULL, MAX_SYN_SIZE);
if (tmp < 0)
{
sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
release_sock (sk);
return (-ENETUNREACH);
}
buff->len += tmp; // IP 头大小
t1 = (struct tcp_header *)((char *)t1 +tmp);
memcpy (t1, (void *)&(sk->dummy_th), sizeof (*t1));
t1->seq = net32(sk->send_seq++);
buff->h.seq = sk->send_seq;
t1->ack = 0;
t1->window = 2;
t1->res1=0;
t1->res2=0;
t1->rst = 0;
t1->urg = 0;
t1->psh = 0;
t1->syn = 1;
t1->urg_ptr = 0;
t1->doff =6;
/* put in the tcp options to say mtu. */
ptr=(unsigned char *)(t1+1);
ptr[0]=2;
ptr[1]=4;
ptr[2]=(dev->mtu- HEADER_SIZE) >> 8;
ptr[3]=(dev->mtu- HEADER_SIZE) & 0xff;
sk->mtu = dev->mtu - HEADER_SIZE;
tcp_send_check (t1, sk->saddr, sk->daddr,
sizeof (struct tcp_header) + 4, sk);
/* this must go first otherwise a really quick response will
get reset. */
sk->state = TCP_SYN_SENT;
// ip_queue_xmit
sk->prot->queue_xmit(sk, dev, buff, 0);
sk->time_wait.len = TCP_CONNECT_TIME;
reset_timer ((struct timer *)&sk->time_wait);
sk->retransmits = TCP_RETR1 - TCP_SYN_RETRIES;
release_sock (sk);
return (0);
}
// ./net/tcp/ip.c
/* This routine builds the appropriate hardware/ip headers for
the routine. It assumes that if *prot != NULL then the
protocol knows what it's doing, otherwise it uses the
routing/arp tables to select a protocol struct. */
int
ip_build_header (struct sk_buff *skb, unsigned long saddr,
unsigned long daddr, struct device **dev, int type,
struct options *opt, int len)
{
static struct options optmem;
struct ip_header *iph;
unsigned char *buff;
static int count = 0;
unsigned long raddr; /* for the router. */
int tmp;
if (saddr == 0) saddr = MY_IP_ADDR;
PRINTK ("ip_build_header (skb=%X, saddr=%X, daddr=%X, *dev=%X,\n"
" type=%d, opt=%X, len = %d)\n",
skb, saddr, daddr, *dev, type, opt, len);
buff = (unsigned char *)(skb + 1);
/* see if we need to look up the device. */
if (*dev == NULL)
{
*dev = ip_route(&optmem,daddr, &raddr);
if (*dev == NULL)
{
return (-ENETUNREACH);
}
opt = &optmem;
}
else
{
/* we still need the address of the first hop. */
ip_route (&optmem, daddr, &raddr);
}
if (raddr == 0) raddr = daddr;
/* now build the header. */
/* we need to worry about routing in here. daddr should
really be the address of the next hop. */
/* but raddr is . */
if ((*dev)->hard_header)
{
tmp = (*dev)->hard_header(buff, *dev, ETHERTYPE_IP, raddr, saddr, len);
}
else
{
tmp = 0;
}
if (tmp < 0)
{
tmp = -tmp;
skb->arp = 0;
}
else
{
skb->arp = 1;
}
buff += tmp;
len -= tmp;
skb->dev = *dev;
/* now build the ip header. */
iph = (struct ip_header *)buff;
iph->version = 4;
iph->tos = 0;
iph->frag_off = 0;
iph->ttl = 32;
iph->daddr = daddr;
iph->saddr = saddr;
iph->protocol=type;
iph->ihl = 5;
iph->id = net16(count++);
/* build_options (iph, opt);*/
return (20+tmp);
}
/* queues a packet to be sent, and starts the transmitter if
necessary. if free = 1 then we free the block after transmit,
otherwise we don't. */
/* This routine also needs to put in the total length, and compute
the checksum. */
void
ip_queue_xmit (volatile struct sock *sk, struct device *dev,
struct sk_buff *skb, int free)
{
struct ip_header *iph;
unsigned char *ptr;
if (sk == NULL) free = 1;
skb->free = free;
skb->dev = dev;
skb->when = jiffies;
PRINTK(">>\n");
ptr = (unsigned char *)(skb + 1);
ptr += dev->hard_header_len;
iph = (struct ip_header *)ptr;
iph->tot_len = net16(skb->len-dev->hard_header_len);
ip_send_check (iph);
print_iph(iph);
skb->next = NULL;
if (!free)
{
skb->link3 = NULL;
sk->packets_out++;
cli();
if (sk->send_tail == NULL)
{
sk->send_tail = skb;
sk->send_head = skb;
}
else
{
sk->send_tail->link3 = skb;
sk->send_tail = skb;
}
sti();
sk->time_wait.len = sk->rtt*2;
sk->timeout=TIME_WRITE;
reset_timer ((struct timer *)&sk->time_wait);
}
else
{
skb->sk = sk;
}
if (dev->up)
{
if (sk)
dev->queue_xmit(skb, dev, sk->priority);
else
dev->queue_xmit (skb, dev, SOPRI_NORMAL);
}
else
{
if (free)
free_skb (skb, FREE_WRITE);
}
}