leevis.com
leevis.com copied to clipboard
linux-0.98网络系统
概述
应用程序通过socket
创建套件字句柄,通过bind
绑定地址,
通过listen
监听端口,通过connect
发起连接,通过accept
接收连接,
通过read
、recv
读取数据,通过write
、send
发送数据。
我们就来看下这些函数的内核实现。
代码分析
linux一切兼文件,网络子系统也是建立在文件系统之上的,
因此先回顾一下进程打开文件结构体,也就是说每个socket也会对应一个file结构体,通过fd
索引。
struct file {
unsigned short f_mode;
unsigned short f_flags;
unsigned short f_count;
unsigned short f_reada;
unsigned short f_rdev; /* needed for /dev/tty */
struct inode * f_inode;
struct file_operations * f_op;
off_t f_pos;
};
然后我们再看网络子系统, 内核初始化时,会调用 sock_init 函数初始化网络系统。
先看下内核socket结构,定义在net/kern_sock.h文件中:
struct socket {
short type; /* SOCK_STREAM, ... */
socket_state state; // socket的状态
long flags;
struct proto_ops *ops; /* protocols do most everything */
void *data; /* protocol data */ // AF_UNIX 指向struct unix_proto_data结构体。
struct socket *conn; /* server socket connected to */
struct socket *iconn; /* incomplete client connections */ // 等待连接的客户端socket队列,可以理解为半连接队列
struct socket *next;
struct wait_queue **wait; /* ptr to place to wait on */
void *dummy; // 指向inode
};
net/socket.c文件中
// 支持的协议类型
static struct {
short family;
char *name;
struct proto_ops *ops;
} proto_table[] = {
{AF_UNIX, "AF_UNIX", &unix_proto_ops}, // unix域套接字
#ifdef INET_SOCKETS
{AF_INET, "AF_INET", &inet_proto_ops}, // 网络套接字
#endif
};
void
sock_init(void)
{
struct socket *sock;
int i, ok;
// 初始化内核sockets数组
for (sock = sockets; sock <= last_socket; ++sock)
sock->state = SS_FREE;
// 初始化支持的协议类型
for (i = ok = 0; i < NPROTO; ++i) {
printk("sock_init: initializing family %d (%s)\n",
proto_table[i].family, proto_table[i].name);
if ((*proto_table[i].ops->init)() < 0) {
printk("sock_init: init failed.\n",
proto_table[i].family);
proto_table[i].family = -1;
}
else
++ok;
}
if (!ok)
printk("sock_init: warning: no protocols initialized\n");
return;
}
AF_UNIX unix socket实现。 net/unix.c 文件中
// unix socket结构体
static struct unix_proto_data {
int refcnt; /* cnt of reference 0=free */ // 引用次数
struct socket *socket; /* socket we're bound to */
int protocol;
struct sockaddr_un sockaddr_un;
short sockaddr_len; /* >0 if name bound */
char *buf;
int bp_head, bp_tail;
struct inode *inode;
struct unix_proto_data *peerupd;
} unix_datas[NSOCKETS];
struct proto_ops unix_proto_ops = {
unix_proto_init,
unix_proto_create,
unix_proto_dup,
unix_proto_release,
unix_proto_bind,
unix_proto_connect,
unix_proto_socketpair,
unix_proto_accept,
unix_proto_getname,
unix_proto_read,
unix_proto_write,
unix_proto_select,
unix_proto_ioctl,
unix_proto_listen,
unix_proto_send,
unix_proto_recv,
unix_proto_sendto,
unix_proto_recvfrom,
unix_proto_shutdown,
unix_proto_setsockopt,
unix_proto_getsockopt,
NULL /* unix_proto_fcntl. */
};
// 初始化unix socket 结构体数组
static int
unix_proto_init(void)
{
struct unix_proto_data *upd;
PRINTK("unix_proto_init: initializing...\n");
for (upd = unix_datas; upd <= last_unix_data; ++upd)
upd->refcnt = 0;
return 0;
}
/*
* upon a create, we allocate an empty protocol data, and grab a page to
* buffer writes
*/
static int
unix_proto_create(struct socket *sock, int protocol)
{
struct unix_proto_data *upd;
PRINTK("unix_proto_create: socket 0x%x, proto %d\n", sock, protocol);
if (protocol != 0) {
PRINTK("unix_proto_create: protocol != 0\n");
return -EINVAL;
}
// 从数组中分配一个unix socket 结构体
if (!(upd = unix_data_alloc())) {
printk("unix_proto_create: can't allocate buffer\n");
return -ENOMEM;
}
// 分配一页物理内存
if (!(upd->buf = (char *)get_free_page(GFP_USER))) {
printk("unix_proto_create: can't get page!\n");
unix_data_deref(upd);
return -ENOMEM;
}
upd->protocol = protocol;
upd->socket = sock;
UN_DATA(sock) = upd;
PRINTK("unix_proto_create: allocated data 0x%x\n", upd);
return 0;
}
linux0.98通过sys_socketcall
函数封装了概述中的函数实现,具体socket
则是调用了内核sock_socket
函数。
/*
* perform the socket system call. we locate the appropriate family, then
* create a fresh socket.
*/
static int
sock_socket(int family, int type, int protocol)
{
int i, fd;
struct socket *sock;
struct proto_ops *ops;
PRINTK("sys_socket: family = %d (%s), type = %d, protocol = %d\n",
family, family_name(family), type, protocol);
/*
* locate the correct protocol family
*/
// 是否支持协议族
for (i = 0; i < NPROTO; ++i)
if (proto_table[i].family == family)
break;
if (i == NPROTO) {
PRINTK("sys_socket: family not found\n");
return -EINVAL;
}
ops = proto_table[i].ops;
/*
* check that this is a type that we know how to manipulate and
* the protocol makes sense here. the family can still reject the
* protocol later.
*/
// 是否是支持的类型
if ((type != SOCK_STREAM &&
type != SOCK_DGRAM &&
type != SOCK_SEQPACKET &&
type != SOCK_RAW) ||
protocol < 0)
return -EINVAL;
/*
* allocate the socket and allow the family to set things up. if
* the protocol is 0, the family is instructed to select an appropriate
* default.
*/
// 同步阻塞从sockets[]数组中分配socket结构体和inode
if (!(sock = sock_alloc(1))) {
printk("sys_socket: no more sockets\n");
return -EAGAIN;
}
sock->type = type;
// 见上述proto_table数组中,unix socket 对应的是unix_proto_ops
// 对应的create回调函数是unix_proto_create
sock->ops = ops;
if ((i = sock->ops->create(sock, protocol)) < 0) {
sock_release(sock);
return i;
}
// 获取fd
if ((fd = get_fd(SOCK_INODE(sock))) < 0) {
sock_release(sock);
return -EINVAL;
}
return fd;
}
// 从当前进程打开文件数组中获取一个空槽,下标为fd
// 该打开文件的f_inode 指向inode
static int
get_fd(struct inode *inode)
{
int fd, i;
struct file *file;
/*
* find a file descriptor suitable for return to the user.
*/
for (fd = 0; fd < NR_OPEN; ++fd)
if (!current->filp[fd])
break;
if (fd == NR_OPEN)
return -1;
current->close_on_exec &= ~(1 << fd);
for (file = file_table, i = 0; i < NR_FILE; ++i, ++file)
if (!file->f_count)
break;
if (i == NR_FILE)
return -1;
current->filp[fd] = file;
file->f_op = &socket_file_ops;
file->f_mode = 3;
file->f_flags = 0;
file->f_count = 1;
file->f_inode = inode;
file->f_pos = 0;
return fd;
}
通过文件句柄fd
从当前打开文件数组current->filp[fd]
获取对应的打开文件结构file
,
file.f_inode
是socket结构体
对应的inode,也就是socket.dummy
的值,
而socket.data
是对应协议的数据结构。socket.ops
是对应协议族的回调函数。
创建好socket句柄后,就需要bind地址。
在net/socket.c文件中,
static int
sock_bind(int fd, struct sockaddr *umyaddr, int addrlen)
{
struct socket *sock;
int i;
PRINTK("sys_bind: fd = %d\n", fd);
// 根据fd获取对应socket结构
if (!(sock = sockfd_lookup(fd, NULL)))
return -EBADF;
// 调用 根据socket结构获取对应协议族回调函数,unix socket 则调用unix_proto_bind
if ((i = sock->ops->bind(sock, umyaddr, addrlen)) < 0) {
PRINTK("sys_bind: bind failed\n");
return i;
}
return 0;
}
static inline struct socket *
socki_lookup(struct inode *inode)
{
struct socket *sock;
// 遍历所有socket结构体,查找inode对应的socket结构
for (sock = sockets; sock <= last_socket; ++sock)
if (sock->state != SS_FREE && SOCK_INODE(sock) == inode)
return sock;
return NULL;
}
static inline struct socket *
sockfd_lookup(int fd, struct file **pfile)
{
struct file *file;
if (fd < 0 || fd >= NR_OPEN || !(file = current->filp[fd]))
return NULL;
if (pfile)
*pfile = file;
return socki_lookup(file->f_inode);
}
net/unix.c文件
static int
unix_proto_bind(struct socket *sock, struct sockaddr *umyaddr,
int sockaddr_len)
{
struct unix_proto_data *upd = UN_DATA(sock);
char fname[sizeof(((struct sockaddr_un *)0)->sun_path) + 1];
int i;
unsigned long old_fs;
PRINTK("unix_proto_bind: socket 0x%x, len=%d\n", sock,
sockaddr_len);
if (sockaddr_len <= UN_PATH_OFFSET ||
sockaddr_len >= sizeof(struct sockaddr_un)) {
PRINTK("unix_proto_bind: bad length %d\n", sockaddr_len);
return -EINVAL;
}
if (upd->sockaddr_len || upd->inode) {
printk("unix_proto_bind: already bound!\n");
return -EINVAL;
}
verify_area(umyaddr, sockaddr_len);
memcpy_fromfs(&upd->sockaddr_un, umyaddr, sockaddr_len);
if (upd->sockaddr_un.sun_family != AF_UNIX) {
PRINTK("unix_proto_bind: family is %d, not AF_UNIX (%d)\n",
upd->sockaddr_un.sun_family, AF_UNIX);
return -EINVAL;
}
memcpy(fname, upd->sockaddr_un.sun_path, sockaddr_len-UN_PATH_OFFSET);
fname[sockaddr_len-UN_PATH_OFFSET] = '\0';
old_fs = get_fs();
set_fs(get_ds());
// 调用fs/namei.c文件中的do_mknod函数创建监听的unix socket 的文件inode
// 也就是unix socket 监听时候会在文件系统创建一个文件。
i = do_mknod(fname, S_IFSOCK | 0777, 0);
// 创建成功,则调用fs/namei.c文件的中open_namei函数获取该文件的inode。
if (i == 0)
i = open_namei(fname, 0, S_IFSOCK, &upd->inode, NULL);
set_fs(old_fs);
if (i < 0) {
printk("unix_proto_bind: can't open socket %s\n", fname);
return i;
}
upd->sockaddr_len = sockaddr_len; /* now its legal */
PRINTK("unix_proto_bind: bound socket address: ");
#ifdef SOCK_DEBUG
sockaddr_un_printk(&upd->sockaddr_un, upd->sockaddr_len);
#endif
return 0;
}
bind
做的事情,通过fd找到对应的socket结构,调用对应协议族的bind
回调函数,
对于unix socket,则打开一个文件,打开文件的inode赋值到unix socket结构体的upd->inode
。
服务端程序调用listen
监听,对于unix socket协议族,listen没干啥实质的事情,仅仅是把socket结构体的状态改一下sock->flags |= SO_ACCEPTCON;
。
对于客户端程序则需要调用connect
发起连接,对应内核的net/socket.c文件中的sock_connect
函数,实际上就是对
对应协议族回调函数sock->ops->connect
的封装调用。
net/unix.c文件:
static int
unix_proto_connect(struct socket *sock, struct sockaddr *uservaddr,
int sockaddr_len, int flags)
{
int i;
struct unix_proto_data *serv_upd;
struct sockaddr_un sockun;
PRINTK("unix_proto_connect: socket 0x%x, servlen=%d\n", sock,
sockaddr_len);
if (sockaddr_len <= UN_PATH_OFFSET ||
sockaddr_len >= sizeof(struct sockaddr_un)) {
PRINTK("unix_proto_connect: bad length %d\n", sockaddr_len);
return -EINVAL;
}
verify_area(uservaddr, sockaddr_len);
memcpy_fromfs(&sockun, uservaddr, sockaddr_len);
if (sockun.sun_family != AF_UNIX) {
PRINTK("unix_proto_connect: family is %d, not AF_UNIX (%d)\n",
sockun.sun_family, AF_UNIX);
return -EINVAL;
}
// 根据unix socket的监听的文件,直接从内核查找服务端unix socket 结构体
if (!(serv_upd = unix_data_lookup(&sockun, sockaddr_len))) {
PRINTK("unix_proto_connect: can't locate peer\n");
return -EINVAL;
}
if ((i = sock_awaitconn(sock, serv_upd->socket)) < 0) {
PRINTK("unix_proto_connect: can't await connection\n");
return i;
}
unix_data_ref(UN_DATA(sock->conn));
UN_DATA(sock)->peerupd = UN_DATA(sock->conn); /* ref server */
return 0;
}
int
sock_awaitconn(struct socket *mysock, struct socket *servsock)
{
struct socket *last;
PRINTK("sock_awaitconn: trying to connect socket 0x%x to 0x%x\n",
mysock, servsock);
if (!(servsock->flags & SO_ACCEPTCON)) {
PRINTK("sock_awaitconn: server not accepting connections\n");
return -EINVAL;
}
/*
* put ourselves on the server's incomplete connection queue.
*/
mysock->next = NULL;
cli();
// 服务端通过一个队列支持多个客户端的连接
if (!(last = servsock->iconn))
servsock->iconn = mysock;
else {
while (last->next)
last = last->next;
last->next = mysock;
}
mysock->state = SS_CONNECTING;
mysock->conn = servsock;
sti();
/*
* wake up server, then await connection. server will set state to
* SS_CONNECTED if we're connected.
*/
// 唤醒服务端进程处理连接
wake_up(servsock->wait);
if (mysock->state != SS_CONNECTED) {
// 客户端进程阻塞等待服务端程序接收连接
interruptible_sleep_on(mysock->wait);
if (mysock->state != SS_CONNECTED) {
/*
* if we're not connected we could have been
* 1) interrupted, so we need to remove ourselves
* from the server list
* 2) rejected (mysock->conn == NULL), and have
* already been removed from the list
*/
if (mysock->conn == servsock) {
cli();
if ((last = servsock->iconn) == mysock)
servsock->iconn = mysock->next;
else {
while (last->next != mysock)
last = last->next;
last->next = mysock->next;
}
sti();
}
return mysock->conn ? -EINTR : -EACCES;
}
}
return 0;
}
客户端调用accept发起连接后,服务端程序调用accept接收连接。
// net/socket.c 文件
static int
sock_accept(int fd, struct sockaddr *upeer_sockaddr, int *upeer_addrlen)
{
struct file *file;
struct socket *sock, *newsock;
int i;
PRINTK("sys_accept: fd = %d\n", fd);
if (!(sock = sockfd_lookup(fd, &file)))
return -EBADF;
if (sock->state != SS_UNCONNECTED) {
PRINTK("sys_accept: socket isn't unconnected\n");
return -EINVAL;
}
if (!(sock->flags & SO_ACCEPTCON)) {
PRINTK("sys_accept: socket not accepting connections!\n");
return -EINVAL;
}
// 为服务端程序新的连接分配一个socket结构体
if (!(newsock = sock_alloc(0))) {
printk("sys_accept: no more sockets\n");
return -EAGAIN;
}
newsock->type = sock->type;
newsock->ops = sock->ops;
if ((i = sock->ops->dup(newsock, sock)) < 0) {
sock_release(newsock);
return i;
}
// 为新连接socket从进程打开文件列表寻找一个空槽,返回对应的下标fd
if ((fd = get_fd(SOCK_INODE(newsock))) < 0) {
sock_release(newsock);
return -EINVAL;
}
// 如果是unix socket,则调用unix_proto_accept函数,从半连接队列取一个连接
i = newsock->ops->accept(sock, newsock, file->f_flags);
if ( i < 0)
{
sys_close (fd);
return (i);
}
PRINTK("sys_accept: connected socket 0x%x via 0x%x\n",
sock, newsock);
if (upeer_sockaddr)
newsock->ops->getname(newsock, upeer_sockaddr,
upeer_addrlen, 1);
return fd;
}
// net/unix.c文件
static int
unix_proto_accept(struct socket *sock, struct socket *newsock, int flags)
{
struct socket *clientsock;
PRINTK("unix_proto_accept: socket 0x%x accepted via socket 0x%x\n",
sock, newsock);
/*
* if there aren't any sockets awaiting connection, then wait for
* one, unless nonblocking
*/
while (!(clientsock = sock->iconn)) {
if (flags & O_NONBLOCK)
return -EAGAIN;
interruptible_sleep_on(sock->wait);
if (current->signal & ~current->blocked) {
PRINTK("sys_accept: sleep was interrupted\n");
return -ERESTARTSYS;
}
}
/*
* great. finish the connection relative to server and client,
* wake up the client and return the new fd to the server
*/
sock->iconn = clientsock->next;
clientsock->next = NULL;
newsock->conn = clientsock;
clientsock->conn = newsock;
clientsock->state = SS_CONNECTED;
newsock->state = SS_CONNECTED;
wake_up(clientsock->wait);
unix_data_ref (UN_DATA(newsock->conn));
UN_DATA(newsock)->peerupd = UN_DATA(newsock->conn);
return 0;
}
连接建立起来后,就可以通过read
和write
函数读写数据了。
write
对应内核函数为 sys_write
, read
对应的内核函数为 sys_read
// fs/read_write.c 文件中
int sys_read(unsigned int fd,char * buf,unsigned int count)
{
struct file * file;
struct inode * inode;
if (fd>=NR_OPEN || !(file=current->filp[fd]) || !(inode=file->f_inode))
return -EBADF;
if (!(file->f_mode & 1))
return -EBADF;
if (!count)
return 0;
verify_area(buf,count);
if (file->f_op && file->f_op->read)
return file->f_op->read(inode,file,buf,count); // 实际调用的是net/socket.c文件的sock_read函数
return -EINVAL;
}
int sys_write(unsigned int fd,char * buf,unsigned int count)
{
struct file * file;
struct inode * inode;
if (fd>=NR_OPEN || !(file=current->filp[fd]) || !(inode=file->f_inode))
return -EBADF;
if (!(file->f_mode&2))
return -EBADF;
if (!count)
return 0;
if (file->f_op && file->f_op->write)
return file->f_op->write(inode,file,buf,count); // 实际调用的是net/socket.c文件的sock_write函数
return -EINVAL;
}
// 在调用内核函数sock_socket函数创建socket结构体时,调用了get_fd函数分配一个打开文件的结构体,
// 在该函数中对file的回调函数进行了赋值, file->f_op = &socket_file_ops;
// 因此 上述 sys_read调用的函数是sock_read sock_write
// net/socket.c文件
static int
sock_read(struct inode *inode, struct file *file, char *ubuf, int size)
{
struct socket *sock;
PRINTK("sock_read: buf=0x%x, size=%d\n", ubuf, size);
if (!(sock = socki_lookup(inode))) {
printk("sock_read: can't find socket for inode!\n");
return -EBADF;
}
if (sock->flags & SO_ACCEPTCON)
return -EINVAL;
return sock->ops->read(sock, ubuf, size, (file->f_flags & O_NONBLOCK));
}
static int
sock_write(struct inode *inode, struct file *file, char *ubuf, int size)
{
struct socket *sock;
PRINTK("sock_write: buf=0x%x, size=%d\n", ubuf, size);
if (!(sock = socki_lookup(inode))) {
printk("sock_write: can't find socket for inode!\n");
return -EBADF;
}
if (sock->flags & SO_ACCEPTCON)
return -EINVAL;
return sock->ops->write(sock, ubuf, size,(file->f_flags & O_NONBLOCK));
}
// 在调用sock_socket 函数创建结构体时,根据协议族从proto_table数组中查找对应的回调函数,
// 对于unix socket 则是net/unix.c文件的unix_proto_ops结构体, 参考代码 sock->ops = ops;
// 因此上述函数,对于unix socket 则调用的是: unix_proto_read 和 unix_proto_write
/*
* we read from our own buf.
*/
static int
unix_proto_read(struct socket *sock, char *ubuf, int size, int nonblock)
{
struct unix_proto_data *upd;
int todo, avail;
if ((todo = size) <= 0)
return 0;
upd = UN_DATA(sock);
while (!(avail = UN_BUF_AVAIL(upd))) {
if (sock->state != SS_CONNECTED) {
PRINTK("unix_proto_read: socket not connected\n");
return (sock->state == SS_DISCONNECTING) ? 0 : -EINVAL;
}
PRINTK("unix_proto_read: no data available...\n");
if (nonblock)
return -EAGAIN;
interruptible_sleep_on(sock->wait);
if (current->signal & ~current->blocked) {
PRINTK("unix_proto_read: interrupted\n");
return -ERESTARTSYS;
}
if (sock->state == SS_DISCONNECTING) {
PRINTK("unix_proto_read: disconnected\n");
return 0;
}
}
/*
* copy from the read buffer into the user's buffer, watching for
* wraparound. then we wake up the writer
*/
do {
int part, cando;
if (avail <= 0) {
PRINTK("unix_proto_read: AVAIL IS NEGATIVE!!!\n");
send_sig(SIGKILL,current,1);
return -EINTR;
}
if ((cando = todo) > avail)
cando = avail;
if (cando > (part = BUF_SIZE - upd->bp_tail))
cando = part;
PRINTK("unix_proto_read: avail=%d, todo=%d, cando=%d\n",
avail, todo, cando);
verify_area(ubuf, cando);
memcpy_tofs(ubuf, upd->buf + upd->bp_tail, cando);
upd->bp_tail = (upd->bp_tail + cando) & (BUF_SIZE-1);
ubuf += cando;
todo -= cando;
if (sock->state == SS_CONNECTED)
wake_up(sock->conn->wait);
avail = UN_BUF_AVAIL(upd);
} while (todo && avail);
return size - todo;
}
/*
* we write to our peer's buf. when we connected we ref'd this peer so we
* are safe that the buffer remains, even after the peer has disconnected,
* which we check other ways.
*/
static int
unix_proto_write(struct socket *sock, char *ubuf, int size, int nonblock)
{
struct unix_proto_data *pupd;
int todo, space;
if ((todo = size) <= 0)
return 0;
if (sock->state != SS_CONNECTED) {
PRINTK("unix_proto_write: socket not connected\n");
if (sock->state == SS_DISCONNECTING) {
send_sig(SIGPIPE,current,1);
return -EINTR;
}
return -EINVAL;
}
pupd = UN_DATA(sock)->peerupd; /* safer than sock->conn */
while (!(space = UN_BUF_SPACE(pupd))) {
PRINTK("unix_proto_write: no space left...\n");
if (nonblock)
return -EAGAIN;
interruptible_sleep_on(sock->wait);
if (current->signal & ~current->blocked) {
PRINTK("unix_proto_write: interrupted\n");
return -ERESTARTSYS;
}
if (sock->state == SS_DISCONNECTING) {
PRINTK("unix_proto_write: disconnected (SIGPIPE)\n");
send_sig(SIGPIPE,current,1);
return -EINTR;
}
}
/*
* copy from the user's buffer to the write buffer, watching for
* wraparound. then we wake up the reader
*/
do {
int part, cando;
if (space <= 0) {
PRINTK("unix_proto_write: SPACE IS NEGATIVE!!!\n");
send_sig(SIGKILL,current,1);
return -EINTR;
}
/*
* we may become disconnected inside this loop, so watch
* for it (peerupd is safe until we close)
*/
if (sock->state == SS_DISCONNECTING) {
send_sig(SIGPIPE,current,1);
return -EINTR;
}
if ((cando = todo) > space)
cando = space;
if (cando > (part = BUF_SIZE - pupd->bp_head))
cando = part;
PRINTK("unix_proto_write: space=%d, todo=%d, cando=%d\n",
space, todo, cando);
verify_area(ubuf, cando);
memcpy_fromfs(pupd->buf + pupd->bp_head, ubuf, cando);
pupd->bp_head = (pupd->bp_head + cando) & (BUF_SIZE-1);
ubuf += cando;
todo -= cando;
if (sock->state == SS_CONNECTED)
wake_up(sock->conn->wait);
space = UN_BUF_SPACE(pupd);
} while (todo && space);
return size - todo;
}
总结
- 系统初始化时,调用
sock_init
函数,该函数调用了内核支持的协议族(AF_UNIX
,AF_INET
)初始化函数unix_proto_ops.init
(unix_proto_init
),inet_proto_ops.init
(ip_proto_init
) 。